summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-damon40
-rw-r--r--Documentation/ABI/testing/sysfs-memory-page-offline4
-rw-r--r--Documentation/admin-guide/cgroup-v1/memory.rst2
-rw-r--r--Documentation/admin-guide/mm/damon/usage.rst76
-rw-r--r--Documentation/admin-guide/mm/ksm.rst27
-rw-r--r--Documentation/admin-guide/mm/memory-hotplug.rst14
-rw-r--r--Documentation/admin-guide/mm/userfaultfd.rst15
-rw-r--r--Documentation/admin-guide/mm/zswap.rst14
-rw-r--r--Documentation/block/biovecs.rst1
-rw-r--r--Documentation/features/vm/TLB/arch-support.txt2
-rw-r--r--Documentation/mm/damon/design.rst24
-rw-r--r--Documentation/mm/frontswap.rst264
-rw-r--r--Documentation/mm/index.rst1
-rw-r--r--Documentation/mm/split_page_table_lock.rst12
-rw-r--r--Documentation/mm/vmemmap_dedup.rst1
-rw-r--r--Documentation/powerpc/index.rst1
-rw-r--r--Documentation/powerpc/vmemmap_dedup.rst101
-rw-r--r--Documentation/translations/zh_CN/mm/frontswap.rst196
-rw-r--r--Documentation/translations/zh_CN/mm/index.rst1
-rw-r--r--Documentation/translations/zh_CN/mm/split_page_table_lock.rst14
-rw-r--r--MAINTAINERS8
-rw-r--r--arch/arc/Kconfig1
-rw-r--r--arch/arc/include/asm/io.h7
-rw-r--r--arch/arc/mm/ioremap.c49
-rw-r--r--arch/arm/include/asm/hugetlb.h1
-rw-r--r--arch/arm/include/asm/tlb.h12
-rw-r--r--arch/arm/mm/fault-armv.c3
-rw-r--r--arch/arm/mm/mmu.c7
-rw-r--r--arch/arm64/Kconfig5
-rw-r--r--arch/arm64/include/asm/hugetlb.h16
-rw-r--r--arch/arm64/include/asm/io.h3
-rw-r--r--arch/arm64/include/asm/mte.h4
-rw-r--r--arch/arm64/include/asm/pgtable.h26
-rw-r--r--arch/arm64/include/asm/tlb.h14
-rw-r--r--arch/arm64/include/asm/tlbbatch.h12
-rw-r--r--arch/arm64/include/asm/tlbflush.h64
-rw-r--r--arch/arm64/kernel/mte.c37
-rw-r--r--arch/arm64/mm/fault.c2
-rw-r--r--arch/arm64/mm/ioremap.c10
-rw-r--r--arch/arm64/mm/mmu.c7
-rw-r--r--arch/csky/include/asm/pgalloc.h4
-rw-r--r--arch/hexagon/Kconfig1
-rw-r--r--arch/hexagon/include/asm/io.h11
-rw-r--r--arch/hexagon/include/asm/pgalloc.h8
-rw-r--r--arch/hexagon/kernel/hexagon_ksyms.c2
-rw-r--r--arch/hexagon/mm/Makefile2
-rw-r--r--arch/hexagon/mm/ioremap.c44
-rw-r--r--arch/ia64/Kconfig1
-rw-r--r--arch/ia64/include/asm/io.h13
-rw-r--r--arch/ia64/mm/ioremap.c41
-rw-r--r--arch/loongarch/Kconfig2
-rw-r--r--arch/loongarch/include/asm/io.h2
-rw-r--r--arch/loongarch/include/asm/pgalloc.h27
-rw-r--r--arch/loongarch/mm/pgtable.c7
-rw-r--r--arch/m68k/include/asm/io_mm.h2
-rw-r--r--arch/m68k/include/asm/kmap.h2
-rw-r--r--arch/m68k/include/asm/mcf_pgalloc.h47
-rw-r--r--arch/m68k/include/asm/sun3_pgalloc.h8
-rw-r--r--arch/m68k/mm/motorola.c4
-rw-r--r--arch/mips/include/asm/io.h5
-rw-r--r--arch/mips/include/asm/pgalloc.h32
-rw-r--r--arch/mips/mm/pgtable.c8
-rw-r--r--arch/nios2/include/asm/pgalloc.h8
-rw-r--r--arch/openrisc/Kconfig1
-rw-r--r--arch/openrisc/include/asm/io.h11
-rw-r--r--arch/openrisc/include/asm/pgalloc.h8
-rw-r--r--arch/openrisc/mm/ioremap.c82
-rw-r--r--arch/parisc/Kconfig1
-rw-r--r--arch/parisc/include/asm/io.h15
-rw-r--r--arch/parisc/mm/ioremap.c62
-rw-r--r--arch/powerpc/Kconfig3
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash.h9
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h155
-rw-r--r--arch/powerpc/include/asm/book3s/64/radix.h49
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush-radix.h2
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush.h9
-rw-r--r--arch/powerpc/include/asm/io.h17
-rw-r--r--arch/powerpc/include/asm/pgalloc.h4
-rw-r--r--arch/powerpc/include/asm/pgtable.h27
-rw-r--r--arch/powerpc/kvm/book3s_hv_uvmem.c1
-rw-r--r--arch/powerpc/mm/book3s64/hash_pgtable.c2
-rw-r--r--arch/powerpc/mm/book3s64/mmu_context.c10
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c110
-rw-r--r--arch/powerpc/mm/book3s64/radix_hugetlbpage.c1
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c574
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c11
-rw-r--r--arch/powerpc/mm/fault.c4
-rw-r--r--arch/powerpc/mm/init_64.c37
-rw-r--r--arch/powerpc/mm/ioremap.c26
-rw-r--r--arch/powerpc/mm/ioremap_32.c19
-rw-r--r--arch/powerpc/mm/ioremap_64.c12
-rw-r--r--arch/powerpc/mm/pgtable-frag.c73
-rw-r--r--arch/powerpc/mm/pgtable.c8
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype1
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-memory.c2
-rw-r--r--arch/powerpc/xmon/xmon.c2
-rw-r--r--arch/riscv/Kconfig2
-rw-r--r--arch/riscv/include/asm/hugetlb.h1
-rw-r--r--arch/riscv/include/asm/pgalloc.h8
-rw-r--r--arch/riscv/include/asm/pgtable.h12
-rw-r--r--arch/riscv/mm/fault.c4
-rw-r--r--arch/riscv/mm/init.c16
-rw-r--r--arch/s390/Kconfig3
-rw-r--r--arch/s390/include/asm/io.h21
-rw-r--r--arch/s390/include/asm/pgalloc.h8
-rw-r--r--arch/s390/include/asm/tlb.h4
-rw-r--r--arch/s390/mm/fault.c2
-rw-r--r--arch/s390/mm/pgalloc.c176
-rw-r--r--arch/s390/pci/pci.c57
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sh/include/asm/io.h89
-rw-r--r--arch/sh/include/asm/io_noioport.h7
-rw-r--r--arch/sh/include/asm/pgalloc.h9
-rw-r--r--arch/sh/mm/ioremap.c65
-rw-r--r--arch/sparc/include/asm/pgalloc_64.h4
-rw-r--r--arch/sparc/kernel/setup_32.c2
-rw-r--r--arch/sparc/mm/init_64.c33
-rw-r--r--arch/sparc/mm/srmmu.c5
-rw-r--r--arch/um/include/asm/pgalloc.h18
-rw-r--r--arch/x86/Kconfig7
-rw-r--r--arch/x86/include/asm/io.h5
-rw-r--r--arch/x86/include/asm/pgtable.h16
-rw-r--r--arch/x86/include/asm/tlbflush.h24
-rw-r--r--arch/x86/mm/fault.c4
-rw-r--r--arch/x86/mm/pgtable.c47
-rw-r--r--arch/x86/mm/tlb.c2
-rw-r--r--arch/x86/xen/mmu_pv.c2
-rw-r--r--arch/xtensa/Kconfig1
-rw-r--r--arch/xtensa/include/asm/io.h32
-rw-r--r--arch/xtensa/mm/ioremap.c58
-rw-r--r--drivers/acpi/acpi_memhotplug.c3
-rw-r--r--drivers/base/memory.c27
-rw-r--r--drivers/base/node.c4
-rw-r--r--drivers/dax/kmem.c4
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_svm.c5
-rw-r--r--drivers/iommu/amd/iommu_v2.c10
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c29
-rw-r--r--drivers/iommu/intel/svm.c8
-rw-r--r--drivers/misc/ocxl/link.c8
-rw-r--r--drivers/net/ethernet/sfc/io.h2
-rw-r--r--drivers/net/ethernet/sfc/siena/io.h2
-rw-r--r--drivers/nvdimm/pfn_devs.c2
-rw-r--r--drivers/tty/sysrq.c2
-rw-r--r--drivers/tty/vt/keyboard.c2
-rw-r--r--fs/9p/cache.c2
-rw-r--r--fs/Kconfig7
-rw-r--r--fs/affs/file.c77
-rw-r--r--fs/affs/symlink.c12
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/buffer.c36
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/exec.c1
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/ext4/move_extent.c19
-rw-r--r--fs/hugetlbfs/inode.c57
-rw-r--r--fs/jbd2/journal.c35
-rw-r--r--fs/nfs/fscache.c3
-rw-r--r--fs/ntfs3/inode.c10
-rw-r--r--fs/ocfs2/file.c7
-rw-r--r--fs/proc/base.c1
-rw-r--r--fs/proc/meminfo.c13
-rw-r--r--fs/proc/task_mmu.c26
-rw-r--r--fs/proc/task_nommu.c15
-rw-r--r--fs/smb/client/fscache.c2
-rw-r--r--fs/splice.c3
-rw-r--r--fs/udf/file.c6
-rw-r--r--fs/userfaultfd.c100
-rw-r--r--include/asm-generic/io.h31
-rw-r--r--include/asm-generic/iomap.h6
-rw-r--r--include/asm-generic/pgalloc.h88
-rw-r--r--include/asm-generic/tlb.h12
-rw-r--r--include/linux/backing-dev.h1
-rw-r--r--include/linux/bio.h5
-rw-r--r--include/linux/buffer_head.h4
-rw-r--r--include/linux/damon.h28
-rw-r--r--include/linux/frontswap.h91
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/highmem.h44
-rw-r--r--include/linux/hugetlb.h35
-rw-r--r--include/linux/ioremap.h30
-rw-r--r--include/linux/kfence.h11
-rw-r--r--include/linux/ksm.h20
-rw-r--r--include/linux/maple_tree.h44
-rw-r--r--include/linux/memblock.h14
-rw-r--r--include/linux/memcontrol.h11
-rw-r--r--include/linux/memory-tiers.h4
-rw-r--r--include/linux/memory.h8
-rw-r--r--include/linux/memory_hotplug.h3
-rw-r--r--include/linux/mm.h282
-rw-r--r--include/linux/mm_inline.h21
-rw-r--r--include/linux/mm_types.h102
-rw-r--r--include/linux/mm_types_task.h4
-rw-r--r--include/linux/mmap_lock.h18
-rw-r--r--include/linux/mmu_notifier.h104
-rw-r--r--include/linux/mmzone.h1
-rw-r--r--include/linux/net_mm.h17
-rw-r--r--include/linux/page-flags.h30
-rw-r--r--include/linux/page_ext.h9
-rw-r--r--include/linux/page_idle.h5
-rw-r--r--include/linux/page_table_check.h66
-rw-r--r--include/linux/pagemap.h21
-rw-r--r--include/linux/pgtable.h28
-rw-r--r--include/linux/pid_namespace.h39
-rw-r--r--include/linux/swap.h9
-rw-r--r--include/linux/swapfile.h5
-rw-r--r--include/linux/swapops.h15
-rw-r--r--include/linux/userfaultfd_k.h4
-rw-r--r--include/linux/zswap.h37
-rw-r--r--include/net/tcp.h1
-rw-r--r--include/trace/events/thp.h33
-rw-r--r--include/uapi/linux/userfaultfd.h25
-rw-r--r--init/initramfs.c2
-rw-r--r--kernel/events/core.c33
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/futex/core.c3
-rw-r--r--kernel/iomem.c13
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/pid.c3
-rw-r--r--kernel/pid_namespace.c6
-rw-r--r--kernel/pid_sysctl.h28
-rw-r--r--lib/maple_tree.c1090
-rw-r--r--lib/test_maple_tree.c141
-rw-r--r--lib/test_meminit.c2
-rw-r--r--mm/Kconfig15
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c6
-rw-r--r--mm/cma.c4
-rw-r--r--mm/compaction.c105
-rw-r--r--mm/damon/core-test.h74
-rw-r--r--mm/damon/core.c62
-rw-r--r--mm/damon/ops-common.c2
-rw-r--r--mm/damon/paddr.c2
-rw-r--r--mm/damon/sysfs-common.h2
-rw-r--r--mm/damon/sysfs-schemes.c107
-rw-r--r--mm/damon/sysfs.c26
-rw-r--r--mm/damon/vaddr.c23
-rw-r--r--mm/debug_vm_pgtable.c18
-rw-r--r--mm/filemap.c19
-rw-r--r--mm/frontswap.c283
-rw-r--r--mm/gup.c83
-rw-r--r--mm/huge_memory.c36
-rw-r--r--mm/hugetlb.c344
-rw-r--r--mm/hugetlb_vmemmap.c34
-rw-r--r--mm/init-mm.c2
-rw-r--r--mm/internal.h56
-rw-r--r--mm/ioremap.c41
-rw-r--r--mm/kfence/core.c123
-rw-r--r--mm/kfence/kfence.h5
-rw-r--r--mm/khugepaged.c474
-rw-r--r--mm/kmemleak.c15
-rw-r--r--mm/kmsan/hooks.c4
-rw-r--r--mm/kmsan/shadow.c8
-rw-r--r--mm/ksm.c44
-rw-r--r--mm/madvise.c11
-rw-r--r--mm/memblock.c5
-rw-r--r--mm/memcontrol.c33
-rw-r--r--mm/memfd.c58
-rw-r--r--mm/memory-failure.c130
-rw-r--r--mm/memory-tiers.c19
-rw-r--r--mm/memory.c233
-rw-r--r--mm/memory_hotplug.c192
-rw-r--r--mm/memtest.c22
-rw-r--r--mm/migrate.c5
-rw-r--r--mm/migrate_device.c30
-rw-r--r--mm/mlock.c3
-rw-r--r--mm/mm_init.c37
-rw-r--r--mm/mmap.c255
-rw-r--r--mm/mmu_notifier.c50
-rw-r--r--mm/mprotect.c7
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c55
-rw-r--r--mm/oom_kill.c3
-rw-r--r--mm/page_alloc.c109
-rw-r--r--mm/page_ext.c101
-rw-r--r--mm/page_io.c80
-rw-r--r--mm/page_isolation.c8
-rw-r--r--mm/page_owner.c2
-rw-r--r--mm/page_poison.c1
-rw-r--r--mm/page_table_check.c52
-rw-r--r--mm/page_vma_mapped.c12
-rw-r--r--mm/pgtable-generic.c97
-rw-r--r--mm/rmap.c83
-rw-r--r--mm/secretmem.c14
-rw-r--r--mm/shmem.c9
-rw-r--r--mm/show_mem.c10
-rw-r--r--mm/sparse-vmemmap.c3
-rw-r--r--mm/sparse.c3
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/swapfile.c63
-rw-r--r--mm/truncate.c8
-rw-r--r--mm/userfaultfd.c87
-rw-r--r--mm/util.c6
-rw-r--r--mm/vmscan.c42
-rw-r--r--mm/vmstat.c1
-rw-r--r--mm/z3fold.c27
-rw-r--r--mm/zsmalloc.c79
-rw-r--r--mm/zswap.c393
-rw-r--r--net/ipv4/tcp.c11
-rw-r--r--security/selinux/hooks.c7
-rw-r--r--tools/include/nolibc/stdio.h24
-rw-r--r--tools/testing/radix-tree/maple.c134
-rw-r--r--tools/testing/selftests/cgroup/.gitignore1
-rw-r--r--tools/testing/selftests/cgroup/Makefile2
-rw-r--r--tools/testing/selftests/cgroup/test_zswap.c286
-rw-r--r--tools/testing/selftests/damon/sysfs.sh6
-rw-r--r--tools/testing/selftests/kselftest.h9
-rw-r--r--tools/testing/selftests/kselftest/runner.sh7
-rw-r--r--tools/testing/selftests/memfd/memfd_test.c329
-rw-r--r--tools/testing/selftests/mm/.gitignore1
-rw-r--r--tools/testing/selftests/mm/Makefile81
-rw-r--r--tools/testing/selftests/mm/hugetlb-read-hwpoison.c322
-rw-r--r--tools/testing/selftests/mm/ksm_functional_tests.c98
-rw-r--r--tools/testing/selftests/mm/madv_populate.c26
-rw-r--r--tools/testing/selftests/mm/migration.c12
-rw-r--r--tools/testing/selftests/mm/mrelease_test.c1
-rwxr-xr-xtools/testing/selftests/mm/run_vmtests.sh80
-rw-r--r--tools/testing/selftests/mm/settings2
-rw-r--r--tools/testing/selftests/mm/thuge-gen.c4
-rw-r--r--tools/testing/selftests/mm/transhuge-stress.c12
-rw-r--r--tools/testing/selftests/mm/uffd-common.c5
-rw-r--r--tools/testing/selftests/mm/uffd-common.h3
-rw-r--r--tools/testing/selftests/mm/uffd-stress.c32
-rw-r--r--tools/testing/selftests/mm/uffd-unit-tests.c117
-rw-r--r--tools/testing/selftests/mm/va_high_addr_switch.c2
-rw-r--r--tools/testing/selftests/proc/proc-empty-vm.c4
327 files changed, 7383 insertions, 5415 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index 2744f21b5a6b..334352d198f8 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -29,8 +29,10 @@ Description: Writing 'on' or 'off' to this file makes the kdamond starts or
file updates contents of schemes stats files of the kdamond.
Writing 'update_schemes_tried_regions' to the file updates
contents of 'tried_regions' directory of every scheme directory
- of this kdamond. Writing 'clear_schemes_tried_regions' to the
- file removes contents of the 'tried_regions' directory.
+ of this kdamond. Writing 'update_schemes_tried_bytes' to the
+ file updates only '.../tried_regions/total_bytes' files of this
+ kdamond. Writing 'clear_schemes_tried_regions' to the file
+ removes contents of the 'tried_regions' directory.
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/pid
Date: Mar 2022
@@ -269,8 +271,10 @@ What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/
Date: Dec 2022
Contact: SeongJae Park <sj@kernel.org>
Description: Writing to and reading from this file sets and gets the type of
- the memory of the interest. 'anon' for anonymous pages, or
- 'memcg' for specific memory cgroup can be written and read.
+ the memory of the interest. 'anon' for anonymous pages,
+ 'memcg' for specific memory cgroup, 'addr' for address range
+ (an open-ended interval), or 'target' for DAMON monitoring
+ target can be written and read.
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/memcg_path
Date: Dec 2022
@@ -279,6 +283,27 @@ Description: If 'memcg' is written to the 'type' file, writing to and
reading from this file sets and gets the path to the memory
cgroup of the interest.
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/addr_start
+Date: Jul 2023
+Contact: SeongJae Park <sj@kernel.org>
+Description: If 'addr' is written to the 'type' file, writing to or reading
+ from this file sets or gets the start address of the address
+ range for the filter.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/addr_end
+Date: Jul 2023
+Contact: SeongJae Park <sj@kernel.org>
+Description: If 'addr' is written to the 'type' file, writing to or reading
+ from this file sets or gets the end address of the address
+ range for the filter.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/target_idx
+Date: Dec 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: If 'target' is written to the 'type' file, writing to or
+ reading from this file sets or gets the index of the DAMON
+ monitoring target of the interest.
+
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/matching
Date: Dec 2022
Contact: SeongJae Park <sj@kernel.org>
@@ -317,6 +342,13 @@ Contact: SeongJae Park <sj@kernel.org>
Description: Reading this file returns the number of the exceed events of
the scheme's quotas.
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/total_bytes
+Date: Jul 2023
+Contact: SeongJae Park <sj@kernel.org>
+Description: Reading this file returns the total amount of memory that
+ corresponding DAMON-based Operation Scheme's action has tried
+ to be applied.
+
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/start
Date: Oct 2022
Contact: SeongJae Park <sj@kernel.org>
diff --git a/Documentation/ABI/testing/sysfs-memory-page-offline b/Documentation/ABI/testing/sysfs-memory-page-offline
index e14703f12fdf..00f4e35f916f 100644
--- a/Documentation/ABI/testing/sysfs-memory-page-offline
+++ b/Documentation/ABI/testing/sysfs-memory-page-offline
@@ -10,7 +10,7 @@ Description:
dropping it if possible. The kernel will then be placed
on the bad page list and never be reused.
- The offlining is done in kernel specific granuality.
+ The offlining is done in kernel specific granularity.
Normally it's the base page size of the kernel, but
this might change.
@@ -35,7 +35,7 @@ Description:
to access this page assuming it's poisoned by the
hardware.
- The offlining is done in kernel specific granuality.
+ The offlining is done in kernel specific granularity.
Normally it's the base page size of the kernel, but
this might change.
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index fabaad3fd9c2..8d3afeede10e 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -92,8 +92,6 @@ Brief summary of control files.
memory.oom_control set/show oom controls.
memory.numa_stat show the number of memory usage per numa
node
- memory.kmem.limit_in_bytes This knob is deprecated and writing to
- it will return -ENOTSUPP.
memory.kmem.usage_in_bytes show current kernel memory allocation
memory.kmem.failcnt show the number of kernel memory usage
hits limits
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 2d495fa85a0e..084f0a32b421 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -87,7 +87,7 @@ comma (","). ::
│ │ │ │ │ │ │ filters/nr_filters
│ │ │ │ │ │ │ │ 0/type,matching,memcg_id
│ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
- │ │ │ │ │ │ │ tried_regions/
+ │ │ │ │ │ │ │ tried_regions/total_bytes
│ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age
│ │ │ │ │ │ │ │ ...
│ │ │ │ │ │ ...
@@ -127,14 +127,18 @@ in the state. Writing ``commit`` to the ``state`` file makes kdamond reads the
user inputs in the sysfs files except ``state`` file again. Writing
``update_schemes_stats`` to ``state`` file updates the contents of stats files
for each DAMON-based operation scheme of the kdamond. For details of the
-stats, please refer to :ref:`stats section <sysfs_schemes_stats>`. Writing
-``update_schemes_tried_regions`` to ``state`` file updates the DAMON-based
-operation scheme action tried regions directory for each DAMON-based operation
-scheme of the kdamond. Writing ``clear_schemes_tried_regions`` to ``state``
-file clears the DAMON-based operating scheme action tried regions directory for
-each DAMON-based operation scheme of the kdamond. For details of the
-DAMON-based operation scheme action tried regions directory, please refer to
-:ref:`tried_regions section <sysfs_schemes_tried_regions>`.
+stats, please refer to :ref:`stats section <sysfs_schemes_stats>`.
+
+Writing ``update_schemes_tried_regions`` to ``state`` file updates the
+DAMON-based operation scheme action tried regions directory for each
+DAMON-based operation scheme of the kdamond. Writing
+``update_schemes_tried_bytes`` to ``state`` file updates only
+``.../tried_regions/total_bytes`` files. Writing
+``clear_schemes_tried_regions`` to ``state`` file clears the DAMON-based
+operating scheme action tried regions directory for each DAMON-based operation
+scheme of the kdamond. For details of the DAMON-based operation scheme action
+tried regions directory, please refer to :ref:`tried_regions section
+<sysfs_schemes_tried_regions>`.
If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread.
@@ -359,15 +363,21 @@ number (``N``) to the file creates the number of child directories named ``0``
to ``N-1``. Each directory represents each filter. The filters are evaluated
in the numeric order.
-Each filter directory contains three files, namely ``type``, ``matcing``, and
-``memcg_path``. You can write one of two special keywords, ``anon`` for
-anonymous pages, or ``memcg`` for specific memory cgroup filtering. In case of
-the memory cgroup filtering, you can specify the memory cgroup of the interest
-by writing the path of the memory cgroup from the cgroups mount point to
-``memcg_path`` file. You can write ``Y`` or ``N`` to ``matching`` file to
-filter out pages that does or does not match to the type, respectively. Then,
-the scheme's action will not be applied to the pages that specified to be
-filtered out.
+Each filter directory contains six files, namely ``type``, ``matcing``,
+``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``. To ``type``
+file, you can write one of four special keywords: ``anon`` for anonymous pages,
+``memcg`` for specific memory cgroup, ``addr`` for specific address range (an
+open-ended interval), or ``target`` for specific DAMON monitoring target
+filtering. In case of the memory cgroup filtering, you can specify the memory
+cgroup of the interest by writing the path of the memory cgroup from the
+cgroups mount point to ``memcg_path`` file. In case of the address range
+filtering, you can specify the start and end address of the range to
+``addr_start`` and ``addr_end`` files, respectively. For the DAMON monitoring
+target filtering, you can specify the index of the target between the list of
+the DAMON context's monitoring targets list to ``target_idx`` file. You can
+write ``Y`` or ``N`` to ``matching`` file to filter out pages that does or does
+not match to the type, respectively. Then, the scheme's action will not be
+applied to the pages that specified to be filtered out.
For example, below restricts a DAMOS action to be applied to only non-anonymous
pages of all memory cgroups except ``/having_care_already``.::
@@ -381,8 +391,14 @@ pages of all memory cgroups except ``/having_care_already``.::
echo /having_care_already > 1/memcg_path
echo N > 1/matching
-Note that filters are currently supported only when ``paddr``
-`implementation <sysfs_contexts>` is being used.
+Note that ``anon`` and ``memcg`` filters are currently supported only when
+``paddr`` `implementation <sysfs_contexts>` is being used.
+
+Also, memory regions that are filtered out by ``addr`` or ``target`` filters
+are not counted as the scheme has tried to those, while regions that filtered
+out by other type filters are counted as the scheme has tried to. The
+difference is applied to :ref:`stats <damos_stats>` and
+:ref:`tried regions <sysfs_schemes_tried_regions>`.
.. _sysfs_schemes_stats:
@@ -406,13 +422,21 @@ stats by writing a special keyword, ``update_schemes_stats`` to the relevant
schemes/<N>/tried_regions/
--------------------------
+This directory initially has one file, ``total_bytes``.
+
When a special keyword, ``update_schemes_tried_regions``, is written to the
-relevant ``kdamonds/<N>/state`` file, DAMON creates directories named integer
-starting from ``0`` under this directory. Each directory contains files
-exposing detailed information about each of the memory region that the
-corresponding scheme's ``action`` has tried to be applied under this directory,
-during next :ref:`aggregation interval <sysfs_monitoring_attrs>`. The
-information includes address range, ``nr_accesses``, and ``age`` of the region.
+relevant ``kdamonds/<N>/state`` file, DAMON updates the ``total_bytes`` file so
+that reading it returns the total size of the scheme tried regions, and creates
+directories named integer starting from ``0`` under this directory. Each
+directory contains files exposing detailed information about each of the memory
+region that the corresponding scheme's ``action`` has tried to be applied under
+this directory, during next :ref:`aggregation interval
+<sysfs_monitoring_attrs>`. The information includes address range,
+``nr_accesses``, and ``age`` of the region.
+
+Writing ``update_schemes_tried_bytes`` to the relevant ``kdamonds/<N>/state``
+file will only update the ``total_bytes`` file, and will not create the
+subdirectories.
The directories will be removed when another special keyword,
``clear_schemes_tried_regions``, is written to the relevant
diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst
index 7626392fe82c..776f244bdae4 100644
--- a/Documentation/admin-guide/mm/ksm.rst
+++ b/Documentation/admin-guide/mm/ksm.rst
@@ -159,6 +159,8 @@ The effectiveness of KSM and MADV_MERGEABLE is shown in ``/sys/kernel/mm/ksm/``:
general_profit
how effective is KSM. The calculation is explained below.
+pages_scanned
+ how many pages are being scanned for ksm
pages_shared
how many shared pages are being used
pages_sharing
@@ -173,6 +175,13 @@ stable_node_chains
the number of KSM pages that hit the ``max_page_sharing`` limit
stable_node_dups
number of duplicated KSM pages
+ksm_zero_pages
+ how many zero pages that are still mapped into processes were mapped by
+ KSM when deduplicating.
+
+When ``use_zero_pages`` is/was enabled, the sum of ``pages_sharing`` +
+``ksm_zero_pages`` represents the actual number of pages saved by KSM.
+if ``use_zero_pages`` has never been enabled, ``ksm_zero_pages`` is 0.
A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good
sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing``
@@ -196,21 +205,25 @@ several times, which are unprofitable memory consumed.
1) How to determine whether KSM save memory or consume memory in system-wide
range? Here is a simple approximate calculation for reference::
- general_profit =~ pages_sharing * sizeof(page) - (all_rmap_items) *
+ general_profit =~ ksm_saved_pages * sizeof(page) - (all_rmap_items) *
sizeof(rmap_item);
- where all_rmap_items can be easily obtained by summing ``pages_sharing``,
- ``pages_shared``, ``pages_unshared`` and ``pages_volatile``.
+ where ksm_saved_pages equals to the sum of ``pages_sharing`` +
+ ``ksm_zero_pages`` of the system, and all_rmap_items can be easily
+ obtained by summing ``pages_sharing``, ``pages_shared``, ``pages_unshared``
+ and ``pages_volatile``.
2) The KSM profit inner a single process can be similarly obtained by the
following approximate calculation::
- process_profit =~ ksm_merging_pages * sizeof(page) -
+ process_profit =~ ksm_saved_pages * sizeof(page) -
ksm_rmap_items * sizeof(rmap_item).
- where ksm_merging_pages is shown under the directory ``/proc/<pid>/``,
- and ksm_rmap_items is shown in ``/proc/<pid>/ksm_stat``. The process profit
- is also shown in ``/proc/<pid>/ksm_stat`` as ksm_process_profit.
+ where ksm_saved_pages equals to the sum of ``ksm_merging_pages`` and
+ ``ksm_zero_pages``, both of which are shown under the directory
+ ``/proc/<pid>/ksm_stat``, and ksm_rmap_items is also shown in
+ ``/proc/<pid>/ksm_stat``. The process profit is also shown in
+ ``/proc/<pid>/ksm_stat`` as ksm_process_profit.
From the perspective of application, a high ratio of ``ksm_rmap_items`` to
``ksm_merging_pages`` means a bad madvise-applied policy, so developers or
diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
index 1b02fe5807cc..2994958c7ce8 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -433,6 +433,18 @@ The following module parameters are currently defined:
memory in a way that huge pages in bigger
granularity cannot be formed on hotplugged
memory.
+
+ With value "force" it could result in memory
+ wastage due to memmap size limitations. For
+ example, if the memmap for a memory block
+ requires 1 MiB, but the pageblock size is 2
+ MiB, 1 MiB of hotplugged memory will be wasted.
+ Note that there are still cases where the
+ feature cannot be enforced: for example, if the
+ memmap is smaller than a single page, or if the
+ architecture does not support the forced mode
+ in all configurations.
+
``online_policy`` read-write: Set the basic policy used for
automatic zone selection when onlining memory
blocks without specifying a target zone.
@@ -669,7 +681,7 @@ when still encountering permanently unmovable pages within ZONE_MOVABLE
(-> BUG), memory offlining will keep retrying until it eventually succeeds.
When offlining is triggered from user space, the offlining context can be
-terminated by sending a fatal signal. A timeout based offlining can easily be
+terminated by sending a signal. A timeout based offlining can easily be
implemented via::
% timeout $TIMEOUT offline_block | failure_handling
diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index 7c304e432205..4349a8c2b978 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -244,6 +244,21 @@ write-protected (so future writes will also result in a WP fault). These ioctls
support a mode flag (``UFFDIO_COPY_MODE_WP`` or ``UFFDIO_CONTINUE_MODE_WP``
respectively) to configure the mapping this way.
+Memory Poisioning Emulation
+---------------------------
+
+In response to a fault (either missing or minor), an action userspace can
+take to "resolve" it is to issue a ``UFFDIO_POISON``. This will cause any
+future faulters to either get a SIGBUS, or in KVM's case the guest will
+receive an MCE as if there were hardware memory poisoning.
+
+This is used to emulate hardware memory poisoning. Imagine a VM running on a
+machine which experiences a real hardware memory error. Later, we live migrate
+the VM to another physical machine. Since we want the migration to be
+transparent to the guest, we want that same address range to act as if it was
+still poisoned, even though it's on a new physical host which ostensibly
+doesn't have a memory error in the exact same spot.
+
QEMU/KVM
========
diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst
index c5c2c7dbb155..45b98390e938 100644
--- a/Documentation/admin-guide/mm/zswap.rst
+++ b/Documentation/admin-guide/mm/zswap.rst
@@ -49,7 +49,7 @@ compressed pool.
Design
======
-Zswap receives pages for compression through the Frontswap API and is able to
+Zswap receives pages for compression from the swap subsystem and is able to
evict pages from its own compressed pool on an LRU basis and write them back to
the backing swap device in the case that the compressed pool is full.
@@ -70,19 +70,19 @@ means the compression ratio will always be 2:1 or worse (because of half-full
zbud pages). The zsmalloc type zpool has a more complex compressed page
storage method, and it can achieve greater storage densities.
-When a swap page is passed from frontswap to zswap, zswap maintains a mapping
+When a swap page is passed from swapout to zswap, zswap maintains a mapping
of the swap entry, a combination of the swap type and swap offset, to the zpool
handle that references that compressed swap page. This mapping is achieved
with a red-black tree per swap type. The swap offset is the search key for the
tree nodes.
-During a page fault on a PTE that is a swap entry, frontswap calls the zswap
-load function to decompress the page into the page allocated by the page fault
-handler.
+During a page fault on a PTE that is a swap entry, the swapin code calls the
+zswap load function to decompress the page into the page allocated by the page
+fault handler.
Once there are no PTEs referencing a swap page stored in zswap (i.e. the count
-in the swap_map goes to 0) the swap code calls the zswap invalidate function,
-via frontswap, to free the compressed entry.
+in the swap_map goes to 0) the swap code calls the zswap invalidate function
+to free the compressed entry.
Zswap seeks to be simple in its policies. Sysfs attributes allow for one user
controlled policy:
diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst
index ddb867e0185b..b9dc0c9dbee4 100644
--- a/Documentation/block/biovecs.rst
+++ b/Documentation/block/biovecs.rst
@@ -134,6 +134,7 @@ Usage of helpers:
bio_for_each_bvec_all()
bio_first_bvec_all()
bio_first_page_all()
+ bio_first_folio_all()
bio_last_bvec_all()
* The following helpers iterate over single-page segment. The passed 'struct
diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt
index 7f049c251a79..76208db88f3b 100644
--- a/Documentation/features/vm/TLB/arch-support.txt
+++ b/Documentation/features/vm/TLB/arch-support.txt
@@ -9,7 +9,7 @@
| alpha: | TODO |
| arc: | TODO |
| arm: | TODO |
- | arm64: | N/A |
+ | arm64: | ok |
| csky: | TODO |
| hexagon: | TODO |
| ia64: | TODO |
diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index 4bfdf1d30c4a..a20383d01a95 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -380,12 +380,24 @@ number of filters for each scheme. Each filter specifies the type of target
memory, and whether it should exclude the memory of the type (filter-out), or
all except the memory of the type (filter-in).
-As of this writing, anonymous page type and memory cgroup type are supported by
-the feature. Some filter target types can require additional arguments. For
-example, the memory cgroup filter type asks users to specify the file path of
-the memory cgroup for the filter. Hence, users can apply specific schemes to
-only anonymous pages, non-anonymous pages, pages of specific cgroups, all pages
-excluding those of specific cgroups, and any combination of those.
+Currently, anonymous page, memory cgroup, address range, and DAMON monitoring
+target type filters are supported by the feature. Some filter target types
+require additional arguments. The memory cgroup filter type asks users to
+specify the file path of the memory cgroup for the filter. The address range
+type asks the start and end addresses of the range. The DAMON monitoring
+target type asks the index of the target from the context's monitoring targets
+list. Hence, users can apply specific schemes to only anonymous pages,
+non-anonymous pages, pages of specific cgroups, all pages excluding those of
+specific cgroups, pages in specific address range, pages in specific DAMON
+monitoring targets, and any combination of those.
+
+To handle filters efficiently, the address range and DAMON monitoring target
+type filters are handled by the core layer, while others are handled by
+operations set. If a memory region is filtered by a core layer-handled filter,
+it is not counted as the scheme has tried to the region. In contrast, if a
+memory regions is filtered by an operations set layer-handled filter, it is
+counted as the scheme has tried. The difference in accounting leads to changes
+in the statistics.
Application Programming Interface
diff --git a/Documentation/mm/frontswap.rst b/Documentation/mm/frontswap.rst
deleted file mode 100644
index c892412988af..000000000000
--- a/Documentation/mm/frontswap.rst
+++ /dev/null
@@ -1,264 +0,0 @@
-=========
-Frontswap
-=========
-
-Frontswap provides a "transcendent memory" interface for swap pages.
-In some environments, dramatic performance savings may be obtained because
-swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
-
-.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
-
-Frontswap is so named because it can be thought of as the opposite of
-a "backing" store for a swap device. The storage is assumed to be
-a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming
-to the requirements of transcendent memory (such as Xen's "tmem", or
-in-kernel compressed memory, aka "zcache", or future RAM-like devices);
-this pseudo-RAM device is not directly accessible or addressable by the
-kernel and is of unknown and possibly time-varying size. The driver
-links itself to frontswap by calling frontswap_register_ops to set the
-frontswap_ops funcs appropriately and the functions it provides must
-conform to certain policies as follows:
-
-An "init" prepares the device to receive frontswap pages associated
-with the specified swap device number (aka "type"). A "store" will
-copy the page to transcendent memory and associate it with the type and
-offset associated with the page. A "load" will copy the page, if found,
-from transcendent memory into kernel memory, but will NOT remove the page
-from transcendent memory. An "invalidate_page" will remove the page
-from transcendent memory and an "invalidate_area" will remove ALL pages
-associated with the swap type (e.g., like swapoff) and notify the "device"
-to refuse further stores with that swap type.
-
-Once a page is successfully stored, a matching load on the page will normally
-succeed. So when the kernel finds itself in a situation where it needs
-to swap out a page, it first attempts to use frontswap. If the store returns
-success, the data has been successfully saved to transcendent memory and
-a disk write and, if the data is later read back, a disk read are avoided.
-If a store returns failure, transcendent memory has rejected the data, and the
-page can be written to swap as usual.
-
-Note that if a page is stored and the page already exists in transcendent memory
-(a "duplicate" store), either the store succeeds and the data is overwritten,
-or the store fails AND the page is invalidated. This ensures stale data may
-never be obtained from frontswap.
-
-If properly configured, monitoring of frontswap is done via debugfs in
-the `/sys/kernel/debug/frontswap` directory. The effectiveness of
-frontswap can be measured (across all swap devices) with:
-
-``failed_stores``
- how many store attempts have failed
-
-``loads``
- how many loads were attempted (all should succeed)
-
-``succ_stores``
- how many store attempts have succeeded
-
-``invalidates``
- how many invalidates were attempted
-
-A backend implementation may provide additional metrics.
-
-FAQ
-===
-
-* Where's the value?
-
-When a workload starts swapping, performance falls through the floor.
-Frontswap significantly increases performance in many such workloads by
-providing a clean, dynamic interface to read and write swap pages to
-"transcendent memory" that is otherwise not directly addressable to the kernel.
-This interface is ideal when data is transformed to a different form
-and size (such as with compression) or secretly moved (as might be
-useful for write-balancing for some RAM-like devices). Swap pages (and
-evicted page-cache pages) are a great use for this kind of slower-than-RAM-
-but-much-faster-than-disk "pseudo-RAM device".
-
-Frontswap with a fairly small impact on the kernel,
-provides a huge amount of flexibility for more dynamic, flexible RAM
-utilization in various system configurations:
-
-In the single kernel case, aka "zcache", pages are compressed and
-stored in local memory, thus increasing the total anonymous pages
-that can be safely kept in RAM. Zcache essentially trades off CPU
-cycles used in compression/decompression for better memory utilization.
-Benchmarks have shown little or no impact when memory pressure is
-low while providing a significant performance improvement (25%+)
-on some workloads under high memory pressure.
-
-"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory
-support for clustered systems. Frontswap pages are locally compressed
-as in zcache, but then "remotified" to another system's RAM. This
-allows RAM to be dynamically load-balanced back-and-forth as needed,
-i.e. when system A is overcommitted, it can swap to system B, and
-vice versa. RAMster can also be configured as a memory server so
-many servers in a cluster can swap, dynamically as needed, to a single
-server configured with a large amount of RAM... without pre-configuring
-how much of the RAM is available for each of the clients!
-
-In the virtual case, the whole point of virtualization is to statistically
-multiplex physical resources across the varying demands of multiple
-virtual machines. This is really hard to do with RAM and efforts to do
-it well with no kernel changes have essentially failed (except in some
-well-publicized special-case workloads).
-Specifically, the Xen Transcendent Memory backend allows otherwise
-"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
-virtual machines, but the pages can be compressed and deduplicated to
-optimize RAM utilization. And when guest OS's are induced to surrender
-underutilized RAM (e.g. with "selfballooning"), sudden unexpected
-memory pressure may result in swapping; frontswap allows those pages
-to be swapped to and from hypervisor RAM (if overall host system memory
-conditions allow), thus mitigating the potentially awful performance impact
-of unplanned swapping.
-
-A KVM implementation is underway and has been RFC'ed to lkml. And,
-using frontswap, investigation is also underway on the use of NVM as
-a memory extension technology.
-
-* Sure there may be performance advantages in some situations, but
- what's the space/time overhead of frontswap?
-
-If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
-nothingness and the only overhead is a few extra bytes per swapon'ed
-swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
-registers, there is one extra global variable compared to zero for
-every swap page read or written. If CONFIG_FRONTSWAP is enabled
-AND a frontswap backend registers AND the backend fails every "store"
-request (i.e. provides no memory despite claiming it might),
-CPU overhead is still negligible -- and since every frontswap fail
-precedes a swap page write-to-disk, the system is highly likely
-to be I/O bound and using a small fraction of a percent of a CPU
-will be irrelevant anyway.
-
-As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
-registers, one bit is allocated for every swap page for every swap
-device that is swapon'd. This is added to the EIGHT bits (which
-was sixteen until about 2.6.34) that the kernel already allocates
-for every swap page for every swap device that is swapon'd. (Hugh
-Dickins has observed that frontswap could probably steal one of
-the existing eight bits, but let's worry about that minor optimization
-later.) For very large swap disks (which are rare) on a standard
-4K pagesize, this is 1MB per 32GB swap.
-
-When swap pages are stored in transcendent memory instead of written
-out to disk, there is a side effect that this may create more memory
-pressure that can potentially outweigh the other advantages. A
-backend, such as zcache, must implement policies to carefully (but
-dynamically) manage memory limits to ensure this doesn't happen.
-
-* OK, how about a quick overview of what this frontswap patch does
- in terms that a kernel hacker can grok?
-
-Let's assume that a frontswap "backend" has registered during
-kernel initialization; this registration indicates that this
-frontswap backend has access to some "memory" that is not directly
-accessible by the kernel. Exactly how much memory it provides is
-entirely dynamic and random.
-
-Whenever a swap-device is swapon'd frontswap_init() is called,
-passing the swap device number (aka "type") as a parameter.
-This notifies frontswap to expect attempts to "store" swap pages
-associated with that number.
-
-Whenever the swap subsystem is readying a page to write to a swap
-device (c.f swap_writepage()), frontswap_store is called. Frontswap
-consults with the frontswap backend and if the backend says it does NOT
-have room, frontswap_store returns -1 and the kernel swaps the page
-to the swap device as normal. Note that the response from the frontswap
-backend is unpredictable to the kernel; it may choose to never accept a
-page, it could accept every ninth page, or it might accept every
-page. But if the backend does accept a page, the data from the page
-has already been copied and associated with the type and offset,
-and the backend guarantees the persistence of the data. In this case,
-frontswap sets a bit in the "frontswap_map" for the swap device
-corresponding to the page offset on the swap device to which it would
-otherwise have written the data.
-
-When the swap subsystem needs to swap-in a page (swap_readpage()),
-it first calls frontswap_load() which checks the frontswap_map to
-see if the page was earlier accepted by the frontswap backend. If
-it was, the page of data is filled from the frontswap backend and
-the swap-in is complete. If not, the normal swap-in code is
-executed to obtain the page of data from the real swap device.
-
-So every time the frontswap backend accepts a page, a swap device read
-and (potentially) a swap device write are replaced by a "frontswap backend
-store" and (possibly) a "frontswap backend loads", which are presumably much
-faster.
-
-* Can't frontswap be configured as a "special" swap device that is
- just higher priority than any real swap device (e.g. like zswap,
- or maybe swap-over-nbd/NFS)?
-
-No. First, the existing swap subsystem doesn't allow for any kind of
-swap hierarchy. Perhaps it could be rewritten to accommodate a hierarchy,
-but this would require fairly drastic changes. Even if it were
-rewritten, the existing swap subsystem uses the block I/O layer which
-assumes a swap device is fixed size and any page in it is linearly
-addressable. Frontswap barely touches the existing swap subsystem,
-and works around the constraints of the block I/O subsystem to provide
-a great deal of flexibility and dynamicity.
-
-For example, the acceptance of any swap page by the frontswap backend is
-entirely unpredictable. This is critical to the definition of frontswap
-backends because it grants completely dynamic discretion to the
-backend. In zcache, one cannot know a priori how compressible a page is.
-"Poorly" compressible pages can be rejected, and "poorly" can itself be
-defined dynamically depending on current memory constraints.
-
-Further, frontswap is entirely synchronous whereas a real swap
-device is, by definition, asynchronous and uses block I/O. The
-block I/O layer is not only unnecessary, but may perform "optimizations"
-that are inappropriate for a RAM-oriented device including delaying
-the write of some pages for a significant amount of time. Synchrony is
-required to ensure the dynamicity of the backend and to avoid thorny race
-conditions that would unnecessarily and greatly complicate frontswap
-and/or the block I/O subsystem. That said, only the initial "store"
-and "load" operations need be synchronous. A separate asynchronous thread
-is free to manipulate the pages stored by frontswap. For example,
-the "remotification" thread in RAMster uses standard asynchronous
-kernel sockets to move compressed frontswap pages to a remote machine.
-Similarly, a KVM guest-side implementation could do in-guest compression
-and use "batched" hypercalls.
-
-In a virtualized environment, the dynamicity allows the hypervisor
-(or host OS) to do "intelligent overcommit". For example, it can
-choose to accept pages only until host-swapping might be imminent,
-then force guests to do their own swapping.
-
-There is a downside to the transcendent memory specifications for
-frontswap: Since any "store" might fail, there must always be a real
-slot on a real swap device to swap the page. Thus frontswap must be
-implemented as a "shadow" to every swapon'd device with the potential
-capability of holding every page that the swap device might have held
-and the possibility that it might hold no pages at all. This means
-that frontswap cannot contain more pages than the total of swapon'd
-swap devices. For example, if NO swap device is configured on some
-installation, frontswap is useless. Swapless portable devices
-can still use frontswap but a backend for such devices must configure
-some kind of "ghost" swap device and ensure that it is never used.
-
-* Why this weird definition about "duplicate stores"? If a page
- has been previously successfully stored, can't it always be
- successfully overwritten?
-
-Nearly always it can, but no, sometimes it cannot. Consider an example
-where data is compressed and the original 4K page has been compressed
-to 1K. Now an attempt is made to overwrite the page with data that
-is non-compressible and so would take the entire 4K. But the backend
-has no more space. In this case, the store must be rejected. Whenever
-frontswap rejects a store that would overwrite, it also must invalidate
-the old data and ensure that it is no longer accessible. Since the
-swap subsystem then writes the new data to the read swap device,
-this is the correct course of action to ensure coherency.
-
-* Why does the frontswap patch create the new include file swapfile.h?
-
-The frontswap code depends on some swap-subsystem-internal data
-structures that have, over the years, moved back and forth between
-static and global. This seemed a reasonable compromise: Define
-them as global but declare them in a new include file that isn't
-included by the large number of source files that include swap.h.
-
-Dan Magenheimer, last updated April 9, 2012
diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst
index 5a94a921ea40..31d2ac306438 100644
--- a/Documentation/mm/index.rst
+++ b/Documentation/mm/index.rst
@@ -44,7 +44,6 @@ above structured documentation, or deleted if it has served its purpose.
balance
damon/index
free_page_reporting
- frontswap
hmm
hwpoison
hugetlbfs_reserv
diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst
index a834fad9de12..e4f6972eb6c0 100644
--- a/Documentation/mm/split_page_table_lock.rst
+++ b/Documentation/mm/split_page_table_lock.rst
@@ -58,7 +58,7 @@ Support of split page table lock by an architecture
===================================================
There's no need in special enabling of PTE split page table lock: everything
-required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which
+required is done by pagetable_pte_ctor() and pagetable_pte_dtor(), which
must be called on PTE table allocation / freeing.
Make sure the architecture doesn't use slab allocator for page table
@@ -68,8 +68,8 @@ This field shares storage with page->ptl.
PMD split lock only makes sense if you have more than two page table
levels.
-PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table
-allocation and pgtable_pmd_page_dtor() on freeing.
+PMD split lock enabling requires pagetable_pmd_ctor() call on PMD table
+allocation and pagetable_pmd_dtor() on freeing.
Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and
pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing
@@ -77,7 +77,7 @@ paths: i.e X86_PAE preallocate few PMDs on pgd_alloc().
With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK.
-NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
+NOTE: pagetable_pte_ctor() and pagetable_pmd_ctor() can fail -- it must
be handled properly.
page->ptl
@@ -97,7 +97,7 @@ trick:
split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs
one more cache line for indirect access;
-The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in
-pgtable_pmd_page_ctor() for PMD table.
+The spinlock_t allocated in pagetable_pte_ctor() for PTE table and in
+pagetable_pmd_ctor() for PMD table.
Please, never access page->ptl directly -- use appropriate helper.
diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst
index a4b12ff906c4..c573e08b5043 100644
--- a/Documentation/mm/vmemmap_dedup.rst
+++ b/Documentation/mm/vmemmap_dedup.rst
@@ -210,6 +210,7 @@ the device (altmap).
The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64),
PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
+For powerpc equivalent details see Documentation/powerpc/vmemmap_dedup.rst
The differences with HugeTLB are relatively minor.
diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst
index d33b554ca7ba..a50834798454 100644
--- a/Documentation/powerpc/index.rst
+++ b/Documentation/powerpc/index.rst
@@ -36,6 +36,7 @@ powerpc
ultravisor
vas-api
vcpudispatch_stats
+ vmemmap_dedup
features
diff --git a/Documentation/powerpc/vmemmap_dedup.rst b/Documentation/powerpc/vmemmap_dedup.rst
new file mode 100644
index 000000000000..dc4db59fdf87
--- /dev/null
+++ b/Documentation/powerpc/vmemmap_dedup.rst
@@ -0,0 +1,101 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+Device DAX
+==========
+
+The device-dax interface uses the tail deduplication technique explained in
+Documentation/mm/vmemmap_dedup.rst
+
+On powerpc, vmemmap deduplication is only used with radix MMU translation. Also
+with a 64K page size, only the devdax namespace with 1G alignment uses vmemmap
+deduplication.
+
+With 2M PMD level mapping, we require 32 struct pages and a single 64K vmemmap
+page can contain 1024 struct pages (64K/sizeof(struct page)). Hence there is no
+vmemmap deduplication possible.
+
+With 1G PUD level mapping, we require 16384 struct pages and a single 64K
+vmemmap page can contain 1024 struct pages (64K/sizeof(struct page)). Hence we
+require 16 64K pages in vmemmap to map the struct page for 1G PUD level mapping.
+
+Here's how things look like on device-dax after the sections are populated::
+ +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
+ | | | 0 | -------------> | 0 |
+ | | +-----------+ +-----------+
+ | | | 1 | -------------> | 1 |
+ | | +-----------+ +-----------+
+ | | | 2 | ----------------^ ^ ^ ^ ^ ^
+ | | +-----------+ | | | | |
+ | | | 3 | ------------------+ | | | |
+ | | +-----------+ | | | |
+ | | | 4 | --------------------+ | | |
+ | PUD | +-----------+ | | |
+ | level | | . | ----------------------+ | |
+ | mapping | +-----------+ | |
+ | | | . | ------------------------+ |
+ | | +-----------+ |
+ | | | 15 | --------------------------+
+ | | +-----------+
+ | |
+ | |
+ | |
+ +-----------+
+
+
+With 4K page size, 2M PMD level mapping requires 512 struct pages and a single
+4K vmemmap page contains 64 struct pages(4K/sizeof(struct page)). Hence we
+require 8 4K pages in vmemmap to map the struct page for 2M pmd level mapping.
+
+Here's how things look like on device-dax after the sections are populated::
+
+ +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
+ | | | 0 | -------------> | 0 |
+ | | +-----------+ +-----------+
+ | | | 1 | -------------> | 1 |
+ | | +-----------+ +-----------+
+ | | | 2 | ----------------^ ^ ^ ^ ^ ^
+ | | +-----------+ | | | | |
+ | | | 3 | ------------------+ | | | |
+ | | +-----------+ | | | |
+ | | | 4 | --------------------+ | | |
+ | PMD | +-----------+ | | |
+ | level | | 5 | ----------------------+ | |
+ | mapping | +-----------+ | |
+ | | | 6 | ------------------------+ |
+ | | +-----------+ |
+ | | | 7 | --------------------------+
+ | | +-----------+
+ | |
+ | |
+ | |
+ +-----------+
+
+With 1G PUD level mapping, we require 262144 struct pages and a single 4K
+vmemmap page can contain 64 struct pages (4K/sizeof(struct page)). Hence we
+require 4096 4K pages in vmemmap to map the struct pages for 1G PUD level
+mapping.
+
+Here's how things look like on device-dax after the sections are populated::
+
+ +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
+ | | | 0 | -------------> | 0 |
+ | | +-----------+ +-----------+
+ | | | 1 | -------------> | 1 |
+ | | +-----------+ +-----------+
+ | | | 2 | ----------------^ ^ ^ ^ ^ ^
+ | | +-----------+ | | | | |
+ | | | 3 | ------------------+ | | | |
+ | | +-----------+ | | | |
+ | | | 4 | --------------------+ | | |
+ | PUD | +-----------+ | | |
+ | level | | . | ----------------------+ | |
+ | mapping | +-----------+ | |
+ | | | . | ------------------------+ |
+ | | +-----------+ |
+ | | | 4095 | --------------------------+
+ | | +-----------+
+ | |
+ | |
+ | |
+ +-----------+
diff --git a/Documentation/translations/zh_CN/mm/frontswap.rst b/Documentation/translations/zh_CN/mm/frontswap.rst
deleted file mode 100644
index 434975390b48..000000000000
--- a/Documentation/translations/zh_CN/mm/frontswap.rst
+++ /dev/null
@@ -1,196 +0,0 @@
-:Original: Documentation/mm/frontswap.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-=========
-Frontswap
-=========
-
-Frontswap为交换页提供了一个 “transcendent memory” 的接口。在一些环境中,由
-于交换页被保存在RAM(或类似RAM的设备)中,而不是交换磁盘,因此可以获得巨大的性能
-节省(提高)。
-
-.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
-
-Frontswap之所以这么命名,是因为它可以被认为是与swap设备的“back”存储相反。存
-储器被认为是一个同步并发安全的面向页面的“伪RAM设备”,符合transcendent memory
-(如Xen的“tmem”,或内核内压缩内存,又称“zcache”,或未来的类似RAM的设备)的要
-求;这个伪RAM设备不能被内核直接访问或寻址,其大小未知且可能随时间变化。驱动程序通过
-调用frontswap_register_ops将自己与frontswap链接起来,以适当地设置frontswap_ops
-的功能,它提供的功能必须符合某些策略,如下所示:
-
-一个 “init” 将设备准备好接收与指定的交换设备编号(又称“类型”)相关的frontswap
-交换页。一个 “store” 将把该页复制到transcendent memory,并与该页的类型和偏移
-量相关联。一个 “load” 将把该页,如果找到的话,从transcendent memory复制到内核
-内存,但不会从transcendent memory中删除该页。一个 “invalidate_page” 将从
-transcendent memory中删除该页,一个 “invalidate_area” 将删除所有与交换类型
-相关的页(例如,像swapoff)并通知 “device” 拒绝进一步存储该交换类型。
-
-一旦一个页面被成功存储,在该页面上的匹配加载通常会成功。因此,当内核发现自己处于需
-要交换页面的情况时,它首先尝试使用frontswap。如果存储的结果是成功的,那么数据就已
-经成功的保存到了transcendent memory中,并且避免了磁盘写入,如果后来再读回数据,
-也避免了磁盘读取。如果存储返回失败,transcendent memory已经拒绝了该数据,且该页
-可以像往常一样被写入交换空间。
-
-请注意,如果一个页面被存储,而该页面已经存在于transcendent memory中(一个 “重复”
-的存储),要么存储成功,数据被覆盖,要么存储失败,该页面被废止。这确保了旧的数据永远
-不会从frontswap中获得。
-
-如果配置正确,对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的
-debugfs完成的。frontswap的有效性可以通过以下方式测量(在所有交换设备中):
-
-``failed_stores``
- 有多少次存储的尝试是失败的
-
-``loads``
- 尝试了多少次加载(应该全部成功)
-
-``succ_stores``
- 有多少次存储的尝试是成功的
-
-``invalidates``
- 尝试了多少次作废
-
-后台实现可以提供额外的指标。
-
-经常问到的问题
-==============
-
-* 价值在哪里?
-
-当一个工作负载开始交换时,性能就会下降。Frontswap通过提供一个干净的、动态的接口来
-读取和写入交换页到 “transcendent memory”,从而大大增加了许多这样的工作负载的性
-能,否则内核是无法直接寻址的。当数据被转换为不同的形式和大小(比如压缩)或者被秘密
-移动(对于一些类似RAM的设备来说,这可能对写平衡很有用)时,这个接口是理想的。交换
-页(和被驱逐的页面缓存页)是这种比RAM慢但比磁盘快得多的“伪RAM设备”的一大用途。
-
-Frontswap对内核的影响相当小,为各种系统配置中更动态、更灵活的RAM利用提供了巨大的
-灵活性:
-
-在单一内核的情况下,又称“zcache”,页面被压缩并存储在本地内存中,从而增加了可以安
-全保存在RAM中的匿名页面总数。Zcache本质上是用压缩/解压缩的CPU周期换取更好的内存利
-用率。Benchmarks测试显示,当内存压力较低时,几乎没有影响,而在高内存压力下的一些
-工作负载上,则有明显的性能改善(25%以上)。
-
-“RAMster” 在zcache的基础上增加了对集群系统的 “peer-to-peer” transcendent memory
-的支持。Frontswap页面像zcache一样被本地压缩,但随后被“remotified” 到另一个系
-统的RAM。这使得RAM可以根据需要动态地来回负载平衡,也就是说,当系统A超载时,它可以
-交换到系统B,反之亦然。RAMster也可以被配置成一个内存服务器,因此集群中的许多服务器
-可以根据需要动态地交换到配置有大量内存的单一服务器上......而不需要预先配置每个客户
-有多少内存可用
-
-在虚拟情况下,虚拟化的全部意义在于统计地将物理资源在多个虚拟机的不同需求之间进行复
-用。对于RAM来说,这真的很难做到,而且在不改变内核的情况下,要做好这一点的努力基本上
-是失败的(除了一些广为人知的特殊情况下的工作负载)。具体来说,Xen Transcendent Memory
-后端允许管理器拥有的RAM “fallow”,不仅可以在多个虚拟机之间进行“time-shared”,
-而且页面可以被压缩和重复利用,以优化RAM的利用率。当客户操作系统被诱导交出未充分利用
-的RAM时(如 “selfballooning”),突然出现的意外内存压力可能会导致交换;frontswap
-允许这些页面被交换到管理器RAM中或从管理器RAM中交换(如果整体主机系统内存条件允许),
-从而减轻计划外交换可能带来的可怕的性能影响。
-
-一个KVM的实现正在进行中,并且已经被RFC'ed到lkml。而且,利用frontswap,对NVM作为
-内存扩展技术的调查也在进行中。
-
-* 当然,在某些情况下可能有性能上的优势,但frontswap的空间/时间开销是多少?
-
-如果 CONFIG_FRONTSWAP 被禁用,每个 frontswap 钩子都会编译成空,唯一的开销是每
-个 swapon'ed swap 设备的几个额外字节。如果 CONFIG_FRONTSWAP 被启用,但没有
-frontswap的 “backend” 寄存器,每读或写一个交换页就会有一个额外的全局变量,而不
-是零。如果 CONFIG_FRONTSWAP 被启用,并且有一个frontswap的backend寄存器,并且
-后端每次 “store” 请求都失败(即尽管声称可能,但没有提供内存),CPU 的开销仍然可以
-忽略不计 - 因为每次frontswap失败都是在交换页写到磁盘之前,系统很可能是 I/O 绑定
-的,无论如何使用一小部分的 CPU 都是不相关的。
-
-至于空间,如果CONFIG_FRONTSWAP被启用,并且有一个frontswap的backend注册,那么
-每个交换设备的每个交换页都会被分配一个比特。这是在内核已经为每个交换设备的每个交换
-页分配的8位(在2.6.34之前是16位)上增加的。(Hugh Dickins观察到,frontswap可能
-会偷取现有的8个比特,但是我们以后再来担心这个小的优化问题)。对于标准的4K页面大小的
-非常大的交换盘(这很罕见),这是每32GB交换盘1MB开销。
-
-当交换页存储在transcendent memory中而不是写到磁盘上时,有一个副作用,即这可能会
-产生更多的内存压力,有可能超过其他的优点。一个backend,比如zcache,必须实现策略
-来仔细(但动态地)管理内存限制,以确保这种情况不会发生。
-
-* 好吧,那就用内核骇客能理解的术语来快速概述一下这个frontswap补丁的作用如何?
-
-我们假设在内核初始化过程中,一个frontswap 的 “backend” 已经注册了;这个注册表
-明这个frontswap 的 “backend” 可以访问一些不被内核直接访问的“内存”。它到底提
-供了多少内存是完全动态和随机的。
-
-每当一个交换设备被交换时,就会调用frontswap_init(),把交换设备的编号(又称“类
-型”)作为一个参数传给它。这就通知了frontswap,以期待 “store” 与该号码相关的交
-换页的尝试。
-
-每当交换子系统准备将一个页面写入交换设备时(参见swap_writepage()),就会调用
-frontswap_store。Frontswap与frontswap backend协商,如果backend说它没有空
-间,frontswap_store返回-1,内核就会照常把页换到交换设备上。注意,来自frontswap
-backend的响应对内核来说是不可预测的;它可能选择从不接受一个页面,可能接受每九个
-页面,也可能接受每一个页面。但是如果backend确实接受了一个页面,那么这个页面的数
-据已经被复制并与类型和偏移量相关联了,而且backend保证了数据的持久性。在这种情况
-下,frontswap在交换设备的“frontswap_map” 中设置了一个位,对应于交换设备上的
-页面偏移量,否则它就会将数据写入该设备。
-
-当交换子系统需要交换一个页面时(swap_readpage()),它首先调用frontswap_load(),
-检查frontswap_map,看这个页面是否早先被frontswap backend接受。如果是,该页
-的数据就会从frontswap后端填充,换入就完成了。如果不是,正常的交换代码将被执行,
-以便从真正的交换设备上获得这一页的数据。
-
-所以每次frontswap backend接受一个页面时,交换设备的读取和(可能)交换设备的写
-入都被 “frontswap backend store” 和(可能)“frontswap backend loads”
-所取代,这可能会快得多。
-
-* frontswap不能被配置为一个 “特殊的” 交换设备,它的优先级要高于任何真正的交换
- 设备(例如像zswap,或者可能是swap-over-nbd/NFS)?
-
-首先,现有的交换子系统不允许有任何种类的交换层次结构。也许它可以被重写以适应层次
-结构,但这将需要相当大的改变。即使它被重写,现有的交换子系统也使用了块I/O层,它
-假定交换设备是固定大小的,其中的任何页面都是可线性寻址的。Frontswap几乎没有触
-及现有的交换子系统,而是围绕着块I/O子系统的限制,提供了大量的灵活性和动态性。
-
-例如,frontswap backend对任何交换页的接受是完全不可预测的。这对frontswap backend
-的定义至关重要,因为它赋予了backend完全动态的决定权。在zcache中,人们无法预
-先知道一个页面的可压缩性如何。可压缩性 “差” 的页面会被拒绝,而 “差” 本身也可
-以根据当前的内存限制动态地定义。
-
-此外,frontswap是完全同步的,而真正的交换设备,根据定义,是异步的,并且使用
-块I/O。块I/O层不仅是不必要的,而且可能进行 “优化”,这对面向RAM的设备来说是
-不合适的,包括将一些页面的写入延迟相当长的时间。同步是必须的,以确保后端的动
-态性,并避免棘手的竞争条件,这将不必要地大大增加frontswap和/或块I/O子系统的
-复杂性。也就是说,只有最初的 “store” 和 “load” 操作是需要同步的。一个独立
-的异步线程可以自由地操作由frontswap存储的页面。例如,RAMster中的 “remotification”
-线程使用标准的异步内核套接字,将压缩的frontswap页面移动到远程机器。同样,
-KVM的客户方实现可以进行客户内压缩,并使用 “batched” hypercalls。
-
-在虚拟化环境中,动态性允许管理程序(或主机操作系统)做“intelligent overcommit”。
-例如,它可以选择只接受页面,直到主机交换可能即将发生,然后强迫客户机做他们
-自己的交换。
-
-transcendent memory规格的frontswap有一个坏处。因为任何 “store” 都可
-能失败,所以必须在一个真正的交换设备上有一个真正的插槽来交换页面。因此,
-frontswap必须作为每个交换设备的 “影子” 来实现,它有可能容纳交换设备可能
-容纳的每一个页面,也有可能根本不容纳任何页面。这意味着frontswap不能包含比
-swap设备总数更多的页面。例如,如果在某些安装上没有配置交换设备,frontswap
-就没有用。无交换设备的便携式设备仍然可以使用frontswap,但是这种设备的
-backend必须配置某种 “ghost” 交换设备,并确保它永远不会被使用。
-
-
-* 为什么会有这种关于 “重复存储” 的奇怪定义?如果一个页面以前被成功地存储过,
- 难道它不能总是被成功地覆盖吗?
-
-几乎总是可以的,不,有时不能。考虑一个例子,数据被压缩了,原来的4K页面被压
-缩到了1K。现在,有人试图用不可压缩的数据覆盖该页,因此会占用整个4K。但是
-backend没有更多的空间了。在这种情况下,这个存储必须被拒绝。每当frontswap
-拒绝一个会覆盖的存储时,它也必须使旧的数据作废,并确保它不再被访问。因为交
-换子系统会把新的数据写到读交换设备上,这是确保一致性的正确做法。
-
-* 为什么frontswap补丁会创建新的头文件swapfile.h?
-
-frontswap代码依赖于一些swap子系统内部的数据结构,这些数据结构多年来一直
-在静态和全局之间来回移动。这似乎是一个合理的妥协:将它们定义为全局,但在一
-个新的包含文件中声明它们,该文件不被包含swap.h的大量源文件所包含。
-
-Dan Magenheimer,最后更新于2012年4月9日
diff --git a/Documentation/translations/zh_CN/mm/index.rst b/Documentation/translations/zh_CN/mm/index.rst
index 2f53e37b8049..b950dd118be7 100644
--- a/Documentation/translations/zh_CN/mm/index.rst
+++ b/Documentation/translations/zh_CN/mm/index.rst
@@ -42,7 +42,6 @@ Linux内存管理文档
damon/index
free_page_reporting
ksm
- frontswap
hmm
hwpoison
hugetlbfs_reserv
diff --git a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst
index 4fb7aa666037..a2c288670a24 100644
--- a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst
+++ b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst
@@ -56,16 +56,16 @@ Hugetlb特定的辅助函数:
架构对分页表锁的支持
====================
-没有必要特别启用PTE分页表锁:所有需要的东西都由pgtable_pte_page_ctor()
-和pgtable_pte_page_dtor()完成,它们必须在PTE表分配/释放时被调用。
+没有必要特别启用PTE分页表锁:所有需要的东西都由pagetable_pte_ctor()
+和pagetable_pte_dtor()完成,它们必须在PTE表分配/释放时被调用。
确保架构不使用slab分配器来分配页表:slab使用page->slab_cache来分配其页
面。这个区域与page->ptl共享存储。
PMD分页锁只有在你有两个以上的页表级别时才有意义。
-启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor(),在释放时调
-用pgtable_pmd_page_dtor()。
+启用PMD分页锁需要在PMD表分配时调用pagetable_pmd_ctor(),在释放时调
+用pagetable_pmd_dtor()。
分配通常发生在pmd_alloc_one()中,释放发生在pmd_free()和pmd_free_tlb()
中,但要确保覆盖所有的PMD表分配/释放路径:即X86_PAE在pgd_alloc()中预先
@@ -73,7 +73,7 @@ PMD分页锁只有在你有两个以上的页表级别时才有意义。
一切就绪后,你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。
-注意:pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必
+注意:pagetable_pte_ctor()和pagetable_pmd_ctor()可能失败--必
须正确处理。
page->ptl
@@ -90,7 +90,7 @@ page->ptl用于访问分割页表锁,其中'page'是包含该表的页面struc
的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的
情况下使用分页锁,但由于间接访问而多花了一个缓存行。
-PTE表的spinlock_t分配在pgtable_pte_page_ctor()中,PMD表的spinlock_t
-分配在pgtable_pmd_page_ctor()中。
+PTE表的spinlock_t分配在pagetable_pte_ctor()中,PMD表的spinlock_t
+分配在pagetable_pmd_ctor()中。
请不要直接访问page->ptl - -使用适当的辅助函数。
diff --git a/MAINTAINERS b/MAINTAINERS
index 8355ec45452b..6849f4994787 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8404,13 +8404,6 @@ F: Documentation/power/freezing-of-tasks.rst
F: include/linux/freezer.h
F: kernel/freezer.c
-FRONTSWAP API
-M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
-L: linux-kernel@vger.kernel.org
-S: Maintained
-F: include/linux/frontswap.h
-F: mm/frontswap.c
-
FS-CACHE: LOCAL CACHING FOR NETWORK FILESYSTEMS
M: David Howells <dhowells@redhat.com>
L: linux-cachefs@redhat.com (moderated for non-subscribers)
@@ -14830,7 +14823,6 @@ NETWORKING [TCP]
M: Eric Dumazet <edumazet@google.com>
L: netdev@vger.kernel.org
S: Maintained
-F: include/linux/net_mm.h
F: include/linux/tcp.h
F: include/net/tcp.h
F: include/trace/events/tcp.h
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 96cf8720bb93..6f4995ad9873 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -26,6 +26,7 @@ config ARC
select GENERIC_PENDING_IRQ if SMP
select GENERIC_SCHED_CLOCK
select GENERIC_SMP_IDLE_THREAD
+ select GENERIC_IOREMAP
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARC_MMU_V4
diff --git a/arch/arc/include/asm/io.h b/arch/arc/include/asm/io.h
index 80347382a380..4fdb7350636c 100644
--- a/arch/arc/include/asm/io.h
+++ b/arch/arc/include/asm/io.h
@@ -21,8 +21,9 @@
#endif
extern void __iomem *ioremap(phys_addr_t paddr, unsigned long size);
-extern void __iomem *ioremap_prot(phys_addr_t paddr, unsigned long size,
- unsigned long flags);
+#define ioremap ioremap
+#define ioremap_prot ioremap_prot
+#define iounmap iounmap
static inline void __iomem *ioport_map(unsigned long port, unsigned int nr)
{
return (void __iomem *)port;
@@ -32,8 +33,6 @@ static inline void ioport_unmap(void __iomem *addr)
{
}
-extern void iounmap(const volatile void __iomem *addr);
-
/*
* io{read,write}{16,32}be() macros
*/
diff --git a/arch/arc/mm/ioremap.c b/arch/arc/mm/ioremap.c
index 712c2311daef..b07004d53267 100644
--- a/arch/arc/mm/ioremap.c
+++ b/arch/arc/mm/ioremap.c
@@ -8,7 +8,6 @@
#include <linux/module.h>
#include <linux/io.h>
#include <linux/mm.h>
-#include <linux/slab.h>
#include <linux/cache.h>
static inline bool arc_uncached_addr_space(phys_addr_t paddr)
@@ -25,13 +24,6 @@ static inline bool arc_uncached_addr_space(phys_addr_t paddr)
void __iomem *ioremap(phys_addr_t paddr, unsigned long size)
{
- phys_addr_t end;
-
- /* Don't allow wraparound or zero size */
- end = paddr + size - 1;
- if (!size || (end < paddr))
- return NULL;
-
/*
* If the region is h/w uncached, MMU mapping can be elided as optim
* The cast to u32 is fine as this region can only be inside 4GB
@@ -51,55 +43,22 @@ EXPORT_SYMBOL(ioremap);
* ARC hardware uncached region, this one still goes thru the MMU as caller
* might need finer access control (R/W/X)
*/
-void __iomem *ioremap_prot(phys_addr_t paddr, unsigned long size,
+void __iomem *ioremap_prot(phys_addr_t paddr, size_t size,
unsigned long flags)
{
- unsigned int off;
- unsigned long vaddr;
- struct vm_struct *area;
- phys_addr_t end;
pgprot_t prot = __pgprot(flags);
- /* Don't allow wraparound, zero size */
- end = paddr + size - 1;
- if ((!size) || (end < paddr))
- return NULL;
-
- /* An early platform driver might end up here */
- if (!slab_is_available())
- return NULL;
-
/* force uncached */
- prot = pgprot_noncached(prot);
-
- /* Mappings have to be page-aligned */
- off = paddr & ~PAGE_MASK;
- paddr &= PAGE_MASK_PHYS;
- size = PAGE_ALIGN(end + 1) - paddr;
-
- /*
- * Ok, go for it..
- */
- area = get_vm_area(size, VM_IOREMAP);
- if (!area)
- return NULL;
- area->phys_addr = paddr;
- vaddr = (unsigned long)area->addr;
- if (ioremap_page_range(vaddr, vaddr + size, paddr, prot)) {
- vunmap((void __force *)vaddr);
- return NULL;
- }
- return (void __iomem *)(off + (char __iomem *)vaddr);
+ return generic_ioremap_prot(paddr, size, pgprot_noncached(prot));
}
EXPORT_SYMBOL(ioremap_prot);
-
-void iounmap(const volatile void __iomem *addr)
+void iounmap(volatile void __iomem *addr)
{
/* weird double cast to handle phys_addr_t > 32 bits */
if (arc_uncached_addr_space((phys_addr_t)(u32)addr))
return;
- vfree((void *)(PAGE_MASK & (unsigned long __force)addr));
+ generic_iounmap(addr);
}
EXPORT_SYMBOL(iounmap);
diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
index d02d6ca88e92..a3a82b7158d4 100644
--- a/arch/arm/include/asm/hugetlb.h
+++ b/arch/arm/include/asm/hugetlb.h
@@ -10,6 +10,7 @@
#ifndef _ASM_ARM_HUGETLB_H
#define _ASM_ARM_HUGETLB_H
+#include <asm/cacheflush.h>
#include <asm/page.h>
#include <asm/hugetlb-3level.h>
#include <asm-generic/hugetlb.h>
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index b8cbe03ad260..f40d06ad5d2a 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -39,7 +39,9 @@ static inline void __tlb_remove_table(void *_table)
static inline void
__pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
{
- pgtable_pte_page_dtor(pte);
+ struct ptdesc *ptdesc = page_ptdesc(pte);
+
+ pagetable_pte_dtor(ptdesc);
#ifndef CONFIG_ARM_LPAE
/*
@@ -50,17 +52,17 @@ __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
__tlb_adjust_range(tlb, addr - PAGE_SIZE, 2 * PAGE_SIZE);
#endif
- tlb_remove_table(tlb, pte);
+ tlb_remove_ptdesc(tlb, ptdesc);
}
static inline void
__pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr)
{
#ifdef CONFIG_ARM_LPAE
- struct page *page = virt_to_page(pmdp);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);
- pgtable_pmd_page_dtor(page);
- tlb_remove_table(tlb, page);
+ pagetable_pmd_dtor(ptdesc);
+ tlb_remove_ptdesc(tlb, ptdesc);
#endif
}
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index ca5302b0b7ee..7cb125497976 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -117,11 +117,10 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
* must use the nested version. This also means we need to
* open-code the spin-locking.
*/
- pte = pte_offset_map(pmd, address);
+ pte = pte_offset_map_nolock(vma->vm_mm, pmd, address, &ptl);
if (!pte)
return 0;
- ptl = pte_lockptr(vma->vm_mm, pmd);
do_pte_lock(ptl);
ret = do_adjust_pte(vma, address, pfn, pte);
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 13fc4bb5f792..fdeaee30d167 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -737,11 +737,12 @@ static void __init *early_alloc(unsigned long sz)
static void *__init late_alloc(unsigned long sz)
{
- void *ptr = (void *)__get_free_pages(GFP_PGTABLE_KERNEL, get_order(sz));
+ void *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM,
+ get_order(sz));
- if (!ptr || !pgtable_pte_page_ctor(virt_to_page(ptr)))
+ if (!ptdesc || !pagetable_pte_ctor(ptdesc))
BUG();
- return ptr;
+ return ptdesc_to_virt(ptdesc);
}
static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr,
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a2511b30d0f6..a7a88322bf46 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -78,6 +78,7 @@ config ARM64
select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION
select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION
select ARCH_KEEP_MEMBLOCK
+ select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_GNU_PROPERTY
select ARCH_USE_MEMTEST
@@ -96,6 +97,7 @@ config ARM64
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_SUPPORTS_PAGE_TABLE_CHECK
select ARCH_SUPPORTS_PER_VMA_LOCK
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
select ARCH_WANT_DEFAULT_BPF_JIT
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
@@ -348,9 +350,6 @@ config GENERIC_CSUM
config GENERIC_CALIBRATE_DELAY
def_bool y
-config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
- def_bool y
-
config SMP
def_bool y
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 6a4a1ab8eb23..f43a38ac1779 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -10,6 +10,7 @@
#ifndef __ASM_HUGETLB_H
#define __ASM_HUGETLB_H
+#include <asm/cacheflush.h>
#include <asm/page.h>
#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
@@ -60,4 +61,19 @@ extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
#include <asm-generic/hugetlb.h>
+#define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
+static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long stride = huge_page_size(hstate_vma(vma));
+
+ if (stride == PMD_SIZE)
+ __flush_tlb_range(vma, start, end, stride, false, 2);
+ else if (stride == PUD_SIZE)
+ __flush_tlb_range(vma, start, end, stride, false, 1);
+ else
+ __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0);
+}
+
#endif /* __ASM_HUGETLB_H */
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 51d92abf945e..3b694511b98f 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -139,8 +139,7 @@ extern void __memset_io(volatile void __iomem *, int, size_t);
* I/O memory mapping functions.
*/
-bool ioremap_allowed(phys_addr_t phys_addr, size_t size, unsigned long prot);
-#define ioremap_allowed ioremap_allowed
+#define ioremap_prot ioremap_prot
#define _PAGE_IOREMAP PROT_DEVICE_nGnRE
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index c028afb1cd0b..4cedbaa16f41 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -90,7 +90,7 @@ static inline bool try_page_mte_tagging(struct page *page)
}
void mte_zero_clear_page_tags(void *addr);
-void mte_sync_tags(pte_t old_pte, pte_t pte);
+void mte_sync_tags(pte_t pte);
void mte_copy_page_tags(void *kto, const void *kfrom);
void mte_thread_init_user(void);
void mte_thread_switch(struct task_struct *next);
@@ -122,7 +122,7 @@ static inline bool try_page_mte_tagging(struct page *page)
static inline void mte_zero_clear_page_tags(void *addr)
{
}
-static inline void mte_sync_tags(pte_t old_pte, pte_t pte)
+static inline void mte_sync_tags(pte_t pte)
{
}
static inline void mte_copy_page_tags(void *kto, const void *kfrom)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0bd18de9fd97..fe4b913589ee 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -337,18 +337,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
* don't expose tags (instruction fetches don't check tags).
*/
if (system_supports_mte() && pte_access_permitted(pte, false) &&
- !pte_special(pte)) {
- pte_t old_pte = READ_ONCE(*ptep);
- /*
- * We only need to synchronise if the new PTE has tags enabled
- * or if swapping in (in which case another mapping may have
- * set tags in the past even if this PTE isn't tagged).
- * (!pte_none() && !pte_present()) is an open coded version of
- * is_swap_pte()
- */
- if (pte_tagged(pte) || (!pte_none(old_pte) && !pte_present(old_pte)))
- mte_sync_tags(old_pte, pte);
- }
+ !pte_special(pte) && pte_tagged(pte))
+ mte_sync_tags(pte);
__check_safe_pte_update(mm, ptep, pte);
@@ -358,7 +348,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
- page_table_check_pte_set(mm, addr, ptep, pte);
+ page_table_check_pte_set(mm, ptep, pte);
return __set_pte_at(mm, addr, ptep, pte);
}
@@ -534,14 +524,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
- page_table_check_pmd_set(mm, addr, pmdp, pmd);
+ page_table_check_pmd_set(mm, pmdp, pmd);
return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd));
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
- page_table_check_pud_set(mm, addr, pudp, pud);
+ page_table_check_pud_set(mm, pudp, pud);
return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud));
}
@@ -938,7 +928,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
{
pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0));
- page_table_check_pte_clear(mm, address, pte);
+ page_table_check_pte_clear(mm, pte);
return pte;
}
@@ -950,7 +940,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
{
pmd_t pmd = __pmd(xchg_relaxed(&pmd_val(*pmdp), 0));
- page_table_check_pmd_clear(mm, address, pmd);
+ page_table_check_pmd_clear(mm, pmd);
return pmd;
}
@@ -986,7 +976,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
- page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
+ page_table_check_pmd_set(vma->vm_mm, pmdp, pmd);
return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd)));
}
#endif
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index c995d1f4594f..2c29239d05c3 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -75,18 +75,20 @@ static inline void tlb_flush(struct mmu_gather *tlb)
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
unsigned long addr)
{
- pgtable_pte_page_dtor(pte);
- tlb_remove_table(tlb, pte);
+ struct ptdesc *ptdesc = page_ptdesc(pte);
+
+ pagetable_pte_dtor(ptdesc);
+ tlb_remove_ptdesc(tlb, ptdesc);
}
#if CONFIG_PGTABLE_LEVELS > 2
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
unsigned long addr)
{
- struct page *page = virt_to_page(pmdp);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);
- pgtable_pmd_page_dtor(page);
- tlb_remove_table(tlb, page);
+ pagetable_pmd_dtor(ptdesc);
+ tlb_remove_ptdesc(tlb, ptdesc);
}
#endif
@@ -94,7 +96,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
unsigned long addr)
{
- tlb_remove_table(tlb, virt_to_page(pudp));
+ tlb_remove_ptdesc(tlb, virt_to_ptdesc(pudp));
}
#endif
diff --git a/arch/arm64/include/asm/tlbbatch.h b/arch/arm64/include/asm/tlbbatch.h
new file mode 100644
index 000000000000..fedb0b87b8db
--- /dev/null
+++ b/arch/arm64/include/asm/tlbbatch.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARCH_ARM64_TLBBATCH_H
+#define _ARCH_ARM64_TLBBATCH_H
+
+struct arch_tlbflush_unmap_batch {
+ /*
+ * For arm64, HW can do tlb shootdown, so we don't
+ * need to record cpumask for sending IPI
+ */
+};
+
+#endif /* _ARCH_ARM64_TLBBATCH_H */
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 412a3b9a3c25..55b50e1d4a84 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -13,6 +13,7 @@
#include <linux/bitfield.h>
#include <linux/mm_types.h>
#include <linux/sched.h>
+#include <linux/mmu_notifier.h>
#include <asm/cputype.h>
#include <asm/mmu.h>
@@ -252,17 +253,26 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
__tlbi(aside1is, asid);
__tlbi_user(aside1is, asid);
dsb(ish);
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
-static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
- unsigned long uaddr)
+static inline void __flush_tlb_page_nosync(struct mm_struct *mm,
+ unsigned long uaddr)
{
unsigned long addr;
dsb(ishst);
- addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm));
+ addr = __TLBI_VADDR(uaddr, ASID(mm));
__tlbi(vale1is, addr);
__tlbi_user(vale1is, addr);
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK,
+ (uaddr & PAGE_MASK) + PAGE_SIZE);
+}
+
+static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
+ unsigned long uaddr)
+{
+ return __flush_tlb_page_nosync(vma->vm_mm, uaddr);
}
static inline void flush_tlb_page(struct vm_area_struct *vma,
@@ -272,6 +282,53 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
dsb(ish);
}
+static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
+{
+#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
+ /*
+ * TLB flush deferral is not required on systems which are affected by
+ * ARM64_WORKAROUND_REPEAT_TLBI, as __tlbi()/__tlbi_user() implementation
+ * will have two consecutive TLBI instructions with a dsb(ish) in between
+ * defeating the purpose (i.e save overall 'dsb ish' cost).
+ */
+ if (unlikely(cpus_have_const_cap(ARM64_WORKAROUND_REPEAT_TLBI)))
+ return false;
+#endif
+ return true;
+}
+
+static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm,
+ unsigned long uaddr)
+{
+ __flush_tlb_page_nosync(mm, uaddr);
+}
+
+/*
+ * If mprotect/munmap/etc occurs during TLB batched flushing, we need to
+ * synchronise all the TLBI issued with a DSB to avoid the race mentioned in
+ * flush_tlb_batched_pending().
+ */
+static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
+{
+ dsb(ish);
+}
+
+/*
+ * To support TLB batched flush for multiple pages unmapping, we only send
+ * the TLBI for each page in arch_tlbbatch_add_pending() and wait for the
+ * completion at the end in arch_tlbbatch_flush(). Since we've already issued
+ * TLBI for each page so only a DSB is needed to synchronise its effect on the
+ * other CPUs.
+ *
+ * This will save the time waiting on DSB comparing issuing a TLBI;DSB sequence
+ * for each page.
+ */
+static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+{
+ dsb(ish);
+}
+
/*
* This is meant to avoid soft lock-ups on large TLB flushing ranges and not
* necessarily a performance improvement.
@@ -358,6 +415,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
scale++;
}
dsb(ish);
+ mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
}
static inline void flush_tlb_range(struct vm_area_struct *vma,
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 4c5ef9b20065..4edecaac8f91 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -35,41 +35,18 @@ DEFINE_STATIC_KEY_FALSE(mte_async_or_asymm_mode);
EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode);
#endif
-static void mte_sync_page_tags(struct page *page, pte_t old_pte,
- bool check_swap, bool pte_is_tagged)
-{
- if (check_swap && is_swap_pte(old_pte)) {
- swp_entry_t entry = pte_to_swp_entry(old_pte);
-
- if (!non_swap_entry(entry))
- mte_restore_tags(entry, page);
- }
-
- if (!pte_is_tagged)
- return;
-
- if (try_page_mte_tagging(page)) {
- mte_clear_page_tags(page_address(page));
- set_page_mte_tagged(page);
- }
-}
-
-void mte_sync_tags(pte_t old_pte, pte_t pte)
+void mte_sync_tags(pte_t pte)
{
struct page *page = pte_page(pte);
long i, nr_pages = compound_nr(page);
- bool check_swap = nr_pages == 1;
- bool pte_is_tagged = pte_tagged(pte);
-
- /* Early out if there's nothing to do */
- if (!check_swap && !pte_is_tagged)
- return;
/* if PG_mte_tagged is set, tags have already been initialised */
- for (i = 0; i < nr_pages; i++, page++)
- if (!page_mte_tagged(page))
- mte_sync_page_tags(page, old_pte, check_swap,
- pte_is_tagged);
+ for (i = 0; i < nr_pages; i++, page++) {
+ if (try_page_mte_tagging(page)) {
+ mte_clear_page_tags(page_address(page));
+ set_page_mte_tagged(page);
+ }
+ }
/* ensure the tags are visible before the PTE is set */
smp_wmb();
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 3fe516b32577..103fcbdc6552 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -587,7 +587,6 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
-#ifdef CONFIG_PER_VMA_LOCK
if (!(mm_flags & FAULT_FLAG_USER))
goto lock_mmap;
@@ -615,7 +614,6 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
return 0;
}
lock_mmap:
-#endif /* CONFIG_PER_VMA_LOCK */
retry:
vma = lock_mm_and_find_vma(mm, addr, regs);
diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index c5af103d4ad4..269f2f63ab7d 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -3,20 +3,22 @@
#include <linux/mm.h>
#include <linux/io.h>
-bool ioremap_allowed(phys_addr_t phys_addr, size_t size, unsigned long prot)
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long prot)
{
unsigned long last_addr = phys_addr + size - 1;
/* Don't allow outside PHYS_MASK */
if (last_addr & ~PHYS_MASK)
- return false;
+ return NULL;
/* Don't allow RAM to be mapped. */
if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr))))
- return false;
+ return NULL;
- return true;
+ return generic_ioremap_prot(phys_addr, size, __pgprot(prot));
}
+EXPORT_SYMBOL(ioremap_prot);
/*
* Must be called after early_fixmap_init
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 95d360805f8a..47781bec6171 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -426,6 +426,7 @@ static phys_addr_t __pgd_pgtable_alloc(int shift)
static phys_addr_t pgd_pgtable_alloc(int shift)
{
phys_addr_t pa = __pgd_pgtable_alloc(shift);
+ struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
/*
* Call proper page table ctor in case later we need to
@@ -433,12 +434,12 @@ static phys_addr_t pgd_pgtable_alloc(int shift)
* this pre-allocated page table.
*
* We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
- * folded, and if so pgtable_pmd_page_ctor() becomes nop.
+ * folded, and if so pagetable_pte_ctor() becomes nop.
*/
if (shift == PAGE_SHIFT)
- BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa)));
+ BUG_ON(!pagetable_pte_ctor(ptdesc));
else if (shift == PMD_SHIFT)
- BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa)));
+ BUG_ON(!pagetable_pmd_ctor(ptdesc));
return pa;
}
diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h
index 7d57e5da0914..9c84c9012e53 100644
--- a/arch/csky/include/asm/pgalloc.h
+++ b/arch/csky/include/asm/pgalloc.h
@@ -63,8 +63,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
#define __pte_free_tlb(tlb, pte, address) \
do { \
- pgtable_pte_page_dtor(pte); \
- tlb_remove_page(tlb, pte); \
+ pagetable_pte_dtor(page_ptdesc(pte)); \
+ tlb_remove_page_ptdesc(tlb, page_ptdesc(pte)); \
} while (0)
extern void pagetable_init(void);
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 6726f4941015..a880ee067d2e 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -25,6 +25,7 @@ config HEXAGON
select NEED_SG_DMA_LENGTH
select NO_IOPORT_MAP
select GENERIC_IOMAP
+ select GENERIC_IOREMAP
select GENERIC_SMP_IDLE_THREAD
select STACKTRACE_SUPPORT
select GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/hexagon/include/asm/io.h b/arch/hexagon/include/asm/io.h
index 46a099de85b7..e2b308e32a37 100644
--- a/arch/hexagon/include/asm/io.h
+++ b/arch/hexagon/include/asm/io.h
@@ -27,8 +27,6 @@
extern int remap_area_pages(unsigned long start, unsigned long phys_addr,
unsigned long end, unsigned long flags);
-extern void iounmap(const volatile void __iomem *addr);
-
/* Defined in lib/io.c, needed for smc91x driver. */
extern void __raw_readsw(const void __iomem *addr, void *data, int wordlen);
extern void __raw_writesw(void __iomem *addr, const void *data, int wordlen);
@@ -170,8 +168,13 @@ static inline void writel(u32 data, volatile void __iomem *addr)
#define writew_relaxed __raw_writew
#define writel_relaxed __raw_writel
-void __iomem *ioremap(unsigned long phys_addr, unsigned long size);
-#define ioremap_uc(X, Y) ioremap((X), (Y))
+/*
+ * I/O memory mapping functions.
+ */
+#define _PAGE_IOREMAP (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+ (__HEXAGON_C_DEV << 6))
+
+#define ioremap_uc(addr, size) ioremap((addr), (size))
#define __raw_writel writel
diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h
index f0c47e6a7427..55988625e6fb 100644
--- a/arch/hexagon/include/asm/pgalloc.h
+++ b/arch/hexagon/include/asm/pgalloc.h
@@ -87,10 +87,10 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
max_kernel_seg = pmdindex;
}
-#define __pte_free_tlb(tlb, pte, addr) \
-do { \
- pgtable_pte_page_dtor((pte)); \
- tlb_remove_page((tlb), (pte)); \
+#define __pte_free_tlb(tlb, pte, addr) \
+do { \
+ pagetable_pte_dtor((page_ptdesc(pte))); \
+ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
} while (0)
#endif
diff --git a/arch/hexagon/kernel/hexagon_ksyms.c b/arch/hexagon/kernel/hexagon_ksyms.c
index ec56ce2d92a2..36a80e31d187 100644
--- a/arch/hexagon/kernel/hexagon_ksyms.c
+++ b/arch/hexagon/kernel/hexagon_ksyms.c
@@ -14,12 +14,10 @@
EXPORT_SYMBOL(__clear_user_hexagon);
EXPORT_SYMBOL(raw_copy_from_user);
EXPORT_SYMBOL(raw_copy_to_user);
-EXPORT_SYMBOL(iounmap);
EXPORT_SYMBOL(__vmgetie);
EXPORT_SYMBOL(__vmsetie);
EXPORT_SYMBOL(__vmyield);
EXPORT_SYMBOL(empty_zero_page);
-EXPORT_SYMBOL(ioremap);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(memset);
diff --git a/arch/hexagon/mm/Makefile b/arch/hexagon/mm/Makefile
index 49911a906fd0..ba4b04d962d6 100644
--- a/arch/hexagon/mm/Makefile
+++ b/arch/hexagon/mm/Makefile
@@ -3,5 +3,5 @@
# Makefile for Hexagon memory management subsystem
#
-obj-y := init.o ioremap.o uaccess.o vm_fault.o cache.o
+obj-y := init.o uaccess.o vm_fault.o cache.o
obj-y += copy_to_user.o copy_from_user.o vm_tlb.o
diff --git a/arch/hexagon/mm/ioremap.c b/arch/hexagon/mm/ioremap.c
deleted file mode 100644
index 255c5b1ee1a7..000000000000
--- a/arch/hexagon/mm/ioremap.c
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * I/O remap functions for Hexagon
- *
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
- */
-
-#include <linux/io.h>
-#include <linux/vmalloc.h>
-#include <linux/mm.h>
-
-void __iomem *ioremap(unsigned long phys_addr, unsigned long size)
-{
- unsigned long last_addr, addr;
- unsigned long offset = phys_addr & ~PAGE_MASK;
- struct vm_struct *area;
-
- pgprot_t prot = __pgprot(_PAGE_PRESENT|_PAGE_READ|_PAGE_WRITE
- |(__HEXAGON_C_DEV << 6));
-
- last_addr = phys_addr + size - 1;
-
- /* Wrapping not allowed */
- if (!size || (last_addr < phys_addr))
- return NULL;
-
- /* Rounds up to next page size, including whole-page offset */
- size = PAGE_ALIGN(offset + size);
-
- area = get_vm_area(size, VM_IOREMAP);
- addr = (unsigned long)area->addr;
-
- if (ioremap_page_range(addr, addr+size, phys_addr, prot)) {
- vunmap((void *)addr);
- return NULL;
- }
-
- return (void __iomem *) (offset + addr);
-}
-
-void iounmap(const volatile void __iomem *addr)
-{
- vunmap((void *) ((unsigned long) addr & PAGE_MASK));
-}
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 2cd93e6bf0fe..3ab75f36c037 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -47,6 +47,7 @@ config IA64
select GENERIC_IRQ_LEGACY
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_IOMAP
+ select GENERIC_IOREMAP
select GENERIC_SMP_IDLE_THREAD
select ARCH_TASK_STRUCT_ON_STACK
select ARCH_TASK_STRUCT_ALLOCATOR
diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h
index 83a492c8d298..eedc0afa8cad 100644
--- a/arch/ia64/include/asm/io.h
+++ b/arch/ia64/include/asm/io.h
@@ -243,15 +243,12 @@ static inline void outsl(unsigned long port, const void *src,
# ifdef __KERNEL__
-extern void __iomem * ioremap(unsigned long offset, unsigned long size);
+#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL)
+
extern void __iomem * ioremap_uc(unsigned long offset, unsigned long size);
-extern void iounmap (volatile void __iomem *addr);
-static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned long size)
-{
- return ioremap(phys_addr, size);
-}
-#define ioremap ioremap
-#define ioremap_cache ioremap_cache
+
+#define ioremap_prot ioremap_prot
+#define ioremap_cache ioremap
#define ioremap_uc ioremap_uc
#define iounmap iounmap
diff --git a/arch/ia64/mm/ioremap.c b/arch/ia64/mm/ioremap.c
index 92b81bc91397..711b6abc822e 100644
--- a/arch/ia64/mm/ioremap.c
+++ b/arch/ia64/mm/ioremap.c
@@ -29,13 +29,9 @@ early_ioremap (unsigned long phys_addr, unsigned long size)
return __ioremap_uc(phys_addr);
}
-void __iomem *
-ioremap (unsigned long phys_addr, unsigned long size)
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long flags)
{
- void __iomem *addr;
- struct vm_struct *area;
- unsigned long offset;
- pgprot_t prot;
u64 attr;
unsigned long gran_base, gran_size;
unsigned long page_base;
@@ -68,36 +64,12 @@ ioremap (unsigned long phys_addr, unsigned long size)
*/
page_base = phys_addr & PAGE_MASK;
size = PAGE_ALIGN(phys_addr + size) - page_base;
- if (efi_mem_attribute(page_base, size) & EFI_MEMORY_WB) {
- prot = PAGE_KERNEL;
-
- /*
- * Mappings have to be page-aligned
- */
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
-
- /*
- * Ok, go for it..
- */
- area = get_vm_area(size, VM_IOREMAP);
- if (!area)
- return NULL;
-
- area->phys_addr = phys_addr;
- addr = (void __iomem *) area->addr;
- if (ioremap_page_range((unsigned long) addr,
- (unsigned long) addr + size, phys_addr, prot)) {
- vunmap((void __force *) addr);
- return NULL;
- }
-
- return (void __iomem *) (offset + (char __iomem *)addr);
- }
+ if (efi_mem_attribute(page_base, size) & EFI_MEMORY_WB)
+ return generic_ioremap_prot(phys_addr, size, __pgprot(flags));
return __ioremap_uc(phys_addr);
}
-EXPORT_SYMBOL(ioremap);
+EXPORT_SYMBOL(ioremap_prot);
void __iomem *
ioremap_uc(unsigned long phys_addr, unsigned long size)
@@ -114,8 +86,7 @@ early_iounmap (volatile void __iomem *addr, unsigned long size)
{
}
-void
-iounmap (volatile void __iomem *addr)
+void iounmap(volatile void __iomem *addr)
{
if (REGION_NUMBER(addr) == RGN_GATE)
vunmap((void *) ((unsigned long) addr & PAGE_MASK));
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index e71d5bf2cee0..dc56ddf9ba03 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -60,7 +60,7 @@ config LOONGARCH
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
select ARCH_WANT_LD_ORPHAN_WARN
- select ARCH_WANT_OPTIMIZE_VMEMMAP
+ select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
select ARCH_WANTS_NO_INSTR
select BUILDTIME_TABLE_SORT
select COMMON_CLK
diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h
index 1c9410220040..0dcb36b32cb2 100644
--- a/arch/loongarch/include/asm/io.h
+++ b/arch/loongarch/include/asm/io.h
@@ -5,8 +5,6 @@
#ifndef _ASM_IO_H
#define _ASM_IO_H
-#define ARCH_HAS_IOREMAP_WC
-
#include <linux/kernel.h>
#include <linux/types.h>
diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h
index af1d1e4a6965..23f5b1107246 100644
--- a/arch/loongarch/include/asm/pgalloc.h
+++ b/arch/loongarch/include/asm/pgalloc.h
@@ -45,9 +45,9 @@ extern void pagetable_init(void);
extern pgd_t *pgd_alloc(struct mm_struct *mm);
#define __pte_free_tlb(tlb, pte, address) \
-do { \
- pgtable_pte_page_dtor(pte); \
- tlb_remove_page((tlb), pte); \
+do { \
+ pagetable_pte_dtor(page_ptdesc(pte)); \
+ tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \
} while (0)
#ifndef __PAGETABLE_PMD_FOLDED
@@ -55,18 +55,18 @@ do { \
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
{
pmd_t *pmd;
- struct page *pg;
+ struct ptdesc *ptdesc;
- pg = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!pg)
+ ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pmd_page_ctor(pg)) {
- __free_page(pg);
+ if (!pagetable_pmd_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- pmd = (pmd_t *)page_address(pg);
+ pmd = ptdesc_address(ptdesc);
pmd_init(pmd);
return pmd;
}
@@ -80,10 +80,13 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
{
pud_t *pud;
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
- pud = (pud_t *) __get_free_page(GFP_KERNEL);
- if (pud)
- pud_init(pud);
+ if (!ptdesc)
+ return NULL;
+ pud = ptdesc_address(ptdesc);
+
+ pud_init(pud);
return pud;
}
diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c
index 36a6dc0148ae..5bd102b51f7c 100644
--- a/arch/loongarch/mm/pgtable.c
+++ b/arch/loongarch/mm/pgtable.c
@@ -11,10 +11,11 @@
pgd_t *pgd_alloc(struct mm_struct *mm)
{
- pgd_t *ret, *init;
+ pgd_t *init, *ret = NULL;
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
- ret = (pgd_t *) __get_free_page(GFP_KERNEL);
- if (ret) {
+ if (ptdesc) {
+ ret = (pgd_t *)ptdesc_address(ptdesc);
init = pgd_offset(&init_mm, 0UL);
pgd_init(ret);
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
diff --git a/arch/m68k/include/asm/io_mm.h b/arch/m68k/include/asm/io_mm.h
index d41fa488453b..6a0abd4846c6 100644
--- a/arch/m68k/include/asm/io_mm.h
+++ b/arch/m68k/include/asm/io_mm.h
@@ -26,8 +26,6 @@
#include <asm/virtconvert.h>
#include <asm/kmap.h>
-#include <asm-generic/iomap.h>
-
#ifdef CONFIG_ATARI
#define atari_readb raw_inb
#define atari_writeb raw_outb
diff --git a/arch/m68k/include/asm/kmap.h b/arch/m68k/include/asm/kmap.h
index dec05743d426..4efb3efa593a 100644
--- a/arch/m68k/include/asm/kmap.h
+++ b/arch/m68k/include/asm/kmap.h
@@ -4,8 +4,6 @@
#ifdef CONFIG_MMU
-#define ARCH_HAS_IOREMAP_WT
-
/* Values for nocacheflag and cmode */
#define IOMAP_FULL_CACHING 0
#define IOMAP_NOCACHE_SER 1
diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h
index 5c2c0a864524..302c5bf67179 100644
--- a/arch/m68k/include/asm/mcf_pgalloc.h
+++ b/arch/m68k/include/asm/mcf_pgalloc.h
@@ -5,22 +5,22 @@
#include <asm/tlb.h>
#include <asm/tlbflush.h>
-extern inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
- free_page((unsigned long) pte);
+ pagetable_free(virt_to_ptdesc(pte));
}
extern const char bad_pmd_string[];
-extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
{
- unsigned long page = __get_free_page(GFP_DMA);
+ struct ptdesc *ptdesc = pagetable_alloc((GFP_DMA | __GFP_ZERO) &
+ ~__GFP_HIGHMEM, 0);
- if (!page)
+ if (!ptdesc)
return NULL;
- memset((void *)page, 0, PAGE_SIZE);
- return (pte_t *) (page);
+ return ptdesc_address(ptdesc);
}
extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address)
@@ -35,36 +35,34 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address)
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable,
unsigned long address)
{
- struct page *page = virt_to_page(pgtable);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ pagetable_pte_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
{
- struct page *page = alloc_pages(GFP_DMA, 0);
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_DMA | __GFP_ZERO, 0);
pte_t *pte;
- if (!page)
+ if (!ptdesc)
return NULL;
- if (!pgtable_pte_page_ctor(page)) {
- __free_page(page);
+ if (!pagetable_pte_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- pte = page_address(page);
- clear_page(pte);
-
+ pte = ptdesc_address(ptdesc);
return pte;
}
static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable)
{
- struct page *page = virt_to_page(pgtable);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ pagetable_pte_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
/*
@@ -75,16 +73,19 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable)
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
- free_page((unsigned long) pgd);
+ pagetable_free(virt_to_ptdesc(pgd));
}
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *new_pgd;
+ struct ptdesc *ptdesc = pagetable_alloc((GFP_DMA | __GFP_NOWARN) &
+ ~__GFP_HIGHMEM, 0);
- new_pgd = (pgd_t *)__get_free_page(GFP_DMA | __GFP_NOWARN);
- if (!new_pgd)
+ if (!ptdesc)
return NULL;
+ new_pgd = ptdesc_address(ptdesc);
+
memcpy(new_pgd, swapper_pg_dir, PTRS_PER_PGD * sizeof(pgd_t));
memset(new_pgd, 0, PAGE_OFFSET >> PGDIR_SHIFT);
return new_pgd;
diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h
index 198036aff519..ff48573db2c0 100644
--- a/arch/m68k/include/asm/sun3_pgalloc.h
+++ b/arch/m68k/include/asm/sun3_pgalloc.h
@@ -17,10 +17,10 @@
extern const char bad_pmd_string[];
-#define __pte_free_tlb(tlb,pte,addr) \
-do { \
- pgtable_pte_page_dtor(pte); \
- tlb_remove_page((tlb), pte); \
+#define __pte_free_tlb(tlb, pte, addr) \
+do { \
+ pagetable_pte_dtor(page_ptdesc(pte)); \
+ tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \
} while (0)
static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c
index c75984e2d86b..594575a0780c 100644
--- a/arch/m68k/mm/motorola.c
+++ b/arch/m68k/mm/motorola.c
@@ -161,7 +161,7 @@ void *get_pointer_table(int type)
* m68k doesn't have SPLIT_PTE_PTLOCKS for not having
* SMP.
*/
- pgtable_pte_page_ctor(virt_to_page(page));
+ pagetable_pte_ctor(virt_to_ptdesc(page));
}
mmu_page_ctor(page);
@@ -201,7 +201,7 @@ int free_pointer_table(void *table, int type)
list_del(dp);
mmu_page_dtor((void *)page);
if (type == TABLE_PTE)
- pgtable_pte_page_dtor(virt_to_page((void *)page));
+ pagetable_pte_dtor(virt_to_ptdesc((void *)page));
free_page (page);
return 1;
} else if (ptable_list[type].next != dp) {
diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h
index affd21e9c20b..062dd4e6b954 100644
--- a/arch/mips/include/asm/io.h
+++ b/arch/mips/include/asm/io.h
@@ -12,8 +12,6 @@
#ifndef _ASM_IO_H
#define _ASM_IO_H
-#define ARCH_HAS_IOREMAP_WC
-
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/types.h>
@@ -25,7 +23,6 @@
#include <asm/byteorder.h>
#include <asm/cpu.h>
#include <asm/cpu-features.h>
-#include <asm-generic/iomap.h>
#include <asm/page.h>
#include <asm/pgtable-bits.h>
#include <asm/processor.h>
@@ -210,6 +207,8 @@ void iounmap(const volatile void __iomem *addr);
#define ioremap_wc(offset, size) \
ioremap_prot((offset), (size), boot_cpu_data.writecombine)
+#include <asm-generic/iomap.h>
+
#if defined(CONFIG_CPU_CAVIUM_OCTEON)
#define war_io_reorder_wmb() wmb()
#else
diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index f72e737dda21..40e40a7eb94a 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -51,13 +51,13 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm);
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
- free_pages((unsigned long)pgd, PGD_TABLE_ORDER);
+ pagetable_free(virt_to_ptdesc(pgd));
}
-#define __pte_free_tlb(tlb,pte,address) \
-do { \
- pgtable_pte_page_dtor(pte); \
- tlb_remove_page((tlb), pte); \
+#define __pte_free_tlb(tlb, pte, address) \
+do { \
+ pagetable_pte_dtor(page_ptdesc(pte)); \
+ tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \
} while (0)
#ifndef __PAGETABLE_PMD_FOLDED
@@ -65,18 +65,18 @@ do { \
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
{
pmd_t *pmd;
- struct page *pg;
+ struct ptdesc *ptdesc;
- pg = alloc_pages(GFP_KERNEL_ACCOUNT, PMD_TABLE_ORDER);
- if (!pg)
+ ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, PMD_TABLE_ORDER);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pmd_page_ctor(pg)) {
- __free_pages(pg, PMD_TABLE_ORDER);
+ if (!pagetable_pmd_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- pmd = (pmd_t *)page_address(pg);
+ pmd = ptdesc_address(ptdesc);
pmd_init(pmd);
return pmd;
}
@@ -90,10 +90,14 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
{
pud_t *pud;
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM,
+ PUD_TABLE_ORDER);
- pud = (pud_t *) __get_free_pages(GFP_KERNEL, PUD_TABLE_ORDER);
- if (pud)
- pud_init(pud);
+ if (!ptdesc)
+ return NULL;
+ pud = ptdesc_address(ptdesc);
+
+ pud_init(pud);
return pud;
}
diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c
index b13314be5d0e..1506e458040d 100644
--- a/arch/mips/mm/pgtable.c
+++ b/arch/mips/mm/pgtable.c
@@ -10,10 +10,12 @@
pgd_t *pgd_alloc(struct mm_struct *mm)
{
- pgd_t *ret, *init;
+ pgd_t *init, *ret = NULL;
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM,
+ PGD_TABLE_ORDER);
- ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER);
- if (ret) {
+ if (ptdesc) {
+ ret = ptdesc_address(ptdesc);
init = pgd_offset(&init_mm, 0UL);
pgd_init(ret);
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h
index ecd1657bb2ce..ce6bb8e74271 100644
--- a/arch/nios2/include/asm/pgalloc.h
+++ b/arch/nios2/include/asm/pgalloc.h
@@ -28,10 +28,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
extern pgd_t *pgd_alloc(struct mm_struct *mm);
-#define __pte_free_tlb(tlb, pte, addr) \
- do { \
- pgtable_pte_page_dtor(pte); \
- tlb_remove_page((tlb), (pte)); \
+#define __pte_free_tlb(tlb, pte, addr) \
+ do { \
+ pagetable_pte_dtor(page_ptdesc(pte)); \
+ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
} while (0)
#endif /* _ASM_NIOS2_PGALLOC_H */
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index c7f282f60f64..fd9bb76a610b 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -21,6 +21,7 @@ config OPENRISC
select GENERIC_IRQ_PROBE
select GENERIC_IRQ_SHOW
select GENERIC_PCI_IOMAP
+ select GENERIC_IOREMAP
select GENERIC_CPU_DEVICES
select HAVE_PCI
select HAVE_UID16
diff --git a/arch/openrisc/include/asm/io.h b/arch/openrisc/include/asm/io.h
index ee6043a03173..5a6f0f16a5ce 100644
--- a/arch/openrisc/include/asm/io.h
+++ b/arch/openrisc/include/asm/io.h
@@ -15,6 +15,8 @@
#define __ASM_OPENRISC_IO_H
#include <linux/types.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
/*
* PCI: We do not use IO ports in OpenRISC
@@ -27,11 +29,10 @@
#define PIO_OFFSET 0
#define PIO_MASK 0
-#define ioremap ioremap
-void __iomem *ioremap(phys_addr_t offset, unsigned long size);
-
-#define iounmap iounmap
-extern void iounmap(volatile void __iomem *addr);
+/*
+ * I/O memory mapping functions.
+ */
+#define _PAGE_IOREMAP (pgprot_val(PAGE_KERNEL) | _PAGE_CI)
#include <asm-generic/io.h>
diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h
index b7b2b8d16fad..c6a73772a546 100644
--- a/arch/openrisc/include/asm/pgalloc.h
+++ b/arch/openrisc/include/asm/pgalloc.h
@@ -66,10 +66,10 @@ extern inline pgd_t *pgd_alloc(struct mm_struct *mm)
extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
-#define __pte_free_tlb(tlb, pte, addr) \
-do { \
- pgtable_pte_page_dtor(pte); \
- tlb_remove_page((tlb), (pte)); \
+#define __pte_free_tlb(tlb, pte, addr) \
+do { \
+ pagetable_pte_dtor(page_ptdesc(pte)); \
+ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
} while (0)
#endif
diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c
index 8ec0dafecf25..91c8259d4b7e 100644
--- a/arch/openrisc/mm/ioremap.c
+++ b/arch/openrisc/mm/ioremap.c
@@ -22,88 +22,6 @@
extern int mem_init_done;
-static unsigned int fixmaps_used __initdata;
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem *__ref ioremap(phys_addr_t addr, unsigned long size)
-{
- phys_addr_t p;
- unsigned long v;
- unsigned long offset, last_addr;
- struct vm_struct *area = NULL;
-
- /* Don't allow wraparound or zero size */
- last_addr = addr + size - 1;
- if (!size || last_addr < addr)
- return NULL;
-
- /*
- * Mappings have to be page-aligned
- */
- offset = addr & ~PAGE_MASK;
- p = addr & PAGE_MASK;
- size = PAGE_ALIGN(last_addr + 1) - p;
-
- if (likely(mem_init_done)) {
- area = get_vm_area(size, VM_IOREMAP);
- if (!area)
- return NULL;
- v = (unsigned long)area->addr;
- } else {
- if ((fixmaps_used + (size >> PAGE_SHIFT)) > FIX_N_IOREMAPS)
- return NULL;
- v = fix_to_virt(FIX_IOREMAP_BEGIN + fixmaps_used);
- fixmaps_used += (size >> PAGE_SHIFT);
- }
-
- if (ioremap_page_range(v, v + size, p,
- __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_CI))) {
- if (likely(mem_init_done))
- vfree(area->addr);
- else
- fixmaps_used -= (size >> PAGE_SHIFT);
- return NULL;
- }
-
- return (void __iomem *)(offset + (char *)v);
-}
-EXPORT_SYMBOL(ioremap);
-
-void iounmap(volatile void __iomem *addr)
-{
- /* If the page is from the fixmap pool then we just clear out
- * the fixmap mapping.
- */
- if (unlikely((unsigned long)addr > FIXADDR_START)) {
- /* This is a bit broken... we don't really know
- * how big the area is so it's difficult to know
- * how many fixed pages to invalidate...
- * just flush tlb and hope for the best...
- * consider this a FIXME
- *
- * Really we should be clearing out one or more page
- * table entries for these virtual addresses so that
- * future references cause a page fault... for now, we
- * rely on two things:
- * i) this code never gets called on known boards
- * ii) invalid accesses to the freed areas aren't made
- */
- flush_tlb_all();
- return;
- }
-
- return vfree((void *)(PAGE_MASK & (unsigned long)addr));
-}
-EXPORT_SYMBOL(iounmap);
-
/**
* OK, this one's a bit tricky... ioremap can get called before memory is
* initialized (early serial console does this) and will want to alloc a page
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 4cb46d5c64a2..4cda9f0a3277 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -36,6 +36,7 @@ config PARISC
select GENERIC_ATOMIC64 if !64BIT
select GENERIC_IRQ_PROBE
select GENERIC_PCI_IOMAP
+ select GENERIC_IOREMAP
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_SMP_IDLE_THREAD
select GENERIC_ARCH_TOPOLOGY if SMP
diff --git a/arch/parisc/include/asm/io.h b/arch/parisc/include/asm/io.h
index c05e781be2f5..366537042465 100644
--- a/arch/parisc/include/asm/io.h
+++ b/arch/parisc/include/asm/io.h
@@ -125,12 +125,17 @@ static inline void gsc_writeq(unsigned long long val, unsigned long addr)
/*
* The standard PCI ioremap interfaces
*/
-void __iomem *ioremap(unsigned long offset, unsigned long size);
-#define ioremap_wc ioremap
-#define ioremap_uc ioremap
-#define pci_iounmap pci_iounmap
+#define ioremap_prot ioremap_prot
+
+#define _PAGE_IOREMAP (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | \
+ _PAGE_ACCESSED | _PAGE_NO_CACHE)
-extern void iounmap(const volatile void __iomem *addr);
+#define ioremap_wc(addr, size) \
+ ioremap_prot((addr), (size), _PAGE_IOREMAP)
+#define ioremap_uc(addr, size) \
+ ioremap_prot((addr), (size), _PAGE_IOREMAP)
+
+#define pci_iounmap pci_iounmap
void memset_io(volatile void __iomem *addr, unsigned char val, int count);
void memcpy_fromio(void *dst, const volatile void __iomem *src, int count);
diff --git a/arch/parisc/mm/ioremap.c b/arch/parisc/mm/ioremap.c
index 345ff0b66499..fd996472dfe7 100644
--- a/arch/parisc/mm/ioremap.c
+++ b/arch/parisc/mm/ioremap.c
@@ -13,25 +13,9 @@
#include <linux/io.h>
#include <linux/mm.h>
-/*
- * Generic mapping function (not visible outside):
- */
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem *ioremap(unsigned long phys_addr, unsigned long size)
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long prot)
{
- void __iomem *addr;
- struct vm_struct *area;
- unsigned long offset, last_addr;
- pgprot_t pgprot;
-
#ifdef CONFIG_EISA
unsigned long end = phys_addr + size - 1;
/* Support EISA addresses */
@@ -40,11 +24,6 @@ void __iomem *ioremap(unsigned long phys_addr, unsigned long size)
phys_addr |= F_EXTEND(0xfc000000);
#endif
- /* Don't allow wraparound or zero size */
- last_addr = phys_addr + size - 1;
- if (!size || last_addr < phys_addr)
- return NULL;
-
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
@@ -62,39 +41,6 @@ void __iomem *ioremap(unsigned long phys_addr, unsigned long size)
}
}
- pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY |
- _PAGE_ACCESSED | _PAGE_NO_CACHE);
-
- /*
- * Mappings have to be page-aligned
- */
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
- size = PAGE_ALIGN(last_addr + 1) - phys_addr;
-
- /*
- * Ok, go for it..
- */
- area = get_vm_area(size, VM_IOREMAP);
- if (!area)
- return NULL;
-
- addr = (void __iomem *) area->addr;
- if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
- phys_addr, pgprot)) {
- vunmap(addr);
- return NULL;
- }
-
- return (void __iomem *) (offset + (char __iomem *)addr);
-}
-EXPORT_SYMBOL(ioremap);
-
-void iounmap(const volatile void __iomem *io_addr)
-{
- unsigned long addr = (unsigned long)io_addr & PAGE_MASK;
-
- if (is_vmalloc_addr((void *)addr))
- vunmap((void *)addr);
+ return generic_ioremap_prot(phys_addr, size, __pgprot(prot));
}
-EXPORT_SYMBOL(iounmap);
+EXPORT_SYMBOL(ioremap_prot);
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 0b1172cbeccb..938294c996dc 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -157,6 +157,7 @@ config PPC
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_KEEP_MEMBLOCK
+ select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
@@ -174,6 +175,7 @@ config PPC
select ARCH_WANT_IPC_PARSE_VERSION
select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
select ARCH_WANT_LD_ORPHAN_WARN
+ select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if PPC_RADIX_MMU
select ARCH_WANTS_MODULES_DATA_IN_VMALLOC if PPC_BOOK3S_32 || PPC_8xx
select ARCH_WEAK_RELEASE_ACQUIRE
select BINFMT_ELF
@@ -193,6 +195,7 @@ config PPC
select GENERIC_CPU_VULNERABILITIES if PPC_BARRIER_NOSPEC
select GENERIC_EARLY_IOREMAP
select GENERIC_GETTIMEOFDAY
+ select GENERIC_IOREMAP
select GENERIC_IRQ_SHOW
select GENERIC_IRQ_SHOW_LEVEL
select GENERIC_PCI_IOMAP if PCI
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index d4a19e6547ac..6e70ae511631 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -138,7 +138,16 @@ static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b)
}
#define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS)
+
+/*
+ * pud comparison that will work with both pte and page table pointer.
+ */
+static inline int hash__pud_same(pud_t pud_a, pud_t pud_b)
+{
+ return (((pud_raw(pud_a) ^ pud_raw(pud_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0);
+}
#define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS)
+
static inline int hash__p4d_bad(p4d_t p4d)
{
return (p4d_val(p4d) == 0);
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 4acc9690f599..a8204566cfd0 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -921,8 +921,29 @@ static inline pud_t pte_pud(pte_t pte)
{
return __pud_raw(pte_raw(pte));
}
+
+static inline pte_t *pudp_ptep(pud_t *pud)
+{
+ return (pte_t *)pud;
+}
+
+#define pud_pfn(pud) pte_pfn(pud_pte(pud))
+#define pud_dirty(pud) pte_dirty(pud_pte(pud))
+#define pud_young(pud) pte_young(pud_pte(pud))
+#define pud_mkold(pud) pte_pud(pte_mkold(pud_pte(pud)))
+#define pud_wrprotect(pud) pte_pud(pte_wrprotect(pud_pte(pud)))
+#define pud_mkdirty(pud) pte_pud(pte_mkdirty(pud_pte(pud)))
+#define pud_mkclean(pud) pte_pud(pte_mkclean(pud_pte(pud)))
+#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud)))
+#define pud_mkwrite(pud) pte_pud(pte_mkwrite(pud_pte(pud)))
#define pud_write(pud) pte_write(pud_pte(pud))
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+#define pud_soft_dirty(pmd) pte_soft_dirty(pud_pte(pud))
+#define pud_mksoft_dirty(pmd) pte_pud(pte_mksoft_dirty(pud_pte(pud)))
+#define pud_clear_soft_dirty(pmd) pte_pud(pte_clear_soft_dirty(pud_pte(pud)))
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
static inline int pud_bad(pud_t pud)
{
if (radix_enabled())
@@ -1115,15 +1136,24 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool write)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
+extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot);
extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd);
+extern void set_pud_at(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud);
+
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd)
{
}
+static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pud)
+{
+}
+
extern int hash__has_transparent_hugepage(void);
static inline int has_transparent_hugepage(void)
{
@@ -1133,6 +1163,14 @@ static inline int has_transparent_hugepage(void)
}
#define has_transparent_hugepage has_transparent_hugepage
+static inline int has_transparent_pud_hugepage(void)
+{
+ if (radix_enabled())
+ return radix__has_transparent_pud_hugepage();
+ return 0;
+}
+#define has_transparent_pud_hugepage has_transparent_pud_hugepage
+
static inline unsigned long
pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
unsigned long clr, unsigned long set)
@@ -1142,6 +1180,16 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set);
}
+static inline unsigned long
+pud_hugepage_update(struct mm_struct *mm, unsigned long addr, pud_t *pudp,
+ unsigned long clr, unsigned long set)
+{
+ if (radix_enabled())
+ return radix__pud_hugepage_update(mm, addr, pudp, clr, set);
+ BUG();
+ return pud_val(*pudp);
+}
+
/*
* returns true for pmd migration entries, THP, devmap, hugetlb
* But compile time dependent on THP config
@@ -1151,6 +1199,11 @@ static inline int pmd_large(pmd_t pmd)
return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
}
+static inline int pud_large(pud_t pud)
+{
+ return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
+}
+
/*
* For radix we should always find H_PAGE_HASHPTE zero. Hence
* the below will work for radix too
@@ -1166,6 +1219,17 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
return ((old & _PAGE_ACCESSED) != 0);
}
+static inline int __pudp_test_and_clear_young(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ unsigned long old;
+
+ if ((pud_raw(*pudp) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0)
+ return 0;
+ old = pud_hugepage_update(mm, addr, pudp, _PAGE_ACCESSED, 0);
+ return ((old & _PAGE_ACCESSED) != 0);
+}
+
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
@@ -1174,6 +1238,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0);
}
+#define __HAVE_ARCH_PUDP_SET_WRPROTECT
+static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp)
+{
+ if (pud_write(*pudp))
+ pud_hugepage_update(mm, addr, pudp, _PAGE_WRITE, 0);
+}
+
/*
* Only returns true for a THP. False for pmd migration entry.
* We also need to return true when we come across a pte that
@@ -1195,6 +1267,17 @@ static inline int pmd_trans_huge(pmd_t pmd)
return hash__pmd_trans_huge(pmd);
}
+static inline int pud_trans_huge(pud_t pud)
+{
+ if (!pud_present(pud))
+ return false;
+
+ if (radix_enabled())
+ return radix__pud_trans_huge(pud);
+ return 0;
+}
+
+
#define __HAVE_ARCH_PMD_SAME
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
@@ -1203,6 +1286,15 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
return hash__pmd_same(pmd_a, pmd_b);
}
+#define pud_same pud_same
+static inline int pud_same(pud_t pud_a, pud_t pud_b)
+{
+ if (radix_enabled())
+ return radix__pud_same(pud_a, pud_b);
+ return hash__pud_same(pud_a, pud_b);
+}
+
+
static inline pmd_t __pmd_mkhuge(pmd_t pmd)
{
if (radix_enabled())
@@ -1210,6 +1302,14 @@ static inline pmd_t __pmd_mkhuge(pmd_t pmd)
return hash__pmd_mkhuge(pmd);
}
+static inline pud_t __pud_mkhuge(pud_t pud)
+{
+ if (radix_enabled())
+ return radix__pud_mkhuge(pud);
+ BUG();
+ return pud;
+}
+
/*
* pfn_pmd return a pmd_t that can be used as pmd pte entry.
*/
@@ -1225,14 +1325,34 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
return pmd;
}
+static inline pud_t pud_mkhuge(pud_t pud)
+{
+#ifdef CONFIG_DEBUG_VM
+ if (radix_enabled())
+ WARN_ON((pud_raw(pud) & cpu_to_be64(_PAGE_PTE)) == 0);
+ else
+ WARN_ON(1);
+#endif
+ return pud;
+}
+
+
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
pmd_t entry, int dirty);
+#define __HAVE_ARCH_PUDP_SET_ACCESS_FLAGS
+extern int pudp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp,
+ pud_t entry, int dirty);
#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
+#define __HAVE_ARCH_PUDP_TEST_AND_CLEAR_YOUNG
+extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp);
+
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
@@ -1243,6 +1363,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
return hash__pmdp_huge_get_and_clear(mm, addr, pmdp);
}
+#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
+static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ if (radix_enabled())
+ return radix__pudp_huge_get_and_clear(mm, addr, pudp);
+ BUG();
+ return *pudp;
+}
+
static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
@@ -1257,6 +1387,11 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmdp, int full);
+#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
+pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr,
+ pud_t *pudp, int full);
+
#define __HAVE_ARCH_PGTABLE_DEPOSIT
static inline void pgtable_trans_huge_deposit(struct mm_struct *mm,
pmd_t *pmdp, pgtable_t pgtable)
@@ -1305,6 +1440,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
return hash__pmd_mkdevmap(pmd);
}
+static inline pud_t pud_mkdevmap(pud_t pud)
+{
+ if (radix_enabled())
+ return radix__pud_mkdevmap(pud);
+ BUG();
+ return pud;
+}
+
static inline int pmd_devmap(pmd_t pmd)
{
return pte_devmap(pmd_pte(pmd));
@@ -1312,7 +1455,7 @@ static inline int pmd_devmap(pmd_t pmd)
static inline int pud_devmap(pud_t pud)
{
- return 0;
+ return pte_devmap(pud_pte(pud));
}
static inline int pgd_devmap(pgd_t pgd)
@@ -1321,16 +1464,6 @@ static inline int pgd_devmap(pgd_t pgd)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline int pud_pfn(pud_t pud)
-{
- /*
- * Currently all calls to pud_pfn() are gated around a pud_devmap()
- * check so this should never be used. If it grows another user we
- * want to know about it.
- */
- BUILD_BUG();
- return 0;
-}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 686001eda936..357e23a403d3 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -250,6 +250,10 @@ static inline int radix__pud_bad(pud_t pud)
return !!(pud_val(pud) & RADIX_PUD_BAD_BITS);
}
+static inline int radix__pud_same(pud_t pud_a, pud_t pud_b)
+{
+ return ((pud_raw(pud_a) ^ pud_raw(pud_b)) == 0);
+}
static inline int radix__p4d_bad(p4d_t p4d)
{
@@ -268,9 +272,22 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
return __pmd(pmd_val(pmd) | _PAGE_PTE);
}
+static inline int radix__pud_trans_huge(pud_t pud)
+{
+ return (pud_val(pud) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE;
+}
+
+static inline pud_t radix__pud_mkhuge(pud_t pud)
+{
+ return __pud(pud_val(pud) | _PAGE_PTE);
+}
+
extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, unsigned long clr,
unsigned long set);
+extern unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, unsigned long clr,
+ unsigned long set);
extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
@@ -278,6 +295,9 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp);
+pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp);
+
static inline int radix__has_transparent_hugepage(void)
{
/* For radix 2M at PMD level means thp */
@@ -285,6 +305,14 @@ static inline int radix__has_transparent_hugepage(void)
return 1;
return 0;
}
+
+static inline int radix__has_transparent_pud_hugepage(void)
+{
+ /* For radix 1G at PUD level means pud hugepage support */
+ if (mmu_psize_defs[MMU_PAGE_1G].shift == PUD_SHIFT)
+ return 1;
+ return 0;
+}
#endif
static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
@@ -292,9 +320,20 @@ static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP));
}
+static inline pud_t radix__pud_mkdevmap(pud_t pud)
+{
+ return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP));
+}
+
+struct vmem_altmap;
+struct dev_pagemap;
extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
unsigned long page_size,
unsigned long phys);
+int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap);
+void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap);
extern void radix__vmemmap_remove_mapping(unsigned long start,
unsigned long page_size);
@@ -325,5 +364,15 @@ int radix__remove_section_mapping(unsigned long start, unsigned long end);
void radix__kernel_map_pages(struct page *page, int numpages, int enable);
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+#define vmemmap_can_optimize vmemmap_can_optimize
+bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap);
+#endif
+
+#define vmemmap_populate_compound_pages vmemmap_populate_compound_pages
+int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+ unsigned long start,
+ unsigned long end, int node,
+ struct dev_pagemap *pgmap);
#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index 77797a2a82eb..a38542259fab 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -68,6 +68,8 @@ void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long end, int psize);
extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
+extern void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end);
extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 0d0c1447ecf0..1950c1b825b4 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -5,6 +5,7 @@
#define MMU_NO_CONTEXT ~0UL
#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
#include <asm/book3s/64/tlbflush-hash.h>
#include <asm/book3s/64/tlbflush-radix.h>
@@ -50,6 +51,14 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma,
radix__flush_pmd_tlb_range(vma, start, end);
}
+#define __HAVE_ARCH_FLUSH_PUD_TLB_RANGE
+static inline void flush_pud_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ if (radix_enabled())
+ radix__flush_pud_tlb_range(vma, start, end);
+}
+
#define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma,
unsigned long start,
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index f1e657c9bbe8..0732b743e099 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -3,11 +3,6 @@
#define _ASM_POWERPC_IO_H
#ifdef __KERNEL__
-#define ARCH_HAS_IOREMAP_WC
-#ifdef CONFIG_PPC32
-#define ARCH_HAS_IOREMAP_WT
-#endif
-
/*
*/
@@ -732,9 +727,7 @@ static inline void name at \
#define writel_relaxed(v, addr) writel(v, addr)
#define writeq_relaxed(v, addr) writeq(v, addr)
-#ifdef CONFIG_GENERIC_IOMAP
-#include <asm-generic/iomap.h>
-#else
+#ifndef CONFIG_GENERIC_IOMAP
/*
* Here comes the implementation of the IOMAP interfaces.
*/
@@ -896,8 +889,8 @@ static inline void iosync(void)
*
*/
extern void __iomem *ioremap(phys_addr_t address, unsigned long size);
-extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size,
- unsigned long flags);
+#define ioremap ioremap
+#define ioremap_prot ioremap_prot
extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size);
#define ioremap_wc ioremap_wc
@@ -911,14 +904,12 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size);
#define ioremap_cache(addr, size) \
ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL))
-extern void iounmap(volatile void __iomem *addr);
+#define iounmap iounmap
void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size);
int early_ioremap_range(unsigned long ea, phys_addr_t pa,
unsigned long size, pgprot_t prot);
-void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
- pgprot_t prot, void *caller);
extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size,
pgprot_t prot, void *caller);
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index 3360cad78ace..3a971e2a8c73 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -45,6 +45,10 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
pte_fragment_free((unsigned long *)ptepage, 0);
}
+/* arch use pte_free_defer() implementation in arch/powerpc/mm/pgtable-frag.c */
+#define pte_free_defer pte_free_defer
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);
+
/*
* Functions that deal with pagetables that could be at any level of
* the table need to be passed an "index_size" so they know how to
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 6a88bfdaa69b..33464e6d6431 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -158,13 +158,30 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd)
}
#ifdef CONFIG_PPC64
-#define is_ioremap_addr is_ioremap_addr
-static inline bool is_ioremap_addr(const void *x)
+int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size);
+bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
+ unsigned long page_size);
+/*
+ * mm/memory_hotplug.c:mhp_supports_memmap_on_memory goes into details
+ * some of the restrictions. We don't check for PMD_SIZE because our
+ * vmemmap allocation code can fallback correctly. The pageblock
+ * alignment requirement is met using altmap->reserve blocks.
+ */
+#define arch_supports_memmap_on_memory arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
{
- unsigned long addr = (unsigned long)x;
-
- return addr >= IOREMAP_BASE && addr < IOREMAP_END;
+ if (!radix_enabled())
+ return false;
+ /*
+ * With 4K page size and 2M PMD_SIZE, we can align
+ * things better with memory block size value
+ * starting from 128MB. Hence align things with PMD_SIZE.
+ */
+ if (IS_ENABLED(CONFIG_PPC_4K_PAGES))
+ return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+ return true;
}
+
#endif /* CONFIG_PPC64 */
#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 709ebd578394..e2d6f9327f77 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -410,6 +410,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm,
ret = H_STATE;
break;
}
+ vma_start_write(vma);
/* Copy vm_flags to avoid partial modifications in ksm_madvise */
vm_flags = vma->vm_flags;
ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 51f48984abca..988948d69bc1 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -214,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr
old = be64_to_cpu(old_be);
- trace_hugepage_update(addr, old, clr, set);
+ trace_hugepage_update_pmd(addr, old, clr, set);
if (old & H_PAGE_HASHPTE)
hpte_do_hugepage_flush(mm, addr, pmdp, old);
return old;
diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
index c766e4c26e42..1715b07c630c 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -246,15 +246,15 @@ static void destroy_contexts(mm_context_t *ctx)
static void pmd_frag_destroy(void *pmd_frag)
{
int count;
- struct page *page;
+ struct ptdesc *ptdesc;
- page = virt_to_page(pmd_frag);
+ ptdesc = virt_to_ptdesc(pmd_frag);
/* drop all the pending references */
count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
- if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) {
- pgtable_pmd_page_dtor(page);
- __free_page(page);
+ if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
+ pagetable_pmd_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
}
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 85c84e89e3ea..1498ccd08367 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -64,11 +64,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
return changed;
}
+int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp, pud_t entry, int dirty)
+{
+ int changed;
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pud_devmap(*pudp));
+ assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
+#endif
+ changed = !pud_same(*(pudp), entry);
+ if (changed) {
+ /*
+ * We can use MMU_PAGE_1G here, because only radix
+ * path look at the psize.
+ */
+ __ptep_set_access_flags(vma, pudp_ptep(pudp),
+ pud_pte(entry), address, MMU_PAGE_1G);
+ }
+ return changed;
+}
+
+
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
}
+
+int pudp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp)
+{
+ return __pudp_test_and_clear_young(vma->vm_mm, address, pudp);
+}
+
/*
* set a new huge pmd. We should not be called for updating
* an existing pmd entry. That should go via pmd_hugepage_update.
@@ -90,6 +118,23 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
}
+void set_pud_at(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud)
+{
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+
+ WARN_ON(pte_hw_valid(pud_pte(*pudp)));
+ assert_spin_locked(pud_lockptr(mm, pudp));
+ WARN_ON(!(pud_large(pud)));
+#endif
+ trace_hugepage_set_pud(addr, pud_val(pud));
+ return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud));
+}
+
static void do_serialize(void *arg)
{
/* We've taken the IPI, so try to trim the mask while here */
@@ -147,11 +192,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
return pmd;
}
+pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pudp, int full)
+{
+ pud_t pud;
+
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) ||
+ !pud_present(*pudp));
+ pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
+ /*
+ * if it not a fullmm flush, then we can possibly end up converting
+ * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
+ * Make sure we flush the tlb in this case.
+ */
+ if (!full)
+ flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE);
+ return pud;
+}
+
static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
{
return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
}
+static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot)
+{
+ return __pud(pud_val(pud) | pgprot_val(pgprot));
+}
+
/*
* At some point we should be able to get rid of
* pmd_mkhuge() and mk_huge_pmd() when we update all the
@@ -166,6 +235,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
}
+pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
+{
+ unsigned long pudv;
+
+ pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
+
+ return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
+}
+
pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
{
return pfn_pmd(page_to_pfn(page), pgprot);
@@ -306,22 +384,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
{
void *ret = NULL;
- struct page *page;
+ struct ptdesc *ptdesc;
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
- page = alloc_page(gfp);
- if (!page)
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pmd_page_ctor(page)) {
- __free_pages(page, 0);
+ if (!pagetable_pmd_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- atomic_set(&page->pt_frag_refcount, 1);
+ atomic_set(&ptdesc->pt_frag_refcount, 1);
- ret = page_address(page);
+ ret = ptdesc_address(ptdesc);
/*
* if we support only one fragment just return the
* allocated page.
@@ -331,12 +409,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
spin_lock(&mm->page_table_lock);
/*
- * If we find pgtable_page set, we return
+ * If we find ptdesc_page set, we return
* the allocated page with single fragment
* count.
*/
if (likely(!mm->context.pmd_frag)) {
- atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
+ atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR);
mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
}
spin_unlock(&mm->page_table_lock);
@@ -357,15 +435,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
void pmd_fragment_free(unsigned long *pmd)
{
- struct page *page = virt_to_page(pmd);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
- if (PageReserved(page))
- return free_reserved_page(page);
+ if (pagetable_is_reserved(ptdesc))
+ return free_reserved_ptdesc(ptdesc);
- BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
- if (atomic_dec_and_test(&page->pt_frag_refcount)) {
- pgtable_pmd_page_dtor(page);
- __free_page(page);
+ BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
+ pagetable_pmd_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
}
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
index 5e3195568525..17075c78d4bc 100644
--- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -39,6 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st
radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize);
else
radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
+ mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
}
void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index e7ea492ac510..96679018e7fb 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -601,17 +601,6 @@ void __init radix__early_init_mmu(void)
#else
mmu_virtual_psize = MMU_PAGE_4K;
#endif
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- /* vmemmap mapping */
- if (mmu_psize_defs[MMU_PAGE_2M].shift) {
- /*
- * map vmemmap using 2M if available
- */
- mmu_vmemmap_psize = MMU_PAGE_2M;
- } else
- mmu_vmemmap_psize = mmu_virtual_psize;
-#endif
#endif
/*
* initialize page table size
@@ -744,8 +733,58 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
p4d_clear(p4d);
}
-static void remove_pte_table(pte_t *pte_start, unsigned long addr,
- unsigned long end, bool direct)
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
+{
+ unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
+
+ return !vmemmap_populated(start, PMD_SIZE);
+}
+
+static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
+{
+ unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
+
+ return !vmemmap_populated(start, PAGE_SIZE);
+
+}
+#endif
+
+static void __meminit free_vmemmap_pages(struct page *page,
+ struct vmem_altmap *altmap,
+ int order)
+{
+ unsigned int nr_pages = 1 << order;
+
+ if (altmap) {
+ unsigned long alt_start, alt_end;
+ unsigned long base_pfn = page_to_pfn(page);
+
+ /*
+ * with 2M vmemmap mmaping we can have things setup
+ * such that even though atlmap is specified we never
+ * used altmap.
+ */
+ alt_start = altmap->base_pfn;
+ alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
+
+ if (base_pfn >= alt_start && base_pfn < alt_end) {
+ vmem_altmap_free(altmap, nr_pages);
+ return;
+ }
+ }
+
+ if (PageReserved(page)) {
+ /* allocated from memblock */
+ while (nr_pages--)
+ free_reserved_page(page++);
+ } else
+ free_pages((unsigned long)page_address(page), order);
+}
+
+static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long next, pages = 0;
pte_t *pte;
@@ -759,24 +798,26 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr,
if (!pte_present(*pte))
continue;
- if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
- /*
- * The vmemmap_free() and remove_section_mapping()
- * codepaths call us with aligned addresses.
- */
- WARN_ONCE(1, "%s: unaligned range\n", __func__);
- continue;
+ if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
+ if (!direct)
+ free_vmemmap_pages(pte_page(*pte), altmap, 0);
+ pte_clear(&init_mm, addr, pte);
+ pages++;
}
-
- pte_clear(&init_mm, addr, pte);
- pages++;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (!direct && vmemmap_page_is_unused(addr, next)) {
+ free_vmemmap_pages(pte_page(*pte), altmap, 0);
+ pte_clear(&init_mm, addr, pte);
+ }
+#endif
}
if (direct)
update_page_count(mmu_virtual_psize, -pages);
}
static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
- unsigned long end, bool direct)
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long next, pages = 0;
pte_t *pte_base;
@@ -790,18 +831,24 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
continue;
if (pmd_is_leaf(*pmd)) {
- if (!IS_ALIGNED(addr, PMD_SIZE) ||
- !IS_ALIGNED(next, PMD_SIZE)) {
- WARN_ONCE(1, "%s: unaligned range\n", __func__);
- continue;
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
+ pages++;
}
- pte_clear(&init_mm, addr, (pte_t *)pmd);
- pages++;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
+ free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
+ }
+#endif
continue;
}
pte_base = (pte_t *)pmd_page_vaddr(*pmd);
- remove_pte_table(pte_base, addr, next, direct);
+ remove_pte_table(pte_base, addr, next, direct, altmap);
free_pte_table(pte_base, pmd);
}
if (direct)
@@ -809,7 +856,8 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
}
static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
- unsigned long end, bool direct)
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long next, pages = 0;
pmd_t *pmd_base;
@@ -834,15 +882,16 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
}
pmd_base = pud_pgtable(*pud);
- remove_pmd_table(pmd_base, addr, next, direct);
+ remove_pmd_table(pmd_base, addr, next, direct, altmap);
free_pmd_table(pmd_base, pud);
}
if (direct)
update_page_count(MMU_PAGE_1G, -pages);
}
-static void __meminit remove_pagetable(unsigned long start, unsigned long end,
- bool direct)
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long addr, next;
pud_t *pud_base;
@@ -871,7 +920,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end,
}
pud_base = p4d_pgtable(*p4d);
- remove_pud_table(pud_base, addr, next, direct);
+ remove_pud_table(pud_base, addr, next, direct, altmap);
free_pud_table(pud_base, p4d);
}
@@ -894,7 +943,7 @@ int __meminit radix__create_section_mapping(unsigned long start,
int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
{
- remove_pagetable(start, end, true);
+ remove_pagetable(start, end, true, NULL);
return 0;
}
#endif /* CONFIG_MEMORY_HOTPLUG */
@@ -926,10 +975,429 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
return 0;
}
+
+bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+{
+ if (radix_enabled())
+ return __vmemmap_can_optimize(altmap, pgmap);
+
+ return false;
+}
+
+int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
+ unsigned long addr, unsigned long next)
+{
+ int large = pmd_large(*pmdp);
+
+ if (large)
+ vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
+
+ return large;
+}
+
+void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
+ unsigned long addr, unsigned long next)
+{
+ pte_t entry;
+ pte_t *ptep = pmdp_ptep(pmdp);
+
+ VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, ptep, entry);
+ asm volatile("ptesync": : :"memory");
+
+ vmemmap_verify(ptep, node, addr, next);
+}
+
+static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
+ int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ pte_t *pte = pte_offset_kernel(pmdp, addr);
+
+ if (pte_none(*pte)) {
+ pte_t entry;
+ void *p;
+
+ if (!reuse) {
+ /*
+ * make sure we don't create altmap mappings
+ * covering things outside the device.
+ */
+ if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
+ altmap = NULL;
+
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+ if (!p && altmap)
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
+ if (!p)
+ return NULL;
+ pr_debug("PAGE_SIZE vmemmap mapping\n");
+ } else {
+ /*
+ * When a PTE/PMD entry is freed from the init_mm
+ * there's a free_pages() call to this page allocated
+ * above. Thus this get_page() is paired with the
+ * put_page_testzero() on the freeing path.
+ * This can only called by certain ZONE_DEVICE path,
+ * and through vmemmap_populate_compound_pages() when
+ * slab is available.
+ */
+ get_page(reuse);
+ p = page_to_virt(reuse);
+ pr_debug("Tail page reuse vmemmap mapping\n");
+ }
+
+ VM_BUG_ON(!PAGE_ALIGNED(addr));
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, pte, entry);
+ asm volatile("ptesync": : :"memory");
+ }
+ return pte;
+}
+
+static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
+ unsigned long address)
+{
+ pud_t *pud;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(p4d_none(*p4dp))) {
+ if (unlikely(!slab_is_available())) {
+ pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ p4d_populate(&init_mm, p4dp, pud);
+ /* go to the pud_offset */
+ } else
+ return pud_alloc(&init_mm, p4dp, address);
+ }
+ return pud_offset(p4dp, address);
+}
+
+static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
+ unsigned long address)
+{
+ pmd_t *pmd;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(pud_none(*pudp))) {
+ if (unlikely(!slab_is_available())) {
+ pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ pud_populate(&init_mm, pudp, pmd);
+ } else
+ return pmd_alloc(&init_mm, pudp, address);
+ }
+ return pmd_offset(pudp, address);
+}
+
+static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
+ unsigned long address)
+{
+ pte_t *pte;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(pmd_none(*pmdp))) {
+ if (unlikely(!slab_is_available())) {
+ pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ pmd_populate(&init_mm, pmdp, pte);
+ } else
+ return pte_alloc_kernel(pmdp, address);
+ }
+ return pte_offset_kernel(pmdp, address);
+}
+
+
+
+int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
+{
+ unsigned long addr;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (pmd_none(READ_ONCE(*pmd))) {
+ void *p;
+
+ /*
+ * keep it simple by checking addr PMD_SIZE alignment
+ * and verifying the device boundary condition.
+ * For us to use a pmd mapping, both addr and pfn should
+ * be aligned. We skip if addr is not aligned and for
+ * pfn we hope we have extra area in the altmap that
+ * can help to find an aligned block. This can result
+ * in altmap block allocation failures, in which case
+ * we fallback to RAM for vmemmap allocation.
+ */
+ if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
+ altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
+ /*
+ * make sure we don't create altmap mappings
+ * covering things outside the device.
+ */
+ goto base_mapping;
+ }
+
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+ if (p) {
+ vmemmap_set_pmd(pmd, p, node, addr, next);
+ pr_debug("PMD_SIZE vmemmap mapping\n");
+ continue;
+ } else if (altmap) {
+ /*
+ * A vmemmap block allocation can fail due to
+ * alignment requirements and we trying to align
+ * things aggressively there by running out of
+ * space. Try base mapping on failure.
+ */
+ goto base_mapping;
+ }
+ } else if (vmemmap_check_pmd(pmd, node, addr, next)) {
+ /*
+ * If a huge mapping exist due to early call to
+ * vmemmap_populate, let's try to use that.
+ */
+ continue;
+ }
+base_mapping:
+ /*
+ * Not able allocate higher order memory to back memmap
+ * or we found a pointer to pte page. Allocate base page
+ * size vmemmap
+ */
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return -ENOMEM;
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
+ if (!pte)
+ return -ENOMEM;
+
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+ next = addr + PAGE_SIZE;
+ }
+ return 0;
+}
+
+static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return NULL;
+ if (pmd_leaf(*pmd))
+ /*
+ * The second page is mapped as a hugepage due to a nearby request.
+ * Force our mapping to page size without deduplication
+ */
+ return NULL;
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return NULL;
+ radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ return pte;
+}
+
+static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
+ unsigned long pfn_offset, int node)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long map_addr;
+
+ /* the second vmemmap page which we use for duplication */
+ map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
+ pgd = pgd_offset_k(map_addr);
+ p4d = p4d_offset(pgd, map_addr);
+ pud = vmemmap_pud_alloc(p4d, node, map_addr);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_alloc(pud, node, map_addr);
+ if (!pmd)
+ return NULL;
+ if (pmd_leaf(*pmd))
+ /*
+ * The second page is mapped as a hugepage due to a nearby request.
+ * Force our mapping to page size without deduplication
+ */
+ return NULL;
+ pte = vmemmap_pte_alloc(pmd, node, map_addr);
+ if (!pte)
+ return NULL;
+ /*
+ * Check if there exist a mapping to the left
+ */
+ if (pte_none(*pte)) {
+ /*
+ * Populate the head page vmemmap page.
+ * It can fall in different pmd, hence
+ * vmemmap_populate_address()
+ */
+ pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
+ if (!pte)
+ return NULL;
+ /*
+ * Populate the tail pages vmemmap page
+ */
+ pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
+ if (!pte)
+ return NULL;
+ vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
+ return pte;
+ }
+ return pte;
+}
+
+int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+ unsigned long start,
+ unsigned long end, int node,
+ struct dev_pagemap *pgmap)
+{
+ /*
+ * we want to map things as base page size mapping so that
+ * we can save space in vmemmap. We could have huge mapping
+ * covering out both edges.
+ */
+ unsigned long addr;
+ unsigned long addr_pfn = start_pfn;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (addr = start; addr < end; addr = next) {
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (pmd_leaf(READ_ONCE(*pmd))) {
+ /* existing huge mapping. Skip the range */
+ addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
+ next = pmd_addr_end(addr, end);
+ continue;
+ }
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return -ENOMEM;
+ if (!pte_none(*pte)) {
+ /*
+ * This could be because we already have a compound
+ * page whose VMEMMAP_RESERVE_NR pages were mapped and
+ * this request fall in those pages.
+ */
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ } else {
+ unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
+ unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
+ pte_t *tail_page_pte;
+
+ /*
+ * if the address is aligned to huge page size it is the
+ * head mapping.
+ */
+ if (pfn_offset == 0) {
+ /* Populate the head page vmemmap page */
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ /*
+ * Populate the tail pages vmemmap page
+ * It can fall in different pmd, hence
+ * vmemmap_populate_address()
+ */
+ pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+
+ addr_pfn += 2;
+ next = addr + 2 * PAGE_SIZE;
+ continue;
+ }
+ /*
+ * get the 2nd mapping details
+ * Also create it if that doesn't exist
+ */
+ tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
+ if (!tail_page_pte) {
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ }
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ }
+ }
+ return 0;
+}
+
+
#ifdef CONFIG_MEMORY_HOTPLUG
void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
{
- remove_pagetable(start, start + page_size, false);
+ remove_pagetable(start, start + page_size, true, NULL);
+}
+
+void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+ remove_pagetable(start, end, false, altmap);
}
#endif
#endif
@@ -962,7 +1430,24 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add
#endif
old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
- trace_hugepage_update(addr, old, clr, set);
+ trace_hugepage_update_pmd(addr, old, clr, set);
+
+ return old;
+}
+
+unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, unsigned long clr,
+ unsigned long set)
+{
+ unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pud_devmap(*pudp));
+ assert_spin_locked(pud_lockptr(mm, pudp));
+#endif
+
+ old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
+ trace_hugepage_update_pud(addr, old, clr, set);
return old;
}
@@ -1043,6 +1528,17 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
return old_pmd;
}
+pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ pud_t old_pud;
+ unsigned long old;
+
+ old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
+ old_pud = __pud(old);
+ return old_pud;
+}
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 0bd4866d9824..3020a8b38572 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -987,6 +987,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
}
}
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
EXPORT_SYMBOL(radix__flush_tlb_mm);
@@ -1020,6 +1021,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
}
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
void radix__flush_all_mm(struct mm_struct *mm)
@@ -1228,6 +1230,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
}
out:
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
@@ -1392,6 +1395,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
}
out:
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
@@ -1461,6 +1465,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
}
EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
+void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G);
+}
+EXPORT_SYMBOL(radix__flush_pud_tlb_range);
+
void radix__flush_tlb_all(void)
{
unsigned long rb,prs,r,rs;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 5bfdf6ecfa96..fafce6bdeff0 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -469,7 +469,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
if (is_exec)
flags |= FAULT_FLAG_INSTRUCTION;
-#ifdef CONFIG_PER_VMA_LOCK
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
@@ -501,7 +500,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
return user_mode(regs) ? 0 : SIGBUS;
lock_mmap:
-#endif /* CONFIG_PER_VMA_LOCK */
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
@@ -551,9 +549,7 @@ retry:
mmap_read_unlock(current->mm);
-#ifdef CONFIG_PER_VMA_LOCK
done:
-#endif
if (unlikely(fault & VM_FAULT_ERROR))
return mm_fault_error(regs, address, fault);
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index fe1b83020e0d..f7930360406e 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -92,7 +92,7 @@ static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_ad
* a page table lookup here because with the hash translation we don't keep
* vmemmap details in linux page table.
*/
-static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
+int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
{
struct page *start;
unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size;
@@ -183,8 +183,8 @@ static __meminit int vmemmap_list_populate(unsigned long phys,
return 0;
}
-static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
- unsigned long page_size)
+bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
+ unsigned long page_size)
{
unsigned long nr_pfn = page_size / sizeof(struct page);
unsigned long start_pfn = page_to_pfn((struct page *)start);
@@ -198,8 +198,8 @@ static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long star
return false;
}
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
- struct vmem_altmap *altmap)
+static int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
{
bool altmap_alloc;
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
@@ -272,6 +272,18 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
return 0;
}
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
+{
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (radix_enabled())
+ return radix__vmemmap_populate(start, end, node, altmap);
+#endif
+
+ return __vmemmap_populate(start, end, node, altmap);
+}
+
#ifdef CONFIG_MEMORY_HOTPLUG
static unsigned long vmemmap_list_free(unsigned long start)
{
@@ -303,8 +315,8 @@ static unsigned long vmemmap_list_free(unsigned long start)
return vmem_back->phys;
}
-void __ref vmemmap_free(unsigned long start, unsigned long end,
- struct vmem_altmap *altmap)
+static void __ref __vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
{
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
unsigned long page_order = get_order(page_size);
@@ -362,6 +374,17 @@ void __ref vmemmap_free(unsigned long start, unsigned long end,
vmemmap_remove_mapping(start, page_size);
}
}
+
+void __ref vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (radix_enabled())
+ return radix__vmemmap_free(start, end, altmap);
+#endif
+ return __vmemmap_free(start, end, altmap);
+}
+
#endif
void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long size)
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
index 4f12504fb405..705e8e8ffde4 100644
--- a/arch/powerpc/mm/ioremap.c
+++ b/arch/powerpc/mm/ioremap.c
@@ -41,7 +41,7 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size)
return __ioremap_caller(addr, size, prot, caller);
}
-void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
+void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long flags)
{
pte_t pte = __pte(flags);
void *caller = __builtin_return_address(0);
@@ -74,27 +74,3 @@ int early_ioremap_range(unsigned long ea, phys_addr_t pa,
return 0;
}
-
-void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
- pgprot_t prot, void *caller)
-{
- struct vm_struct *area;
- int ret;
- unsigned long va;
-
- area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller);
- if (area == NULL)
- return NULL;
-
- area->phys_addr = pa;
- va = (unsigned long)area->addr;
-
- ret = ioremap_page_range(va, va + size, pa, prot);
- if (!ret)
- return (void __iomem *)area->addr + offset;
-
- vunmap_range(va, va + size);
- free_vm_area(area);
-
- return NULL;
-}
diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c
index 9d13143b8be4..ca5bc6be3e6f 100644
--- a/arch/powerpc/mm/ioremap_32.c
+++ b/arch/powerpc/mm/ioremap_32.c
@@ -22,6 +22,13 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
int err;
/*
+ * If the address lies within the first 16 MB, assume it's in ISA
+ * memory space
+ */
+ if (addr < SZ_16M)
+ addr += _ISA_MEM_BASE;
+
+ /*
* Choose an address to map it to.
* Once the vmalloc system is running, we use it.
* Before then, we use space going down from IOREMAP_TOP
@@ -31,13 +38,6 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
offset = addr & ~PAGE_MASK;
size = PAGE_ALIGN(addr + size) - p;
- /*
- * If the address lies within the first 16 MB, assume it's in ISA
- * memory space
- */
- if (p < 16 * 1024 * 1024)
- p += _ISA_MEM_BASE;
-
#ifndef CONFIG_CRASH_DUMP
/*
* Don't allow anybody to remap normal RAM that we're using.
@@ -63,7 +63,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
return (void __iomem *)v + offset;
if (slab_is_available())
- return do_ioremap(p, offset, size, prot, caller);
+ return generic_ioremap_prot(addr, size, prot);
/*
* Should check if it is a candidate for a BAT mapping
@@ -87,7 +87,6 @@ void iounmap(volatile void __iomem *addr)
if (v_block_mapped((unsigned long)addr))
return;
- if (addr > high_memory && (unsigned long)addr < ioremap_bot)
- vunmap((void *)(PAGE_MASK & (unsigned long)addr));
+ generic_iounmap(addr);
}
EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
index 3acece00b33e..d24e5f166723 100644
--- a/arch/powerpc/mm/ioremap_64.c
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -29,7 +29,7 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
return NULL;
if (slab_is_available())
- return do_ioremap(paligned, offset, size, prot, caller);
+ return generic_ioremap_prot(addr, size, prot);
pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller);
@@ -49,17 +49,9 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
*/
void iounmap(volatile void __iomem *token)
{
- void *addr;
-
if (!slab_is_available())
return;
- addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK);
-
- if ((unsigned long)addr < ioremap_bot) {
- pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr);
- return;
- }
- vunmap(addr);
+ generic_iounmap(PCI_FIX_ADDR(token));
}
EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
index 20652daa1d7e..8c31802f97e8 100644
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -18,15 +18,15 @@
void pte_frag_destroy(void *pte_frag)
{
int count;
- struct page *page;
+ struct ptdesc *ptdesc;
- page = virt_to_page(pte_frag);
+ ptdesc = virt_to_ptdesc(pte_frag);
/* drop all the pending references */
count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
- if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) {
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
+ pagetable_pte_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
}
@@ -55,25 +55,25 @@ static pte_t *get_pte_from_cache(struct mm_struct *mm)
static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel)
{
void *ret = NULL;
- struct page *page;
+ struct ptdesc *ptdesc;
if (!kernel) {
- page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT);
- if (!page)
+ ptdesc = pagetable_alloc(PGALLOC_GFP | __GFP_ACCOUNT, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pte_page_ctor(page)) {
- __free_page(page);
+ if (!pagetable_pte_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
} else {
- page = alloc_page(PGALLOC_GFP);
- if (!page)
+ ptdesc = pagetable_alloc(PGALLOC_GFP, 0);
+ if (!ptdesc)
return NULL;
}
- atomic_set(&page->pt_frag_refcount, 1);
+ atomic_set(&ptdesc->pt_frag_refcount, 1);
- ret = page_address(page);
+ ret = ptdesc_address(ptdesc);
/*
* if we support only one fragment just return the
* allocated page.
@@ -82,12 +82,12 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel)
return ret;
spin_lock(&mm->page_table_lock);
/*
- * If we find pgtable_page set, we return
+ * If we find ptdesc_page set, we return
* the allocated page with single fragment
* count.
*/
if (likely(!pte_frag_get(&mm->context))) {
- atomic_set(&page->pt_frag_refcount, PTE_FRAG_NR);
+ atomic_set(&ptdesc->pt_frag_refcount, PTE_FRAG_NR);
pte_frag_set(&mm->context, ret + PTE_FRAG_SIZE);
}
spin_unlock(&mm->page_table_lock);
@@ -106,17 +106,40 @@ pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel)
return __alloc_for_ptecache(mm, kernel);
}
-void pte_fragment_free(unsigned long *table, int kernel)
+static void pte_free_now(struct rcu_head *head)
{
- struct page *page = virt_to_page(table);
+ struct ptdesc *ptdesc;
- if (PageReserved(page))
- return free_reserved_page(page);
+ ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+ pagetable_pte_dtor(ptdesc);
+ pagetable_free(ptdesc);
+}
- BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
- if (atomic_dec_and_test(&page->pt_frag_refcount)) {
- if (!kernel)
- pgtable_pte_page_dtor(page);
- __free_page(page);
+void pte_fragment_free(unsigned long *table, int kernel)
+{
+ struct ptdesc *ptdesc = virt_to_ptdesc(table);
+
+ if (pagetable_is_reserved(ptdesc))
+ return free_reserved_ptdesc(ptdesc);
+
+ BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
+ if (kernel)
+ pagetable_free(ptdesc);
+ else if (folio_test_clear_active(ptdesc_folio(ptdesc)))
+ call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
+ else
+ pte_free_now(&ptdesc->pt_rcu_head);
}
}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+ struct page *page;
+
+ page = virt_to_page(pgtable);
+ SetPageActive(page);
+ pte_fragment_free((unsigned long *)pgtable, 0);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index cb2dcdb18f8e..a3dcdb2d5b4b 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -311,6 +311,8 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
+ pte_t *pte;
+ spinlock_t *ptl;
if (mm == &init_mm)
return;
@@ -329,8 +331,10 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
*/
if (pmd_none(*pmd))
return;
- BUG_ON(!pmd_present(*pmd));
- assert_spin_locked(pte_lockptr(mm, pmd));
+ pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
+ BUG_ON(!pte);
+ assert_spin_locked(ptl);
+ pte_unmap(pte);
}
#endif /* CONFIG_DEBUG_VM */
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 45fd975ef521..340b86ef7284 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -94,6 +94,7 @@ config PPC_BOOK3S_64
select PPC_FPU
select PPC_HAVE_PMU_SUPPORT
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
select ARCH_ENABLE_SPLIT_PMD_PTLOCK
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 9c62c2c3b3d0..4f3d6a2f9065 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -637,7 +637,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
nid = first_online_node;
/* Add the memory */
- rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE);
+ rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_MEMMAP_ON_MEMORY);
if (rc) {
invalidate_lmb_associativity_index(lmb);
return rc;
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index fae747cc57d2..ee17270d35d0 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1084,7 +1084,7 @@ cmds(struct pt_regs *excp)
memzcan();
break;
case 'i':
- show_mem(0, NULL);
+ show_mem();
break;
default:
termch = cmd;
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 4c07b9189c86..6943d34c1ec1 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -53,7 +53,7 @@ config RISCV
select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
select ARCH_WANT_LD_ORPHAN_WARN if !XIP_KERNEL
- select ARCH_WANT_OPTIMIZE_VMEMMAP
+ select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE
select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
select BUILDTIME_TABLE_SORT if MMU
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index ce1ebda1a49a..34e24f078cc1 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -2,6 +2,7 @@
#ifndef _ASM_RISCV_HUGETLB_H
#define _ASM_RISCV_HUGETLB_H
+#include <asm/cacheflush.h>
#include <asm/page.h>
static inline void arch_clear_hugepage_flags(struct page *page)
diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 59dc12b5b7e8..d169a4f41a2e 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -153,10 +153,10 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
#endif /* __PAGETABLE_PMD_FOLDED */
-#define __pte_free_tlb(tlb, pte, buf) \
-do { \
- pgtable_pte_page_dtor(pte); \
- tlb_remove_page((tlb), pte); \
+#define __pte_free_tlb(tlb, pte, buf) \
+do { \
+ pagetable_pte_dtor(page_ptdesc(pte)); \
+ tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\
} while (0)
#endif /* CONFIG_MMU */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 75970ee2bda2..44377f0d7c35 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -499,7 +499,7 @@ static inline void __set_pte_at(struct mm_struct *mm,
static inline void set_pte_at(struct mm_struct *mm,
unsigned long addr, pte_t *ptep, pte_t pteval)
{
- page_table_check_pte_set(mm, addr, ptep, pteval);
+ page_table_check_pte_set(mm, ptep, pteval);
__set_pte_at(mm, addr, ptep, pteval);
}
@@ -529,7 +529,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
{
pte_t pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0));
- page_table_check_pte_clear(mm, address, pte);
+ page_table_check_pte_clear(mm, pte);
return pte;
}
@@ -687,14 +687,14 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
- page_table_check_pmd_set(mm, addr, pmdp, pmd);
+ page_table_check_pmd_set(mm, pmdp, pmd);
return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd));
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
- page_table_check_pud_set(mm, addr, pudp, pud);
+ page_table_check_pud_set(mm, pudp, pud);
return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud));
}
@@ -742,7 +742,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
{
pmd_t pmd = __pmd(atomic_long_xchg((atomic_long_t *)pmdp, 0));
- page_table_check_pmd_clear(mm, address, pmd);
+ page_table_check_pmd_clear(mm, pmd);
return pmd;
}
@@ -758,7 +758,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
- page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
+ page_table_check_pmd_set(vma->vm_mm, pmdp, pmd);
return __pmd(atomic_long_xchg((atomic_long_t *)pmdp, pmd_val(pmd)));
}
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 6ea2cce4cc17..046732fcb48c 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -283,7 +283,6 @@ void handle_page_fault(struct pt_regs *regs)
flags |= FAULT_FLAG_WRITE;
else if (cause == EXC_INST_PAGE_FAULT)
flags |= FAULT_FLAG_INSTRUCTION;
-#ifdef CONFIG_PER_VMA_LOCK
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
@@ -311,7 +310,6 @@ void handle_page_fault(struct pt_regs *regs)
return;
}
lock_mmap:
-#endif /* CONFIG_PER_VMA_LOCK */
retry:
vma = lock_mm_and_find_vma(mm, addr, regs);
@@ -368,9 +366,7 @@ retry:
mmap_read_unlock(mm);
-#ifdef CONFIG_PER_VMA_LOCK
done:
-#endif
if (unlikely(fault & VM_FAULT_ERROR)) {
tsk->thread.bad_cause = cause;
mm_fault_error(regs, addr, fault);
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 9ce504737d18..430a3d05a841 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -353,12 +353,10 @@ static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t va)
static phys_addr_t __init alloc_pte_late(uintptr_t va)
{
- unsigned long vaddr;
-
- vaddr = __get_free_page(GFP_KERNEL);
- BUG_ON(!vaddr || !pgtable_pte_page_ctor(virt_to_page((void *)vaddr)));
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
- return __pa(vaddr);
+ BUG_ON(!ptdesc || !pagetable_pte_ctor(ptdesc));
+ return __pa((pte_t *)ptdesc_address(ptdesc));
}
static void __init create_pte_mapping(pte_t *ptep,
@@ -436,12 +434,10 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va)
static phys_addr_t __init alloc_pmd_late(uintptr_t va)
{
- unsigned long vaddr;
-
- vaddr = __get_free_page(GFP_KERNEL);
- BUG_ON(!vaddr || !pgtable_pmd_page_ctor(virt_to_page((void *)vaddr)));
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
- return __pa(vaddr);
+ BUG_ON(!ptdesc || !pagetable_pmd_ctor(ptdesc));
+ return __pa((pmd_t *)ptdesc_address(ptdesc));
}
static void __init create_pmd_mapping(pmd_t *pmdp,
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 5b39918b7042..8ff6d1c21e38 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -127,7 +127,7 @@ config S390
select ARCH_WANTS_NO_INSTR
select ARCH_WANT_DEFAULT_BPF_JIT
select ARCH_WANT_IPC_PARSE_VERSION
- select ARCH_WANT_OPTIMIZE_VMEMMAP
+ select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
select BUILDTIME_TABLE_SORT
select CLONE_BACKWARDS2
select DMA_OPS if PCI
@@ -143,6 +143,7 @@ config S390
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL
select GENERIC_VDSO_TIME_NS
+ select GENERIC_IOREMAP if PCI
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_JUMP_LABEL
diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h
index e3882b012bfa..4453ad7c11ac 100644
--- a/arch/s390/include/asm/io.h
+++ b/arch/s390/include/asm/io.h
@@ -22,11 +22,18 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
#define IO_SPACE_LIMIT 0
-void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot);
-void __iomem *ioremap(phys_addr_t addr, size_t size);
-void __iomem *ioremap_wc(phys_addr_t addr, size_t size);
-void __iomem *ioremap_wt(phys_addr_t addr, size_t size);
-void iounmap(volatile void __iomem *addr);
+/*
+ * I/O memory mapping functions.
+ */
+#define ioremap_prot ioremap_prot
+#define iounmap iounmap
+
+#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL)
+
+#define ioremap_wc(addr, size) \
+ ioremap_prot((addr), (size), pgprot_val(pgprot_writecombine(PAGE_KERNEL)))
+#define ioremap_wt(addr, size) \
+ ioremap_prot((addr), (size), pgprot_val(pgprot_writethrough(PAGE_KERNEL)))
static inline void __iomem *ioport_map(unsigned long port, unsigned int nr)
{
@@ -51,10 +58,6 @@ static inline void ioport_unmap(void __iomem *p)
#define pci_iomap_wc pci_iomap_wc
#define pci_iomap_wc_range pci_iomap_wc_range
-#define ioremap ioremap
-#define ioremap_wt ioremap_wt
-#define ioremap_wc ioremap_wc
-
#define memcpy_fromio(dst, src, count) zpci_memcpy_fromio(dst, src, count)
#define memcpy_toio(dst, src, count) zpci_memcpy_toio(dst, src, count)
#define memset_io(dst, val, count) zpci_memset_io(dst, val, count)
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 17eb618f1348..376b4b23bdaa 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -86,7 +86,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
if (!table)
return NULL;
crst_table_init(table, _SEGMENT_ENTRY_EMPTY);
- if (!pgtable_pmd_page_ctor(virt_to_page(table))) {
+ if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) {
crst_table_free(mm, table);
return NULL;
}
@@ -97,7 +97,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
if (mm_pmd_folded(mm))
return;
- pgtable_pmd_page_dtor(virt_to_page(pmd));
+ pagetable_pmd_dtor(virt_to_ptdesc(pmd));
crst_table_free(mm, (unsigned long *) pmd);
}
@@ -143,6 +143,10 @@ static inline void pmd_populate(struct mm_struct *mm,
#define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte)
#define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte)
+/* arch use pte_free_defer() implementation in arch/s390/mm/pgalloc.c */
+#define pte_free_defer pte_free_defer
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);
+
void vmem_map_init(void);
void *vmem_crst_alloc(unsigned long val);
pte_t *vmem_pte_alloc(void);
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index b91f4a9b044c..383b1f91442c 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -89,12 +89,12 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
{
if (mm_pmd_folded(tlb->mm))
return;
- pgtable_pmd_page_dtor(virt_to_page(pmd));
+ pagetable_pmd_dtor(virt_to_ptdesc(pmd));
__tlb_adjust_range(tlb, address, PAGE_SIZE);
tlb->mm->context.flush_mm = 1;
tlb->freed_tables = 1;
tlb->cleared_puds = 1;
- tlb_remove_table(tlb, pmd);
+ tlb_remove_ptdesc(tlb, pmd);
}
/*
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 2f123429a291..6f6b9881e55e 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -407,7 +407,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
access = VM_WRITE;
if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
-#ifdef CONFIG_PER_VMA_LOCK
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
vma = lock_vma_under_rcu(mm, address);
@@ -432,7 +431,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
goto out;
}
lock_mmap:
-#endif /* CONFIG_PER_VMA_LOCK */
mmap_read_lock(mm);
gmap = NULL;
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 66ab68db9842..07fc660a24aa 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -43,17 +43,17 @@ __initcall(page_table_register_sysctl);
unsigned long *crst_table_alloc(struct mm_struct *mm)
{
- struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
- if (!page)
+ if (!ptdesc)
return NULL;
- arch_set_page_dat(page, CRST_ALLOC_ORDER);
- return (unsigned long *) page_to_virt(page);
+ arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER);
+ return (unsigned long *) ptdesc_to_virt(ptdesc);
}
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
- free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+ pagetable_free(virt_to_ptdesc(table));
}
static void __crst_table_upgrade(void *arg)
@@ -140,21 +140,21 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
struct page *page_table_alloc_pgste(struct mm_struct *mm)
{
- struct page *page;
+ struct ptdesc *ptdesc;
u64 *table;
- page = alloc_page(GFP_KERNEL);
- if (page) {
- table = (u64 *)page_to_virt(page);
+ ptdesc = pagetable_alloc(GFP_KERNEL, 0);
+ if (ptdesc) {
+ table = (u64 *)ptdesc_to_virt(ptdesc);
memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
}
- return page;
+ return ptdesc_page(ptdesc);
}
void page_table_free_pgste(struct page *page)
{
- __free_page(page);
+ pagetable_free(page_ptdesc(page));
}
#endif /* CONFIG_PGSTE */
@@ -229,11 +229,20 @@ void page_table_free_pgste(struct page *page)
* logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
* while the PP bits are never used, nor such a page is added to or removed
* from mm_context_t::pgtable_list.
+ *
+ * pte_free_defer() overrides those rules: it takes the page off pgtable_list,
+ * and prevents both 2K fragments from being reused. pte_free_defer() has to
+ * guarantee that its pgtable cannot be reused before the RCU grace period
+ * has elapsed (which page_table_free_rcu() does not actually guarantee).
+ * But for simplicity, because page->rcu_head overlays page->lru, and because
+ * the RCU callback might not be called before the mm_context_t has been freed,
+ * pte_free_defer() in this implementation prevents both fragments from being
+ * reused, and delays making the call to RCU until both fragments are freed.
*/
unsigned long *page_table_alloc(struct mm_struct *mm)
{
unsigned long *table;
- struct page *page;
+ struct ptdesc *ptdesc;
unsigned int mask, bit;
/* Try to get a fragment of a 4K page as a 2K page table */
@@ -241,9 +250,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
table = NULL;
spin_lock_bh(&mm->context.lock);
if (!list_empty(&mm->context.pgtable_list)) {
- page = list_first_entry(&mm->context.pgtable_list,
- struct page, lru);
- mask = atomic_read(&page->_refcount) >> 24;
+ ptdesc = list_first_entry(&mm->context.pgtable_list,
+ struct ptdesc, pt_list);
+ mask = atomic_read(&ptdesc->_refcount) >> 24;
/*
* The pending removal bits must also be checked.
* Failure to do so might lead to an impossible
@@ -255,13 +264,13 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
*/
mask = (mask | (mask >> 4)) & 0x03U;
if (mask != 0x03U) {
- table = (unsigned long *) page_to_virt(page);
+ table = (unsigned long *) ptdesc_to_virt(ptdesc);
bit = mask & 1; /* =1 -> second 2K */
if (bit)
table += PTRS_PER_PTE;
- atomic_xor_bits(&page->_refcount,
+ atomic_xor_bits(&ptdesc->_refcount,
0x01U << (bit + 24));
- list_del(&page->lru);
+ list_del_init(&ptdesc->pt_list);
}
}
spin_unlock_bh(&mm->context.lock);
@@ -269,27 +278,28 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
return table;
}
/* Allocate a fresh page */
- page = alloc_page(GFP_KERNEL);
- if (!page)
+ ptdesc = pagetable_alloc(GFP_KERNEL, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pte_page_ctor(page)) {
- __free_page(page);
+ if (!pagetable_pte_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- arch_set_page_dat(page, 0);
+ arch_set_page_dat(ptdesc_page(ptdesc), 0);
/* Initialize page table */
- table = (unsigned long *) page_to_virt(page);
+ table = (unsigned long *) ptdesc_to_virt(ptdesc);
if (mm_alloc_pgste(mm)) {
/* Return 4K page table with PGSTEs */
- atomic_xor_bits(&page->_refcount, 0x03U << 24);
+ INIT_LIST_HEAD(&ptdesc->pt_list);
+ atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
} else {
/* Return the first 2K fragment of the page */
- atomic_xor_bits(&page->_refcount, 0x01U << 24);
+ atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24);
memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
spin_lock_bh(&mm->context.lock);
- list_add(&page->lru, &mm->context.pgtable_list);
+ list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
spin_unlock_bh(&mm->context.lock);
}
return table;
@@ -300,7 +310,9 @@ static void page_table_release_check(struct page *page, void *table,
{
char msg[128];
- if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask)
+ if (!IS_ENABLED(CONFIG_DEBUG_VM))
+ return;
+ if (!mask && list_empty(&page->lru))
return;
snprintf(msg, sizeof(msg),
"Invalid pgtable %p release half 0x%02x mask 0x%02x",
@@ -308,12 +320,20 @@ static void page_table_release_check(struct page *page, void *table,
dump_page(page, msg);
}
+static void pte_free_now(struct rcu_head *head)
+{
+ struct ptdesc *ptdesc;
+
+ ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+ pagetable_pte_dtor(ptdesc);
+ pagetable_free(ptdesc);
+}
+
void page_table_free(struct mm_struct *mm, unsigned long *table)
{
unsigned int mask, bit, half;
- struct page *page;
+ struct ptdesc *ptdesc = virt_to_ptdesc(table);
- page = virt_to_page(table);
if (!mm_alloc_pgste(mm)) {
/* Free 2K page table fragment of a 4K page */
bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
@@ -323,42 +343,50 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
* will happen outside of the critical section from this
* function or from __tlb_remove_table()
*/
- mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+ mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
mask >>= 24;
- if (mask & 0x03U)
- list_add(&page->lru, &mm->context.pgtable_list);
- else
- list_del(&page->lru);
+ if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
+ /*
+ * Other half is allocated, and neither half has had
+ * its free deferred: add page to head of list, to make
+ * this freed half available for immediate reuse.
+ */
+ list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
+ } else {
+ /* If page is on list, now remove it. */
+ list_del_init(&ptdesc->pt_list);
+ }
spin_unlock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
+ mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24));
mask >>= 24;
if (mask != 0x00U)
return;
half = 0x01U << bit;
} else {
half = 0x03U;
- mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+ mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
mask >>= 24;
}
- page_table_release_check(page, table, half, mask);
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
+ if (folio_test_clear_active(ptdesc_folio(ptdesc)))
+ call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
+ else
+ pte_free_now(&ptdesc->pt_rcu_head);
}
void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
unsigned long vmaddr)
{
struct mm_struct *mm;
- struct page *page;
unsigned int bit, mask;
+ struct ptdesc *ptdesc = virt_to_ptdesc(table);
mm = tlb->mm;
- page = virt_to_page(table);
if (mm_alloc_pgste(mm)) {
gmap_unlink(mm, table, vmaddr);
table = (unsigned long *) ((unsigned long)table | 0x03U);
- tlb_remove_table(tlb, table);
+ tlb_remove_ptdesc(tlb, table);
return;
}
bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
@@ -368,12 +396,20 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
* outside of the critical section from __tlb_remove_table() or from
* page_table_free()
*/
- mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+ mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
mask >>= 24;
- if (mask & 0x03U)
- list_add_tail(&page->lru, &mm->context.pgtable_list);
- else
- list_del(&page->lru);
+ if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
+ /*
+ * Other half is allocated, and neither half has had
+ * its free deferred: add page to end of list, to make
+ * this freed half available for reuse once its pending
+ * bit has been cleared by __tlb_remove_table().
+ */
+ list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list);
+ } else {
+ /* If page is on list, now remove it. */
+ list_del_init(&ptdesc->pt_list);
+ }
spin_unlock_bh(&mm->context.lock);
table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
tlb_remove_table(tlb, table);
@@ -383,30 +419,48 @@ void __tlb_remove_table(void *_table)
{
unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
void *table = (void *)((unsigned long) _table ^ mask);
- struct page *page = virt_to_page(table);
+ struct ptdesc *ptdesc = virt_to_ptdesc(table);
switch (half) {
case 0x00U: /* pmd, pud, or p4d */
- free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+ pagetable_free(ptdesc);
return;
case 0x01U: /* lower 2K of a 4K page table */
case 0x02U: /* higher 2K of a 4K page table */
- mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
+ mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24));
mask >>= 24;
if (mask != 0x00U)
return;
break;
case 0x03U: /* 4K page table with pgstes */
- mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+ mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
mask >>= 24;
break;
}
- page_table_release_check(page, table, half, mask);
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
+ if (folio_test_clear_active(ptdesc_folio(ptdesc)))
+ call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
+ else
+ pte_free_now(&ptdesc->pt_rcu_head);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+ struct page *page;
+
+ page = virt_to_page(pgtable);
+ SetPageActive(page);
+ page_table_free(mm, (unsigned long *)pgtable);
+ /*
+ * page_table_free() does not do the pgste gmap_unlink() which
+ * page_table_free_rcu() does: warn us if pgste ever reaches here.
+ */
+ WARN_ON_ONCE(mm_has_pgste(mm));
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
/*
* Base infrastructure required to generate basic asces, region, segment,
* and page tables that do not make use of enhanced features like EDAT1.
@@ -432,16 +486,20 @@ static void base_pgt_free(unsigned long *table)
static unsigned long *base_crst_alloc(unsigned long val)
{
unsigned long *table;
+ struct ptdesc *ptdesc;
- table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
- if (table)
- crst_table_init(table, val);
+ ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, CRST_ALLOC_ORDER);
+ if (!ptdesc)
+ return NULL;
+ table = ptdesc_address(ptdesc);
+
+ crst_table_init(table, val);
return table;
}
static void base_crst_free(unsigned long *table)
{
- free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+ pagetable_free(virt_to_ptdesc(table));
}
#define BASE_ADDR_END_FUNC(NAME, SIZE) \
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index afc3f33788da..d34d5813d006 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -244,62 +244,25 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
zpci_memcpy_toio(to, from, count);
}
-static void __iomem *__ioremap(phys_addr_t addr, size_t size, pgprot_t prot)
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long prot)
{
- unsigned long offset, vaddr;
- struct vm_struct *area;
- phys_addr_t last_addr;
-
- last_addr = addr + size - 1;
- if (!size || last_addr < addr)
- return NULL;
-
+ /*
+ * When PCI MIO instructions are unavailable the "physical" address
+ * encodes a hint for accessing the PCI memory space it represents.
+ * Just pass it unchanged such that ioread/iowrite can decode it.
+ */
if (!static_branch_unlikely(&have_mio))
- return (void __iomem *) addr;
+ return (void __iomem *)phys_addr;
- offset = addr & ~PAGE_MASK;
- addr &= PAGE_MASK;
- size = PAGE_ALIGN(size + offset);
- area = get_vm_area(size, VM_IOREMAP);
- if (!area)
- return NULL;
-
- vaddr = (unsigned long) area->addr;
- if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) {
- free_vm_area(area);
- return NULL;
- }
- return (void __iomem *) ((unsigned long) area->addr + offset);
-}
-
-void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot)
-{
- return __ioremap(addr, size, __pgprot(prot));
+ return generic_ioremap_prot(phys_addr, size, __pgprot(prot));
}
EXPORT_SYMBOL(ioremap_prot);
-void __iomem *ioremap(phys_addr_t addr, size_t size)
-{
- return __ioremap(addr, size, PAGE_KERNEL);
-}
-EXPORT_SYMBOL(ioremap);
-
-void __iomem *ioremap_wc(phys_addr_t addr, size_t size)
-{
- return __ioremap(addr, size, pgprot_writecombine(PAGE_KERNEL));
-}
-EXPORT_SYMBOL(ioremap_wc);
-
-void __iomem *ioremap_wt(phys_addr_t addr, size_t size)
-{
- return __ioremap(addr, size, pgprot_writethrough(PAGE_KERNEL));
-}
-EXPORT_SYMBOL(ioremap_wt);
-
void iounmap(volatile void __iomem *addr)
{
if (static_branch_likely(&have_mio))
- vunmap((__force void *) ((unsigned long) addr & PAGE_MASK));
+ generic_iounmap(addr);
}
EXPORT_SYMBOL(iounmap);
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 2b3ce4fd3956..6be32254211c 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -29,6 +29,7 @@ config SUPERH
select GENERIC_SMP_IDLE_THREAD
select GUP_GET_PXX_LOW_HIGH if X2TLB
select HAS_IOPORT if HAS_IOPORT_MAP
+ select GENERIC_IOREMAP if MMU
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_KGDB
select HAVE_ARCH_SECCOMP_FILTER
diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h
index d8f3537ef57f..f2f38e9d489a 100644
--- a/arch/sh/include/asm/io.h
+++ b/arch/sh/include/asm/io.h
@@ -119,6 +119,30 @@ void __raw_readsl(const void __iomem *addr, void *data, int longlen);
__BUILD_MEMORY_STRING(__raw_, q, u64)
+#define ioport_map ioport_map
+#define ioport_unmap ioport_unmap
+#define pci_iounmap pci_iounmap
+
+#define ioread8 ioread8
+#define ioread16 ioread16
+#define ioread16be ioread16be
+#define ioread32 ioread32
+#define ioread32be ioread32be
+
+#define iowrite8 iowrite8
+#define iowrite16 iowrite16
+#define iowrite16be iowrite16be
+#define iowrite32 iowrite32
+#define iowrite32be iowrite32be
+
+#define ioread8_rep ioread8_rep
+#define ioread16_rep ioread16_rep
+#define ioread32_rep ioread32_rep
+
+#define iowrite8_rep iowrite8_rep
+#define iowrite16_rep iowrite16_rep
+#define iowrite32_rep iowrite32_rep
+
#ifdef CONFIG_HAS_IOPORT_MAP
/*
@@ -221,10 +245,33 @@ __BUILD_IOPORT_STRING(q, u64)
#endif
+#define inb(addr) inb(addr)
+#define inw(addr) inw(addr)
+#define inl(addr) inl(addr)
+#define outb(x, addr) outb((x), (addr))
+#define outw(x, addr) outw((x), (addr))
+#define outl(x, addr) outl((x), (addr))
+
+#define inb_p(addr) inb(addr)
+#define inw_p(addr) inw(addr)
+#define inl_p(addr) inl(addr)
+#define outb_p(x, addr) outb((x), (addr))
+#define outw_p(x, addr) outw((x), (addr))
+#define outl_p(x, addr) outl((x), (addr))
+
+#define insb insb
+#define insw insw
+#define insl insl
+#define outsb outsb
+#define outsw outsw
+#define outsl outsl
#define IO_SPACE_LIMIT 0xffffffff
/* We really want to try and get these to memcpy etc */
+#define memset_io memset_io
+#define memcpy_fromio memcpy_fromio
+#define memcpy_toio memcpy_toio
void memcpy_fromio(void *, const volatile void __iomem *, unsigned long);
void memcpy_toio(volatile void __iomem *, const void *, unsigned long);
void memset_io(volatile void __iomem *, int, unsigned long);
@@ -243,40 +290,16 @@ unsigned long long poke_real_address_q(unsigned long long addr,
#endif
#ifdef CONFIG_MMU
-void iounmap(void __iomem *addr);
-void __iomem *__ioremap_caller(phys_addr_t offset, unsigned long size,
- pgprot_t prot, void *caller);
-
-static inline void __iomem *ioremap(phys_addr_t offset, unsigned long size)
-{
- return __ioremap_caller(offset, size, PAGE_KERNEL_NOCACHE,
- __builtin_return_address(0));
-}
-
-static inline void __iomem *
-ioremap_cache(phys_addr_t offset, unsigned long size)
-{
- return __ioremap_caller(offset, size, PAGE_KERNEL,
- __builtin_return_address(0));
-}
-#define ioremap_cache ioremap_cache
-
-#ifdef CONFIG_HAVE_IOREMAP_PROT
-static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size,
- unsigned long flags)
-{
- return __ioremap_caller(offset, size, __pgprot(flags),
- __builtin_return_address(0));
-}
-#endif /* CONFIG_HAVE_IOREMAP_PROT */
+/*
+ * I/O memory mapping functions.
+ */
+#define ioremap_prot ioremap_prot
+#define iounmap iounmap
-#else /* CONFIG_MMU */
-static inline void __iomem *ioremap(phys_addr_t offset, size_t size)
-{
- return (void __iomem *)(unsigned long)offset;
-}
+#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL_NOCACHE)
-static inline void iounmap(volatile void __iomem *addr) { }
+#define ioremap_cache(addr, size) \
+ ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL))
#endif /* CONFIG_MMU */
#define ioremap_uc ioremap
@@ -288,6 +311,8 @@ static inline void iounmap(volatile void __iomem *addr) { }
#define xlate_dev_mem_ptr(p) __va(p)
#define unxlate_dev_mem_ptr(p, v) do { } while (0)
+#include <asm-generic/io.h>
+
#define ARCH_HAS_VALID_PHYS_ADDR_RANGE
int valid_phys_addr_range(phys_addr_t addr, size_t size);
int valid_mmap_phys_addr_range(unsigned long pfn, size_t size);
diff --git a/arch/sh/include/asm/io_noioport.h b/arch/sh/include/asm/io_noioport.h
index f7938fe0f911..12dad91f41c1 100644
--- a/arch/sh/include/asm/io_noioport.h
+++ b/arch/sh/include/asm/io_noioport.h
@@ -46,13 +46,6 @@ static inline void ioport_unmap(void __iomem *addr)
BUG();
}
-#define inb_p(addr) inb(addr)
-#define inw_p(addr) inw(addr)
-#define inl_p(addr) inl(addr)
-#define outb_p(x, addr) outb((x), (addr))
-#define outw_p(x, addr) outw((x), (addr))
-#define outl_p(x, addr) outl((x), (addr))
-
static inline void insb(unsigned long port, void *dst, unsigned long count)
{
BUG();
diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h
index a9e98233c4d4..5d8577ab1591 100644
--- a/arch/sh/include/asm/pgalloc.h
+++ b/arch/sh/include/asm/pgalloc.h
@@ -2,6 +2,7 @@
#ifndef __ASM_SH_PGALLOC_H
#define __ASM_SH_PGALLOC_H
+#include <linux/mm.h>
#include <asm/page.h>
#define __HAVE_ARCH_PMD_ALLOC_ONE
@@ -31,10 +32,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
set_pmd(pmd, __pmd((unsigned long)page_address(pte)));
}
-#define __pte_free_tlb(tlb,pte,addr) \
-do { \
- pgtable_pte_page_dtor(pte); \
- tlb_remove_page((tlb), (pte)); \
+#define __pte_free_tlb(tlb, pte, addr) \
+do { \
+ pagetable_pte_dtor(page_ptdesc(pte)); \
+ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
} while (0)
#endif /* __ASM_SH_PGALLOC_H */
diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c
index 21342581144d..c33b3daa4ad1 100644
--- a/arch/sh/mm/ioremap.c
+++ b/arch/sh/mm/ioremap.c
@@ -72,22 +72,11 @@ __ioremap_29bit(phys_addr_t offset, unsigned long size, pgprot_t prot)
#define __ioremap_29bit(offset, size, prot) NULL
#endif /* CONFIG_29BIT */
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem * __ref
-__ioremap_caller(phys_addr_t phys_addr, unsigned long size,
- pgprot_t pgprot, void *caller)
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long prot)
{
- struct vm_struct *area;
- unsigned long offset, last_addr, addr, orig_addr;
void __iomem *mapped;
+ pgprot_t pgprot = __pgprot(prot);
mapped = __ioremap_trapped(phys_addr, size);
if (mapped)
@@ -97,11 +86,6 @@ __ioremap_caller(phys_addr_t phys_addr, unsigned long size,
if (mapped)
return mapped;
- /* Don't allow wraparound or zero size */
- last_addr = phys_addr + size - 1;
- if (!size || last_addr < phys_addr)
- return NULL;
-
/*
* If we can't yet use the regular approach, go the fixmap route.
*/
@@ -112,34 +96,14 @@ __ioremap_caller(phys_addr_t phys_addr, unsigned long size,
* First try to remap through the PMB.
* PMB entries are all pre-faulted.
*/
- mapped = pmb_remap_caller(phys_addr, size, pgprot, caller);
+ mapped = pmb_remap_caller(phys_addr, size, pgprot,
+ __builtin_return_address(0));
if (mapped && !IS_ERR(mapped))
return mapped;
- /*
- * Mappings have to be page-aligned
- */
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
- size = PAGE_ALIGN(last_addr+1) - phys_addr;
-
- /*
- * Ok, go for it..
- */
- area = get_vm_area_caller(size, VM_IOREMAP, caller);
- if (!area)
- return NULL;
- area->phys_addr = phys_addr;
- orig_addr = addr = (unsigned long)area->addr;
-
- if (ioremap_page_range(addr, addr + size, phys_addr, pgprot)) {
- vunmap((void *)orig_addr);
- return NULL;
- }
-
- return (void __iomem *)(offset + (char *)orig_addr);
+ return generic_ioremap_prot(phys_addr, size, pgprot);
}
-EXPORT_SYMBOL(__ioremap_caller);
+EXPORT_SYMBOL(ioremap_prot);
/*
* Simple checks for non-translatable mappings.
@@ -158,10 +122,9 @@ static inline int iomapping_nontranslatable(unsigned long offset)
return 0;
}
-void iounmap(void __iomem *addr)
+void iounmap(volatile void __iomem *addr)
{
unsigned long vaddr = (unsigned long __force)addr;
- struct vm_struct *p;
/*
* Nothing to do if there is no translatable mapping.
@@ -172,21 +135,15 @@ void iounmap(void __iomem *addr)
/*
* There's no VMA if it's from an early fixed mapping.
*/
- if (iounmap_fixed(addr) == 0)
+ if (iounmap_fixed((void __iomem *)addr) == 0)
return;
/*
* If the PMB handled it, there's nothing else to do.
*/
- if (pmb_unmap(addr) == 0)
+ if (pmb_unmap((void __iomem *)addr) == 0)
return;
- p = remove_vm_area((void *)(vaddr & PAGE_MASK));
- if (!p) {
- printk(KERN_ERR "%s: bad address %p\n", __func__, addr);
- return;
- }
-
- kfree(p);
+ generic_iounmap(addr);
}
EXPORT_SYMBOL(iounmap);
diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h
index 7b5561d17ab1..caa7632be4c2 100644
--- a/arch/sparc/include/asm/pgalloc_64.h
+++ b/arch/sparc/include/asm/pgalloc_64.h
@@ -65,6 +65,10 @@ pgtable_t pte_alloc_one(struct mm_struct *mm);
void pte_free_kernel(struct mm_struct *mm, pte_t *pte);
void pte_free(struct mm_struct *mm, pgtable_t ptepage);
+/* arch use pte_free_defer() implementation in arch/sparc/mm/init_64.c */
+#define pte_free_defer pte_free_defer
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);
+
#define pmd_populate_kernel(MM, PMD, PTE) pmd_set(MM, PMD, PTE)
#define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE)
diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c
index 1adf5c1c16b8..34ef7febf0d5 100644
--- a/arch/sparc/kernel/setup_32.c
+++ b/arch/sparc/kernel/setup_32.c
@@ -83,7 +83,7 @@ static void prom_sync_me(void)
"nop\n\t" : : "r" (&trapbase));
prom_printf("PROM SYNC COMMAND...\n");
- show_free_areas(0, NULL);
+ show_mem();
if (!is_idle_task(current)) {
local_irq_enable();
ksys_sync();
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 04f9db0c3111..9a63a3e08e40 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2893,14 +2893,15 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
pgtable_t pte_alloc_one(struct mm_struct *mm)
{
- struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page)
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0);
+
+ if (!ptdesc)
return NULL;
- if (!pgtable_pte_page_ctor(page)) {
- __free_page(page);
+ if (!pagetable_pte_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- return (pte_t *) page_address(page);
+ return ptdesc_address(ptdesc);
}
void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
@@ -2910,10 +2911,10 @@ void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
static void __pte_free(pgtable_t pte)
{
- struct page *page = virt_to_page(pte);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pte);
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ pagetable_pte_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
void pte_free(struct mm_struct *mm, pgtable_t pte)
@@ -2930,6 +2931,22 @@ void pgtable_free(void *table, bool is_page)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pte_free_now(struct rcu_head *head)
+{
+ struct page *page;
+
+ page = container_of(head, struct page, rcu_head);
+ __pte_free((pgtable_t)page_address(page));
+}
+
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+ struct page *page;
+
+ page = virt_to_page(pgtable);
+ call_rcu(&page->rcu_head, pte_free_now);
+}
+
void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd)
{
diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index 13f027afc875..8393faa3e596 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c
@@ -355,7 +355,8 @@ pgtable_t pte_alloc_one(struct mm_struct *mm)
return NULL;
page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT);
spin_lock(&mm->page_table_lock);
- if (page_ref_inc_return(page) == 2 && !pgtable_pte_page_ctor(page)) {
+ if (page_ref_inc_return(page) == 2 &&
+ !pagetable_pte_ctor(page_ptdesc(page))) {
page_ref_dec(page);
ptep = NULL;
}
@@ -371,7 +372,7 @@ void pte_free(struct mm_struct *mm, pgtable_t ptep)
page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT);
spin_lock(&mm->page_table_lock);
if (page_ref_dec_return(page) == 1)
- pgtable_pte_page_dtor(page);
+ pagetable_pte_dtor(page_ptdesc(page));
spin_unlock(&mm->page_table_lock);
srmmu_free_nocache(ptep, SRMMU_PTE_TABLE_SIZE);
diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h
index 8ec7cd46dd96..de5e31c64793 100644
--- a/arch/um/include/asm/pgalloc.h
+++ b/arch/um/include/asm/pgalloc.h
@@ -25,19 +25,19 @@
*/
extern pgd_t *pgd_alloc(struct mm_struct *);
-#define __pte_free_tlb(tlb,pte, address) \
-do { \
- pgtable_pte_page_dtor(pte); \
- tlb_remove_page((tlb),(pte)); \
+#define __pte_free_tlb(tlb, pte, address) \
+do { \
+ pagetable_pte_dtor(page_ptdesc(pte)); \
+ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \
} while (0)
#ifdef CONFIG_3_LEVEL_PGTABLES
-#define __pmd_free_tlb(tlb, pmd, address) \
-do { \
- pgtable_pmd_page_dtor(virt_to_page(pmd)); \
- tlb_remove_page((tlb),virt_to_page(pmd)); \
-} while (0) \
+#define __pmd_free_tlb(tlb, pmd, address) \
+do { \
+ pagetable_pmd_dtor(virt_to_ptdesc(pmd)); \
+ tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \
+} while (0)
#endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7422db409770..d0258e92a8af 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -102,6 +102,7 @@ config X86
select ARCH_HAS_DEBUG_WX
select ARCH_HAS_ZONE_DMA_SET if EXPERT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
+ select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
@@ -128,7 +129,8 @@ config X86
select ARCH_WANT_GENERAL_HUGETLB
select ARCH_WANT_HUGE_PMD_SHARE
select ARCH_WANT_LD_ORPHAN_WARN
- select ARCH_WANT_OPTIMIZE_VMEMMAP if X86_64
+ select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if X86_64
+ select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64
select ARCH_WANTS_THP_SWAP if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
select BUILDTIME_TABLE_SORT
@@ -2609,9 +2611,6 @@ config ARCH_HAS_ADD_PAGES
def_bool y
depends on ARCH_ENABLE_MEMORY_HOTPLUG
-config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
- def_bool y
-
menu "Power management and ACPI options"
config ARCH_HIBERNATION_HEADER
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index e9025640f634..76238842406a 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -35,9 +35,6 @@
* - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*/
-#define ARCH_HAS_IOREMAP_WC
-#define ARCH_HAS_IOREMAP_WT
-
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/cc_platform.h>
@@ -212,8 +209,6 @@ void memset_io(volatile void __iomem *, int, size_t);
#define memcpy_toio memcpy_toio
#define memset_io memset_io
-#include <asm-generic/iomap.h>
-
/*
* ISA space is 'always mapped' on a typical x86 system, no need to
* explicitly ioremap() it. The fact that the ISA IO space is mapped
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5700bb337987..ada1bbf12961 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1023,21 +1023,21 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
- page_table_check_pte_set(mm, addr, ptep, pte);
+ page_table_check_pte_set(mm, ptep, pte);
set_pte(ptep, pte);
}
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
- page_table_check_pmd_set(mm, addr, pmdp, pmd);
+ page_table_check_pmd_set(mm, pmdp, pmd);
set_pmd(pmdp, pmd);
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
- page_table_check_pud_set(mm, addr, pudp, pud);
+ page_table_check_pud_set(mm, pudp, pud);
native_set_pud(pudp, pud);
}
@@ -1068,7 +1068,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
pte_t pte = native_ptep_get_and_clear(ptep);
- page_table_check_pte_clear(mm, addr, pte);
+ page_table_check_pte_clear(mm, pte);
return pte;
}
@@ -1084,7 +1084,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
* care about updates and native needs no locking
*/
pte = native_local_ptep_get_and_clear(ptep);
- page_table_check_pte_clear(mm, addr, pte);
+ page_table_check_pte_clear(mm, pte);
} else {
pte = ptep_get_and_clear(mm, addr, ptep);
}
@@ -1133,7 +1133,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long
{
pmd_t pmd = native_pmdp_get_and_clear(pmdp);
- page_table_check_pmd_clear(mm, addr, pmd);
+ page_table_check_pmd_clear(mm, pmd);
return pmd;
}
@@ -1144,7 +1144,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
{
pud_t pud = native_pudp_get_and_clear(pudp);
- page_table_check_pud_clear(mm, addr, pud);
+ page_table_check_pud_clear(mm, pud);
return pud;
}
@@ -1167,7 +1167,7 @@ static inline int pud_write(pud_t pud)
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
- page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
+ page_table_check_pmd_set(vma->vm_mm, pmdp, pmd);
if (IS_ENABLED(CONFIG_SMP)) {
return xchg(pmdp, pmd);
} else {
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 80450e1d5385..6ab42caaa67a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -3,6 +3,7 @@
#define _ASM_X86_TLBFLUSH_H
#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
#include <linux/sched.h>
#include <asm/processor.h>
@@ -253,6 +254,18 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
}
+static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
+{
+ bool should_defer = false;
+
+ /* If remote CPUs need to be flushed then defer batch the flush */
+ if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+ should_defer = true;
+ put_cpu();
+
+ return should_defer;
+}
+
static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
{
/*
@@ -264,11 +277,18 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
return atomic64_inc_return(&mm->context.tlb_gen);
}
-static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
- struct mm_struct *mm)
+static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm,
+ unsigned long uaddr)
{
inc_mm_tlb_gen(mm);
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
+static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
+{
+ flush_tlb_mm(mm);
}
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e8711b2cafaf..787da09d24f3 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1328,7 +1328,6 @@ void do_user_addr_fault(struct pt_regs *regs,
}
#endif
-#ifdef CONFIG_PER_VMA_LOCK
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
@@ -1358,7 +1357,6 @@ void do_user_addr_fault(struct pt_regs *regs,
return;
}
lock_mmap:
-#endif /* CONFIG_PER_VMA_LOCK */
retry:
vma = lock_mm_and_find_vma(mm, address, regs);
@@ -1418,9 +1416,7 @@ retry:
}
mmap_read_unlock(mm);
-#ifdef CONFIG_PER_VMA_LOCK
done:
-#endif
if (likely(!(fault & VM_FAULT_ERROR)))
return;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 15a8009a4480..d3a93e8766ee 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -52,7 +52,7 @@ early_param("userpte", setup_userpte);
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
- pgtable_pte_page_dtor(pte);
+ pagetable_pte_dtor(page_ptdesc(pte));
paravirt_release_pte(page_to_pfn(pte));
paravirt_tlb_remove_table(tlb, pte);
}
@@ -60,7 +60,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
#if CONFIG_PGTABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
- struct page *page = virt_to_page(pmd);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
/*
* NOTE! For PAE, any changes to the top page-directory-pointer-table
@@ -69,8 +69,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
#ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
#endif
- pgtable_pmd_page_dtor(page);
- paravirt_tlb_remove_table(tlb, page);
+ pagetable_pmd_dtor(ptdesc);
+ paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
}
#if CONFIG_PGTABLE_LEVELS > 3
@@ -92,16 +92,16 @@ void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
static inline void pgd_list_add(pgd_t *pgd)
{
- struct page *page = virt_to_page(pgd);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
- list_add(&page->lru, &pgd_list);
+ list_add(&ptdesc->pt_list, &pgd_list);
}
static inline void pgd_list_del(pgd_t *pgd)
{
- struct page *page = virt_to_page(pgd);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
- list_del(&page->lru);
+ list_del(&ptdesc->pt_list);
}
#define UNSHARED_PTRS_PER_PGD \
@@ -112,12 +112,12 @@ static inline void pgd_list_del(pgd_t *pgd)
static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
{
- virt_to_page(pgd)->pt_mm = mm;
+ virt_to_ptdesc(pgd)->pt_mm = mm;
}
struct mm_struct *pgd_page_get_mm(struct page *page)
{
- return page->pt_mm;
+ return page_ptdesc(page)->pt_mm;
}
static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
@@ -213,11 +213,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
int i;
+ struct ptdesc *ptdesc;
for (i = 0; i < count; i++)
if (pmds[i]) {
- pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
- free_page((unsigned long)pmds[i]);
+ ptdesc = virt_to_ptdesc(pmds[i]);
+
+ pagetable_pmd_dtor(ptdesc);
+ pagetable_free(ptdesc);
mm_dec_nr_pmds(mm);
}
}
@@ -230,18 +233,24 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
+ gfp &= ~__GFP_HIGHMEM;
for (i = 0; i < count; i++) {
- pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
- if (!pmd)
+ pmd_t *pmd = NULL;
+ struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);
+
+ if (!ptdesc)
failed = true;
- if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
- free_page((unsigned long)pmd);
- pmd = NULL;
+ if (ptdesc && !pagetable_pmd_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
+ ptdesc = NULL;
failed = true;
}
- if (pmd)
+ if (ptdesc) {
mm_inc_nr_pmds(mm);
+ pmd = ptdesc_address(ptdesc);
+ }
+
pmds[i] = pmd;
}
@@ -830,7 +839,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
free_page((unsigned long)pmd_sv);
- pgtable_pmd_page_dtor(virt_to_page(pmd));
+ pagetable_pmd_dtor(virt_to_ptdesc(pmd));
free_page((unsigned long)pmd);
return 1;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 267acf27480a..2d253919b3e8 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -10,6 +10,7 @@
#include <linux/debugfs.h>
#include <linux/sched/smt.h>
#include <linux/task_work.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -1036,6 +1037,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
put_flush_tlb_info();
put_cpu();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index e0a975165de7..8796ec310483 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -667,7 +667,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
spinlock_t *ptl = NULL;
#if USE_SPLIT_PTE_PTLOCKS
- ptl = ptlock_ptr(page);
+ ptl = ptlock_ptr(page_ptdesc(page));
spin_lock_nest_lock(ptl, &mm->page_table_lock);
#endif
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 2a51a466779f..a5488cc40f58 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -28,6 +28,7 @@ config XTENSA
select GENERIC_LIB_UCMPDI2
select GENERIC_PCI_IOMAP
select GENERIC_SCHED_CLOCK
+ select GENERIC_IOREMAP if MMU
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL
diff --git a/arch/xtensa/include/asm/io.h b/arch/xtensa/include/asm/io.h
index a5b707e1c0f4..934e58399c8c 100644
--- a/arch/xtensa/include/asm/io.h
+++ b/arch/xtensa/include/asm/io.h
@@ -16,6 +16,7 @@
#include <asm/vectors.h>
#include <linux/bug.h>
#include <linux/kernel.h>
+#include <linux/pgtable.h>
#include <linux/types.h>
@@ -24,22 +25,24 @@
#define PCI_IOBASE ((void __iomem *)XCHAL_KIO_BYPASS_VADDR)
#ifdef CONFIG_MMU
-
-void __iomem *xtensa_ioremap_nocache(unsigned long addr, unsigned long size);
-void __iomem *xtensa_ioremap_cache(unsigned long addr, unsigned long size);
-void xtensa_iounmap(volatile void __iomem *addr);
-
/*
- * Return the virtual address for the specified bus memory.
+ * I/O memory mapping functions.
*/
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long prot);
+#define ioremap_prot ioremap_prot
+#define iounmap iounmap
+
static inline void __iomem *ioremap(unsigned long offset, unsigned long size)
{
if (offset >= XCHAL_KIO_PADDR
&& offset - XCHAL_KIO_PADDR < XCHAL_KIO_SIZE)
return (void*)(offset-XCHAL_KIO_PADDR+XCHAL_KIO_BYPASS_VADDR);
else
- return xtensa_ioremap_nocache(offset, size);
+ return ioremap_prot(offset, size,
+ pgprot_val(pgprot_noncached(PAGE_KERNEL)));
}
+#define ioremap ioremap
static inline void __iomem *ioremap_cache(unsigned long offset,
unsigned long size)
@@ -48,21 +51,10 @@ static inline void __iomem *ioremap_cache(unsigned long offset,
&& offset - XCHAL_KIO_PADDR < XCHAL_KIO_SIZE)
return (void*)(offset-XCHAL_KIO_PADDR+XCHAL_KIO_CACHED_VADDR);
else
- return xtensa_ioremap_cache(offset, size);
-}
-#define ioremap_cache ioremap_cache
+ return ioremap_prot(offset, size, pgprot_val(PAGE_KERNEL));
-static inline void iounmap(volatile void __iomem *addr)
-{
- unsigned long va = (unsigned long) addr;
-
- if (!(va >= XCHAL_KIO_CACHED_VADDR &&
- va - XCHAL_KIO_CACHED_VADDR < XCHAL_KIO_SIZE) &&
- !(va >= XCHAL_KIO_BYPASS_VADDR &&
- va - XCHAL_KIO_BYPASS_VADDR < XCHAL_KIO_SIZE))
- xtensa_iounmap(addr);
}
-
+#define ioremap_cache ioremap_cache
#endif /* CONFIG_MMU */
#include <asm-generic/io.h>
diff --git a/arch/xtensa/mm/ioremap.c b/arch/xtensa/mm/ioremap.c
index a400188c16b9..8ca660b7ab49 100644
--- a/arch/xtensa/mm/ioremap.c
+++ b/arch/xtensa/mm/ioremap.c
@@ -6,60 +6,30 @@
*/
#include <linux/io.h>
-#include <linux/vmalloc.h>
#include <linux/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/io.h>
-static void __iomem *xtensa_ioremap(unsigned long paddr, unsigned long size,
- pgprot_t prot)
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long prot)
{
- unsigned long offset = paddr & ~PAGE_MASK;
- unsigned long pfn = __phys_to_pfn(paddr);
- struct vm_struct *area;
- unsigned long vaddr;
- int err;
-
- paddr &= PAGE_MASK;
-
+ unsigned long pfn = __phys_to_pfn((phys_addr));
WARN_ON(pfn_valid(pfn));
- size = PAGE_ALIGN(offset + size);
-
- area = get_vm_area(size, VM_IOREMAP);
- if (!area)
- return NULL;
-
- vaddr = (unsigned long)area->addr;
- area->phys_addr = paddr;
-
- err = ioremap_page_range(vaddr, vaddr + size, paddr, prot);
-
- if (err) {
- vunmap((void *)vaddr);
- return NULL;
- }
-
- flush_cache_vmap(vaddr, vaddr + size);
- return (void __iomem *)(offset + vaddr);
-}
-
-void __iomem *xtensa_ioremap_nocache(unsigned long addr, unsigned long size)
-{
- return xtensa_ioremap(addr, size, pgprot_noncached(PAGE_KERNEL));
+ return generic_ioremap_prot(phys_addr, size, __pgprot(prot));
}
-EXPORT_SYMBOL(xtensa_ioremap_nocache);
+EXPORT_SYMBOL(ioremap_prot);
-void __iomem *xtensa_ioremap_cache(unsigned long addr, unsigned long size)
+void iounmap(volatile void __iomem *addr)
{
- return xtensa_ioremap(addr, size, PAGE_KERNEL);
-}
-EXPORT_SYMBOL(xtensa_ioremap_cache);
+ unsigned long va = (unsigned long) addr;
-void xtensa_iounmap(volatile void __iomem *io_addr)
-{
- void *addr = (void *)(PAGE_MASK & (unsigned long)io_addr);
+ if ((va >= XCHAL_KIO_CACHED_VADDR &&
+ va - XCHAL_KIO_CACHED_VADDR < XCHAL_KIO_SIZE) ||
+ (va >= XCHAL_KIO_BYPASS_VADDR &&
+ va - XCHAL_KIO_BYPASS_VADDR < XCHAL_KIO_SIZE))
+ return;
- vunmap(addr);
+ generic_iounmap(addr);
}
-EXPORT_SYMBOL(xtensa_iounmap);
+EXPORT_SYMBOL(iounmap);
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 24f662d8bd39..d0c1a71007d0 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -211,8 +211,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
if (!info->length)
continue;
- if (mhp_supports_memmap_on_memory(info->length))
- mhp_flags |= MHP_MEMMAP_ON_MEMORY;
+ mhp_flags |= MHP_MEMMAP_ON_MEMORY;
result = __add_memory(mgid, info->start_addr, info->length,
mhp_flags);
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index b456ac213610..8191709c9ad2 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -105,7 +105,8 @@ EXPORT_SYMBOL(unregister_memory_notifier);
static void memory_block_release(struct device *dev)
{
struct memory_block *mem = to_memory_block(dev);
-
+ /* Verify that the altmap is freed */
+ WARN_ON(mem->altmap);
kfree(mem);
}
@@ -183,7 +184,7 @@ static int memory_block_online(struct memory_block *mem)
{
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
- unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+ unsigned long nr_vmemmap_pages = 0;
struct zone *zone;
int ret;
@@ -200,6 +201,9 @@ static int memory_block_online(struct memory_block *mem)
* stage helps to keep accounting easier to follow - e.g vmemmaps
* belong to the same zone as the memory they backed.
*/
+ if (mem->altmap)
+ nr_vmemmap_pages = mem->altmap->free;
+
if (nr_vmemmap_pages) {
ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
if (ret)
@@ -230,7 +234,7 @@ static int memory_block_offline(struct memory_block *mem)
{
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
- unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+ unsigned long nr_vmemmap_pages = 0;
int ret;
if (!mem->zone)
@@ -240,6 +244,9 @@ static int memory_block_offline(struct memory_block *mem)
* Unaccount before offlining, such that unpopulated zone and kthreads
* can properly be torn down in offline_pages().
*/
+ if (mem->altmap)
+ nr_vmemmap_pages = mem->altmap->free;
+
if (nr_vmemmap_pages)
adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
-nr_vmemmap_pages);
@@ -726,7 +733,7 @@ void memory_block_add_nid(struct memory_block *mem, int nid,
#endif
static int add_memory_block(unsigned long block_id, unsigned long state,
- unsigned long nr_vmemmap_pages,
+ struct vmem_altmap *altmap,
struct memory_group *group)
{
struct memory_block *mem;
@@ -744,7 +751,7 @@ static int add_memory_block(unsigned long block_id, unsigned long state,
mem->start_section_nr = block_id * sections_per_block;
mem->state = state;
mem->nid = NUMA_NO_NODE;
- mem->nr_vmemmap_pages = nr_vmemmap_pages;
+ mem->altmap = altmap;
INIT_LIST_HEAD(&mem->group_next);
#ifndef CONFIG_NUMA
@@ -783,14 +790,14 @@ static int __init add_boot_memory_block(unsigned long base_section_nr)
if (section_count == 0)
return 0;
return add_memory_block(memory_block_id(base_section_nr),
- MEM_ONLINE, 0, NULL);
+ MEM_ONLINE, NULL, NULL);
}
static int add_hotplug_memory_block(unsigned long block_id,
- unsigned long nr_vmemmap_pages,
+ struct vmem_altmap *altmap,
struct memory_group *group)
{
- return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group);
+ return add_memory_block(block_id, MEM_OFFLINE, altmap, group);
}
static void remove_memory_block(struct memory_block *memory)
@@ -818,7 +825,7 @@ static void remove_memory_block(struct memory_block *memory)
* Called under device_hotplug_lock.
*/
int create_memory_block_devices(unsigned long start, unsigned long size,
- unsigned long vmemmap_pages,
+ struct vmem_altmap *altmap,
struct memory_group *group)
{
const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
@@ -832,7 +839,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
return -EINVAL;
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
- ret = add_hotplug_memory_block(block_id, vmemmap_pages, group);
+ ret = add_hotplug_memory_block(block_id, altmap, group);
if (ret)
break;
}
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 9de524e56307..8e871ba9162f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -446,8 +446,8 @@ static ssize_t node_read_meminfo(struct device *dev,
"Node %d AnonHugePages: %8lu kB\n"
"Node %d ShmemHugePages: %8lu kB\n"
"Node %d ShmemPmdMapped: %8lu kB\n"
- "Node %d FileHugePages: %8lu kB\n"
- "Node %d FilePmdMapped: %8lu kB\n"
+ "Node %d FileHugePages: %8lu kB\n"
+ "Node %d FilePmdMapped: %8lu kB\n"
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
"Node %d Unaccepted: %8lu kB\n"
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 898ca9505754..c57acb73e3db 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -264,7 +264,7 @@ static int __init dax_kmem_init(void)
return rc;
error_dax_driver:
- destroy_memory_type(dax_slowmem_type);
+ put_memory_type(dax_slowmem_type);
err_dax_slowmem_type:
kfree_const(kmem_name);
return rc;
@@ -275,7 +275,7 @@ static void __exit dax_kmem_exit(void)
dax_driver_unregister(&device_dax_kmem_driver);
if (!any_hotremove_failed)
kfree_const(kmem_name);
- destroy_memory_type(dax_slowmem_type);
+ put_memory_type(dax_slowmem_type);
}
MODULE_AUTHOR("Intel Corporation");
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 5ff1a5a89d96..0b7bfbd0cb66 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2621,10 +2621,7 @@ svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
return -EFAULT;
}
- *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk &&
- vma->vm_end >= vma->vm_mm->start_brk) ||
- (vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack);
+ *is_heap_stack = vma_is_initial_heap(vma) || vma_is_initial_stack(vma);
start_limit = max(vma->vm_start >> PAGE_SHIFT,
(unsigned long)ALIGN_DOWN(addr, 2UL << 8));
diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c
index 261352a23271..2596466cd5a6 100644
--- a/drivers/iommu/amd/iommu_v2.c
+++ b/drivers/iommu/amd/iommu_v2.c
@@ -355,9 +355,9 @@ static struct pasid_state *mn_to_state(struct mmu_notifier *mn)
return container_of(mn, struct pasid_state, mn);
}
-static void mn_invalidate_range(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start, unsigned long end)
+static void mn_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
struct pasid_state *pasid_state;
struct device_state *dev_state;
@@ -391,8 +391,8 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
}
static const struct mmu_notifier_ops iommu_mn = {
- .release = mn_release,
- .invalidate_range = mn_invalidate_range,
+ .release = mn_release,
+ .arch_invalidate_secondary_tlbs = mn_arch_invalidate_secondary_tlbs,
};
static void set_pri_tag_status(struct pasid_state *pasid_state,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index a5a63b1c947e..dbc812a0e57e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -186,9 +186,10 @@ static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd)
}
}
-static void arm_smmu_mm_invalidate_range(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start, unsigned long end)
+static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
{
struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn);
struct arm_smmu_domain *smmu_domain = smmu_mn->domain;
@@ -200,10 +201,20 @@ static void arm_smmu_mm_invalidate_range(struct mmu_notifier *mn,
* range. So do a simple translation here by calculating size correctly.
*/
size = end - start;
+ if (size == ULONG_MAX)
+ size = 0;
+
+ if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM)) {
+ if (!size)
+ arm_smmu_tlb_inv_asid(smmu_domain->smmu,
+ smmu_mn->cd->asid);
+ else
+ arm_smmu_tlb_inv_range_asid(start, size,
+ smmu_mn->cd->asid,
+ PAGE_SIZE, false,
+ smmu_domain);
+ }
- if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM))
- arm_smmu_tlb_inv_range_asid(start, size, smmu_mn->cd->asid,
- PAGE_SIZE, false, smmu_domain);
arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, start, size);
}
@@ -237,9 +248,9 @@ static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn)
}
static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = {
- .invalidate_range = arm_smmu_mm_invalidate_range,
- .release = arm_smmu_mm_release,
- .free_notifier = arm_smmu_mmu_notifier_free,
+ .arch_invalidate_secondary_tlbs = arm_smmu_mm_arch_invalidate_secondary_tlbs,
+ .release = arm_smmu_mm_release,
+ .free_notifier = arm_smmu_mmu_notifier_free,
};
/* Allocate or get existing MMU notifier for this {domain, mm} pair */
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index e95b339e9cdc..8f6d68006ab6 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -219,9 +219,9 @@ static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
}
/* Pages have been freed at this point */
-static void intel_invalidate_range(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start, unsigned long end)
+static void intel_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
@@ -256,7 +256,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
static const struct mmu_notifier_ops intel_mmuops = {
.release = intel_mm_release,
- .invalidate_range = intel_invalidate_range,
+ .arch_invalidate_secondary_tlbs = intel_arch_invalidate_secondary_tlbs,
};
static DEFINE_MUTEX(pasid_mutex);
diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
index 4cf4c55a5f00..c06c699c0e7b 100644
--- a/drivers/misc/ocxl/link.c
+++ b/drivers/misc/ocxl/link.c
@@ -491,9 +491,9 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle)
}
EXPORT_SYMBOL_GPL(ocxl_link_release);
-static void invalidate_range(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start, unsigned long end)
+static void arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier);
struct ocxl_link *link = pe_data->link;
@@ -509,7 +509,7 @@ static void invalidate_range(struct mmu_notifier *mn,
}
static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = {
- .invalidate_range = invalidate_range,
+ .arch_invalidate_secondary_tlbs = arch_invalidate_secondary_tlbs,
};
static u64 calculate_cfg_state(bool kernel)
diff --git a/drivers/net/ethernet/sfc/io.h b/drivers/net/ethernet/sfc/io.h
index 30439cc83a89..07f99ad14bf3 100644
--- a/drivers/net/ethernet/sfc/io.h
+++ b/drivers/net/ethernet/sfc/io.h
@@ -70,7 +70,7 @@
*/
#ifdef CONFIG_X86_64
/* PIO is a win only if write-combining is possible */
-#ifdef ARCH_HAS_IOREMAP_WC
+#ifdef ioremap_wc
#define EFX_USE_PIO 1
#endif
#endif
diff --git a/drivers/net/ethernet/sfc/siena/io.h b/drivers/net/ethernet/sfc/siena/io.h
index 30439cc83a89..07f99ad14bf3 100644
--- a/drivers/net/ethernet/sfc/siena/io.h
+++ b/drivers/net/ethernet/sfc/siena/io.h
@@ -70,7 +70,7 @@
*/
#ifdef CONFIG_X86_64
/* PIO is a win only if write-combining is possible */
-#ifdef ARCH_HAS_IOREMAP_WC
+#ifdef ioremap_wc
#define EFX_USE_PIO 1
#endif
#endif
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index af7d9301520c..18ad315581ca 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -100,7 +100,7 @@ static unsigned long *nd_pfn_supported_alignments(unsigned long *alignments)
if (has_transparent_hugepage()) {
alignments[1] = HPAGE_PMD_SIZE;
- if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
+ if (has_transparent_pud_hugepage())
alignments[2] = HPAGE_PUD_SIZE;
}
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index b6e70c5cfa17..e1df63a88aac 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -342,7 +342,7 @@ static const struct sysrq_key_op sysrq_ftrace_dump_op = {
static void sysrq_handle_showmem(int key)
{
- show_mem(0, NULL);
+ show_mem();
}
static const struct sysrq_key_op sysrq_showmem_op = {
.handler = sysrq_handle_showmem,
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index be8313cdbac3..358f216c6cd6 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -606,7 +606,7 @@ static void fn_scroll_back(struct vc_data *vc)
static void fn_show_mem(struct vc_data *vc)
{
- show_mem(0, NULL);
+ show_mem();
}
static void fn_show_state(struct vc_data *vc)
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index cebba4eaa0b5..12c0ae29f185 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -68,6 +68,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
&path, sizeof(path),
&version, sizeof(version),
i_size_read(&v9inode->netfs.inode));
+ if (v9inode->netfs.cache)
+ mapping_set_release_always(inode->i_mapping);
p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
inode, v9fs_inode_cookie(v9inode));
diff --git a/fs/Kconfig b/fs/Kconfig
index 18d034ec7953..f3be721bab6d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -169,6 +169,7 @@ source "fs/sysfs/Kconfig"
config TMPFS
bool "Tmpfs virtual memory file system support (former shm fs)"
depends on SHMEM
+ select MEMFD_CREATE
help
Tmpfs is a file system which keeps all files in virtual memory.
@@ -240,6 +241,7 @@ config HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
depends on (SYSFS || SYSCTL)
+ select MEMFD_CREATE
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
@@ -252,7 +254,7 @@ config HUGETLB_PAGE
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
- depends on ARCH_WANT_OPTIMIZE_VMEMMAP
+ depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
depends on SPARSEMEM_VMEMMAP
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
@@ -264,9 +266,6 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
(boot command line) or hugetlb_optimize_vmemmap (sysctl).
-config MEMFD_CREATE
- def_bool TMPFS || HUGETLBFS
-
config ARCH_HAS_GIGANTIC_PAGE
bool
diff --git a/fs/affs/file.c b/fs/affs/file.c
index e43f2f007ac1..705e227ff63d 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -520,21 +520,20 @@ affs_getemptyblk_ino(struct inode *inode, int block)
return ERR_PTR(err);
}
-static int
-affs_do_readpage_ofs(struct page *page, unsigned to, int create)
+static int affs_do_read_folio_ofs(struct folio *folio, size_t to, int create)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct super_block *sb = inode->i_sb;
struct buffer_head *bh;
- unsigned pos = 0;
- u32 bidx, boff, bsize;
+ size_t pos = 0;
+ size_t bidx, boff, bsize;
u32 tmp;
- pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
- page->index, to);
- BUG_ON(to > PAGE_SIZE);
+ pr_debug("%s(%lu, %ld, 0, %zu)\n", __func__, inode->i_ino,
+ folio->index, to);
+ BUG_ON(to > folio_size(folio));
bsize = AFFS_SB(sb)->s_data_blksize;
- tmp = page->index << PAGE_SHIFT;
+ tmp = folio_pos(folio);
bidx = tmp / bsize;
boff = tmp % bsize;
@@ -544,7 +543,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create)
return PTR_ERR(bh);
tmp = min(bsize - boff, to - pos);
BUG_ON(pos + tmp > to || tmp > bsize);
- memcpy_to_page(page, pos, AFFS_DATA(bh) + boff, tmp);
+ memcpy_to_folio(folio, pos, AFFS_DATA(bh) + boff, tmp);
affs_brelse(bh);
bidx++;
pos += tmp;
@@ -624,25 +623,23 @@ out:
return PTR_ERR(bh);
}
-static int
-affs_read_folio_ofs(struct file *file, struct folio *folio)
+static int affs_read_folio_ofs(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
- u32 to;
+ struct inode *inode = folio->mapping->host;
+ size_t to;
int err;
- pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index);
- to = PAGE_SIZE;
- if (((page->index + 1) << PAGE_SHIFT) > inode->i_size) {
- to = inode->i_size & ~PAGE_MASK;
- memset(page_address(page) + to, 0, PAGE_SIZE - to);
+ pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, folio->index);
+ to = folio_size(folio);
+ if (folio_pos(folio) + to > inode->i_size) {
+ to = inode->i_size - folio_pos(folio);
+ folio_zero_segment(folio, to, folio_size(folio));
}
- err = affs_do_readpage_ofs(page, to, 0);
+ err = affs_do_read_folio_ofs(folio, to, 0);
if (!err)
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return err;
}
@@ -651,7 +648,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
- struct page *page;
+ struct folio *folio;
pgoff_t index;
int err = 0;
@@ -667,19 +664,20 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
}
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ *pagep = &folio->page;
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
return 0;
/* XXX: inefficient but safe in the face of short writes */
- err = affs_do_readpage_ofs(page, PAGE_SIZE, 1);
+ err = affs_do_read_folio_ofs(folio, folio_size(folio), 1);
if (err) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
return err;
}
@@ -688,6 +686,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
struct buffer_head *bh, *prev_bh;
@@ -701,18 +700,18 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
to = from + len;
/*
* XXX: not sure if this can handle short copies (len < copied), but
- * we don't have to, because the page should always be uptodate here,
+ * we don't have to, because the folio should always be uptodate here,
* due to write_begin.
*/
pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
pos + len);
bsize = AFFS_SB(sb)->s_data_blksize;
- data = page_address(page);
+ data = folio_address(folio);
bh = NULL;
written = 0;
- tmp = (page->index << PAGE_SHIFT) + from;
+ tmp = (folio->index << PAGE_SHIFT) + from;
bidx = tmp / bsize;
boff = tmp % bsize;
if (boff) {
@@ -804,11 +803,11 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
from += tmp;
bidx++;
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
done:
affs_brelse(bh);
- tmp = (page->index << PAGE_SHIFT) + from;
+ tmp = (folio->index << PAGE_SHIFT) + from;
if (tmp > inode->i_size)
inode->i_size = AFFS_I(inode)->mmu_private = tmp;
@@ -819,8 +818,8 @@ done:
}
err_first_bh:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return written;
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 31d6446dc166..094aec8d17b8 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -13,10 +13,9 @@
static int affs_symlink_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
struct buffer_head *bh;
- struct inode *inode = page->mapping->host;
- char *link = page_address(page);
+ struct inode *inode = folio->mapping->host;
+ char *link = folio_address(folio);
struct slink_front *lf;
int i, j;
char c;
@@ -58,12 +57,11 @@ static int affs_symlink_read_folio(struct file *file, struct folio *folio)
}
link[i] = '\0';
affs_brelse(bh);
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
fail:
- SetPageError(page);
- unlock_page(page);
+ folio_unlock(folio);
return -EIO;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 9d3d64921106..da73b97e19a9 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -681,6 +681,8 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode,
{
#ifdef CONFIG_AFS_FSCACHE
vnode->netfs.cache = cookie;
+ if (cookie)
+ mapping_set_release_always(vnode->netfs.inode.i_mapping);
#endif
}
diff --git a/fs/buffer.c b/fs/buffer.c
index bd091329026c..f0563ebae75f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1539,21 +1539,6 @@ void invalidate_bh_lrus_cpu(void)
bh_lru_unlock();
}
-void set_bh_page(struct buffer_head *bh,
- struct page *page, unsigned long offset)
-{
- bh->b_page = page;
- BUG_ON(offset >= PAGE_SIZE);
- if (PageHighMem(page))
- /*
- * This catches illegal uses and preserves the offset:
- */
- bh->b_data = (char *)(0 + offset);
- else
- bh->b_data = page_address(page) + offset;
-}
-EXPORT_SYMBOL(set_bh_page);
-
void folio_set_bh(struct buffer_head *bh, struct folio *folio,
unsigned long offset)
{
@@ -2180,8 +2165,7 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
}
EXPORT_SYMBOL(__block_write_begin);
-static int __block_commit_write(struct inode *inode, struct folio *folio,
- size_t from, size_t to)
+static void __block_commit_write(struct folio *folio, size_t from, size_t to)
{
size_t block_start, block_end;
bool partial = false;
@@ -2216,7 +2200,6 @@ static int __block_commit_write(struct inode *inode, struct folio *folio,
*/
if (!partial)
folio_mark_uptodate(folio);
- return 0;
}
/*
@@ -2253,7 +2236,6 @@ int block_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct folio *folio = page_folio(page);
- struct inode *inode = mapping->host;
size_t start = pos - folio_pos(folio);
if (unlikely(copied < len)) {
@@ -2277,7 +2259,7 @@ int block_write_end(struct file *file, struct address_space *mapping,
flush_dcache_folio(folio);
/* This could be a short (even 0-length) commit */
- __block_commit_write(inode, folio, start, start + copied);
+ __block_commit_write(folio, start, start + copied);
return copied;
}
@@ -2598,12 +2580,10 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
}
EXPORT_SYMBOL(cont_write_begin);
-int block_commit_write(struct page *page, unsigned from, unsigned to)
+void block_commit_write(struct page *page, unsigned from, unsigned to)
{
struct folio *folio = page_folio(page);
- struct inode *inode = folio->mapping->host;
- __block_commit_write(inode, folio, from, to);
- return 0;
+ __block_commit_write(folio, from, to);
}
EXPORT_SYMBOL(block_commit_write);
@@ -2649,11 +2629,11 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
end = size - folio_pos(folio);
ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
- if (!ret)
- ret = __block_commit_write(inode, folio, 0, end);
-
- if (unlikely(ret < 0))
+ if (unlikely(ret))
goto out_unlock;
+
+ __block_commit_write(folio, 0, end);
+
folio_mark_dirty(folio);
folio_wait_stable(folio);
return 0;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index d9d22d0ec38a..7bf7a5fcc045 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -585,6 +585,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
if (ret < 0)
goto check_failed;
+ clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &object->cookie->flags);
+
object->file = file;
/* Always update the atime on an object we've just looked up (this is
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 177d8e8d73fe..de1dee46d3df 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -36,6 +36,8 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
&ci->i_vino, sizeof(ci->i_vino),
&ci->i_version, sizeof(ci->i_version),
i_size_read(inode));
+ if (ci->netfs.cache)
+ mapping_set_release_always(inode->i_mapping);
}
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci)
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index e619c31b6bd9..b9575957a7c2 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -10,6 +10,7 @@
#include <linux/writeback.h>
#include <linux/sysctl.h>
#include <linux/gfp.h>
+#include <linux/swap.h>
#include "internal.h"
/* A global variable is a bit ugly, but it keeps the code simple */
@@ -59,6 +60,7 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write,
static int stfu;
if (sysctl_drop_caches & 1) {
+ lru_add_drain_all();
iterate_supers(drop_pagecache_sb, NULL);
count_vm_event(DROP_PAGECACHE);
}
diff --git a/fs/exec.c b/fs/exec.c
index 1a827d55ba94..0b9484358a49 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -701,6 +701,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
if (vma != vma_next(&vmi))
return -EFAULT;
+ vma_iter_prev_range(&vmi);
/*
* cover the whole range: [new_start, old_end)
*/
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 43775a6ca505..3d253e250871 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1569,7 +1569,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
if (folio->index < mpd->first_page)
continue;
- if (folio->index + folio_nr_pages(folio) - 1 > end)
+ if (folio_next_index(folio) - 1 > end)
continue;
BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio));
@@ -2455,7 +2455,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (mpd->map.m_len == 0)
mpd->first_page = folio->index;
- mpd->next_page = folio->index + folio_nr_pages(folio);
+ mpd->next_page = folio_next_index(folio);
/*
* Writeout when we cannot modify metadata is simple.
* Just submit the page. For data=journal mode we
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b5af2fc03b2f..18a9e7c47975 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -340,10 +340,8 @@ again:
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto data_copy;
}
- if ((folio_has_private(folio[0]) &&
- !filemap_release_folio(folio[0], 0)) ||
- (folio_has_private(folio[1]) &&
- !filemap_release_folio(folio[1], 0))) {
+ if (!filemap_release_folio(folio[0], 0) ||
+ !filemap_release_folio(folio[1], 0)) {
*err = -EBUSY;
goto drop_data_sem;
}
@@ -362,10 +360,8 @@ data_copy:
/* At this point all buffers in range are uptodate, old mapping layout
* is no longer required, try to drop it now. */
- if ((folio_has_private(folio[0]) &&
- !filemap_release_folio(folio[0], 0)) ||
- (folio_has_private(folio[1]) &&
- !filemap_release_folio(folio[1], 0))) {
+ if (!filemap_release_folio(folio[0], 0) ||
+ !filemap_release_folio(folio[1], 0)) {
*err = -EBUSY;
goto unlock_folios;
}
@@ -392,14 +388,11 @@ data_copy:
for (i = 0; i < block_len_in_page; i++) {
*err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
if (*err < 0)
- break;
+ goto repair_branches;
bh = bh->b_this_page;
}
- if (!*err)
- *err = block_commit_write(&folio[0]->page, from, from + replaced_size);
- if (unlikely(*err < 0))
- goto repair_branches;
+ block_commit_write(&folio[0]->page, from, from + replaced_size);
/* Even in case of data=writeback it is reasonable to pin
* inode to transaction, to prevent unexpected data loss */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7b17ccfa039d..e7611ae1e612 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -283,6 +283,41 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
#endif
/*
+ * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset.
+ * Returns the maximum number of bytes one can read without touching the 1st raw
+ * HWPOISON subpage.
+ *
+ * The implementation borrows the iteration logic from copy_page_to_iter*.
+ */
+static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes)
+{
+ size_t n = 0;
+ size_t res = 0;
+
+ /* First subpage to start the loop. */
+ page += offset / PAGE_SIZE;
+ offset %= PAGE_SIZE;
+ while (1) {
+ if (is_raw_hwpoison_page_in_hugepage(page))
+ break;
+
+ /* Safe to read n bytes without touching HWPOISON subpage. */
+ n = min(bytes, (size_t)PAGE_SIZE - offset);
+ res += n;
+ bytes -= n;
+ if (!bytes || !n)
+ break;
+ offset += n;
+ if (offset == PAGE_SIZE) {
+ page++;
+ offset = 0;
+ }
+ }
+
+ return res;
+}
+
+/*
* Support for read() - Find the page attached to f_mapping and copy out the
* data. This provides functionality similar to filemap_read().
*/
@@ -300,7 +335,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
while (iov_iter_count(to)) {
struct page *page;
- size_t nr, copied;
+ size_t nr, copied, want;
/* nr is the maximum number of bytes to copy from this page */
nr = huge_page_size(h);
@@ -328,16 +363,26 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
unlock_page(page);
- if (PageHWPoison(page)) {
- put_page(page);
- retval = -EIO;
- break;
+ if (!PageHWPoison(page))
+ want = nr;
+ else {
+ /*
+ * Adjust how many bytes safe to read without
+ * touching the 1st raw HWPOISON subpage after
+ * offset.
+ */
+ want = adjust_range_hwpoison(page, offset, nr);
+ if (want == 0) {
+ put_page(page);
+ retval = -EIO;
+ break;
+ }
}
/*
* We have the page, copy it to user space buffer.
*/
- copied = copy_page_to_iter(page, offset, nr, to);
+ copied = copy_page_to_iter(page, offset, want, to);
put_page(page);
}
offset += copied;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fbce16fedaa4..1b5a45ab62b0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -341,7 +341,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
int do_escape = 0;
char *mapped_data;
struct buffer_head *new_bh;
- struct page *new_page;
+ struct folio *new_folio;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
journal_t *journal = transaction->t_journal;
@@ -370,14 +370,14 @@ repeat:
*/
if (jh_in->b_frozen_data) {
done_copy_out = 1;
- new_page = virt_to_page(jh_in->b_frozen_data);
- new_offset = offset_in_page(jh_in->b_frozen_data);
+ new_folio = virt_to_folio(jh_in->b_frozen_data);
+ new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
} else {
- new_page = jh2bh(jh_in)->b_page;
- new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+ new_folio = jh2bh(jh_in)->b_folio;
+ new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data);
}
- mapped_data = kmap_atomic(new_page);
+ mapped_data = kmap_local_folio(new_folio, new_offset);
/*
* Fire data frozen trigger if data already wasn't frozen. Do this
* before checking for escaping, as the trigger may modify the magic
@@ -385,18 +385,17 @@ repeat:
* data in the buffer.
*/
if (!done_copy_out)
- jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+ jbd2_buffer_frozen_trigger(jh_in, mapped_data,
jh_in->b_triggers);
/*
* Check for escaping
*/
- if (*((__be32 *)(mapped_data + new_offset)) ==
- cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+ if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) {
need_copy_out = 1;
do_escape = 1;
}
- kunmap_atomic(mapped_data);
+ kunmap_local(mapped_data);
/*
* Do we need to do a data copy?
@@ -417,12 +416,10 @@ repeat:
}
jh_in->b_frozen_data = tmp;
- mapped_data = kmap_atomic(new_page);
- memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
- kunmap_atomic(mapped_data);
+ memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size);
- new_page = virt_to_page(tmp);
- new_offset = offset_in_page(tmp);
+ new_folio = virt_to_folio(tmp);
+ new_offset = offset_in_folio(new_folio, tmp);
done_copy_out = 1;
/*
@@ -438,12 +435,12 @@ repeat:
* copying, we can finally do so.
*/
if (do_escape) {
- mapped_data = kmap_atomic(new_page);
- *((unsigned int *)(mapped_data + new_offset)) = 0;
- kunmap_atomic(mapped_data);
+ mapped_data = kmap_local_folio(new_folio, new_offset);
+ *((unsigned int *)mapped_data) = 0;
+ kunmap_local(mapped_data);
}
- set_bh_page(new_bh, new_page, new_offset);
+ folio_set_bh(new_bh, new_folio, new_offset);
new_bh->b_size = bh_in->b_size;
new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 8c35d88a84b1..b05717fe0d4e 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -180,6 +180,9 @@ void nfs_fscache_init_inode(struct inode *inode)
&auxdata, /* aux_data */
sizeof(auxdata),
i_size_read(inode));
+
+ if (netfs_inode(inode)->cache)
+ mapping_set_release_always(inode->i_mapping);
}
/*
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index dc7e7ab701c6..8ae572aacc69 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -554,7 +554,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
struct super_block *sb = inode->i_sb;
struct ntfs_sb_info *sbi = sb->s_fs_info;
struct ntfs_inode *ni = ntfs_i(inode);
- struct page *page = bh->b_page;
+ struct folio *folio = bh->b_folio;
u8 cluster_bits = sbi->cluster_bits;
u32 block_size = sb->s_blocksize;
u64 bytes, lbo, valid;
@@ -569,7 +569,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
if (is_resident(ni)) {
ni_lock(ni);
- err = attr_data_read_resident(ni, page);
+ err = attr_data_read_resident(ni, &folio->page);
ni_unlock(ni);
if (!err)
@@ -642,17 +642,17 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
*/
bytes = block_size;
- if (page) {
+ if (folio) {
u32 voff = valid - vbo;
bh->b_size = block_size;
off = vbo & (PAGE_SIZE - 1);
- set_bh_page(bh, page, off);
+ folio_set_bh(bh, folio, off);
err = bh_read(bh, 0);
if (err < 0)
goto out;
- zero_user_segment(page, off + voff, off + block_size);
+ folio_zero_segment(folio, off + voff, off + block_size);
}
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 91a194596552..9e417cd4fd16 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -808,12 +808,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
/* must not update i_size! */
- ret = block_commit_write(page, block_start + 1,
- block_start + 1);
- if (ret < 0)
- mlog_errno(ret);
- else
- ret = 0;
+ block_commit_write(page, block_start + 1, block_start + 1);
}
/*
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 05452c3b9872..eb2e498e3b8d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
mm = get_task_mm(task);
if (mm) {
seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+ seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages);
seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
mmput(mm);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 8dca4d6d96c7..45af9a989d40 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -17,6 +17,7 @@
#ifdef CONFIG_CMA
#include <linux/cma.h>
#endif
+#include <linux/zswap.h>
#include <asm/page.h>
#include "internal.h"
@@ -132,17 +133,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "VmallocChunk: ", 0ul);
show_val_kb(m, "Percpu: ", pcpu_nr_pages());
-#ifdef CONFIG_MEMTEST
- if (early_memtest_done) {
- unsigned long early_memtest_bad_size_kb;
-
- early_memtest_bad_size_kb = early_memtest_bad_size>>10;
- if (early_memtest_bad_size && !early_memtest_bad_size_kb)
- early_memtest_bad_size_kb = 1;
- /* When 0 is reported, it means there actually was a successful test */
- seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb);
- }
-#endif
+ memtest_report_meminfo(m);
#ifdef CONFIG_MEMORY_FAILURE
seq_printf(m, "HardwareCorrupted: %5lu kB\n",
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fafff1bd34cd..15ddf4653a19 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -236,21 +236,6 @@ static int do_maps_open(struct inode *inode, struct file *file,
sizeof(struct proc_maps_private));
}
-/*
- * Indicate if the VMA is a stack for the given task; for
- * /proc/PID/maps that is the stack of the main task.
- */
-static int is_stack(struct vm_area_struct *vma)
-{
- /*
- * We make no effort to guess what a given thread considers to be
- * its "stack". It's not even well-defined for programs written
- * languages like Go.
- */
- return vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack;
-}
-
static void show_vma_header_prefix(struct seq_file *m,
unsigned long start, unsigned long end,
vm_flags_t flags, unsigned long long pgoff,
@@ -327,13 +312,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done;
}
- if (vma->vm_start <= mm->brk &&
- vma->vm_end >= mm->start_brk) {
+ if (vma_is_initial_heap(vma)) {
name = "[heap]";
goto done;
}
- if (is_stack(vma)) {
+ if (vma_is_initial_stack(vma)) {
name = "[stack]";
goto done;
}
@@ -871,7 +855,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
- seq_printf(m, "THPeligible: %d\n",
+ seq_printf(m, "THPeligible: %8u\n",
hugepage_vma_check(vma, vma->vm_flags, true, false, true));
if (arch_pkeys_enabled())
@@ -1975,9 +1959,9 @@ static int show_numa_map(struct seq_file *m, void *v)
if (file) {
seq_puts(m, " file=");
seq_file_path(m, file, "\n\t= ");
- } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ } else if (vma_is_initial_heap(vma)) {
seq_puts(m, " heap");
- } else if (is_stack(vma)) {
+ } else if (vma_is_initial_stack(vma)) {
seq_puts(m, " stack");
}
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 2c8b62265981..a8ac0dd8041e 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -121,19 +121,6 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
-static int is_stack(struct vm_area_struct *vma)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- /*
- * We make no effort to guess what a given thread considers to be
- * its "stack". It's not even well-defined for programs written
- * languages like Go.
- */
- return vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack;
-}
-
/*
* display a single VMA to a sequenced file
*/
@@ -171,7 +158,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
if (file) {
seq_pad(m, ' ');
seq_file_path(m, file, "");
- } else if (mm && is_stack(vma)) {
+ } else if (mm && vma_is_initial_stack(vma)) {
seq_pad(m, ' ');
seq_puts(m, "[stack]");
}
diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
index 8f6909d633da..3677525ee993 100644
--- a/fs/smb/client/fscache.c
+++ b/fs/smb/client/fscache.c
@@ -108,6 +108,8 @@ void cifs_fscache_get_inode_cookie(struct inode *inode)
&cifsi->uniqueid, sizeof(cifsi->uniqueid),
&cd, sizeof(cd),
i_size_read(&cifsi->netfs.inode));
+ if (cifsi->netfs.cache)
+ mapping_set_release_always(inode->i_mapping);
}
void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update)
diff --git a/fs/splice.c b/fs/splice.c
index 3e2a31e1ce6a..0d3deeb3857e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -83,8 +83,7 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
*/
folio_wait_writeback(folio);
- if (folio_has_private(folio) &&
- !filemap_release_folio(folio, GFP_KERNEL))
+ if (!filemap_release_folio(folio, GFP_KERNEL))
goto out_unlock;
/*
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 243840dc83ad..0292d75e60cc 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -63,13 +63,13 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf)
else
end = PAGE_SIZE;
err = __block_write_begin(page, 0, end, udf_get_block);
- if (!err)
- err = block_commit_write(page, 0, end);
- if (err < 0) {
+ if (err) {
unlock_page(page);
ret = block_page_mkwrite_return(err);
goto out_unlock;
}
+
+ block_commit_write(page, 0, end);
out_dirty:
set_page_dirty(page);
wait_for_stable_page(page);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7cecd49e078b..70bd2951b68d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -427,7 +427,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
*
* We also don't do userfault handling during
* coredumping. hugetlbfs has the special
- * follow_hugetlb_page() to skip missing pages in the
+ * hugetlb_follow_page_mask() to skip missing pages in the
* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
* the no_page_table() helper in follow_page_mask(), but the
* shmem_vm_ops->fault method is invoked even during
@@ -667,6 +667,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
mmap_write_lock(mm);
for_each_vma(vmi, vma) {
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
+ vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma,
vma->vm_flags & ~__VM_UFFD_FLAGS);
@@ -702,6 +703,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+ vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
return 0;
@@ -783,6 +785,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
atomic_inc(&ctx->mmap_changing);
} else {
/* Drop uffd context if remap feature not enabled */
+ vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
}
@@ -940,6 +943,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
prev = vma;
}
+ vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
@@ -1289,13 +1293,11 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
__wake_userfault(ctx, range);
}
-static __always_inline int validate_range(struct mm_struct *mm,
- __u64 start, __u64 len)
+static __always_inline int validate_unaligned_range(
+ struct mm_struct *mm, __u64 start, __u64 len)
{
__u64 task_size = mm->task_size;
- if (start & ~PAGE_MASK)
- return -EINVAL;
if (len & ~PAGE_MASK)
return -EINVAL;
if (!len)
@@ -1306,9 +1308,20 @@ static __always_inline int validate_range(struct mm_struct *mm,
return -EINVAL;
if (len > task_size - start)
return -EINVAL;
+ if (start + len <= start)
+ return -EINVAL;
return 0;
}
+static __always_inline int validate_range(struct mm_struct *mm,
+ __u64 start, __u64 len)
+{
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+
+ return validate_unaligned_range(mm, start, len);
+}
+
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
@@ -1502,6 +1515,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
+ vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
@@ -1685,6 +1699,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
+ vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -1757,17 +1772,15 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;
+ ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
+ uffdio_copy.len);
+ if (ret)
+ goto out;
ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
if (ret)
goto out;
- /*
- * double check for wraparound just in case. copy_from_user()
- * will later check uffdio_copy.src + uffdio_copy.len to fit
- * in the userland range.
- */
+
ret = -EINVAL;
- if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
- goto out;
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out;
if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
@@ -1927,11 +1940,6 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
goto out;
ret = -EINVAL;
- /* double check for wraparound just in case. */
- if (uffdio_continue.range.start + uffdio_continue.range.len <=
- uffdio_continue.range.start) {
- goto out;
- }
if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
UFFDIO_CONTINUE_MODE_WP))
goto out;
@@ -1965,6 +1973,61 @@ out:
return ret;
}
+static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_poison uffdio_poison;
+ struct uffdio_poison __user *user_uffdio_poison;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_poison = (struct uffdio_poison __user *)arg;
+
+ ret = -EAGAIN;
+ if (atomic_read(&ctx->mmap_changing))
+ goto out;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_poison, user_uffdio_poison,
+ /* don't copy the output fields */
+ sizeof(uffdio_poison) - (sizeof(__s64))))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_poison.range.start,
+ uffdio_poison.range.len);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
+ if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
+ goto out;
+
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
+ uffdio_poison.range.len,
+ &ctx->mmap_changing, 0);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
+ if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+
+ /* len == 0 would wake all */
+ BUG_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
+ range.start = uffdio_poison.range.start;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
+
+out:
+ return ret;
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
@@ -2066,6 +2129,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_CONTINUE:
ret = userfaultfd_continue(ctx, arg);
break;
+ case UFFDIO_POISON:
+ ret = userfaultfd_poison(ctx, arg);
+ break;
}
return ret;
}
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 587e7e9b9a37..bac63e874c7b 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -1047,41 +1047,22 @@ static inline void iounmap(volatile void __iomem *addr)
#elif defined(CONFIG_GENERIC_IOREMAP)
#include <linux/pgtable.h>
-/*
- * Arch code can implement the following two hooks when using GENERIC_IOREMAP
- * ioremap_allowed() return a bool,
- * - true means continue to remap
- * - false means skip remap and return directly
- * iounmap_allowed() return a bool,
- * - true means continue to vunmap
- * - false means skip vunmap and return directly
- */
-#ifndef ioremap_allowed
-#define ioremap_allowed ioremap_allowed
-static inline bool ioremap_allowed(phys_addr_t phys_addr, size_t size,
- unsigned long prot)
-{
- return true;
-}
-#endif
-
-#ifndef iounmap_allowed
-#define iounmap_allowed iounmap_allowed
-static inline bool iounmap_allowed(void *addr)
-{
- return true;
-}
-#endif
+void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size,
+ pgprot_t prot);
void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
unsigned long prot);
void iounmap(volatile void __iomem *addr);
+void generic_iounmap(volatile void __iomem *addr);
+#ifndef ioremap
+#define ioremap ioremap
static inline void __iomem *ioremap(phys_addr_t addr, size_t size)
{
/* _PAGE_IOREMAP needs to be supplied by the architecture */
return ioremap_prot(addr, size, _PAGE_IOREMAP);
}
+#endif
#endif /* !CONFIG_MMU || CONFIG_GENERIC_IOREMAP */
#ifndef ioremap_wc
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 08237ae8b840..196087a8126e 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -93,15 +93,15 @@ extern void __iomem *ioport_map(unsigned long port, unsigned int nr);
extern void ioport_unmap(void __iomem *);
#endif
-#ifndef ARCH_HAS_IOREMAP_WC
+#ifndef ioremap_wc
#define ioremap_wc ioremap
#endif
-#ifndef ARCH_HAS_IOREMAP_WT
+#ifndef ioremap_wt
#define ioremap_wt ioremap
#endif
-#ifndef ARCH_HAS_IOREMAP_NP
+#ifndef ioremap_np
/* See the comment in asm-generic/io.h about ioremap_np(). */
#define ioremap_np ioremap_np
static inline void __iomem *ioremap_np(phys_addr_t offset, size_t size)
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index a7cf825befae..c75d4a753849 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -8,7 +8,7 @@
#define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)
/**
- * __pte_alloc_one_kernel - allocate a page for PTE-level kernel page table
+ * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
* @mm: the mm_struct of the current context
*
* This function is intended for architectures that need
@@ -18,12 +18,17 @@
*/
static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
{
- return (pte_t *)__get_free_page(GFP_PGTABLE_KERNEL);
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL &
+ ~__GFP_HIGHMEM, 0);
+
+ if (!ptdesc)
+ return NULL;
+ return ptdesc_address(ptdesc);
}
#ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
/**
- * pte_alloc_one_kernel - allocate a page for PTE-level kernel page table
+ * pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
* @mm: the mm_struct of the current context
*
* Return: pointer to the allocated memory or %NULL on error
@@ -35,40 +40,40 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
#endif
/**
- * pte_free_kernel - free PTE-level kernel page table page
+ * pte_free_kernel - free PTE-level kernel page table memory
* @mm: the mm_struct of the current context
* @pte: pointer to the memory containing the page table
*/
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
- free_page((unsigned long)pte);
+ pagetable_free(virt_to_ptdesc(pte));
}
/**
- * __pte_alloc_one - allocate a page for PTE-level user page table
+ * __pte_alloc_one - allocate memory for a PTE-level user page table
* @mm: the mm_struct of the current context
* @gfp: GFP flags to use for the allocation
*
- * Allocates a page and runs the pgtable_pte_page_ctor().
+ * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
*
* This function is intended for architectures that need
* anything beyond simple page allocation or must have custom GFP flags.
*
- * Return: `struct page` initialized as page table or %NULL on error
+ * Return: `struct page` referencing the ptdesc or %NULL on error
*/
static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
{
- struct page *pte;
+ struct ptdesc *ptdesc;
- pte = alloc_page(gfp);
- if (!pte)
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pte_page_ctor(pte)) {
- __free_page(pte);
+ if (!pagetable_pte_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- return pte;
+ return ptdesc_page(ptdesc);
}
#ifndef __HAVE_ARCH_PTE_ALLOC_ONE
@@ -76,9 +81,9 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
* pte_alloc_one - allocate a page for PTE-level user page table
* @mm: the mm_struct of the current context
*
- * Allocates a page and runs the pgtable_pte_page_ctor().
+ * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
*
- * Return: `struct page` initialized as page table or %NULL on error
+ * Return: `struct page` referencing the ptdesc or %NULL on error
*/
static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
{
@@ -92,14 +97,16 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
*/
/**
- * pte_free - free PTE-level user page table page
+ * pte_free - free PTE-level user page table memory
* @mm: the mm_struct of the current context
- * @pte_page: the `struct page` representing the page table
+ * @pte_page: the `struct page` referencing the ptdesc
*/
static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
{
- pgtable_pte_page_dtor(pte_page);
- __free_page(pte_page);
+ struct ptdesc *ptdesc = page_ptdesc(pte_page);
+
+ pagetable_pte_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
@@ -107,10 +114,11 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
#ifndef __HAVE_ARCH_PMD_ALLOC_ONE
/**
- * pmd_alloc_one - allocate a page for PMD-level page table
+ * pmd_alloc_one - allocate memory for a PMD-level page table
* @mm: the mm_struct of the current context
*
- * Allocates a page and runs the pgtable_pmd_page_ctor().
+ * Allocate memory for a page table and ptdesc and runs pagetable_pmd_ctor().
+ *
* Allocations use %GFP_PGTABLE_USER in user context and
* %GFP_PGTABLE_KERNEL in kernel context.
*
@@ -118,28 +126,30 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
*/
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
- struct page *page;
+ struct ptdesc *ptdesc;
gfp_t gfp = GFP_PGTABLE_USER;
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
- page = alloc_page(gfp);
- if (!page)
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pmd_page_ctor(page)) {
- __free_page(page);
+ if (!pagetable_pmd_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- return (pmd_t *)page_address(page);
+ return ptdesc_address(ptdesc);
}
#endif
#ifndef __HAVE_ARCH_PMD_FREE
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
+ struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
+
BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
- pgtable_pmd_page_dtor(virt_to_page(pmd));
- free_page((unsigned long)pmd);
+ pagetable_pmd_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
#endif
@@ -150,19 +160,25 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
gfp_t gfp = GFP_PGTABLE_USER;
+ struct ptdesc *ptdesc;
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
- return (pud_t *)get_zeroed_page(gfp);
+ gfp &= ~__GFP_HIGHMEM;
+
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
+ return NULL;
+ return ptdesc_address(ptdesc);
}
#ifndef __HAVE_ARCH_PUD_ALLOC_ONE
/**
- * pud_alloc_one - allocate a page for PUD-level page table
+ * pud_alloc_one - allocate memory for a PUD-level page table
* @mm: the mm_struct of the current context
*
- * Allocates a page using %GFP_PGTABLE_USER for user context and
- * %GFP_PGTABLE_KERNEL for kernel context.
+ * Allocate memory for a page table using %GFP_PGTABLE_USER for user context
+ * and %GFP_PGTABLE_KERNEL for kernel context.
*
* Return: pointer to the allocated memory or %NULL on error
*/
@@ -175,7 +191,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
static inline void __pud_free(struct mm_struct *mm, pud_t *pud)
{
BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
- free_page((unsigned long)pud);
+ pagetable_free(virt_to_ptdesc(pud));
}
#ifndef __HAVE_ARCH_PUD_FREE
@@ -190,7 +206,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
#ifndef __HAVE_ARCH_PGD_FREE
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
- free_page((unsigned long)pgd);
+ pagetable_free(virt_to_ptdesc(pgd));
}
#endif
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b46617207c93..129a3a759976 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -456,7 +456,6 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
return;
tlb_flush(tlb);
- mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
__tlb_reset_range(tlb);
}
@@ -481,6 +480,17 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
return tlb_remove_page_size(tlb, page, PAGE_SIZE);
}
+static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt)
+{
+ tlb_remove_table(tlb, pt);
+}
+
+/* Like tlb_remove_ptdesc, but for page-like page directories. */
+static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt)
+{
+ tlb_remove_page(tlb, ptdesc_page(pt));
+}
+
static inline void tlb_change_page_size(struct mmu_gather *tlb,
unsigned int page_size)
{
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index fbad4fcd408e..1a97277f99b1 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -46,7 +46,6 @@ extern spinlock_t bdi_lock;
extern struct list_head bdi_list;
extern struct workqueue_struct *bdi_wq;
-extern struct workqueue_struct *bdi_async_bio_wq;
static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c4f5b5228105..027ff9ab5d12 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -253,6 +253,11 @@ static inline struct page *bio_first_page_all(struct bio *bio)
return bio_first_bvec_all(bio)->bv_page;
}
+static inline struct folio *bio_first_folio_all(struct bio *bio)
+{
+ return page_folio(bio_first_page_all(bio));
+}
+
static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
{
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 6cb3e9af78c9..06566aee94ca 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -194,8 +194,6 @@ void buffer_check_dirty_writeback(struct folio *folio,
void mark_buffer_dirty(struct buffer_head *bh);
void mark_buffer_write_io_error(struct buffer_head *bh);
void touch_buffer(struct buffer_head *bh);
-void set_bh_page(struct buffer_head *bh,
- struct page *page, unsigned long offset);
void folio_set_bh(struct buffer_head *bh, struct folio *folio,
unsigned long offset);
bool try_to_free_buffers(struct folio *);
@@ -288,7 +286,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t,
unsigned, struct page **, void **,
get_block_t *, loff_t *);
int generic_cont_expand_simple(struct inode *inode, loff_t size);
-int block_commit_write(struct page *page, unsigned from, unsigned to);
+void block_commit_write(struct page *page, unsigned int from, unsigned int to);
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block);
/* Convert errno to return value from ->page_mkwrite() call */
diff --git a/include/linux/damon.h b/include/linux/damon.h
index d5d4d19928e0..ae2664d1d5f1 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -226,16 +226,26 @@ struct damos_stat {
* enum damos_filter_type - Type of memory for &struct damos_filter
* @DAMOS_FILTER_TYPE_ANON: Anonymous pages.
* @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages.
+ * @DAMOS_FILTER_TYPE_ADDR: Address range.
+ * @DAMOS_FILTER_TYPE_TARGET: Data Access Monitoring target.
* @NR_DAMOS_FILTER_TYPES: Number of filter types.
*
- * The support of each filter type is up to running &struct damon_operations.
- * &enum DAMON_OPS_PADDR is supporting all filter types, while
- * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR are not supporting any
- * filter types.
+ * The anon pages type and memcg type filters are handled by underlying
+ * &struct damon_operations as a part of scheme action trying, and therefore
+ * accounted as 'tried'. In contrast, other types are handled by core layer
+ * before trying of the action and therefore not accounted as 'tried'.
+ *
+ * The support of the filters that handled by &struct damon_operations depend
+ * on the running &struct damon_operations.
+ * &enum DAMON_OPS_PADDR supports both anon pages type and memcg type filters,
+ * while &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR don't support any of
+ * the two types.
*/
enum damos_filter_type {
DAMOS_FILTER_TYPE_ANON,
DAMOS_FILTER_TYPE_MEMCG,
+ DAMOS_FILTER_TYPE_ADDR,
+ DAMOS_FILTER_TYPE_TARGET,
NR_DAMOS_FILTER_TYPES,
};
@@ -244,18 +254,24 @@ enum damos_filter_type {
* @type: Type of the page.
* @matching: If the matching page should filtered out or in.
* @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG.
+ * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR.
+ * @target_idx: Index of the &struct damon_target of
+ * &damon_ctx->adaptive_targets if @type is
+ * DAMOS_FILTER_TYPE_TARGET.
* @list: List head for siblings.
*
* Before applying the &damos->action to a memory region, DAMOS checks if each
* page of the region matches to this and avoid applying the action if so.
- * Note that the check support is up to &struct damon_operations
- * implementation.
+ * Support of each filter type depends on the running &struct damon_operations
+ * and the type. Refer to &enum damos_filter_type for more detai.
*/
struct damos_filter {
enum damos_filter_type type;
bool matching;
union {
unsigned short memcg_id;
+ struct damon_addr_range addr_range;
+ int target_idx;
};
struct list_head list;
};
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
deleted file mode 100644
index eaa0ac5f9003..000000000000
--- a/include/linux/frontswap.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_FRONTSWAP_H
-#define _LINUX_FRONTSWAP_H
-
-#include <linux/swap.h>
-#include <linux/mm.h>
-#include <linux/bitops.h>
-#include <linux/jump_label.h>
-
-struct frontswap_ops {
- void (*init)(unsigned); /* this swap type was just swapon'ed */
- int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
- int (*load)(unsigned, pgoff_t, struct page *, bool *); /* load a page */
- void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */
- void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */
-};
-
-int frontswap_register_ops(const struct frontswap_ops *ops);
-
-extern void frontswap_init(unsigned type, unsigned long *map);
-extern int __frontswap_store(struct page *page);
-extern int __frontswap_load(struct page *page);
-extern void __frontswap_invalidate_page(unsigned, pgoff_t);
-extern void __frontswap_invalidate_area(unsigned);
-
-#ifdef CONFIG_FRONTSWAP
-extern struct static_key_false frontswap_enabled_key;
-
-static inline bool frontswap_enabled(void)
-{
- return static_branch_unlikely(&frontswap_enabled_key);
-}
-
-static inline void frontswap_map_set(struct swap_info_struct *p,
- unsigned long *map)
-{
- p->frontswap_map = map;
-}
-
-static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
-{
- return p->frontswap_map;
-}
-#else
-/* all inline routines become no-ops and all externs are ignored */
-
-static inline bool frontswap_enabled(void)
-{
- return false;
-}
-
-static inline void frontswap_map_set(struct swap_info_struct *p,
- unsigned long *map)
-{
-}
-
-static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
-{
- return NULL;
-}
-#endif
-
-static inline int frontswap_store(struct page *page)
-{
- if (frontswap_enabled())
- return __frontswap_store(page);
-
- return -1;
-}
-
-static inline int frontswap_load(struct page *page)
-{
- if (frontswap_enabled())
- return __frontswap_load(page);
-
- return -1;
-}
-
-static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset)
-{
- if (frontswap_enabled())
- __frontswap_invalidate_page(type, offset);
-}
-
-static inline void frontswap_invalidate_area(unsigned type)
-{
- if (frontswap_enabled())
- __frontswap_invalidate_area(type);
-}
-
-#endif /* _LINUX_FRONTSWAP_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6867512907d6..bb00d37a6ca0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -447,11 +447,11 @@ struct address_space {
atomic_t nr_thps;
#endif
struct rb_root_cached i_mmap;
- struct rw_semaphore i_mmap_rwsem;
unsigned long nrpages;
pgoff_t writeback_index;
const struct address_space_operations *a_ops;
unsigned long flags;
+ struct rw_semaphore i_mmap_rwsem;
errseq_t wb_err;
spinlock_t private_lock;
struct list_head private_list;
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 68da30625a6c..99c474de800d 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -439,6 +439,50 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len)
kunmap_local(addr);
}
+static inline void memcpy_from_folio(char *to, struct folio *folio,
+ size_t offset, size_t len)
+{
+ VM_BUG_ON(offset + len > folio_size(folio));
+
+ do {
+ const char *from = kmap_local_folio(folio, offset);
+ size_t chunk = len;
+
+ if (folio_test_highmem(folio) &&
+ chunk > PAGE_SIZE - offset_in_page(offset))
+ chunk = PAGE_SIZE - offset_in_page(offset);
+ memcpy(to, from, chunk);
+ kunmap_local(from);
+
+ from += chunk;
+ offset += chunk;
+ len -= chunk;
+ } while (len > 0);
+}
+
+static inline void memcpy_to_folio(struct folio *folio, size_t offset,
+ const char *from, size_t len)
+{
+ VM_BUG_ON(offset + len > folio_size(folio));
+
+ do {
+ char *to = kmap_local_folio(folio, offset);
+ size_t chunk = len;
+
+ if (folio_test_highmem(folio) &&
+ chunk > PAGE_SIZE - offset_in_page(offset))
+ chunk = PAGE_SIZE - offset_in_page(offset);
+ memcpy(to, from, chunk);
+ kunmap_local(to);
+
+ from += chunk;
+ offset += chunk;
+ len -= chunk;
+ } while (len > 0);
+
+ flush_dcache_folio(folio);
+}
+
/**
* memcpy_from_file_folio - Copy some bytes from a file folio.
* @to: The destination buffer.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ca3c8e10f24a..0a393bc02f25 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -131,10 +131,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
struct vm_area_struct *, struct vm_area_struct *);
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags);
-long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
- struct page **, unsigned long *, unsigned long *,
- long, unsigned int, int *);
+ unsigned long address, unsigned int flags,
+ unsigned int *page_mask);
void unmap_hugepage_range(struct vm_area_struct *,
unsigned long, unsigned long, struct page *,
zap_flags_t);
@@ -297,21 +295,13 @@ static inline void adjust_range_if_pmd_sharing_possible(
{
}
-static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags)
+static inline struct page *hugetlb_follow_page_mask(
+ struct vm_area_struct *vma, unsigned long address, unsigned int flags,
+ unsigned int *page_mask)
{
BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/
}
-static inline long follow_hugetlb_page(struct mm_struct *mm,
- struct vm_area_struct *vma, struct page **pages,
- unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags, int *nonblocking)
-{
- BUG();
- return 0;
-}
-
static inline int copy_hugetlb_page_range(struct mm_struct *dst,
struct mm_struct *src,
struct vm_area_struct *dst_vma,
@@ -851,11 +841,6 @@ static inline struct hstate *folio_hstate(struct folio *folio)
return size_to_hstate(folio_size(folio));
}
-static inline struct hstate *page_hstate(struct page *page)
-{
- return folio_hstate(page_folio(page));
-}
-
static inline unsigned hstate_index_to_shift(unsigned index)
{
return hstates[index].order + PAGE_SHIFT;
@@ -1007,6 +992,11 @@ void hugetlb_register_node(struct node *node);
void hugetlb_unregister_node(struct node *node);
#endif
+/*
+ * Check if a given raw @page in a hugepage is HWPOISON.
+ */
+bool is_raw_hwpoison_page_in_hugepage(struct page *page);
+
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
@@ -1067,11 +1057,6 @@ static inline struct hstate *folio_hstate(struct folio *folio)
return NULL;
}
-static inline struct hstate *page_hstate(struct page *page)
-{
- return NULL;
-}
-
static inline struct hstate *size_to_hstate(unsigned long size)
{
return NULL;
diff --git a/include/linux/ioremap.h b/include/linux/ioremap.h
new file mode 100644
index 000000000000..f0e99fc7dd8b
--- /dev/null
+++ b/include/linux/ioremap.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IOREMAP_H
+#define _LINUX_IOREMAP_H
+
+#include <linux/kasan.h>
+#include <asm/pgtable.h>
+
+#if defined(CONFIG_HAS_IOMEM) || defined(CONFIG_GENERIC_IOREMAP)
+/*
+ * Ioremap often, but not always uses the generic vmalloc area. E.g on
+ * Power ARCH, it could have different ioremap space.
+ */
+#ifndef IOREMAP_START
+#define IOREMAP_START VMALLOC_START
+#define IOREMAP_END VMALLOC_END
+#endif
+static inline bool is_ioremap_addr(const void *x)
+{
+ unsigned long addr = (unsigned long)kasan_reset_tag(x);
+
+ return addr >= IOREMAP_START && addr < IOREMAP_END;
+}
+#else
+static inline bool is_ioremap_addr(const void *x)
+{
+ return false;
+}
+#endif
+
+#endif /* _LINUX_IOREMAP_H */
diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 726857a4b680..401af4757514 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -59,15 +59,16 @@ static __always_inline bool is_kfence_address(const void *addr)
}
/**
- * kfence_alloc_pool() - allocate the KFENCE pool via memblock
+ * kfence_alloc_pool_and_metadata() - allocate the KFENCE pool and KFENCE
+ * metadata via memblock
*/
-void __init kfence_alloc_pool(void);
+void __init kfence_alloc_pool_and_metadata(void);
/**
* kfence_init() - perform KFENCE initialization at boot time
*
- * Requires that kfence_alloc_pool() was called before. This sets up the
- * allocation gate timer, and requires that workqueues are available.
+ * Requires that kfence_alloc_pool_and_metadata() was called before. This sets
+ * up the allocation gate timer, and requires that workqueues are available.
*/
void __init kfence_init(void);
@@ -223,7 +224,7 @@ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *sla
#else /* CONFIG_KFENCE */
static inline bool is_kfence_address(const void *addr) { return false; }
-static inline void kfence_alloc_pool(void) { }
+static inline void kfence_alloc_pool_and_metadata(void) { }
static inline void kfence_init(void) { }
static inline void kfence_shutdown_cache(struct kmem_cache *s) { }
static inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { return NULL; }
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 899a314bc487..c2dd786a30e1 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -26,6 +26,22 @@ int ksm_disable(struct mm_struct *mm);
int __ksm_enter(struct mm_struct *mm);
void __ksm_exit(struct mm_struct *mm);
+/*
+ * To identify zeropages that were mapped by KSM, we reuse the dirty bit
+ * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when
+ * deduplicating memory.
+ */
+#define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte))
+
+extern unsigned long ksm_zero_pages;
+
+static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
+{
+ if (is_ksm_zero_pte(pte)) {
+ ksm_zero_pages--;
+ mm->ksm_zero_pages--;
+ }
+}
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
@@ -95,6 +111,10 @@ static inline void ksm_exit(struct mm_struct *mm)
{
}
+static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
+{
+}
+
#ifdef CONFIG_MEMORY_FAILURE
static inline void collect_procs_ksm(struct page *page,
struct list_head *to_kill, int force_early)
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 295548cca8b3..c962af188681 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -29,14 +29,12 @@
#define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */
-#define MAPLE_ARANGE64_META_MAX 15 /* Out of range for metadata */
#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1)
#else
/* 32bit sizes */
#define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */
-#define MAPLE_ARANGE64_META_MAX 31 /* Out of range for metadata */
#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2)
#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */
@@ -184,13 +182,23 @@ enum maple_type {
#ifdef CONFIG_LOCKDEP
typedef struct lockdep_map *lockdep_map_p;
-#define mt_lock_is_held(mt) lock_is_held(mt->ma_external_lock)
+#define mt_lock_is_held(mt) \
+ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock))
+
+#define mt_write_lock_is_held(mt) \
+ (!(mt)->ma_external_lock || \
+ lock_is_held_type((mt)->ma_external_lock, 0))
+
#define mt_set_external_lock(mt, lock) \
(mt)->ma_external_lock = &(lock)->dep_map
+
+#define mt_on_stack(mt) (mt).ma_external_lock = NULL
#else
typedef struct { /* nothing */ } lockdep_map_p;
-#define mt_lock_is_held(mt) 1
+#define mt_lock_is_held(mt) 1
+#define mt_write_lock_is_held(mt) 1
#define mt_set_external_lock(mt, lock) do { } while (0)
+#define mt_on_stack(mt) do { } while (0)
#endif
/*
@@ -458,7 +466,7 @@ void *mas_find(struct ma_state *mas, unsigned long max);
void *mas_find_range(struct ma_state *mas, unsigned long max);
void *mas_find_rev(struct ma_state *mas, unsigned long min);
void *mas_find_range_rev(struct ma_state *mas, unsigned long max);
-int mas_preallocate(struct ma_state *mas, gfp_t gfp);
+int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
bool mas_is_err(struct ma_state *mas);
bool mas_nomem(struct ma_state *mas, gfp_t gfp);
@@ -531,6 +539,22 @@ static inline void mas_reset(struct ma_state *mas)
*/
#define mas_for_each(__mas, __entry, __max) \
while (((__entry) = mas_find((__mas), (__max))) != NULL)
+/**
+ * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the
+ * current location.
+ * @mas: Maple Tree operation state.
+ * @start: New start of range in the Maple Tree.
+ * @last: New end of range in the Maple Tree.
+ *
+ * set the internal maple state values to a sub-range.
+ * Please use mas_set_range() if you do not know where you are in the tree.
+ */
+static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
+ unsigned long last)
+{
+ mas->index = start;
+ mas->last = last;
+}
/**
* mas_set_range() - Set up Maple Tree operation state for a different index.
@@ -545,9 +569,8 @@ static inline void mas_reset(struct ma_state *mas)
static inline
void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
{
- mas->index = start;
- mas->last = last;
- mas->node = MAS_START;
+ __mas_set_range(mas, start, last);
+ mas->node = MAS_START;
}
/**
@@ -662,10 +685,11 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);
* mt_for_each - Iterate over each entry starting at index until max.
* @__tree: The Maple Tree
* @__entry: The current entry
- * @__index: The index to update to track the location in the tree
+ * @__index: The index to start the search from. Subsequently used as iterator.
* @__max: The maximum limit for @index
*
- * Note: Will not return the zero entry.
+ * This iterator skips all entries, which resolve to a NULL pointer,
+ * e.g. entries which has been reserved with XA_ZERO_ENTRY.
*/
#define mt_for_each(__tree, __entry, __index, __max) \
for (__entry = mt_find(__tree, &(__index), __max); \
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f71ff9f0ec81..1c1072e3ca06 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -581,9 +581,7 @@ extern void *alloc_large_system_hash(const char *tablename,
unsigned long high_limit);
#define HASH_EARLY 0x00000001 /* Allocating during early boot? */
-#define HASH_SMALL 0x00000002 /* sub-page allocation allowed, min
- * shift passed via *_hash_shift */
-#define HASH_ZERO 0x00000004 /* Zero allocated hash table */
+#define HASH_ZERO 0x00000002 /* Zero allocated hash table */
/* Only NUMA needs hash distribution. 64bit NUMA architectures have
* sufficient vmalloc space.
@@ -596,13 +594,11 @@ extern int hashdist; /* Distribute hashes across NUMA nodes? */
#endif
#ifdef CONFIG_MEMTEST
-extern phys_addr_t early_memtest_bad_size; /* Size of faulty ram found by memtest */
-extern bool early_memtest_done; /* Was early memtest done? */
-extern void early_memtest(phys_addr_t start, phys_addr_t end);
+void early_memtest(phys_addr_t start, phys_addr_t end);
+void memtest_report_meminfo(struct seq_file *m);
#else
-static inline void early_memtest(phys_addr_t start, phys_addr_t end)
-{
-}
+static inline void early_memtest(phys_addr_t start, phys_addr_t end) { }
+static inline void memtest_report_meminfo(struct seq_file *m) { }
#endif
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5818af8eca5a..163004ae3349 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -61,7 +61,6 @@ struct mem_cgroup_reclaim_cookie {
#ifdef CONFIG_MEMCG
#define MEM_CGROUP_ID_SHIFT 16
-#define MEM_CGROUP_ID_MAX USHRT_MAX
struct mem_cgroup_id {
int id;
@@ -583,7 +582,7 @@ static inline void mem_cgroup_protection(struct mem_cgroup *root,
/*
* There is no reclaim protection applied to a targeted reclaim.
* We are special casing this specific case here because
- * mem_cgroup_protected calculation is not robust enough to keep
+ * mem_cgroup_calculate_protection is not robust enough to keep
* the protection invariant for calculated effective values for
* parallel reclaimers with different reclaim target. This is
* especially a problem for tail memcgs (as they have pages on LRU)
@@ -861,8 +860,7 @@ static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
* parent_mem_cgroup - find the accounting parent of a memcg
* @memcg: memcg whose parent to find
*
- * Returns the parent memcg, or NULL if this is the root or the memory
- * controller is in legacy no-hierarchy mode.
+ * Returns the parent memcg, or NULL if this is the root.
*/
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
@@ -1158,7 +1156,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
#else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0
-#define MEM_CGROUP_ID_MAX 0
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
@@ -1761,7 +1758,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);
struct obj_cgroup *get_obj_cgroup_from_current(void);
-struct obj_cgroup *get_obj_cgroup_from_page(struct page *page);
+struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
@@ -1845,7 +1842,7 @@ static inline void __memcg_kmem_uncharge_page(struct page *page, int order)
{
}
-static inline struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
+static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
{
return NULL;
}
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index fc9647b1b4f9..437441cdf78f 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -33,7 +33,7 @@ struct memory_dev_type {
#ifdef CONFIG_NUMA
extern bool numa_demotion_enabled;
struct memory_dev_type *alloc_memory_type(int adistance);
-void destroy_memory_type(struct memory_dev_type *memtype);
+void put_memory_type(struct memory_dev_type *memtype);
void init_node_memory_type(int node, struct memory_dev_type *default_type);
void clear_node_memory_type(int node, struct memory_dev_type *memtype);
#ifdef CONFIG_MIGRATION
@@ -68,7 +68,7 @@ static inline struct memory_dev_type *alloc_memory_type(int adistance)
return NULL;
}
-static inline void destroy_memory_type(struct memory_dev_type *memtype)
+static inline void put_memory_type(struct memory_dev_type *memtype)
{
}
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 31343566c221..f53cfdaaaa41 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -77,11 +77,7 @@ struct memory_block {
*/
struct zone *zone;
struct device dev;
- /*
- * Number of vmemmap pages. These pages
- * lay at the beginning of the memory block.
- */
- unsigned long nr_vmemmap_pages;
+ struct vmem_altmap *altmap;
struct memory_group *group; /* group (if any) for this block */
struct list_head group_next; /* next block inside memory group */
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
@@ -147,7 +143,7 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri)
extern int register_memory_notifier(struct notifier_block *nb);
extern void unregister_memory_notifier(struct notifier_block *nb);
int create_memory_block_devices(unsigned long start, unsigned long size,
- unsigned long vmemmap_pages,
+ struct vmem_altmap *altmap,
struct memory_group *group);
void remove_memory_block_devices(unsigned long start, unsigned long size);
extern void memory_dev_init(void);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 013c69753c91..7d2076583494 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -97,6 +97,8 @@ typedef int __bitwise mhp_t;
* To do so, we will use the beginning of the hot-added range to build
* the page tables for the memmap array that describes the entire range.
* Only selected architectures support it with SPARSE_VMEMMAP.
+ * This is only a hint, the core kernel can decide to not do this based on
+ * different alignment checks.
*/
#define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1))
/*
@@ -354,7 +356,6 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid,
extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
struct mhp_params *params);
void arch_remove_linear_mapping(u64 start, u64 size);
-extern bool mhp_supports_memmap_on_memory(unsigned long size);
#endif /* CONFIG_MEMORY_HOTPLUG */
#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 34f9dba17c1a..55eb2789794e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -691,6 +691,11 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
return (vma->vm_lock_seq == *mm_lock_seq);
}
+/*
+ * Begin writing to a VMA.
+ * Exclude concurrent readers under the per-VMA lock until the currently
+ * write-locked mmap_lock is dropped or downgraded.
+ */
static inline void vma_start_write(struct vm_area_struct *vma)
{
int mm_lock_seq;
@@ -709,21 +714,6 @@ static inline void vma_start_write(struct vm_area_struct *vma)
up_write(&vma->vm_lock->lock);
}
-static inline bool vma_try_start_write(struct vm_area_struct *vma)
-{
- int mm_lock_seq;
-
- if (__is_vma_write_locked(vma, &mm_lock_seq))
- return true;
-
- if (!down_write_trylock(&vma->vm_lock->lock))
- return false;
-
- WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
- up_write(&vma->vm_lock->lock);
- return true;
-}
-
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
int mm_lock_seq;
@@ -748,25 +738,30 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
{ return false; }
static inline void vma_end_read(struct vm_area_struct *vma) {}
static inline void vma_start_write(struct vm_area_struct *vma) {}
-static inline bool vma_try_start_write(struct vm_area_struct *vma)
- { return true; }
-static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+ { mmap_assert_write_locked(vma->vm_mm); }
static inline void vma_mark_detached(struct vm_area_struct *vma,
bool detached) {}
+static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address)
+{
+ return NULL;
+}
+
#endif /* CONFIG_PER_VMA_LOCK */
+extern const struct vm_operations_struct vma_dummy_vm_ops;
+
/*
* WARNING: vma_init does not initialize vma->vm_lock.
* Use vm_area_alloc()/vm_area_free() if vma needs locking.
*/
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
- static const struct vm_operations_struct dummy_vm_ops = {};
-
memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
- vma->vm_ops = &dummy_vm_ops;
+ vma->vm_ops = &vma_dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
vma_mark_detached(vma, false);
vma_numab_state_init(vma);
@@ -779,18 +774,22 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
ACCESS_PRIVATE(vma, __vm_flags) = flags;
}
-/* Use when VMA is part of the VMA tree and modifications need coordination */
+/*
+ * Use when VMA is part of the VMA tree and modifications need coordination
+ * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
+ * it should be locked explicitly beforehand.
+ */
static inline void vm_flags_reset(struct vm_area_struct *vma,
vm_flags_t flags)
{
- vma_start_write(vma);
+ vma_assert_write_locked(vma);
vm_flags_init(vma, flags);
}
static inline void vm_flags_reset_once(struct vm_area_struct *vma,
vm_flags_t flags)
{
- vma_start_write(vma);
+ vma_assert_write_locked(vma);
WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
}
@@ -839,6 +838,31 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
return !vma->vm_ops;
}
+/*
+ * Indicate if the VMA is a heap for the given task; for
+ * /proc/PID/maps that is the heap of the main task.
+ */
+static inline bool vma_is_initial_heap(const struct vm_area_struct *vma)
+{
+ return vma->vm_start <= vma->vm_mm->brk &&
+ vma->vm_end >= vma->vm_mm->start_brk;
+}
+
+/*
+ * Indicate if the VMA is a stack for the given task; for
+ * /proc/PID/maps that is the stack of the main task.
+ */
+static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
+{
+ /*
+ * We make no effort to guess what a given thread considers to be
+ * its "stack". It's not even well-defined for programs written
+ * languages like Go.
+ */
+ return vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack;
+}
+
static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1072,11 +1096,6 @@ unsigned long vmalloc_to_pfn(const void *addr);
* On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
* is no special casing required.
*/
-
-#ifndef is_ioremap_addr
-#define is_ioremap_addr(x) is_vmalloc_addr(x)
-#endif
-
#ifdef CONFIG_MMU
extern bool is_vmalloc_addr(const void *x);
extern int is_vmalloc_or_module_addr(const void *x);
@@ -2170,7 +2189,6 @@ static inline void *folio_address(const struct folio *folio)
return page_address(&folio->page);
}
-extern void *page_rmapping(struct page *page);
extern pgoff_t __page_file_index(struct page *page);
/*
@@ -2238,18 +2256,6 @@ extern void pagefault_out_of_memory(void);
#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
/*
- * Flags passed to show_mem() and show_free_areas() to suppress output in
- * various contexts.
- */
-#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
-
-extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
-static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask)
-{
- __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
-}
-
-/*
* Parameter block passed down to zap_pte_range in exceptional cases.
*/
struct zap_details {
@@ -2317,9 +2323,9 @@ static inline void zap_vma_pages(struct vm_area_struct *vma)
zap_page_range_single(vma, vma->vm_start,
vma->vm_end - vma->vm_start, NULL);
}
-void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
+void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *start_vma, unsigned long start,
- unsigned long end, bool mm_wr_locked);
+ unsigned long end, unsigned long tree_end, bool mm_wr_locked);
struct mmu_notifier_range;
@@ -2766,42 +2772,93 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
}
#endif /* CONFIG_MMU */
+static inline struct ptdesc *virt_to_ptdesc(const void *x)
+{
+ return page_ptdesc(virt_to_page(x));
+}
+
+static inline void *ptdesc_to_virt(const struct ptdesc *pt)
+{
+ return page_to_virt(ptdesc_page(pt));
+}
+
+static inline void *ptdesc_address(const struct ptdesc *pt)
+{
+ return folio_address(ptdesc_folio(pt));
+}
+
+static inline bool pagetable_is_reserved(struct ptdesc *pt)
+{
+ return folio_test_reserved(ptdesc_folio(pt));
+}
+
+/**
+ * pagetable_alloc - Allocate pagetables
+ * @gfp: GFP flags
+ * @order: desired pagetable order
+ *
+ * pagetable_alloc allocates memory for page tables as well as a page table
+ * descriptor to describe that memory.
+ *
+ * Return: The ptdesc describing the allocated page tables.
+ */
+static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order)
+{
+ struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+
+ return page_ptdesc(page);
+}
+
+/**
+ * pagetable_free - Free pagetables
+ * @pt: The page table descriptor
+ *
+ * pagetable_free frees the memory of all page tables described by a page
+ * table descriptor and the memory for the descriptor itself.
+ */
+static inline void pagetable_free(struct ptdesc *pt)
+{
+ struct page *page = ptdesc_page(pt);
+
+ __free_pages(page, compound_order(page));
+}
+
#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
-extern bool ptlock_alloc(struct page *page);
-extern void ptlock_free(struct page *page);
+bool ptlock_alloc(struct ptdesc *ptdesc);
+void ptlock_free(struct ptdesc *ptdesc);
-static inline spinlock_t *ptlock_ptr(struct page *page)
+static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
- return page->ptl;
+ return ptdesc->ptl;
}
#else /* ALLOC_SPLIT_PTLOCKS */
static inline void ptlock_cache_init(void)
{
}
-static inline bool ptlock_alloc(struct page *page)
+static inline bool ptlock_alloc(struct ptdesc *ptdesc)
{
return true;
}
-static inline void ptlock_free(struct page *page)
+static inline void ptlock_free(struct ptdesc *ptdesc)
{
}
-static inline spinlock_t *ptlock_ptr(struct page *page)
+static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
- return &page->ptl;
+ return &ptdesc->ptl;
}
#endif /* ALLOC_SPLIT_PTLOCKS */
static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
- return ptlock_ptr(pmd_page(*pmd));
+ return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
}
-static inline bool ptlock_init(struct page *page)
+static inline bool ptlock_init(struct ptdesc *ptdesc)
{
/*
* prep_new_page() initialize page->private (and therefore page->ptl)
@@ -2810,10 +2867,10 @@ static inline bool ptlock_init(struct page *page)
* It can happen if arch try to use slab for page table allocation:
* slab code uses page->slab_cache, which share storage with page->ptl.
*/
- VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page);
- if (!ptlock_alloc(page))
+ VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc));
+ if (!ptlock_alloc(ptdesc))
return false;
- spin_lock_init(ptlock_ptr(page));
+ spin_lock_init(ptlock_ptr(ptdesc));
return true;
}
@@ -2826,24 +2883,28 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
return &mm->page_table_lock;
}
static inline void ptlock_cache_init(void) {}
-static inline bool ptlock_init(struct page *page) { return true; }
-static inline void ptlock_free(struct page *page) {}
+static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
+static inline void ptlock_free(struct ptdesc *ptdesc) {}
#endif /* USE_SPLIT_PTE_PTLOCKS */
-static inline bool pgtable_pte_page_ctor(struct page *page)
+static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
{
- if (!ptlock_init(page))
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ if (!ptlock_init(ptdesc))
return false;
- __SetPageTable(page);
- inc_lruvec_page_state(page, NR_PAGETABLE);
+ __folio_set_pgtable(folio);
+ lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
}
-static inline void pgtable_pte_page_dtor(struct page *page)
+static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
{
- ptlock_free(page);
- __ClearPageTable(page);
- dec_lruvec_page_state(page, NR_PAGETABLE);
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ ptlock_free(ptdesc);
+ __folio_clear_pgtable(folio);
+ lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}
pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
@@ -2892,28 +2953,33 @@ static inline struct page *pmd_pgtable_page(pmd_t *pmd)
return virt_to_page((void *)((unsigned long) pmd & mask));
}
+static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
+{
+ return page_ptdesc(pmd_pgtable_page(pmd));
+}
+
static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
- return ptlock_ptr(pmd_pgtable_page(pmd));
+ return ptlock_ptr(pmd_ptdesc(pmd));
}
-static inline bool pmd_ptlock_init(struct page *page)
+static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- page->pmd_huge_pte = NULL;
+ ptdesc->pmd_huge_pte = NULL;
#endif
- return ptlock_init(page);
+ return ptlock_init(ptdesc);
}
-static inline void pmd_ptlock_free(struct page *page)
+static inline void pmd_ptlock_free(struct ptdesc *ptdesc)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
+ VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc));
#endif
- ptlock_free(page);
+ ptlock_free(ptdesc);
}
-#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte)
+#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
#else
@@ -2922,8 +2988,8 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
return &mm->page_table_lock;
}
-static inline bool pmd_ptlock_init(struct page *page) { return true; }
-static inline void pmd_ptlock_free(struct page *page) {}
+static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
+static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {}
#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
@@ -2936,20 +3002,24 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
return ptl;
}
-static inline bool pgtable_pmd_page_ctor(struct page *page)
+static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
{
- if (!pmd_ptlock_init(page))
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ if (!pmd_ptlock_init(ptdesc))
return false;
- __SetPageTable(page);
- inc_lruvec_page_state(page, NR_PAGETABLE);
+ __folio_set_pgtable(folio);
+ lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
}
-static inline void pgtable_pmd_page_dtor(struct page *page)
+static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc)
{
- pmd_ptlock_free(page);
- __ClearPageTable(page);
- dec_lruvec_page_state(page, NR_PAGETABLE);
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ pmd_ptlock_free(ptdesc);
+ __folio_clear_pgtable(folio);
+ lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}
/*
@@ -3004,6 +3074,11 @@ static inline void mark_page_reserved(struct page *page)
adjust_managed_page_count(page, -1);
}
+static inline void free_reserved_ptdesc(struct ptdesc *pt)
+{
+ free_reserved_page(ptdesc_page(pt));
+}
+
/*
* Default method to free all the __init memory into the buddy system.
* The freed pages will be poisoned with pattern "poison" if it's within
@@ -3069,9 +3144,9 @@ extern void mem_init(void);
extern void __init mmap_init(void);
extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
-static inline void show_mem(unsigned int flags, nodemask_t *nodemask)
+static inline void show_mem(void)
{
- __show_mem(flags, nodemask, MAX_NR_ZONES - 1);
+ __show_mem(0, NULL, MAX_NR_ZONES - 1);
}
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
@@ -3509,8 +3584,8 @@ static inline bool debug_pagealloc_enabled(void)
}
/*
- * For use in fast paths after init_debug_pagealloc() has run, or when a
- * false negative result is not harmful when called too early.
+ * For use in fast paths after mem_debugging_and_hardening_init() has run,
+ * or when a false negative result is not harmful when called too early.
*/
static inline bool debug_pagealloc_enabled_static(void)
{
@@ -3665,13 +3740,32 @@ void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap);
#endif
-#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
-static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
- struct dev_pagemap *pgmap)
+#define VMEMMAP_RESERVE_NR 2
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
- return is_power_of_2(sizeof(struct page)) &&
- pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+ unsigned long nr_pages;
+ unsigned long nr_vmemmap_pages;
+
+ if (!pgmap || !is_power_of_2(sizeof(struct page)))
+ return false;
+
+ nr_pages = pgmap_vmemmap_nr(pgmap);
+ nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT);
+ /*
+ * For vmemmap optimization with DAX we need minimum 2 vmemmap
+ * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
+ */
+ return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
}
+/*
+ * If we don't have an architecture override, use the generic rule
+ */
+#ifndef vmemmap_can_optimize
+#define vmemmap_can_optimize __vmemmap_can_optimize
+#endif
+
#else
static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 21d6c72bcc71..8148b30a9df1 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -523,6 +523,27 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
return atomic_read(&mm->tlb_flush_pending) > 1;
}
+#ifdef CONFIG_MMU
+/*
+ * Computes the pte marker to copy from the given source entry into dst_vma.
+ * If no marker should be copied, returns 0.
+ * The caller should insert a new pte created with make_pte_marker().
+ */
+static inline pte_marker copy_pte_marker(
+ swp_entry_t entry, struct vm_area_struct *dst_vma)
+{
+ pte_marker srcm = pte_marker_get(entry);
+ /* Always copy error entries. */
+ pte_marker dstm = srcm & PTE_MARKER_POISONED;
+
+ /* Only copy PTE markers if UFFD register matches. */
+ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
+ dstm |= PTE_MARKER_UFFD_WP;
+
+ return dstm;
+}
+#endif
+
/*
* If this pte is wr-protected by uffd-wp in any form, arm the special pte to
* replace a none pte. NOTE! This should only be called when *pte is already
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7d30dc4ff0ff..de5ac95572c8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -141,20 +141,6 @@ struct page {
struct { /* Tail pages of compound page */
unsigned long compound_head; /* Bit zero is set */
};
- struct { /* Page table pages */
- unsigned long _pt_pad_1; /* compound_head */
- pgtable_t pmd_huge_pte; /* protected by page->ptl */
- unsigned long _pt_pad_2; /* mapping */
- union {
- struct mm_struct *pt_mm; /* x86 pgds only */
- atomic_t pt_frag_refcount; /* powerpc */
- };
-#if ALLOC_SPLIT_PTLOCKS
- spinlock_t *ptl;
-#else
- spinlock_t ptl;
-#endif
- };
struct { /* ZONE_DEVICE pages */
/** @pgmap: Points to the hosting device page map. */
struct dev_pagemap *pgmap;
@@ -393,6 +379,85 @@ FOLIO_MATCH(flags, _flags_2);
FOLIO_MATCH(compound_head, _head_2);
#undef FOLIO_MATCH
+/**
+ * struct ptdesc - Memory descriptor for page tables.
+ * @__page_flags: Same as page flags. Unused for page tables.
+ * @pt_rcu_head: For freeing page table pages.
+ * @pt_list: List of used page tables. Used for s390 and x86.
+ * @_pt_pad_1: Padding that aliases with page's compound head.
+ * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
+ * @__page_mapping: Aliases with page->mapping. Unused for page tables.
+ * @pt_mm: Used for x86 pgds.
+ * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only.
+ * @_pt_pad_2: Padding to ensure proper alignment.
+ * @ptl: Lock for the page table.
+ * @__page_type: Same as page->page_type. Unused for page tables.
+ * @_refcount: Same as page refcount. Used for s390 page tables.
+ * @pt_memcg_data: Memcg data. Tracked for page tables here.
+ *
+ * This struct overlays struct page for now. Do not modify without a good
+ * understanding of the issues.
+ */
+struct ptdesc {
+ unsigned long __page_flags;
+
+ union {
+ struct rcu_head pt_rcu_head;
+ struct list_head pt_list;
+ struct {
+ unsigned long _pt_pad_1;
+ pgtable_t pmd_huge_pte;
+ };
+ };
+ unsigned long __page_mapping;
+
+ union {
+ struct mm_struct *pt_mm;
+ atomic_t pt_frag_refcount;
+ };
+
+ union {
+ unsigned long _pt_pad_2;
+#if ALLOC_SPLIT_PTLOCKS
+ spinlock_t *ptl;
+#else
+ spinlock_t ptl;
+#endif
+ };
+ unsigned int __page_type;
+ atomic_t _refcount;
+#ifdef CONFIG_MEMCG
+ unsigned long pt_memcg_data;
+#endif
+};
+
+#define TABLE_MATCH(pg, pt) \
+ static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt))
+TABLE_MATCH(flags, __page_flags);
+TABLE_MATCH(compound_head, pt_list);
+TABLE_MATCH(compound_head, _pt_pad_1);
+TABLE_MATCH(mapping, __page_mapping);
+TABLE_MATCH(rcu_head, pt_rcu_head);
+TABLE_MATCH(page_type, __page_type);
+TABLE_MATCH(_refcount, _refcount);
+#ifdef CONFIG_MEMCG
+TABLE_MATCH(memcg_data, pt_memcg_data);
+#endif
+#undef TABLE_MATCH
+static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
+
+#define ptdesc_page(pt) (_Generic((pt), \
+ const struct ptdesc *: (const struct page *)(pt), \
+ struct ptdesc *: (struct page *)(pt)))
+
+#define ptdesc_folio(pt) (_Generic((pt), \
+ const struct ptdesc *: (const struct folio *)(pt), \
+ struct ptdesc *: (struct folio *)(pt)))
+
+#define page_ptdesc(p) (_Generic((p), \
+ const struct page *: (const struct ptdesc *)(p), \
+ struct page *: (struct ptdesc *)(p)))
+
/*
* Used for sizing the vmemmap region on some architectures
*/
@@ -812,7 +877,7 @@ struct mm_struct {
#ifdef CONFIG_KSM
/*
* Represent how many pages of this process are involved in KSM
- * merging.
+ * merging (not including ksm_zero_pages).
*/
unsigned long ksm_merging_pages;
/*
@@ -820,7 +885,12 @@ struct mm_struct {
* including merged and not merged.
*/
unsigned long ksm_rmap_items;
-#endif
+ /*
+ * Represent how many empty pages are merged with kernel zero
+ * pages when enabling KSM use_zero_pages.
+ */
+ unsigned long ksm_zero_pages;
+#endif /* CONFIG_KSM */
#ifdef CONFIG_LRU_GEN
struct {
/* this mm_struct is on lru_gen_mm_list */
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 5414b5c6a103..aa44fff8bb9d 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -52,8 +52,8 @@ struct tlbflush_unmap_batch {
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
* The arch code makes the following promise: generic code can modify a
- * PTE, then call arch_tlbbatch_add_mm() (which internally provides all
- * needed barriers), then call arch_tlbbatch_flush(), and the entries
+ * PTE, then call arch_tlbbatch_add_pending() (which internally provides
+ * all needed barriers), then call arch_tlbbatch_flush(), and the entries
* will be flushed on all CPUs by the time that arch_tlbbatch_flush()
* returns.
*/
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e05e167dbd16..8d38dcb6d044 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -73,6 +73,14 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
}
#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Drop all currently-held per-VMA locks.
+ * This is called from the mmap_lock implementation directly before releasing
+ * a write-locked mmap_lock (or downgrading it to read-locked).
+ * This should normally NOT be called manually from other places.
+ * If you want to call this manually anyway, keep in mind that this will release
+ * *all* VMA write locks, including ones from further up the stack.
+ */
static inline void vma_end_write_all(struct mm_struct *mm)
{
mmap_assert_write_locked(mm);
@@ -118,16 +126,6 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm)
return ret;
}
-static inline bool mmap_write_trylock(struct mm_struct *mm)
-{
- bool ret;
-
- __mmap_lock_trace_start_locking(mm, true);
- ret = down_write_trylock(&mm->mmap_lock) != 0;
- __mmap_lock_trace_acquire_returned(mm, true, ret);
- return ret;
-}
-
static inline void mmap_write_unlock(struct mm_struct *mm)
{
__mmap_lock_trace_released(mm, true);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 64a3e051c3c4..6e3c857606f1 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -187,27 +187,27 @@ struct mmu_notifier_ops {
const struct mmu_notifier_range *range);
/*
- * invalidate_range() is either called between
- * invalidate_range_start() and invalidate_range_end() when the
- * VM has to free pages that where unmapped, but before the
- * pages are actually freed, or outside of _start()/_end() when
- * a (remote) TLB is necessary.
+ * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB
+ * which shares page-tables with the CPU. The
+ * invalidate_range_start()/end() callbacks should not be implemented as
+ * invalidate_secondary_tlbs() already catches the points in time when
+ * an external TLB needs to be flushed.
*
- * If invalidate_range() is used to manage a non-CPU TLB with
- * shared page-tables, it not necessary to implement the
- * invalidate_range_start()/end() notifiers, as
- * invalidate_range() already catches the points in time when an
- * external TLB range needs to be flushed. For more in depth
- * discussion on this see Documentation/mm/mmu_notifier.rst
+ * This requires arch_invalidate_secondary_tlbs() to be called while
+ * holding the ptl spin-lock and therefore this callback is not allowed
+ * to sleep.
*
- * Note that this function might be called with just a sub-range
- * of what was passed to invalidate_range_start()/end(), if
- * called between those functions.
+ * This is called by architecture code whenever invalidating a TLB
+ * entry. It is assumed that any secondary TLB has the same rules for
+ * when invalidations are required. If this is not the case architecture
+ * code will need to call this explicitly when required for secondary
+ * TLB invalidation.
*/
- void (*invalidate_range)(struct mmu_notifier *subscription,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end);
+ void (*arch_invalidate_secondary_tlbs)(
+ struct mmu_notifier *subscription,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end);
/*
* These callbacks are used with the get/put interface to manage the
@@ -395,10 +395,9 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm,
extern void __mmu_notifier_change_pte(struct mm_struct *mm,
unsigned long address, pte_t pte);
extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
-extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r,
- bool only_end);
-extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
- unsigned long start, unsigned long end);
+extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r);
+extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
+ unsigned long start, unsigned long end);
extern bool
mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);
@@ -481,21 +480,14 @@ mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
might_sleep();
if (mm_has_notifiers(range->mm))
- __mmu_notifier_invalidate_range_end(range, false);
-}
-
-static inline void
-mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range)
-{
- if (mm_has_notifiers(range->mm))
- __mmu_notifier_invalidate_range_end(range, true);
+ __mmu_notifier_invalidate_range_end(range);
}
-static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
- unsigned long start, unsigned long end)
+static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
if (mm_has_notifiers(mm))
- __mmu_notifier_invalidate_range(mm, start, end);
+ __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
@@ -582,45 +574,6 @@ static inline void mmu_notifier_range_init_owner(
__young; \
})
-#define ptep_clear_flush_notify(__vma, __address, __ptep) \
-({ \
- unsigned long ___addr = __address & PAGE_MASK; \
- struct mm_struct *___mm = (__vma)->vm_mm; \
- pte_t ___pte; \
- \
- ___pte = ptep_clear_flush(__vma, __address, __ptep); \
- mmu_notifier_invalidate_range(___mm, ___addr, \
- ___addr + PAGE_SIZE); \
- \
- ___pte; \
-})
-
-#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \
-({ \
- unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
- struct mm_struct *___mm = (__vma)->vm_mm; \
- pmd_t ___pmd; \
- \
- ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \
- mmu_notifier_invalidate_range(___mm, ___haddr, \
- ___haddr + HPAGE_PMD_SIZE); \
- \
- ___pmd; \
-})
-
-#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \
-({ \
- unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \
- struct mm_struct *___mm = (__vma)->vm_mm; \
- pud_t ___pud; \
- \
- ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \
- mmu_notifier_invalidate_range(___mm, ___haddr, \
- ___haddr + HPAGE_PUD_SIZE); \
- \
- ___pud; \
-})
-
/*
* set_pte_at_notify() sets the pte _after_ running the notifier.
* This is safe to start by updating the secondary MMUs, because the primary MMU
@@ -711,12 +664,7 @@ void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
}
-static inline void
-mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range)
-{
-}
-
-static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
+static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5e50b78d58ea..4106fbc5b4b3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -676,7 +676,6 @@ enum zone_watermarks {
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
-/* Fields and list protected by pagesets local_lock in page_alloc.c */
struct per_cpu_pages {
spinlock_t lock; /* Protects lists field */
int count; /* number of pages in the list */
diff --git a/include/linux/net_mm.h b/include/linux/net_mm.h
deleted file mode 100644
index b298998bd5a0..000000000000
--- a/include/linux/net_mm.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifdef CONFIG_MMU
-
-#ifdef CONFIG_INET
-extern const struct vm_operations_struct tcp_vm_ops;
-static inline bool vma_is_tcp(const struct vm_area_struct *vma)
-{
- return vma->vm_ops == &tcp_vm_ops;
-}
-#else
-static inline bool vma_is_tcp(const struct vm_area_struct *vma)
-{
- return false;
-}
-#endif /* CONFIG_INET*/
-
-#endif /* CONFIG_MMU */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 92a2063a0a23..9218028caf33 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -908,6 +908,8 @@ static inline bool is_page_hwpoison(struct page *page)
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
+#define folio_test_type(folio, flag) \
+ ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
static inline int page_type_has_type(unsigned int page_type)
{
@@ -919,27 +921,41 @@ static inline int page_has_type(struct page *page)
return page_type_has_type(page->page_type);
}
-#define PAGE_TYPE_OPS(uname, lname) \
-static __always_inline int Page##uname(struct page *page) \
+#define PAGE_TYPE_OPS(uname, lname, fname) \
+static __always_inline int Page##uname(const struct page *page) \
{ \
return PageType(page, PG_##lname); \
} \
+static __always_inline int folio_test_##fname(const struct folio *folio)\
+{ \
+ return folio_test_type(folio, PG_##lname); \
+} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!PageType(page, 0), page); \
page->page_type &= ~PG_##lname; \
} \
+static __always_inline void __folio_set_##fname(struct folio *folio) \
+{ \
+ VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \
+ folio->page.page_type &= ~PG_##lname; \
+} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
page->page_type |= PG_##lname; \
-}
+} \
+static __always_inline void __folio_clear_##fname(struct folio *folio) \
+{ \
+ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \
+ folio->page.page_type |= PG_##lname; \
+} \
/*
* PageBuddy() indicates that the page is free and in the buddy system
* (see mm/page_alloc.c).
*/
-PAGE_TYPE_OPS(Buddy, buddy)
+PAGE_TYPE_OPS(Buddy, buddy, buddy)
/*
* PageOffline() indicates that the page is logically offline although the
@@ -963,7 +979,7 @@ PAGE_TYPE_OPS(Buddy, buddy)
* pages should check PageOffline() and synchronize with such drivers using
* page_offline_freeze()/page_offline_thaw().
*/
-PAGE_TYPE_OPS(Offline, offline)
+PAGE_TYPE_OPS(Offline, offline, offline)
extern void page_offline_freeze(void);
extern void page_offline_thaw(void);
@@ -973,12 +989,12 @@ extern void page_offline_end(void);
/*
* Marks pages in use as page tables.
*/
-PAGE_TYPE_OPS(Table, table)
+PAGE_TYPE_OPS(Table, table, pgtable)
/*
* Marks guardpages used with debug_pagealloc.
*/
-PAGE_TYPE_OPS(Guard, guard)
+PAGE_TYPE_OPS(Guard, guard, guard)
extern bool is_free_buddy_page(struct page *page);
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 67314f648aeb..be98564191e6 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -8,6 +8,7 @@
struct pglist_data;
+#ifdef CONFIG_PAGE_EXTENSION
/**
* struct page_ext_operations - per page_ext client operations
* @offset: Offset to the client's data within page_ext. Offset is returned to
@@ -29,8 +30,6 @@ struct page_ext_operations {
bool need_shared_flags;
};
-#ifdef CONFIG_PAGE_EXTENSION
-
/*
* The page_ext_flags users must set need_shared_flags to true.
*/
@@ -82,6 +81,12 @@ static inline void page_ext_init(void)
extern struct page_ext *page_ext_get(struct page *page);
extern void page_ext_put(struct page_ext *page_ext);
+static inline void *page_ext_data(struct page_ext *page_ext,
+ struct page_ext_operations *ops)
+{
+ return (void *)(page_ext) + ops->offset;
+}
+
static inline struct page_ext *page_ext_next(struct page_ext *curr)
{
void *next = curr;
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 5cb7bd2078ec..d8f344840643 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -144,9 +144,4 @@ static inline void set_page_idle(struct page *page)
{
folio_set_idle(page_folio(page));
}
-
-static inline void clear_page_idle(struct page *page)
-{
- folio_clear_idle(page_folio(page));
-}
#endif /* _LINUX_MM_PAGE_IDLE_H */
diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h
index 01e16c7696ec..7f6b9bf926c5 100644
--- a/include/linux/page_table_check.h
+++ b/include/linux/page_table_check.h
@@ -14,18 +14,12 @@ extern struct static_key_true page_table_check_disabled;
extern struct page_ext_operations page_table_check_ops;
void __page_table_check_zero(struct page *page, unsigned int order);
-void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t pte);
-void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
- pmd_t pmd);
-void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
- pud_t pud);
-void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte);
-void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, pmd_t pmd);
-void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
- pud_t *pudp, pud_t pud);
+void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte);
+void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd);
+void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud);
+void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte);
+void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd);
+void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud);
void __page_table_check_pte_clear_range(struct mm_struct *mm,
unsigned long addr,
pmd_t pmd);
@@ -46,61 +40,55 @@ static inline void page_table_check_free(struct page *page, unsigned int order)
__page_table_check_zero(page, order);
}
-static inline void page_table_check_pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t pte)
+static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pte_clear(mm, addr, pte);
+ __page_table_check_pte_clear(mm, pte);
}
-static inline void page_table_check_pmd_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t pmd)
+static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pmd_clear(mm, addr, pmd);
+ __page_table_check_pmd_clear(mm, pmd);
}
-static inline void page_table_check_pud_clear(struct mm_struct *mm,
- unsigned long addr, pud_t pud)
+static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pud_clear(mm, addr, pud);
+ __page_table_check_pud_clear(mm, pud);
}
-static inline void page_table_check_pte_set(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep,
+static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep,
pte_t pte)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pte_set(mm, addr, ptep, pte);
+ __page_table_check_pte_set(mm, ptep, pte);
}
-static inline void page_table_check_pmd_set(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp,
+static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp,
pmd_t pmd)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pmd_set(mm, addr, pmdp, pmd);
+ __page_table_check_pmd_set(mm, pmdp, pmd);
}
-static inline void page_table_check_pud_set(struct mm_struct *mm,
- unsigned long addr, pud_t *pudp,
+static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp,
pud_t pud)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pud_set(mm, addr, pudp, pud);
+ __page_table_check_pud_set(mm, pudp, pud);
}
static inline void page_table_check_pte_clear_range(struct mm_struct *mm,
@@ -123,35 +111,29 @@ static inline void page_table_check_free(struct page *page, unsigned int order)
{
}
-static inline void page_table_check_pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t pte)
+static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
}
-static inline void page_table_check_pmd_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t pmd)
+static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
}
-static inline void page_table_check_pud_clear(struct mm_struct *mm,
- unsigned long addr, pud_t pud)
+static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
}
-static inline void page_table_check_pte_set(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep,
+static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep,
pte_t pte)
{
}
-static inline void page_table_check_pmd_set(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp,
+static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp,
pmd_t pmd)
{
}
-static inline void page_table_check_pud_set(struct mm_struct *mm,
- unsigned long addr, pud_t *pudp,
+static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp,
pud_t pud)
{
}
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 716953ee1ebd..f4f24b594cd7 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -203,6 +203,7 @@ enum mapping_flags {
/* writeback related tags are not used */
AS_NO_WRITEBACK_TAGS = 5,
AS_LARGE_FOLIO_SUPPORT = 6,
+ AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */
};
/**
@@ -273,6 +274,21 @@ static inline int mapping_use_writeback_tags(struct address_space *mapping)
return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}
+static inline bool mapping_release_always(const struct address_space *mapping)
+{
+ return test_bit(AS_RELEASE_ALWAYS, &mapping->flags);
+}
+
+static inline void mapping_set_release_always(struct address_space *mapping)
+{
+ set_bit(AS_RELEASE_ALWAYS, &mapping->flags);
+}
+
+static inline void mapping_clear_release_always(struct address_space *mapping)
+{
+ clear_bit(AS_RELEASE_ALWAYS, &mapping->flags);
+}
+
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
return mapping->gfp_mask;
@@ -1044,11 +1060,6 @@ static inline void wait_on_page_locked(struct page *page)
folio_wait_locked(page_folio(page));
}
-static inline int wait_on_page_locked_killable(struct page *page)
-{
- return folio_wait_locked_killable(page_folio(page));
-}
-
void wait_on_page_writeback(struct page *page);
void folio_wait_writeback(struct folio *folio);
int folio_wait_writeback_killable(struct folio *folio);
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5063b482e34f..f34e0f2cb4d8 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -99,7 +99,7 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address)))
#define pte_unmap(pte) do { \
kunmap_local((pte)); \
- /* rcu_read_unlock() to be added later */ \
+ rcu_read_unlock(); \
} while (0)
#else
static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address)
@@ -108,10 +108,12 @@ static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address)
}
static inline void pte_unmap(pte_t *pte)
{
- /* rcu_read_unlock() to be added later */
+ rcu_read_unlock();
}
#endif
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);
+
/* Find an entry in the second-level page table.. */
#ifndef pmd_offset
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
@@ -320,7 +322,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
{
pte_t pte = ptep_get(ptep);
pte_clear(mm, address, ptep);
- page_table_check_pte_clear(mm, address, pte);
+ page_table_check_pte_clear(mm, pte);
return pte;
}
#endif
@@ -390,6 +392,7 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
return pmd;
}
#define pmdp_get_lockless pmdp_get_lockless
+#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */
@@ -408,6 +411,9 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
return pmdp_get(pmdp);
}
+static inline void pmdp_get_lockless_sync(void)
+{
+}
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -419,7 +425,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
pmd_t pmd = *pmdp;
pmd_clear(pmdp);
- page_table_check_pmd_clear(mm, address, pmd);
+ page_table_check_pmd_clear(mm, pmd);
return pmd;
}
@@ -432,7 +438,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
pud_t pud = *pudp;
pud_clear(pudp);
- page_table_check_pud_clear(mm, address, pud);
+ page_table_check_pud_clear(mm, pud);
return pud;
}
@@ -450,11 +456,11 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
#endif
#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
-static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm,
+static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
int full)
{
- return pudp_huge_get_and_clear(mm, address, pudp);
+ return pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -558,6 +564,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
#endif
#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pudp_set_wrprotect(struct mm_struct *mm,
unsigned long address, pud_t *pudp)
{
@@ -571,6 +578,7 @@ static inline void pudp_set_wrprotect(struct mm_struct *mm,
{
BUILD_BUG();
}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#endif
@@ -693,11 +701,14 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
return pmd_val(pmd_a) == pmd_val(pmd_b);
}
+#endif
+#ifndef pud_same
static inline int pud_same(pud_t pud_a, pud_t pud_b)
{
return pud_val(pud_a) == pud_val(pud_b);
}
+#define pud_same pud_same
#endif
#ifndef __HAVE_ARCH_P4D_SAME
@@ -1499,6 +1510,9 @@ typedef unsigned int pgtbl_mod_mask;
#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
#endif
+#ifndef has_transparent_pud_hugepage
+#define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+#endif
/*
* On some architectures it depends on the mm if the p4d/pud or pmd
* layer of the page table hierarchy is folded or not.
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index c758809d5bcf..f9f9931e02d6 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -17,18 +17,10 @@
struct fs_pin;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
-/*
- * sysctl for vm.memfd_noexec
- * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL
- * acts like MFD_EXEC was set.
- * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL
- * acts like MFD_NOEXEC_SEAL was set.
- * 2: memfd_create() without MFD_NOEXEC_SEAL will be
- * rejected.
- */
-#define MEMFD_NOEXEC_SCOPE_EXEC 0
-#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1
-#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2
+/* modes for vm.memfd_noexec sysctl */
+#define MEMFD_NOEXEC_SCOPE_EXEC 0 /* MFD_EXEC implied if unset */
+#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 /* MFD_NOEXEC_SEAL implied if unset */
+#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 /* same as 1, except MFD_EXEC rejected */
#endif
struct pid_namespace {
@@ -47,7 +39,6 @@ struct pid_namespace {
int reboot; /* group exit code if this pidns was rebooted */
struct ns_common ns;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
- /* sysctl for vm.memfd_noexec */
int memfd_noexec_scope;
#endif
} __randomize_layout;
@@ -64,6 +55,23 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
return ns;
}
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
+{
+ int scope = MEMFD_NOEXEC_SCOPE_EXEC;
+
+ for (; ns; ns = ns->parent)
+ scope = max(scope, READ_ONCE(ns->memfd_noexec_scope));
+
+ return scope;
+}
+#else
+static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
+{
+ return 0;
+}
+#endif
+
extern struct pid_namespace *copy_pid_ns(unsigned long flags,
struct user_namespace *user_ns, struct pid_namespace *ns);
extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
@@ -78,6 +86,11 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
return ns;
}
+static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
+{
+ return 0;
+}
+
static inline struct pid_namespace *copy_pid_ns(unsigned long flags,
struct user_namespace *user_ns, struct pid_namespace *ns)
{
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 456546443f1f..bb5adc604144 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -302,10 +302,6 @@ struct swap_info_struct {
struct file *swap_file; /* seldom referenced */
unsigned int old_block_size; /* seldom referenced */
struct completion comp; /* seldom referenced */
-#ifdef CONFIG_FRONTSWAP
- unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
- atomic_t frontswap_pages; /* frontswap pages in-use counter */
-#endif
spinlock_t lock; /*
* protect map scan related fields like
* swap_map, lowest_bit, highest_bit,
@@ -630,11 +626,6 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
}
#endif
-#ifdef CONFIG_ZSWAP
-extern u64 zswap_pool_total_size;
-extern atomic_t zswap_stored_pages;
-#endif
-
#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp);
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index 7ed529a77c5b..99e3ed469e88 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -2,11 +2,6 @@
#ifndef _LINUX_SWAPFILE_H
#define _LINUX_SWAPFILE_H
-/*
- * these were static in swapfile.c but frontswap.c needs them and we don't
- * want to expose them to the dozens of source files that include swap.h
- */
-extern struct swap_info_struct *swap_info[];
extern unsigned long generic_max_swapfile_size(void);
unsigned long arch_max_swapfile_size(void);
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 4c932cb45e0b..bff1e8d97de0 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -393,7 +393,12 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry)
typedef unsigned long pte_marker;
#define PTE_MARKER_UFFD_WP BIT(0)
-#define PTE_MARKER_SWAPIN_ERROR BIT(1)
+/*
+ * "Poisoned" here is meant in the very general sense of "future accesses are
+ * invalid", instead of referring very specifically to hardware memory errors.
+ * This marker is meant to represent any of various different causes of this.
+ */
+#define PTE_MARKER_POISONED BIT(1)
#define PTE_MARKER_MASK (BIT(2) - 1)
static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
@@ -421,15 +426,15 @@ static inline pte_t make_pte_marker(pte_marker marker)
return swp_entry_to_pte(make_pte_marker_entry(marker));
}
-static inline swp_entry_t make_swapin_error_entry(void)
+static inline swp_entry_t make_poisoned_swp_entry(void)
{
- return make_pte_marker_entry(PTE_MARKER_SWAPIN_ERROR);
+ return make_pte_marker_entry(PTE_MARKER_POISONED);
}
-static inline int is_swapin_error_entry(swp_entry_t entry)
+static inline int is_poisoned_swp_entry(swp_entry_t entry)
{
return is_pte_marker_entry(entry) &&
- (pte_marker_get(entry) & PTE_MARKER_SWAPIN_ERROR);
+ (pte_marker_get(entry) & PTE_MARKER_POISONED);
}
/*
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index ac7b0c96d351..ac8c6854097c 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -46,6 +46,7 @@ enum mfill_atomic_mode {
MFILL_ATOMIC_COPY,
MFILL_ATOMIC_ZEROPAGE,
MFILL_ATOMIC_CONTINUE,
+ MFILL_ATOMIC_POISON,
NR_MFILL_ATOMIC_MODES,
};
@@ -83,6 +84,9 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long len, atomic_t *mmap_changing,
uffd_flags_t flags);
+extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, atomic_t *mmap_changing,
+ uffd_flags_t flags);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing);
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
new file mode 100644
index 000000000000..2a60ce39cfde
--- /dev/null
+++ b/include/linux/zswap.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ZSWAP_H
+#define _LINUX_ZSWAP_H
+
+#include <linux/types.h>
+#include <linux/mm_types.h>
+
+extern u64 zswap_pool_total_size;
+extern atomic_t zswap_stored_pages;
+
+#ifdef CONFIG_ZSWAP
+
+bool zswap_store(struct folio *folio);
+bool zswap_load(struct folio *folio);
+void zswap_invalidate(int type, pgoff_t offset);
+void zswap_swapon(int type);
+void zswap_swapoff(int type);
+
+#else
+
+static inline bool zswap_store(struct folio *folio)
+{
+ return false;
+}
+
+static inline bool zswap_load(struct folio *folio)
+{
+ return false;
+}
+
+static inline void zswap_invalidate(int type, pgoff_t offset) {}
+static inline void zswap_swapon(int type) {}
+static inline void zswap_swapoff(int type) {}
+
+#endif
+
+#endif /* _LINUX_ZSWAP_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0ca972ebd3dd..3a818fe1a8a5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -45,7 +45,6 @@
#include <linux/memcontrol.h>
#include <linux/bpf-cgroup.h>
#include <linux/siphash.h>
-#include <linux/net_mm.h>
extern struct inet_hashinfo tcp_hashinfo;
diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h
index 202b3e3e67ff..f50048af5fcc 100644
--- a/include/trace/events/thp.h
+++ b/include/trace/events/thp.h
@@ -8,25 +8,34 @@
#include <linux/types.h>
#include <linux/tracepoint.h>
-TRACE_EVENT(hugepage_set_pmd,
+DECLARE_EVENT_CLASS(hugepage_set,
- TP_PROTO(unsigned long addr, unsigned long pmd),
- TP_ARGS(addr, pmd),
+ TP_PROTO(unsigned long addr, unsigned long pte),
+ TP_ARGS(addr, pte),
TP_STRUCT__entry(
__field(unsigned long, addr)
- __field(unsigned long, pmd)
+ __field(unsigned long, pte)
),
TP_fast_assign(
__entry->addr = addr;
- __entry->pmd = pmd;
+ __entry->pte = pte;
),
- TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd)
+ TP_printk("Set page table entry with 0x%lx with 0x%lx", __entry->addr, __entry->pte)
);
+DEFINE_EVENT(hugepage_set, hugepage_set_pmd,
+ TP_PROTO(unsigned long addr, unsigned long pmd),
+ TP_ARGS(addr, pmd)
+);
-TRACE_EVENT(hugepage_update,
+DEFINE_EVENT(hugepage_set, hugepage_set_pud,
+ TP_PROTO(unsigned long addr, unsigned long pud),
+ TP_ARGS(addr, pud)
+);
+
+DECLARE_EVENT_CLASS(hugepage_update,
TP_PROTO(unsigned long addr, unsigned long pte, unsigned long clr, unsigned long set),
TP_ARGS(addr, pte, clr, set),
@@ -48,6 +57,16 @@ TRACE_EVENT(hugepage_update,
TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set)
);
+DEFINE_EVENT(hugepage_update, hugepage_update_pmd,
+ TP_PROTO(unsigned long addr, unsigned long pmd, unsigned long clr, unsigned long set),
+ TP_ARGS(addr, pmd, clr, set)
+);
+
+DEFINE_EVENT(hugepage_update, hugepage_update_pud,
+ TP_PROTO(unsigned long addr, unsigned long pud, unsigned long clr, unsigned long set),
+ TP_ARGS(addr, pud, clr, set)
+);
+
DECLARE_EVENT_CLASS(migration_pmd,
TP_PROTO(unsigned long addr, unsigned long pmd),
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 66dd4cd277bd..62151706c5a3 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -39,7 +39,8 @@
UFFD_FEATURE_MINOR_SHMEM | \
UFFD_FEATURE_EXACT_ADDRESS | \
UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \
- UFFD_FEATURE_WP_UNPOPULATED)
+ UFFD_FEATURE_WP_UNPOPULATED | \
+ UFFD_FEATURE_POISON)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -49,12 +50,14 @@
(__u64)1 << _UFFDIO_COPY | \
(__u64)1 << _UFFDIO_ZEROPAGE | \
(__u64)1 << _UFFDIO_WRITEPROTECT | \
- (__u64)1 << _UFFDIO_CONTINUE)
+ (__u64)1 << _UFFDIO_CONTINUE | \
+ (__u64)1 << _UFFDIO_POISON)
#define UFFD_API_RANGE_IOCTLS_BASIC \
((__u64)1 << _UFFDIO_WAKE | \
(__u64)1 << _UFFDIO_COPY | \
+ (__u64)1 << _UFFDIO_WRITEPROTECT | \
(__u64)1 << _UFFDIO_CONTINUE | \
- (__u64)1 << _UFFDIO_WRITEPROTECT)
+ (__u64)1 << _UFFDIO_POISON)
/*
* Valid ioctl command number range with this API is from 0x00 to
@@ -71,6 +74,7 @@
#define _UFFDIO_ZEROPAGE (0x04)
#define _UFFDIO_WRITEPROTECT (0x06)
#define _UFFDIO_CONTINUE (0x07)
+#define _UFFDIO_POISON (0x08)
#define _UFFDIO_API (0x3F)
/* userfaultfd ioctl ids */
@@ -91,6 +95,8 @@
struct uffdio_writeprotect)
#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \
struct uffdio_continue)
+#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \
+ struct uffdio_poison)
/* read() structure */
struct uffd_msg {
@@ -225,6 +231,7 @@ struct uffdio_api {
#define UFFD_FEATURE_EXACT_ADDRESS (1<<11)
#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12)
#define UFFD_FEATURE_WP_UNPOPULATED (1<<13)
+#define UFFD_FEATURE_POISON (1<<14)
__u64 features;
__u64 ioctls;
@@ -321,6 +328,18 @@ struct uffdio_continue {
__s64 mapped;
};
+struct uffdio_poison {
+ struct uffdio_range range;
+#define UFFDIO_POISON_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * Fields below here are written by the ioctl and must be at the end:
+ * the copy_from_user will not read past here.
+ */
+ __s64 updated;
+};
+
/*
* Flags for the userfaultfd(2) system call itself.
*/
diff --git a/init/initramfs.c b/init/initramfs.c
index e7a01c2ccd1b..8d0fd946cdd2 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -61,7 +61,7 @@ static void __init error(char *x)
}
#define panic_show_mem(fmt, ...) \
- ({ show_mem(0, NULL); panic(fmt, ##__VA_ARGS__); })
+ ({ show_mem(); panic(fmt, ##__VA_ARGS__); })
/* link hash */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 78ae7b6f90fd..e78751fee7fe 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8631,7 +8631,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
unsigned int size;
char tmp[16];
char *buf = NULL;
- char *name;
+ char *name = NULL;
if (vma->vm_flags & VM_READ)
prot |= PROT_READ;
@@ -8678,29 +8678,18 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
goto got_name;
} else {
- if (vma->vm_ops && vma->vm_ops->name) {
+ if (vma->vm_ops && vma->vm_ops->name)
name = (char *) vma->vm_ops->name(vma);
- if (name)
- goto cpy_name;
+ if (!name)
+ name = (char *)arch_vma_name(vma);
+ if (!name) {
+ if (vma_is_initial_heap(vma))
+ name = "[heap]";
+ else if (vma_is_initial_stack(vma))
+ name = "[stack]";
+ else
+ name = "//anon";
}
-
- name = (char *)arch_vma_name(vma);
- if (name)
- goto cpy_name;
-
- if (vma->vm_start <= vma->vm_mm->start_brk &&
- vma->vm_end >= vma->vm_mm->brk) {
- name = "[heap]";
- goto cpy_name;
- }
- if (vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack) {
- name = "[stack]";
- goto cpy_name;
- }
-
- name = "//anon";
- goto cpy_name;
}
cpy_name:
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f0ac5b874919..3048589e2e85 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
}
flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
- ptep_clear_flush_notify(vma, addr, pvmw.pte);
+ ptep_clear_flush(vma, addr, pvmw.pte);
if (new_page)
set_pte_at_notify(mm, addr, pvmw.pte,
mk_pte(new_page, vma->vm_page_prot));
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 514e4582b863..f10587d1d481 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1132,8 +1132,7 @@ static int __init futex_init(void)
#endif
futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
- futex_hashsize, 0,
- futex_hashsize < 256 ? HASH_SMALL : 0,
+ futex_hashsize, 0, 0,
&futex_shift, NULL,
futex_hashsize, futex_hashsize);
futex_hashsize = 1UL << futex_shift;
diff --git a/kernel/iomem.c b/kernel/iomem.c
index 62c92e43aa0d..dc2120776e1c 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -3,19 +3,16 @@
#include <linux/types.h>
#include <linux/io.h>
#include <linux/mm.h>
-
-#ifndef ioremap_cache
-/* temporary while we convert existing ioremap_cache users to memremap */
-__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
-{
- return ioremap(offset, size);
-}
-#endif
+#include <linux/ioremap.h>
#ifndef arch_memremap_wb
static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
{
+#ifdef ioremap_cache
return (__force void *)ioremap_cache(offset, size);
+#else
+ return (__force void *)ioremap(offset, size);
+#endif
}
#endif
diff --git a/kernel/panic.c b/kernel/panic.c
index 10effe40a3fa..07239d4ad81e 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -216,7 +216,7 @@ static void panic_print_sys_info(bool console_flush)
show_state();
if (panic_print & PANIC_PRINT_MEM_INFO)
- show_mem(0, NULL);
+ show_mem();
if (panic_print & PANIC_PRINT_TIMER_INFO)
sysrq_timer_list_show();
diff --git a/kernel/pid.c b/kernel/pid.c
index 6a1d23a11026..fee14a4486a3 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -83,6 +83,9 @@ struct pid_namespace init_pid_ns = {
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
+#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0bf44afe04dd..619972c78774 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -110,9 +110,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
-
- initialize_memfd_noexec_scope(ns);
-
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
+#endif
return ns;
out_free_idr:
diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
index b26e027fc9cd..2ee41a3a1dfd 100644
--- a/kernel/pid_sysctl.h
+++ b/kernel/pid_sysctl.h
@@ -5,33 +5,30 @@
#include <linux/pid_namespace.h>
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
-static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns)
-{
- ns->memfd_noexec_scope =
- task_active_pid_ns(current)->memfd_noexec_scope;
-}
-
static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table,
int write, void *buf, size_t *lenp, loff_t *ppos)
{
struct pid_namespace *ns = task_active_pid_ns(current);
struct ctl_table table_copy;
+ int err, scope, parent_scope;
if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
table_copy = *table;
- if (ns != &init_pid_ns)
- table_copy.data = &ns->memfd_noexec_scope;
- /*
- * set minimum to current value, the effect is only bigger
- * value is accepted.
- */
- if (*(int *)table_copy.data > *(int *)table_copy.extra1)
- table_copy.extra1 = table_copy.data;
+ /* You cannot set a lower enforcement value than your parent. */
+ parent_scope = pidns_memfd_noexec_scope(ns->parent);
+ /* Equivalent to pidns_memfd_noexec_scope(ns). */
+ scope = max(READ_ONCE(ns->memfd_noexec_scope), parent_scope);
+
+ table_copy.data = &scope;
+ table_copy.extra1 = &parent_scope;
- return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
+ err = proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
+ if (!err && write)
+ WRITE_ONCE(ns->memfd_noexec_scope, scope);
+ return err;
}
static struct ctl_table pid_ns_ctl_table_vm[] = {
@@ -51,7 +48,6 @@ static inline void register_pid_ns_sysctl_table_vm(void)
register_sysctl("vm", pid_ns_ctl_table_vm);
}
#else
-static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {}
static inline void register_pid_ns_sysctl_table_vm(void) {}
#endif
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 4dd73cf936a6..ffb9d15bd815 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -75,6 +75,7 @@
#define MA_STATE_PREALLOC 4
#define ma_parent_ptr(x) ((struct maple_pnode *)(x))
+#define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT)
#define ma_mnode_ptr(x) ((struct maple_node *)(x))
#define ma_enode_ptr(x) ((struct maple_enode *)(x))
static struct kmem_cache *maple_node_cache;
@@ -729,33 +730,6 @@ mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset)
}
/*
- * mas_logical_pivot() - Get the logical pivot of a given offset.
- * @mas: The maple state
- * @pivots: The pointer to the maple node pivots
- * @offset: The offset into the pivot array
- * @type: The maple node type
- *
- * When there is no value at a pivot (beyond the end of the data), then the
- * pivot is actually @mas->max.
- *
- * Return: the logical pivot of a given @offset.
- */
-static inline unsigned long
-mas_logical_pivot(struct ma_state *mas, unsigned long *pivots,
- unsigned char offset, enum maple_type type)
-{
- unsigned long lpiv = mas_safe_pivot(mas, pivots, offset, type);
-
- if (likely(lpiv))
- return lpiv;
-
- if (likely(offset))
- return mas->max;
-
- return lpiv;
-}
-
-/*
* mte_set_pivot() - Set a pivot to a value in an encoded maple node.
* @mn: The encoded maple node
* @piv: The pivot offset
@@ -804,6 +778,12 @@ static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt)
}
}
+static inline bool mt_write_locked(const struct maple_tree *mt)
+{
+ return mt_external_lock(mt) ? mt_write_lock_is_held(mt) :
+ lockdep_is_held(&mt->ma_lock);
+}
+
static inline bool mt_locked(const struct maple_tree *mt)
{
return mt_external_lock(mt) ? mt_lock_is_held(mt) :
@@ -819,7 +799,7 @@ static inline void *mt_slot(const struct maple_tree *mt,
static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots,
unsigned char offset)
{
- return rcu_dereference_protected(slots[offset], mt_locked(mt));
+ return rcu_dereference_protected(slots[offset], mt_write_locked(mt));
}
/*
* mas_slot_locked() - Get the slot value when holding the maple tree lock.
@@ -862,7 +842,7 @@ static inline void *mas_root(struct ma_state *mas)
static inline void *mt_root_locked(struct maple_tree *mt)
{
- return rcu_dereference_protected(mt->ma_root, mt_locked(mt));
+ return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt));
}
/*
@@ -1002,27 +982,9 @@ static inline void mat_add(struct ma_topiary *mat,
mat->tail = dead_enode;
}
-static void mte_destroy_walk(struct maple_enode *, struct maple_tree *);
-static inline void mas_free(struct ma_state *mas, struct maple_enode *used);
-
-/*
- * mas_mat_free() - Free all nodes in a dead list.
- * @mas - the maple state
- * @mat - the ma_topiary linked list of dead nodes to free.
- *
- * Free walk a dead list.
- */
-static void mas_mat_free(struct ma_state *mas, struct ma_topiary *mat)
-{
- struct maple_enode *next;
-
- while (mat->head) {
- next = mte_to_mat(mat->head)->next;
- mas_free(mas, mat->head);
- mat->head = next;
- }
-}
-
+static void mt_free_walk(struct rcu_head *head);
+static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
+ bool free);
/*
* mas_mat_destroy() - Free all nodes and subtrees in a dead list.
* @mas - the maple state
@@ -1033,10 +995,15 @@ static void mas_mat_free(struct ma_state *mas, struct ma_topiary *mat)
static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat)
{
struct maple_enode *next;
+ struct maple_node *node;
+ bool in_rcu = mt_in_rcu(mas->tree);
while (mat->head) {
next = mte_to_mat(mat->head)->next;
- mte_destroy_walk(mat->head, mat->mtree);
+ node = mte_to_node(mat->head);
+ mt_destroy_walk(mat->head, mas->tree, !in_rcu);
+ if (in_rcu)
+ call_rcu(&node->rcu, mt_free_walk);
mat->head = next;
}
}
@@ -1610,8 +1577,6 @@ ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt,
* mas_max_gap() - find the largest gap in a non-leaf node and set the slot.
* @mas: The maple state.
*
- * If the metadata gap is set to MAPLE_ARANGE64_META_MAX, there is no gap.
- *
* Return: The gap value.
*/
static inline unsigned long mas_max_gap(struct ma_state *mas)
@@ -1628,9 +1593,6 @@ static inline unsigned long mas_max_gap(struct ma_state *mas)
node = mas_mn(mas);
MAS_BUG_ON(mas, mt != maple_arange_64);
offset = ma_meta_gap(node, mt);
- if (offset == MAPLE_ARANGE64_META_MAX)
- return 0;
-
gaps = ma_gaps(node, mt);
return gaps[offset];
}
@@ -1662,10 +1624,7 @@ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset,
ascend:
MAS_BUG_ON(mas, pmt != maple_arange_64);
meta_offset = ma_meta_gap(pnode, pmt);
- if (meta_offset == MAPLE_ARANGE64_META_MAX)
- meta_gap = 0;
- else
- meta_gap = pgaps[meta_offset];
+ meta_gap = pgaps[meta_offset];
pgaps[offset] = new;
@@ -1678,7 +1637,6 @@ ascend:
ma_set_meta_gap(pnode, pmt, offset);
} else if (new < meta_gap) {
- meta_offset = 15;
new = ma_max_gap(pnode, pgaps, pmt, &meta_offset);
ma_set_meta_gap(pnode, pmt, meta_offset);
}
@@ -1731,7 +1689,7 @@ static inline void mas_adopt_children(struct ma_state *mas,
struct maple_enode *parent)
{
enum maple_type type = mte_node_type(parent);
- struct maple_node *node = mas_mn(mas);
+ struct maple_node *node = mte_to_node(parent);
void __rcu **slots = ma_slots(node, type);
unsigned long *pivots = ma_pivots(node, type);
struct maple_enode *child;
@@ -1745,53 +1703,54 @@ static inline void mas_adopt_children(struct ma_state *mas,
}
/*
- * mas_replace() - Replace a maple node in the tree with mas->node. Uses the
- * parent encoding to locate the maple node in the tree.
- * @mas - the ma_state to use for operations.
- * @advanced - boolean to adopt the child nodes and free the old node (false) or
- * leave the node (true) and handle the adoption and free elsewhere.
+ * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old
+ * node as dead.
+ * @mas - the maple state with the new node
+ * @old_enode - The old maple encoded node to replace.
*/
-static inline void mas_replace(struct ma_state *mas, bool advanced)
+static inline void mas_put_in_tree(struct ma_state *mas,
+ struct maple_enode *old_enode)
__must_hold(mas->tree->ma_lock)
{
- struct maple_node *mn = mas_mn(mas);
- struct maple_enode *old_enode;
- unsigned char offset = 0;
- void __rcu **slots = NULL;
-
- if (ma_is_root(mn)) {
- old_enode = mas_root_locked(mas);
- } else {
- offset = mte_parent_slot(mas->node);
- slots = ma_slots(mte_parent(mas->node),
- mas_parent_type(mas, mas->node));
- old_enode = mas_slot_locked(mas, slots, offset);
- }
-
- if (!advanced && !mte_is_leaf(mas->node))
- mas_adopt_children(mas, mas->node);
+ unsigned char offset;
+ void __rcu **slots;
if (mte_is_root(mas->node)) {
- mn->parent = ma_parent_ptr(
- ((unsigned long)mas->tree | MA_ROOT_PARENT));
+ mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas));
rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
mas_set_height(mas);
} else {
+
+ offset = mte_parent_slot(mas->node);
+ slots = ma_slots(mte_parent(mas->node),
+ mas_parent_type(mas, mas->node));
rcu_assign_pointer(slots[offset], mas->node);
}
- if (!advanced) {
- mte_set_node_dead(old_enode);
- mas_free(mas, old_enode);
- }
+ mte_set_node_dead(old_enode);
}
/*
- * mas_new_child() - Find the new child of a node.
- * @mas: the maple state
+ * mas_replace_node() - Replace a node by putting it in the tree, marking it
+ * dead, and freeing it.
+ * the parent encoding to locate the maple node in the tree.
+ * @mas - the ma_state with @mas->node pointing to the new node.
+ * @old_enode - The old maple encoded node.
+ */
+static inline void mas_replace_node(struct ma_state *mas,
+ struct maple_enode *old_enode)
+ __must_hold(mas->tree->ma_lock)
+{
+ mas_put_in_tree(mas, old_enode);
+ mas_free(mas, old_enode);
+}
+
+/*
+ * mas_find_child() - Find a child who has the parent @mas->node.
+ * @mas: the maple state with the parent.
* @child: the maple state to store the child.
*/
-static inline bool mas_new_child(struct ma_state *mas, struct ma_state *child)
+static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child)
__must_hold(mas->tree->ma_lock)
{
enum maple_type mt;
@@ -2076,7 +2035,7 @@ static inline void mab_mas_cp(struct maple_big_node *b_node,
end = j - 1;
if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) {
unsigned long max_gap = 0;
- unsigned char offset = 15;
+ unsigned char offset = 0;
gaps = ma_gaps(node, mt);
do {
@@ -2094,56 +2053,6 @@ static inline void mab_mas_cp(struct maple_big_node *b_node,
}
/*
- * mas_descend_adopt() - Descend through a sub-tree and adopt children.
- * @mas: the maple state with the maple encoded node of the sub-tree.
- *
- * Descend through a sub-tree and adopt children who do not have the correct
- * parents set. Follow the parents which have the correct parents as they are
- * the new entries which need to be followed to find other incorrectly set
- * parents.
- */
-static inline void mas_descend_adopt(struct ma_state *mas)
-{
- struct ma_state list[3], next[3];
- int i, n;
-
- /*
- * At each level there may be up to 3 correct parent pointers which indicates
- * the new nodes which need to be walked to find any new nodes at a lower level.
- */
-
- for (i = 0; i < 3; i++) {
- list[i] = *mas;
- list[i].offset = 0;
- next[i].offset = 0;
- }
- next[0] = *mas;
-
- while (!mte_is_leaf(list[0].node)) {
- n = 0;
- for (i = 0; i < 3; i++) {
- if (mas_is_none(&list[i]))
- continue;
-
- if (i && list[i-1].node == list[i].node)
- continue;
-
- while ((n < 3) && (mas_new_child(&list[i], &next[n])))
- n++;
-
- mas_adopt_children(&list[i], list[i].node);
- }
-
- while (n < 3)
- next[n++].node = MAS_NONE;
-
- /* descend by setting the list to the children */
- for (i = 0; i < 3; i++)
- list[i] = next[i];
- }
-}
-
-/*
* mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert.
* @mas: The maple state
* @end: The maple node end
@@ -2211,7 +2120,7 @@ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas,
goto b_end;
/* Handle new range ending before old range ends */
- piv = mas_logical_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type);
+ piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type);
if (piv > mas->last) {
if (piv == ULONG_MAX)
mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type);
@@ -2333,98 +2242,6 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas)
}
/*
- * mas_topiary_range() - Add a range of slots to the topiary.
- * @mas: The maple state
- * @destroy: The topiary to add the slots (usually destroy)
- * @start: The starting slot inclusively
- * @end: The end slot inclusively
- */
-static inline void mas_topiary_range(struct ma_state *mas,
- struct ma_topiary *destroy, unsigned char start, unsigned char end)
-{
- void __rcu **slots;
- unsigned char offset;
-
- MAS_BUG_ON(mas, mte_is_leaf(mas->node));
-
- slots = ma_slots(mas_mn(mas), mte_node_type(mas->node));
- for (offset = start; offset <= end; offset++) {
- struct maple_enode *enode = mas_slot_locked(mas, slots, offset);
-
- if (mte_dead_node(enode))
- continue;
-
- mat_add(destroy, enode);
- }
-}
-
-/*
- * mast_topiary() - Add the portions of the tree to the removal list; either to
- * be freed or discarded (destroy walk).
- * @mast: The maple_subtree_state.
- */
-static inline void mast_topiary(struct maple_subtree_state *mast)
-{
- MA_WR_STATE(wr_mas, mast->orig_l, NULL);
- unsigned char r_start, r_end;
- unsigned char l_start, l_end;
- void __rcu **l_slots, **r_slots;
-
- wr_mas.type = mte_node_type(mast->orig_l->node);
- mast->orig_l->index = mast->orig_l->last;
- mas_wr_node_walk(&wr_mas);
- l_start = mast->orig_l->offset + 1;
- l_end = mas_data_end(mast->orig_l);
- r_start = 0;
- r_end = mast->orig_r->offset;
-
- if (r_end)
- r_end--;
-
- l_slots = ma_slots(mas_mn(mast->orig_l),
- mte_node_type(mast->orig_l->node));
-
- r_slots = ma_slots(mas_mn(mast->orig_r),
- mte_node_type(mast->orig_r->node));
-
- if ((l_start < l_end) &&
- mte_dead_node(mas_slot_locked(mast->orig_l, l_slots, l_start))) {
- l_start++;
- }
-
- if (mte_dead_node(mas_slot_locked(mast->orig_r, r_slots, r_end))) {
- if (r_end)
- r_end--;
- }
-
- if ((l_start > r_end) && (mast->orig_l->node == mast->orig_r->node))
- return;
-
- /* At the node where left and right sides meet, add the parts between */
- if (mast->orig_l->node == mast->orig_r->node) {
- return mas_topiary_range(mast->orig_l, mast->destroy,
- l_start, r_end);
- }
-
- /* mast->orig_r is different and consumed. */
- if (mte_is_leaf(mast->orig_r->node))
- return;
-
- if (mte_dead_node(mas_slot_locked(mast->orig_l, l_slots, l_end)))
- l_end--;
-
-
- if (l_start <= l_end)
- mas_topiary_range(mast->orig_l, mast->destroy, l_start, l_end);
-
- if (mte_dead_node(mas_slot_locked(mast->orig_r, r_slots, r_start)))
- r_start++;
-
- if (r_start <= r_end)
- mas_topiary_range(mast->orig_r, mast->destroy, 0, r_end);
-}
-
-/*
* mast_rebalance_next() - Rebalance against the next node
* @mast: The maple subtree state
* @old_r: The encoded maple node to the right (next node).
@@ -2459,7 +2276,7 @@ static inline void mast_rebalance_prev(struct maple_subtree_state *mast)
/*
* mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring
* the node to the right. Checking the nodes to the right then the left at each
- * level upwards until root is reached. Free and destroy as needed.
+ * level upwards until root is reached.
* Data is copied into the @mast->bn.
* @mast: The maple_subtree_state.
*/
@@ -2468,8 +2285,6 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast)
{
struct ma_state r_tmp = *mast->orig_r;
struct ma_state l_tmp = *mast->orig_l;
- struct maple_enode *ancestor = NULL;
- unsigned char start, end;
unsigned char depth = 0;
r_tmp = *mast->orig_r;
@@ -2478,87 +2293,25 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast)
mas_ascend(mast->orig_r);
mas_ascend(mast->orig_l);
depth++;
- if (!ancestor &&
- (mast->orig_r->node == mast->orig_l->node)) {
- ancestor = mast->orig_r->node;
- end = mast->orig_r->offset - 1;
- start = mast->orig_l->offset + 1;
- }
-
if (mast->orig_r->offset < mas_data_end(mast->orig_r)) {
- if (!ancestor) {
- ancestor = mast->orig_r->node;
- start = 0;
- }
-
mast->orig_r->offset++;
do {
mas_descend(mast->orig_r);
mast->orig_r->offset = 0;
- depth--;
- } while (depth);
+ } while (--depth);
mast_rebalance_next(mast);
- do {
- unsigned char l_off = 0;
- struct maple_enode *child = r_tmp.node;
-
- mas_ascend(&r_tmp);
- if (ancestor == r_tmp.node)
- l_off = start;
-
- if (r_tmp.offset)
- r_tmp.offset--;
-
- if (l_off < r_tmp.offset)
- mas_topiary_range(&r_tmp, mast->destroy,
- l_off, r_tmp.offset);
-
- if (l_tmp.node != child)
- mat_add(mast->free, child);
-
- } while (r_tmp.node != ancestor);
-
*mast->orig_l = l_tmp;
return true;
-
} else if (mast->orig_l->offset != 0) {
- if (!ancestor) {
- ancestor = mast->orig_l->node;
- end = mas_data_end(mast->orig_l);
- }
-
mast->orig_l->offset--;
do {
mas_descend(mast->orig_l);
mast->orig_l->offset =
mas_data_end(mast->orig_l);
- depth--;
- } while (depth);
+ } while (--depth);
mast_rebalance_prev(mast);
- do {
- unsigned char r_off;
- struct maple_enode *child = l_tmp.node;
-
- mas_ascend(&l_tmp);
- if (ancestor == l_tmp.node)
- r_off = end;
- else
- r_off = mas_data_end(&l_tmp);
-
- if (l_tmp.offset < r_off)
- l_tmp.offset++;
-
- if (l_tmp.offset < r_off)
- mas_topiary_range(&l_tmp, mast->destroy,
- l_tmp.offset, r_off);
-
- if (r_tmp.node != child)
- mat_add(mast->free, child);
-
- } while (l_tmp.node != ancestor);
-
*mast->orig_r = r_tmp;
return true;
}
@@ -2570,36 +2323,24 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast)
}
/*
- * mast_ascend_free() - Add current original maple state nodes to the free list
- * and ascend.
+ * mast_ascend() - Ascend the original left and right maple states.
* @mast: the maple subtree state.
*
- * Ascend the original left and right sides and add the previous nodes to the
- * free list. Set the slots to point to the correct location in the new nodes.
+ * Ascend the original left and right sides. Set the offsets to point to the
+ * data already in the new tree (@mast->l and @mast->r).
*/
-static inline void
-mast_ascend_free(struct maple_subtree_state *mast)
+static inline void mast_ascend(struct maple_subtree_state *mast)
{
MA_WR_STATE(wr_mas, mast->orig_r, NULL);
- struct maple_enode *left = mast->orig_l->node;
- struct maple_enode *right = mast->orig_r->node;
-
mas_ascend(mast->orig_l);
mas_ascend(mast->orig_r);
- mat_add(mast->free, left);
-
- if (left != right)
- mat_add(mast->free, right);
mast->orig_r->offset = 0;
mast->orig_r->index = mast->r->max;
/* last should be larger than or equal to index */
if (mast->orig_r->last < mast->orig_r->index)
mast->orig_r->last = mast->orig_r->index;
- /*
- * The node may not contain the value so set slot to ensure all
- * of the nodes contents are freed or destroyed.
- */
+
wr_mas.type = mte_node_type(mast->orig_r->node);
mas_wr_node_walk(&wr_mas);
/* Set up the left side of things */
@@ -2778,58 +2519,152 @@ static inline void mast_set_split_parents(struct maple_subtree_state *mast,
}
/*
- * mas_wmb_replace() - Write memory barrier and replace
- * @mas: The maple state
- * @free: the maple topiary list of nodes to free
- * @destroy: The maple topiary list of nodes to destroy (walk and free)
+ * mas_topiary_node() - Dispose of a singe node
+ * @mas: The maple state for pushing nodes
+ * @enode: The encoded maple node
+ * @in_rcu: If the tree is in rcu mode
*
- * Updates gap as necessary.
+ * The node will either be RCU freed or pushed back on the maple state.
*/
-static inline void mas_wmb_replace(struct ma_state *mas,
- struct ma_topiary *free,
- struct ma_topiary *destroy)
+static inline void mas_topiary_node(struct ma_state *mas,
+ struct maple_enode *enode, bool in_rcu)
{
- /* All nodes must see old data as dead prior to replacing that data */
- smp_wmb(); /* Needed for RCU */
+ struct maple_node *tmp;
- /* Insert the new data in the tree */
- mas_replace(mas, true);
+ if (enode == MAS_NONE)
+ return;
- if (!mte_is_leaf(mas->node))
- mas_descend_adopt(mas);
+ tmp = mte_to_node(enode);
+ mte_set_node_dead(enode);
+ if (in_rcu)
+ ma_free_rcu(tmp);
+ else
+ mas_push_node(mas, tmp);
+}
- mas_mat_free(mas, free);
+/*
+ * mas_topiary_replace() - Replace the data with new data, then repair the
+ * parent links within the new tree. Iterate over the dead sub-tree and collect
+ * the dead subtrees and topiary the nodes that are no longer of use.
+ *
+ * The new tree will have up to three children with the correct parent. Keep
+ * track of the new entries as they need to be followed to find the next level
+ * of new entries.
+ *
+ * The old tree will have up to three children with the old parent. Keep track
+ * of the old entries as they may have more nodes below replaced. Nodes within
+ * [index, last] are dead subtrees, others need to be freed and followed.
+ *
+ * @mas: The maple state pointing at the new data
+ * @old_enode: The maple encoded node being replaced
+ *
+ */
+static inline void mas_topiary_replace(struct ma_state *mas,
+ struct maple_enode *old_enode)
+{
+ struct ma_state tmp[3], tmp_next[3];
+ MA_TOPIARY(subtrees, mas->tree);
+ bool in_rcu;
+ int i, n;
- if (destroy)
- mas_mat_destroy(mas, destroy);
+ /* Place data in tree & then mark node as old */
+ mas_put_in_tree(mas, old_enode);
- if (mte_is_leaf(mas->node))
- return;
+ /* Update the parent pointers in the tree */
+ tmp[0] = *mas;
+ tmp[0].offset = 0;
+ tmp[1].node = MAS_NONE;
+ tmp[2].node = MAS_NONE;
+ while (!mte_is_leaf(tmp[0].node)) {
+ n = 0;
+ for (i = 0; i < 3; i++) {
+ if (mas_is_none(&tmp[i]))
+ continue;
- mas_update_gap(mas);
+ while (n < 3) {
+ if (!mas_find_child(&tmp[i], &tmp_next[n]))
+ break;
+ n++;
+ }
+
+ mas_adopt_children(&tmp[i], tmp[i].node);
+ }
+
+ if (MAS_WARN_ON(mas, n == 0))
+ break;
+
+ while (n < 3)
+ tmp_next[n++].node = MAS_NONE;
+
+ for (i = 0; i < 3; i++)
+ tmp[i] = tmp_next[i];
+ }
+
+ /* Collect the old nodes that need to be discarded */
+ if (mte_is_leaf(old_enode))
+ return mas_free(mas, old_enode);
+
+ tmp[0] = *mas;
+ tmp[0].offset = 0;
+ tmp[0].node = old_enode;
+ tmp[1].node = MAS_NONE;
+ tmp[2].node = MAS_NONE;
+ in_rcu = mt_in_rcu(mas->tree);
+ do {
+ n = 0;
+ for (i = 0; i < 3; i++) {
+ if (mas_is_none(&tmp[i]))
+ continue;
+
+ while (n < 3) {
+ if (!mas_find_child(&tmp[i], &tmp_next[n]))
+ break;
+
+ if ((tmp_next[n].min >= tmp_next->index) &&
+ (tmp_next[n].max <= tmp_next->last)) {
+ mat_add(&subtrees, tmp_next[n].node);
+ tmp_next[n].node = MAS_NONE;
+ } else {
+ n++;
+ }
+ }
+ }
+
+ if (MAS_WARN_ON(mas, n == 0))
+ break;
+
+ while (n < 3)
+ tmp_next[n++].node = MAS_NONE;
+
+ for (i = 0; i < 3; i++) {
+ mas_topiary_node(mas, tmp[i].node, in_rcu);
+ tmp[i] = tmp_next[i];
+ }
+ } while (!mte_is_leaf(tmp[0].node));
+
+ for (i = 0; i < 3; i++)
+ mas_topiary_node(mas, tmp[i].node, in_rcu);
+
+ mas_mat_destroy(mas, &subtrees);
}
/*
- * mast_new_root() - Set a new tree root during subtree creation
- * @mast: The maple subtree state
+ * mas_wmb_replace() - Write memory barrier and replace
* @mas: The maple state
+ * @old: The old maple encoded node that is being replaced.
+ *
+ * Updates gap as necessary.
*/
-static inline void mast_new_root(struct maple_subtree_state *mast,
- struct ma_state *mas)
+static inline void mas_wmb_replace(struct ma_state *mas,
+ struct maple_enode *old_enode)
{
- mas_mn(mast->l)->parent =
- ma_parent_ptr(((unsigned long)mas->tree | MA_ROOT_PARENT));
- if (!mte_dead_node(mast->orig_l->node) &&
- !mte_is_root(mast->orig_l->node)) {
- do {
- mast_ascend_free(mast);
- mast_topiary(mast);
- } while (!mte_is_root(mast->orig_l->node));
- }
- if ((mast->orig_l->node != mas->node) &&
- (mast->l->depth > mas_mt_height(mas))) {
- mat_add(mast->free, mas->node);
- }
+ /* Insert the new data in the tree */
+ mas_topiary_replace(mas, old_enode);
+
+ if (mte_is_leaf(mas->node))
+ return;
+
+ mas_update_gap(mas);
}
/*
@@ -3015,12 +2850,11 @@ static int mas_spanning_rebalance(struct ma_state *mas,
unsigned char split, mid_split;
unsigned char slot = 0;
struct maple_enode *left = NULL, *middle = NULL, *right = NULL;
+ struct maple_enode *old_enode;
MA_STATE(l_mas, mas->tree, mas->index, mas->index);
MA_STATE(r_mas, mas->tree, mas->index, mas->last);
MA_STATE(m_mas, mas->tree, mas->index, mas->index);
- MA_TOPIARY(free, mas->tree);
- MA_TOPIARY(destroy, mas->tree);
/*
* The tree needs to be rebalanced and leaves need to be kept at the same level.
@@ -3029,8 +2863,6 @@ static int mas_spanning_rebalance(struct ma_state *mas,
mast->l = &l_mas;
mast->m = &m_mas;
mast->r = &r_mas;
- mast->free = &free;
- mast->destroy = &destroy;
l_mas.node = r_mas.node = m_mas.node = MAS_NONE;
/* Check if this is not root and has sufficient data. */
@@ -3038,7 +2870,7 @@ static int mas_spanning_rebalance(struct ma_state *mas,
unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type]))
mast_spanning_rebalance(mast);
- mast->orig_l->depth = 0;
+ l_mas.depth = 0;
/*
* Each level of the tree is examined and balanced, pushing data to the left or
@@ -3049,7 +2881,7 @@ static int mas_spanning_rebalance(struct ma_state *mas,
* original tree and the partially new tree. To remedy the parent pointers in
* the old tree, the new data is swapped into the active tree and a walk down
* the tree is performed and the parent pointers are updated.
- * See mas_descend_adopt() for more information..
+ * See mas_topiary_replace() for more information.
*/
while (count--) {
mast->bn->b_end--;
@@ -3066,13 +2898,13 @@ static int mas_spanning_rebalance(struct ma_state *mas,
*/
memset(mast->bn, 0, sizeof(struct maple_big_node));
mast->bn->type = mte_node_type(left);
- mast->orig_l->depth++;
+ l_mas.depth++;
/* Root already stored in l->node. */
if (mas_is_root_limits(mast->l))
goto new_root;
- mast_ascend_free(mast);
+ mast_ascend(mast);
mast_combine_cp_left(mast);
l_mas.offset = mast->bn->b_end;
mab_set_b_end(mast->bn, &l_mas, left);
@@ -3081,7 +2913,6 @@ static int mas_spanning_rebalance(struct ma_state *mas,
/* Copy anything necessary out of the right node. */
mast_combine_cp_right(mast);
- mast_topiary(mast);
mast->orig_l->last = mast->orig_l->max;
if (mast_sufficient(mast))
@@ -3103,7 +2934,7 @@ static int mas_spanning_rebalance(struct ma_state *mas,
l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)),
mte_node_type(mast->orig_l->node));
- mast->orig_l->depth++;
+ l_mas.depth++;
mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true);
mas_set_parent(mas, left, l_mas.node, slot);
if (middle)
@@ -3114,23 +2945,20 @@ static int mas_spanning_rebalance(struct ma_state *mas,
if (mas_is_root_limits(mast->l)) {
new_root:
- mast_new_root(mast, mas);
+ mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas));
+ while (!mte_is_root(mast->orig_l->node))
+ mast_ascend(mast);
} else {
mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent;
}
- if (!mte_dead_node(mast->orig_l->node))
- mat_add(&free, mast->orig_l->node);
-
- mas->depth = mast->orig_l->depth;
- *mast->orig_l = l_mas;
- mte_set_node_dead(mas->node);
-
- /* Set up mas for insertion. */
- mast->orig_l->depth = mas->depth;
- mast->orig_l->alloc = mas->alloc;
- *mas = *mast->orig_l;
- mas_wmb_replace(mas, &free, &destroy);
+ old_enode = mast->orig_l->node;
+ mas->depth = l_mas.depth;
+ mas->node = l_mas.node;
+ mas->min = l_mas.min;
+ mas->max = l_mas.max;
+ mas->offset = l_mas.offset;
+ mas_wmb_replace(mas, old_enode);
mtree_range_walk(mas);
return mast->bn->b_end;
}
@@ -3166,7 +2994,7 @@ static inline int mas_rebalance(struct ma_state *mas,
* tries to combine the data in the same way. If one node contains the
* entire range of the tree, then that node is used as a new root node.
*/
- mas_node_count(mas, 1 + empty_count * 3);
+ mas_node_count(mas, empty_count * 2 - 1);
if (mas_is_err(mas))
return 0;
@@ -3206,7 +3034,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end
{
enum maple_type mt = mte_node_type(mas->node);
struct maple_node reuse, *newnode, *parent, *new_left, *left, *node;
- struct maple_enode *eparent;
+ struct maple_enode *eparent, *old_eparent;
unsigned char offset, tmp, split = mt_slots[mt] / 2;
void __rcu **l_slots, **slots;
unsigned long *l_pivs, *pivs, gap;
@@ -3248,7 +3076,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end
l_mas.max = l_pivs[split];
mas->min = l_mas.max + 1;
- eparent = mt_mk_node(mte_parent(l_mas.node),
+ old_eparent = mt_mk_node(mte_parent(l_mas.node),
mas_parent_type(&l_mas, l_mas.node));
tmp += end;
if (!in_rcu) {
@@ -3264,7 +3092,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end
memcpy(node, newnode, sizeof(struct maple_node));
ma_set_meta(node, mt, 0, tmp - 1);
- mte_set_pivot(eparent, mte_parent_slot(l_mas.node),
+ mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node),
l_pivs[split]);
/* Remove data from l_pivs. */
@@ -3272,6 +3100,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end
memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp));
memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp));
ma_set_meta(left, mt, 0, split);
+ eparent = old_eparent;
goto done;
}
@@ -3296,7 +3125,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end
parent = mas_pop_node(mas);
slots = ma_slots(parent, mt);
pivs = ma_pivots(parent, mt);
- memcpy(parent, mte_to_node(eparent), sizeof(struct maple_node));
+ memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node));
rcu_assign_pointer(slots[offset], mas->node);
rcu_assign_pointer(slots[offset - 1], l_mas.node);
pivs[offset - 1] = l_mas.max;
@@ -3308,8 +3137,10 @@ done:
mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap);
mas_ascend(mas);
- if (in_rcu)
- mas_replace(mas, false);
+ if (in_rcu) {
+ mas_replace_node(mas, old_eparent);
+ mas_adopt_children(mas, mas->node);
+ }
mas_update_gap(mas);
}
@@ -3358,7 +3189,6 @@ static inline void mast_fill_bnode(struct maple_subtree_state *mast,
unsigned char skip)
{
bool cp = true;
- struct maple_enode *old = mas->node;
unsigned char split;
memset(mast->bn->gap, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->gap));
@@ -3370,7 +3200,6 @@ static inline void mast_fill_bnode(struct maple_subtree_state *mast,
cp = false;
} else {
mas_ascend(mas);
- mat_add(mast->free, old);
mas->offset = mte_parent_slot(mas->node);
}
@@ -3474,13 +3303,11 @@ static inline bool mas_push_data(struct ma_state *mas, int height,
split = mt_slots[mast->bn->type] - 2;
if (left) {
/* Switch mas to prev node */
- mat_add(mast->free, mas->node);
*mas = tmp_mas;
/* Start using mast->l for the left side. */
tmp_mas.node = mast->l->node;
*mast->l = tmp_mas;
} else {
- mat_add(mast->free, tmp_mas.node);
tmp_mas.node = mast->r->node;
*mast->r = tmp_mas;
split = slot_total - split;
@@ -3507,6 +3334,7 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node)
struct maple_subtree_state mast;
int height = 0;
unsigned char mid_split, split = 0;
+ struct maple_enode *old;
/*
* Splitting is handled differently from any other B-tree; the Maple
@@ -3529,7 +3357,6 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node)
MA_STATE(r_mas, mas->tree, mas->index, mas->last);
MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last);
MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last);
- MA_TOPIARY(mat, mas->tree);
trace_ma_op(__func__, mas);
mas->depth = mas_mt_height(mas);
@@ -3542,7 +3369,6 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node)
mast.r = &r_mas;
mast.orig_l = &prev_l_mas;
mast.orig_r = &prev_r_mas;
- mast.free = &mat;
mast.bn = b_node;
while (height++ <= mas->depth) {
@@ -3582,9 +3408,9 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node)
}
/* Set the original node as dead */
- mat_add(mast.free, mas->node);
+ old = mas->node;
mas->node = l_mas.node;
- mas_wmb_replace(mas, mast.free, NULL);
+ mas_wmb_replace(mas, old);
mtree_range_walk(mas);
return 1;
}
@@ -3626,11 +3452,13 @@ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas,
struct maple_big_node *b_node, unsigned char end)
{
struct maple_node *node;
+ struct maple_enode *old_enode;
unsigned char b_end = b_node->b_end;
enum maple_type b_type = b_node->type;
+ old_enode = wr_mas->mas->node;
if ((b_end < mt_min_slots[b_type]) &&
- (!mte_is_root(wr_mas->mas->node)) &&
+ (!mte_is_root(old_enode)) &&
(mas_mt_height(wr_mas->mas) > 1))
return mas_rebalance(wr_mas->mas, b_node);
@@ -3648,7 +3476,7 @@ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas,
node->parent = mas_mn(wr_mas->mas)->parent;
wr_mas->mas->node = mt_mk_node(node, b_type);
mab_mas_cp(b_node, 0, b_end, wr_mas->mas, false);
- mas_replace(wr_mas->mas, false);
+ mas_replace_node(wr_mas->mas, old_enode);
reuse_node:
mas_update_gap(wr_mas->mas);
return 1;
@@ -3675,8 +3503,7 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry)
node = mas_pop_node(mas);
pivots = ma_pivots(node, type);
slots = ma_slots(node, type);
- node->parent = ma_parent_ptr(
- ((unsigned long)mas->tree | MA_ROOT_PARENT));
+ node->parent = ma_parent_ptr(mas_tree_parent(mas));
mas->node = mt_mk_node(node, type);
if (mas->index) {
@@ -3919,6 +3746,7 @@ dead_node:
return NULL;
}
+static void mte_destroy_walk(struct maple_enode *, struct maple_tree *);
/*
* mas_new_root() - Create a new root node that only contains the entry passed
* in.
@@ -3952,8 +3780,7 @@ static inline int mas_new_root(struct ma_state *mas, void *entry)
node = mas_pop_node(mas);
pivots = ma_pivots(node, type);
slots = ma_slots(node, type);
- node->parent = ma_parent_ptr(
- ((unsigned long)mas->tree | MA_ROOT_PARENT));
+ node->parent = ma_parent_ptr(mas_tree_parent(mas));
mas->node = mt_mk_node(node, type);
rcu_assign_pointer(slots[0], entry);
pivots[0] = mas->last;
@@ -3986,7 +3813,6 @@ static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas)
/* Left and Right side of spanning store */
MA_STATE(l_mas, NULL, 0, 0);
MA_STATE(r_mas, NULL, 0, 0);
-
MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry);
MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry);
@@ -4147,9 +3973,10 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas,
done:
mas_leaf_set_meta(mas, newnode, dst_pivots, maple_leaf_64, new_end);
if (in_rcu) {
- mte_set_node_dead(mas->node);
+ struct maple_enode *old_enode = mas->node;
+
mas->node = mt_mk_node(newnode, wr_mas->type);
- mas_replace(mas, false);
+ mas_replace_node(mas, old_enode);
} else {
memcpy(wr_mas->node, newnode, sizeof(struct maple_node));
}
@@ -4168,23 +3995,35 @@ static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas)
{
struct ma_state *mas = wr_mas->mas;
unsigned char offset = mas->offset;
+ void __rcu **slots = wr_mas->slots;
bool gap = false;
- if (wr_mas->offset_end - offset != 1)
- return false;
+ gap |= !mt_slot_locked(mas->tree, slots, offset);
+ gap |= !mt_slot_locked(mas->tree, slots, offset + 1);
- gap |= !mt_slot_locked(mas->tree, wr_mas->slots, offset);
- gap |= !mt_slot_locked(mas->tree, wr_mas->slots, offset + 1);
-
- if (mas->index == wr_mas->r_min) {
- /* Overwriting the range and over a part of the next range. */
- rcu_assign_pointer(wr_mas->slots[offset], wr_mas->entry);
- wr_mas->pivots[offset] = mas->last;
- } else {
- /* Overwriting a part of the range and over the next range */
- rcu_assign_pointer(wr_mas->slots[offset + 1], wr_mas->entry);
+ if (wr_mas->offset_end - offset == 1) {
+ if (mas->index == wr_mas->r_min) {
+ /* Overwriting the range and a part of the next one */
+ rcu_assign_pointer(slots[offset], wr_mas->entry);
+ wr_mas->pivots[offset] = mas->last;
+ } else {
+ /* Overwriting a part of the range and the next one */
+ rcu_assign_pointer(slots[offset + 1], wr_mas->entry);
+ wr_mas->pivots[offset] = mas->index - 1;
+ mas->offset++; /* Keep mas accurate. */
+ }
+ } else if (!mt_in_rcu(mas->tree)) {
+ /*
+ * Expand the range, only partially overwriting the previous and
+ * next ranges
+ */
+ gap |= !mt_slot_locked(mas->tree, slots, offset + 2);
+ rcu_assign_pointer(slots[offset + 1], wr_mas->entry);
wr_mas->pivots[offset] = mas->index - 1;
+ wr_mas->pivots[offset + 1] = mas->last;
mas->offset++; /* Keep mas accurate. */
+ } else {
+ return false;
}
trace_ma_write(__func__, mas, 0, wr_mas->entry);
@@ -4198,18 +4037,6 @@ static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas)
return true;
}
-static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas)
-{
- while ((wr_mas->offset_end < wr_mas->node_end) &&
- (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end]))
- wr_mas->offset_end++;
-
- if (wr_mas->offset_end < wr_mas->node_end)
- wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end];
- else
- wr_mas->end_piv = wr_mas->mas->max;
-}
-
static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas)
{
struct ma_state *mas = wr_mas->mas;
@@ -4246,6 +4073,21 @@ static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas)
}
}
+static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas)
+{
+ while ((wr_mas->offset_end < wr_mas->node_end) &&
+ (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end]))
+ wr_mas->offset_end++;
+
+ if (wr_mas->offset_end < wr_mas->node_end)
+ wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end];
+ else
+ wr_mas->end_piv = wr_mas->mas->max;
+
+ if (!wr_mas->entry)
+ mas_wr_extend_null(wr_mas);
+}
+
static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas)
{
struct ma_state *mas = wr_mas->mas;
@@ -4267,10 +4109,10 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas)
*
* Return: True if appended, false otherwise
*/
-static inline bool mas_wr_append(struct ma_wr_state *wr_mas)
+static inline bool mas_wr_append(struct ma_wr_state *wr_mas,
+ unsigned char new_end)
{
unsigned char end = wr_mas->node_end;
- unsigned char new_end = end + 1;
struct ma_state *mas = wr_mas->mas;
unsigned char node_pivots = mt_pivots[wr_mas->type];
@@ -4282,16 +4124,27 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas)
ma_set_meta(wr_mas->node, maple_leaf_64, 0, new_end);
}
- if (mas->last == wr_mas->r_max) {
- /* Append to end of range */
- rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->entry);
- wr_mas->pivots[end] = mas->index - 1;
- mas->offset = new_end;
+ if (new_end == wr_mas->node_end + 1) {
+ if (mas->last == wr_mas->r_max) {
+ /* Append to end of range */
+ rcu_assign_pointer(wr_mas->slots[new_end],
+ wr_mas->entry);
+ wr_mas->pivots[end] = mas->index - 1;
+ mas->offset = new_end;
+ } else {
+ /* Append to start of range */
+ rcu_assign_pointer(wr_mas->slots[new_end],
+ wr_mas->content);
+ wr_mas->pivots[end] = mas->last;
+ rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry);
+ }
} else {
- /* Append to start of range */
+ /* Append to the range without touching any boundaries. */
rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->content);
- wr_mas->pivots[end] = mas->last;
- rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry);
+ wr_mas->pivots[end + 1] = mas->last;
+ rcu_assign_pointer(wr_mas->slots[end + 1], wr_mas->entry);
+ wr_mas->pivots[end] = mas->index - 1;
+ mas->offset = end + 1;
}
if (!wr_mas->content || !wr_mas->entry)
@@ -4338,7 +4191,7 @@ static inline void mas_wr_modify(struct ma_wr_state *wr_mas)
goto slow_path;
/* Attempt to append */
- if (new_end == wr_mas->node_end + 1 && mas_wr_append(wr_mas))
+ if (mas_wr_append(wr_mas, new_end))
return;
if (new_end == wr_mas->node_end && mas_wr_slot_store(wr_mas))
@@ -4378,10 +4231,6 @@ static inline void *mas_wr_store_entry(struct ma_wr_state *wr_mas)
/* At this point, we are at the leaf node that needs to be altered. */
mas_wr_end_piv(wr_mas);
-
- if (!wr_mas->entry)
- mas_wr_extend_null(wr_mas);
-
/* New root for a single pointer */
if (unlikely(!mas->index && mas->last == ULONG_MAX)) {
mas_new_root(mas, wr_mas->entry);
@@ -4921,7 +4770,7 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size)
min = mas_safe_min(mas, pivots, offset);
data_end = ma_data_end(node, type, pivots, mas->max);
for (; offset <= data_end; offset++) {
- pivot = mas_logical_pivot(mas, pivots, offset, type);
+ pivot = mas_safe_pivot(mas, pivots, offset, type);
/* Not within lower bounds */
if (mas->index > pivot)
@@ -5432,19 +5281,34 @@ static inline void mte_destroy_walk(struct maple_enode *enode,
static void mas_wr_store_setup(struct ma_wr_state *wr_mas)
{
+ if (mas_is_start(wr_mas->mas))
+ return;
+
if (unlikely(mas_is_paused(wr_mas->mas)))
- mas_reset(wr_mas->mas);
+ goto reset;
- if (!mas_is_start(wr_mas->mas)) {
- if (mas_is_none(wr_mas->mas)) {
- mas_reset(wr_mas->mas);
- } else {
- wr_mas->r_max = wr_mas->mas->max;
- wr_mas->type = mte_node_type(wr_mas->mas->node);
- if (mas_is_span_wr(wr_mas))
- mas_reset(wr_mas->mas);
- }
- }
+ if (unlikely(mas_is_none(wr_mas->mas)))
+ goto reset;
+
+ /*
+ * A less strict version of mas_is_span_wr() where we allow spanning
+ * writes within this node. This is to stop partial walks in
+ * mas_prealloc() from being reset.
+ */
+ if (wr_mas->mas->last > wr_mas->mas->max)
+ goto reset;
+
+ if (wr_mas->entry)
+ return;
+
+ if (mte_is_leaf(wr_mas->mas->node) &&
+ wr_mas->mas->last == wr_mas->mas->max)
+ goto reset;
+
+ return;
+
+reset:
+ mas_reset(wr_mas->mas);
}
/* Interface */
@@ -5536,15 +5400,58 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc);
/**
* mas_preallocate() - Preallocate enough nodes for a store operation
* @mas: The maple state
+ * @entry: The entry that will be stored
* @gfp: The GFP_FLAGS to use for allocations.
*
* Return: 0 on success, -ENOMEM if memory could not be allocated.
*/
-int mas_preallocate(struct ma_state *mas, gfp_t gfp)
+int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
{
+ MA_WR_STATE(wr_mas, mas, entry);
+ unsigned char node_size;
+ int request = 1;
int ret;
- mas_node_count_gfp(mas, 1 + mas_mt_height(mas) * 3, gfp);
+
+ if (unlikely(!mas->index && mas->last == ULONG_MAX))
+ goto ask_now;
+
+ mas_wr_store_setup(&wr_mas);
+ wr_mas.content = mas_start(mas);
+ /* Root expand */
+ if (unlikely(mas_is_none(mas) || mas_is_ptr(mas)))
+ goto ask_now;
+
+ if (unlikely(!mas_wr_walk(&wr_mas))) {
+ /* Spanning store, use worst case for now */
+ request = 1 + mas_mt_height(mas) * 3;
+ goto ask_now;
+ }
+
+ /* At this point, we are at the leaf node that needs to be altered. */
+ /* Exact fit, no nodes needed. */
+ if (wr_mas.r_min == mas->index && wr_mas.r_max == mas->last)
+ return 0;
+
+ mas_wr_end_piv(&wr_mas);
+ node_size = mas_wr_new_end(&wr_mas);
+ if (node_size >= mt_slots[wr_mas.type]) {
+ /* Split, worst case for now. */
+ request = 1 + mas_mt_height(mas) * 2;
+ goto ask_now;
+ }
+
+ /* New root needs a singe node */
+ if (unlikely(mte_is_root(mas->node)))
+ goto ask_now;
+
+ /* Potential spanning rebalance collapsing a node, use worst-case */
+ if (node_size - 1 <= mt_min_slots[wr_mas.type])
+ request = mas_mt_height(mas) * 2 - 1;
+
+ /* node store, slot store needs one node */
+ask_now:
+ mas_node_count_gfp(mas, request, gfp);
mas->mas_flags |= MA_STATE_PREALLOC;
if (likely(!mas_is_err(mas)))
return 0;
@@ -5750,7 +5657,11 @@ EXPORT_SYMBOL_GPL(mas_next_range);
* @index: The start index
* @max: The maximum index to check
*
- * Return: The entry at @index or higher, or %NULL if nothing is found.
+ * Takes RCU read lock internally to protect the search, which does not
+ * protect the returned pointer after dropping RCU read lock.
+ * See also: Documentation/core-api/maple_tree.rst
+ *
+ * Return: The entry higher than @index or %NULL if nothing is found.
*/
void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max)
{
@@ -5856,7 +5767,11 @@ EXPORT_SYMBOL_GPL(mas_prev_range);
* @index: The start index
* @min: The minimum index to check
*
- * Return: The entry at @index or lower, or %NULL if nothing is found.
+ * Takes RCU read lock internally to protect the search, which does not
+ * protect the returned pointer after dropping RCU read lock.
+ * See also: Documentation/core-api/maple_tree.rst
+ *
+ * Return: The entry before @index or %NULL if nothing is found.
*/
void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min)
{
@@ -6279,7 +6194,7 @@ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry,
EXPORT_SYMBOL(mtree_store);
/**
- * mtree_insert_range() - Insert an entry at a give range if there is no value.
+ * mtree_insert_range() - Insert an entry at a given range if there is no value.
* @mt: The maple tree
* @first: The start of the range
* @last: The end of the range
@@ -6315,11 +6230,11 @@ retry:
EXPORT_SYMBOL(mtree_insert_range);
/**
- * mtree_insert() - Insert an entry at a give index if there is no value.
+ * mtree_insert() - Insert an entry at a given index if there is no value.
* @mt: The maple tree
* @index : The index to store the value
* @entry: The entry to store
- * @gfp: The FGP_FLAGS to use for allocations.
+ * @gfp: The GFP_FLAGS to use for allocations.
*
* Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid
* request, -ENOMEM if memory could not be allocated.
@@ -6468,9 +6383,15 @@ EXPORT_SYMBOL(mtree_destroy);
* mt_find() - Search from the start up until an entry is found.
* @mt: The maple tree
* @index: Pointer which contains the start location of the search
- * @max: The maximum value to check
+ * @max: The maximum value of the search range
*
- * Handles locking. @index will be incremented to one beyond the range.
+ * Takes RCU read lock internally to protect the search, which does not
+ * protect the returned pointer after dropping RCU read lock.
+ * See also: Documentation/core-api/maple_tree.rst
+ *
+ * In case that an entry is found @index is updated to point to the next
+ * possible entry independent whether the found entry is occupying a
+ * single index or a range if indices.
*
* Return: The entry at or after the @index or %NULL
*/
@@ -6528,7 +6449,9 @@ EXPORT_SYMBOL(mt_find);
* @index: Pointer which contains the start location of the search
* @max: The maximum value to check
*
- * Handles locking, detects wrapping on index == 0
+ * Same as mt_find() except that it checks @index for 0 before
+ * searching. If @index == 0, the search is aborted. This covers a wrap
+ * around of @index to 0 in an iterator loop.
*
* Return: The entry at or after the @index or %NULL
*/
@@ -6633,78 +6556,6 @@ static inline struct maple_enode *mas_get_slot(struct ma_state *mas,
offset);
}
-
-/*
- * mas_first_entry() - Go the first leaf and find the first entry.
- * @mas: the maple state.
- * @limit: the maximum index to check.
- * @*r_start: Pointer to set to the range start.
- *
- * Sets mas->offset to the offset of the entry, r_start to the range minimum.
- *
- * Return: The first entry or MAS_NONE.
- */
-static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn,
- unsigned long limit, enum maple_type mt)
-
-{
- unsigned long max;
- unsigned long *pivots;
- void __rcu **slots;
- void *entry = NULL;
-
- mas->index = mas->min;
- if (mas->index > limit)
- goto none;
-
- max = mas->max;
- mas->offset = 0;
- while (likely(!ma_is_leaf(mt))) {
- MAS_WARN_ON(mas, mte_dead_node(mas->node));
- slots = ma_slots(mn, mt);
- entry = mas_slot(mas, slots, 0);
- pivots = ma_pivots(mn, mt);
- if (unlikely(ma_dead_node(mn)))
- return NULL;
- max = pivots[0];
- mas->node = entry;
- mn = mas_mn(mas);
- mt = mte_node_type(mas->node);
- }
- MAS_WARN_ON(mas, mte_dead_node(mas->node));
-
- mas->max = max;
- slots = ma_slots(mn, mt);
- entry = mas_slot(mas, slots, 0);
- if (unlikely(ma_dead_node(mn)))
- return NULL;
-
- /* Slot 0 or 1 must be set */
- if (mas->index > limit)
- goto none;
-
- if (likely(entry))
- return entry;
-
- mas->offset = 1;
- entry = mas_slot(mas, slots, 1);
- pivots = ma_pivots(mn, mt);
- if (unlikely(ma_dead_node(mn)))
- return NULL;
-
- mas->index = pivots[0] + 1;
- if (mas->index > limit)
- goto none;
-
- if (likely(entry))
- return entry;
-
-none:
- if (likely(!ma_dead_node(mn)))
- mas->node = MAS_NONE;
- return NULL;
-}
-
/* Depth first search, post-order */
static void mas_dfs_postorder(struct ma_state *mas, unsigned long max)
{
@@ -6839,11 +6690,27 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry,
int i;
pr_cont(" contents: ");
- for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++)
- pr_cont("%lu ", node->gap[i]);
+ for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) {
+ switch (format) {
+ case mt_dump_hex:
+ pr_cont("%lx ", node->gap[i]);
+ break;
+ default:
+ case mt_dump_dec:
+ pr_cont("%lu ", node->gap[i]);
+ }
+ }
pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap);
- for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++)
- pr_cont("%p %lu ", node->slot[i], node->pivot[i]);
+ for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) {
+ switch (format) {
+ case mt_dump_hex:
+ pr_cont("%p %lX ", node->slot[i], node->pivot[i]);
+ break;
+ default:
+ case mt_dump_dec:
+ pr_cont("%p %lu ", node->slot[i], node->pivot[i]);
+ }
+ }
pr_cont("%p\n", node->slot[i]);
for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) {
unsigned long last = max;
@@ -6927,15 +6794,16 @@ EXPORT_SYMBOL_GPL(mt_dump);
static void mas_validate_gaps(struct ma_state *mas)
{
struct maple_enode *mte = mas->node;
- struct maple_node *p_mn;
+ struct maple_node *p_mn, *node = mte_to_node(mte);
+ enum maple_type mt = mte_node_type(mas->node);
unsigned long gap = 0, max_gap = 0;
unsigned long p_end, p_start = mas->min;
- unsigned char p_slot;
+ unsigned char p_slot, offset;
unsigned long *gaps = NULL;
- unsigned long *pivots = ma_pivots(mte_to_node(mte), mte_node_type(mte));
- int i;
+ unsigned long *pivots = ma_pivots(node, mt);
+ unsigned int i;
- if (ma_is_dense(mte_node_type(mte))) {
+ if (ma_is_dense(mt)) {
for (i = 0; i < mt_slot_count(mte); i++) {
if (mas_get_slot(mas, i)) {
if (gap > max_gap)
@@ -6948,52 +6816,59 @@ static void mas_validate_gaps(struct ma_state *mas)
goto counted;
}
- gaps = ma_gaps(mte_to_node(mte), mte_node_type(mte));
+ gaps = ma_gaps(node, mt);
for (i = 0; i < mt_slot_count(mte); i++) {
- p_end = mas_logical_pivot(mas, pivots, i, mte_node_type(mte));
+ p_end = mas_safe_pivot(mas, pivots, i, mt);
if (!gaps) {
- if (mas_get_slot(mas, i)) {
- gap = 0;
- goto not_empty;
- }
-
- gap += p_end - p_start + 1;
+ if (!mas_get_slot(mas, i))
+ gap = p_end - p_start + 1;
} else {
void *entry = mas_get_slot(mas, i);
gap = gaps[i];
- if (!entry) {
- if (gap != p_end - p_start + 1) {
- pr_err("%p[%u] -> %p %lu != %lu - %lu + 1\n",
- mas_mn(mas), i,
- mas_get_slot(mas, i), gap,
- p_end, p_start);
- mt_dump(mas->tree, mt_dump_hex);
-
- MT_BUG_ON(mas->tree,
- gap != p_end - p_start + 1);
- }
- } else {
- if (gap > p_end - p_start + 1) {
- pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n",
- mas_mn(mas), i, gap, p_end, p_start,
- p_end - p_start + 1);
- MT_BUG_ON(mas->tree,
- gap > p_end - p_start + 1);
- }
+ MT_BUG_ON(mas->tree, !entry);
+
+ if (gap > p_end - p_start + 1) {
+ pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n",
+ mas_mn(mas), i, gap, p_end, p_start,
+ p_end - p_start + 1);
+ MT_BUG_ON(mas->tree, gap > p_end - p_start + 1);
}
}
if (gap > max_gap)
max_gap = gap;
-not_empty:
+
p_start = p_end + 1;
if (p_end >= mas->max)
break;
}
counted:
+ if (mt == maple_arange_64) {
+ offset = ma_meta_gap(node, mt);
+ if (offset > i) {
+ pr_err("gap offset %p[%u] is invalid\n", node, offset);
+ MT_BUG_ON(mas->tree, 1);
+ }
+
+ if (gaps[offset] != max_gap) {
+ pr_err("gap %p[%u] is not the largest gap %lu\n",
+ node, offset, max_gap);
+ MT_BUG_ON(mas->tree, 1);
+ }
+
+ MT_BUG_ON(mas->tree, !gaps);
+ for (i++ ; i < mt_slot_count(mte); i++) {
+ if (gaps[i] != 0) {
+ pr_err("gap %p[%u] beyond node limit != 0\n",
+ node, i);
+ MT_BUG_ON(mas->tree, 1);
+ }
+ }
+ }
+
if (mte_is_root(mte))
return;
@@ -7003,10 +6878,8 @@ counted:
if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) {
pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap);
mt_dump(mas->tree, mt_dump_hex);
+ MT_BUG_ON(mas->tree, 1);
}
-
- MT_BUG_ON(mas->tree,
- ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap);
}
static void mas_validate_parent_slot(struct ma_state *mas)
@@ -7057,11 +6930,12 @@ static void mas_validate_child_slot(struct ma_state *mas)
for (i = 0; i < mt_slots[type]; i++) {
child = mas_slot(mas, slots, i);
- if (!pivots[i] || pivots[i] == mas->max)
- break;
- if (!child)
- break;
+ if (!child) {
+ pr_err("Non-leaf node lacks child at %p[%u]\n",
+ mas_mn(mas), i);
+ MT_BUG_ON(mas->tree, 1);
+ }
if (mte_parent_slot(child) != i) {
pr_err("Slot error at %p[%u]: child %p has pslot %u\n",
@@ -7076,11 +6950,16 @@ static void mas_validate_child_slot(struct ma_state *mas)
mte_to_node(mas->node));
MT_BUG_ON(mas->tree, 1);
}
+
+ if (i < mt_pivots[type] && pivots[i] == mas->max)
+ break;
}
}
/*
- * Validate all pivots are within mas->min and mas->max.
+ * Validate all pivots are within mas->min and mas->max, check metadata ends
+ * where the maximum ends and ensure there is no slots or pivots set outside of
+ * the end of the data.
*/
static void mas_validate_limits(struct ma_state *mas)
{
@@ -7090,26 +6969,15 @@ static void mas_validate_limits(struct ma_state *mas)
void __rcu **slots = ma_slots(mte_to_node(mas->node), type);
unsigned long *pivots = ma_pivots(mas_mn(mas), type);
- /* all limits are fine here. */
- if (mte_is_root(mas->node))
- return;
-
for (i = 0; i < mt_slots[type]; i++) {
unsigned long piv;
piv = mas_safe_pivot(mas, pivots, i, type);
- if (!piv && (i != 0))
- break;
-
- if (!mte_is_leaf(mas->node)) {
- void *entry = mas_slot(mas, slots, i);
-
- if (!entry)
- pr_err("%p[%u] cannot be null\n",
- mas_mn(mas), i);
-
- MT_BUG_ON(mas->tree, !entry);
+ if (!piv && (i != 0)) {
+ pr_err("Missing node limit pivot at %p[%u]",
+ mas_mn(mas), i);
+ MAS_WARN_ON(mas, 1);
}
if (prev_piv > piv) {
@@ -7132,6 +7000,13 @@ static void mas_validate_limits(struct ma_state *mas)
if (piv == mas->max)
break;
}
+
+ if (mas_data_end(mas) != i) {
+ pr_err("node%p: data_end %u != the last slot offset %u\n",
+ mas_mn(mas), mas_data_end(mas), i);
+ MT_BUG_ON(mas->tree, 1);
+ }
+
for (i += 1; i < mt_slots[type]; i++) {
void *entry = mas_slot(mas, slots, i);
@@ -7206,21 +7081,20 @@ void mt_validate(struct maple_tree *mt)
if (!mas_searchable(&mas))
goto done;
- mas_first_entry(&mas, mas_mn(&mas), ULONG_MAX, mte_node_type(mas.node));
+ while (!mte_is_leaf(mas.node))
+ mas_descend(&mas);
+
while (!mas_is_none(&mas)) {
MAS_WARN_ON(&mas, mte_dead_node(mas.node));
- if (!mte_is_root(mas.node)) {
- end = mas_data_end(&mas);
- if (MAS_WARN_ON(&mas,
- (end < mt_min_slot_count(mas.node)) &&
- (mas.max != ULONG_MAX))) {
- pr_err("Invalid size %u of %p\n", end,
- mas_mn(&mas));
- }
+ end = mas_data_end(&mas);
+ if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) &&
+ (mas.max != ULONG_MAX))) {
+ pr_err("Invalid size %u of %p\n", end, mas_mn(&mas));
}
+
mas_validate_parent_slot(&mas);
- mas_validate_child_slot(&mas);
mas_validate_limits(&mas);
+ mas_validate_child_slot(&mas);
if (mt_is_alloc(mt))
mas_validate_gaps(&mas);
mas_dfs_postorder(&mas, ULONG_MAX);
diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c
index 8d4c92cbdd0c..0674aebd4423 100644
--- a/lib/test_maple_tree.c
+++ b/lib/test_maple_tree.c
@@ -44,6 +44,8 @@ atomic_t maple_tree_tests_passed;
/* #define BENCH_WALK */
/* #define BENCH_MT_FOR_EACH */
/* #define BENCH_FORK */
+/* #define BENCH_MAS_FOR_EACH */
+/* #define BENCH_MAS_PREV */
#ifdef __KERNEL__
#define mt_set_non_kernel(x) do {} while (0)
@@ -1157,6 +1159,71 @@ static noinline void __init check_ranges(struct maple_tree *mt)
MT_BUG_ON(mt, !mt_height(mt));
mtree_destroy(mt);
+ /* Check in-place modifications */
+ mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
+ /* Append to the start of last range */
+ mt_set_non_kernel(50);
+ for (i = 0; i <= 500; i++) {
+ val = i * 5 + 1;
+ val2 = val + 4;
+ check_store_range(mt, val, val2, xa_mk_value(val), 0);
+ }
+
+ /* Append to the last range without touching any boundaries */
+ for (i = 0; i < 10; i++) {
+ val = val2 + 5;
+ val2 = val + 4;
+ check_store_range(mt, val, val2, xa_mk_value(val), 0);
+ }
+
+ /* Append to the end of last range */
+ val = val2;
+ for (i = 0; i < 10; i++) {
+ val += 5;
+ MT_BUG_ON(mt, mtree_test_store_range(mt, val, ULONG_MAX,
+ xa_mk_value(val)) != 0);
+ }
+
+ /* Overwriting the range and over a part of the next range */
+ for (i = 10; i < 30; i += 2) {
+ val = i * 5 + 1;
+ val2 = val + 5;
+ check_store_range(mt, val, val2, xa_mk_value(val), 0);
+ }
+
+ /* Overwriting a part of the range and over the next range */
+ for (i = 50; i < 70; i += 2) {
+ val2 = i * 5;
+ val = val2 - 5;
+ check_store_range(mt, val, val2, xa_mk_value(val), 0);
+ }
+
+ /*
+ * Expand the range, only partially overwriting the previous and
+ * next ranges
+ */
+ for (i = 100; i < 130; i += 3) {
+ val = i * 5 - 5;
+ val2 = i * 5 + 1;
+ check_store_range(mt, val, val2, xa_mk_value(val), 0);
+ }
+
+ /*
+ * Expand the range, only partially overwriting the previous and
+ * next ranges, in RCU mode
+ */
+ mt_set_in_rcu(mt);
+ for (i = 150; i < 180; i += 3) {
+ val = i * 5 - 5;
+ val2 = i * 5 + 1;
+ check_store_range(mt, val, val2, xa_mk_value(val), 0);
+ }
+
+ MT_BUG_ON(mt, !mt_height(mt));
+ mt_validate(mt);
+ mt_set_non_kernel(0);
+ mtree_destroy(mt);
+
/* Test rebalance gaps */
mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
mt_set_non_kernel(50);
@@ -1705,6 +1772,66 @@ static noinline void __init bench_mt_for_each(struct maple_tree *mt)
}
#endif
+#if defined(BENCH_MAS_FOR_EACH)
+static noinline void __init bench_mas_for_each(struct maple_tree *mt)
+{
+ int i, count = 1000000;
+ unsigned long max = 2500;
+ void *entry;
+ MA_STATE(mas, mt, 0, 0);
+
+ for (i = 0; i < max; i += 5) {
+ int gap = 4;
+
+ if (i % 30 == 0)
+ gap = 3;
+ mtree_store_range(mt, i, i + gap, xa_mk_value(i), GFP_KERNEL);
+ }
+
+ rcu_read_lock();
+ for (i = 0; i < count; i++) {
+ unsigned long j = 0;
+
+ mas_for_each(&mas, entry, max) {
+ MT_BUG_ON(mt, entry != xa_mk_value(j));
+ j += 5;
+ }
+ mas_set(&mas, 0);
+ }
+ rcu_read_unlock();
+
+}
+#endif
+#if defined(BENCH_MAS_PREV)
+static noinline void __init bench_mas_prev(struct maple_tree *mt)
+{
+ int i, count = 1000000;
+ unsigned long max = 2500;
+ void *entry;
+ MA_STATE(mas, mt, 0, 0);
+
+ for (i = 0; i < max; i += 5) {
+ int gap = 4;
+
+ if (i % 30 == 0)
+ gap = 3;
+ mtree_store_range(mt, i, i + gap, xa_mk_value(i), GFP_KERNEL);
+ }
+
+ rcu_read_lock();
+ for (i = 0; i < count; i++) {
+ unsigned long j = 2495;
+
+ mas_set(&mas, ULONG_MAX);
+ while ((entry = mas_prev(&mas, 0)) != NULL) {
+ MT_BUG_ON(mt, entry != xa_mk_value(j));
+ j -= 5;
+ }
+ }
+ rcu_read_unlock();
+
+}
+#endif
/* check_forking - simulate the kernel forking sequence with the tree. */
static noinline void __init check_forking(struct maple_tree *mt)
{
@@ -3433,6 +3560,20 @@ static int __init maple_tree_seed(void)
mtree_destroy(&tree);
goto skip;
#endif
+#if defined(BENCH_MAS_FOR_EACH)
+#define BENCH
+ mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
+ bench_mas_for_each(&tree);
+ mtree_destroy(&tree);
+ goto skip;
+#endif
+#if defined(BENCH_MAS_PREV)
+#define BENCH
+ mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
+ bench_mas_prev(&tree);
+ mtree_destroy(&tree);
+ goto skip;
+#endif
mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
check_iteration(&tree);
diff --git a/lib/test_meminit.c b/lib/test_meminit.c
index 60e1984c060f..0ae35223d773 100644
--- a/lib/test_meminit.c
+++ b/lib/test_meminit.c
@@ -93,7 +93,7 @@ static int __init test_pages(int *total_failures)
int failures = 0, num_tests = 0;
int i;
- for (i = 0; i < 10; i++)
+ for (i = 0; i <= MAX_ORDER; i++)
num_tests += do_alloc_pages_order(i, &failures);
REPORT_FAILURES_IN_FN();
diff --git a/mm/Kconfig b/mm/Kconfig
index 09130434e30d..721dc88423c7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -25,7 +25,6 @@ menuconfig SWAP
config ZSWAP
bool "Compressed cache for swap pages"
depends on SWAP
- select FRONTSWAP
select CRYPTO
select ZPOOL
help
@@ -487,7 +486,10 @@ config SPARSEMEM_VMEMMAP
# Select this config option from the architecture Kconfig, if it is preferred
# to enable the feature of HugeTLB/dev_dax vmemmap optimization.
#
-config ARCH_WANT_OPTIMIZE_VMEMMAP
+config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+ bool
+
+config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
bool
config HAVE_MEMBLOCK_PHYS_MAP
@@ -569,6 +571,9 @@ config MHP_MEMMAP_ON_MEMORY
endif # MEMORY_HOTPLUG
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+ bool
+
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
@@ -870,9 +875,6 @@ config USE_PERCPU_NUMA_NODE_ID
config HAVE_SETUP_PER_CPU_AREA
bool
-config FRONTSWAP
- bool
-
config CMA
bool "Contiguous Memory Allocator"
depends on MMU
@@ -1144,6 +1146,9 @@ config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY
config IO_MAPPING
bool
+config MEMFD_CREATE
+ bool "Enable memfd_create() system call" if EXPERT
+
config SECRETMEM
default y
bool "Enable memfd_secret() system call" if EXPERT
diff --git a/mm/Makefile b/mm/Makefile
index 678530a07326..e6d9a1d5e84d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -72,7 +72,6 @@ ifdef CONFIG_MMU
endif
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o
-obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3ffc3cfa7a14..1e3447bccdb1 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -16,6 +16,7 @@
#include <linux/writeback.h>
#include <linux/device.h>
#include <trace/events/writeback.h>
+#include "internal.h"
struct backing_dev_info noop_backing_dev_info;
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
@@ -34,8 +35,6 @@ LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;
-#define K(x) ((x) << (PAGE_SHIFT - 10))
-
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -733,9 +732,6 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
might_alloc(gfp);
- if (!memcg_css->parent)
- return &bdi->wb;
-
do {
wb = wb_get_lookup(bdi, memcg_css);
} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
diff --git a/mm/cma.c b/mm/cma.c
index a4cfe995e11e..4880f72102fa 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -436,8 +436,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
if (!cma || !cma->count || !cma->bitmap)
goto out;
- pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma,
- count, align);
+ pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__,
+ (void *)cma, cma->name, count, align);
if (!count)
goto out;
diff --git a/mm/compaction.c b/mm/compaction.c
index eacca2794e47..38c8d216c6a3 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -249,11 +249,36 @@ static unsigned long skip_offline_sections(unsigned long start_pfn)
return 0;
}
+
+/*
+ * If the PFN falls into an offline section, return the end PFN of the
+ * next online section in reverse. If the PFN falls into an online section
+ * or if there is no next online section in reverse, return 0.
+ */
+static unsigned long skip_offline_sections_reverse(unsigned long start_pfn)
+{
+ unsigned long start_nr = pfn_to_section_nr(start_pfn);
+
+ if (!start_nr || online_section_nr(start_nr))
+ return 0;
+
+ while (start_nr-- > 0) {
+ if (online_section_nr(start_nr))
+ return section_nr_to_pfn(start_nr) + PAGES_PER_SECTION;
+ }
+
+ return 0;
+}
#else
static unsigned long skip_offline_sections(unsigned long start_pfn)
{
return 0;
}
+
+static unsigned long skip_offline_sections_reverse(unsigned long start_pfn)
+{
+ return 0;
+}
#endif
/*
@@ -438,12 +463,13 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
{
struct zone *zone = cc->zone;
- pfn = pageblock_end_pfn(pfn);
-
/* Set for isolation rather than compaction */
if (cc->no_set_skip_hint)
return;
+ pfn = pageblock_end_pfn(pfn);
+
+ /* Update where async and sync compaction should restart */
if (pfn > zone->compact_cached_migrate_pfn[0])
zone->compact_cached_migrate_pfn[0] = pfn;
if (cc->mode != MIGRATE_ASYNC &&
@@ -465,7 +491,6 @@ static void update_pageblock_skip(struct compact_control *cc,
set_pageblock_skip(page);
- /* Update where async and sync compaction should restart */
if (pfn < zone->compact_cached_free_pfn)
zone->compact_cached_free_pfn = pfn;
}
@@ -564,7 +589,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
bool strict)
{
int nr_scanned = 0, total_isolated = 0;
- struct page *cursor;
+ struct page *page;
unsigned long flags = 0;
bool locked = false;
unsigned long blockpfn = *start_pfn;
@@ -574,12 +599,11 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (strict)
stride = 1;
- cursor = pfn_to_page(blockpfn);
+ page = pfn_to_page(blockpfn);
/* Isolate free pages. */
- for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) {
+ for (; blockpfn < end_pfn; blockpfn += stride, page += stride) {
int isolated;
- struct page *page = cursor;
/*
* Periodically drop the lock (if held) regardless of its
@@ -604,7 +628,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (likely(order <= MAX_ORDER)) {
blockpfn += (1UL << order) - 1;
- cursor += (1UL << order) - 1;
+ page += (1UL << order) - 1;
nr_scanned += (1UL << order) - 1;
}
goto isolate_fail;
@@ -641,14 +665,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
}
/* Advance to the end of split page */
blockpfn += isolated - 1;
- cursor += isolated - 1;
+ page += isolated - 1;
continue;
isolate_fail:
if (strict)
break;
- else
- continue;
}
@@ -715,8 +737,6 @@ isolate_freepages_range(struct compact_control *cc,
/* Protect pfn from changing by isolate_freepages_block */
unsigned long isolate_start_pfn = pfn;
- block_end_pfn = min(block_end_pfn, end_pfn);
-
/*
* pfn could pass the block_end_pfn if isolated freepage
* is more than pageblock order. In this case, we adjust
@@ -725,9 +745,10 @@ isolate_freepages_range(struct compact_control *cc,
if (pfn >= block_end_pfn) {
block_start_pfn = pageblock_start_pfn(pfn);
block_end_pfn = pageblock_end_pfn(pfn);
- block_end_pfn = min(block_end_pfn, end_pfn);
}
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
if (!pageblock_pfn_to_page(block_start_pfn,
block_end_pfn, cc->zone))
break;
@@ -1076,13 +1097,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
bool migrate_dirty;
/*
- * Only pages without mappings or that have a
- * ->migrate_folio callback are possible to migrate
- * without blocking. However, we can be racing with
- * truncation so it's necessary to lock the page
- * to stabilise the mapping as truncation holds
- * the page lock until after the page is removed
- * from the page cache.
+ * Only folios without mappings or that have
+ * a ->migrate_folio callback are possible to
+ * migrate without blocking. However, we may
+ * be racing with truncation, which can free
+ * the mapping. Truncation holds the folio lock
+ * until after the folio is removed from the page
+ * cache so holding it ourselves is sufficient.
*/
if (!folio_trylock(folio))
goto isolate_fail_put;
@@ -1120,6 +1141,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
skip_updated = true;
if (test_and_set_skip(cc, valid_page) &&
!cc->finish_pageblock) {
+ low_pfn = end_pfn;
goto isolate_abort;
}
}
@@ -1421,10 +1443,8 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn)
isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
/* Skip this pageblock in the future as it's full or nearly full */
- if (start_pfn == end_pfn)
+ if (start_pfn == end_pfn && !cc->no_set_skip_hint)
set_pageblock_skip(page);
-
- return;
}
/* Search orders in round-robin fashion */
@@ -1501,7 +1521,7 @@ static void fast_isolate_freepages(struct compact_control *cc)
spin_lock_irqsave(&cc->zone->lock, flags);
freelist = &area->free_list[MIGRATE_MOVABLE];
- list_for_each_entry_reverse(freepage, freelist, lru) {
+ list_for_each_entry_reverse(freepage, freelist, buddy_list) {
unsigned long pfn;
order_scanned++;
@@ -1530,7 +1550,7 @@ static void fast_isolate_freepages(struct compact_control *cc)
break;
}
- /* Use a minimum pfn if a preferred one was not found */
+ /* Use a maximum candidate pfn if a preferred one was not found */
if (!page && high_pfn) {
page = pfn_to_page(high_pfn);
@@ -1669,8 +1689,15 @@ static void isolate_freepages(struct compact_control *cc)
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
- if (!page)
+ if (!page) {
+ unsigned long next_pfn;
+
+ next_pfn = skip_offline_sections_reverse(block_start_pfn);
+ if (next_pfn)
+ block_start_pfn = max(next_pfn, low_pfn);
+
continue;
+ }
/* Check the block is suitable for migration */
if (!suitable_migration_target(cc, page))
@@ -1686,7 +1713,8 @@ static void isolate_freepages(struct compact_control *cc)
/* Update the skip hint if the full pageblock was scanned */
if (isolate_start_pfn == block_end_pfn)
- update_pageblock_skip(cc, page, block_start_pfn);
+ update_pageblock_skip(cc, page, block_start_pfn -
+ pageblock_nr_pages);
/* Are enough freepages isolated? */
if (cc->nr_freepages >= cc->nr_migratepages) {
@@ -1884,7 +1912,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
spin_lock_irqsave(&cc->zone->lock, flags);
freelist = &area->free_list[MIGRATE_MOVABLE];
- list_for_each_entry(freepage, freelist, lru) {
+ list_for_each_entry(freepage, freelist, buddy_list) {
unsigned long free_pfn;
if (nr_scanned++ >= limit) {
@@ -1958,9 +1986,9 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
block_start_pfn = cc->zone->zone_start_pfn;
/*
- * fast_find_migrateblock marks a pageblock skipped so to avoid
- * the isolation_suitable check below, check whether the fast
- * search was successful.
+ * fast_find_migrateblock() has already ensured the pageblock is not
+ * set with a skipped flag, so to avoid the isolation_suitable check
+ * below again, check whether the fast search was successful.
*/
fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail;
@@ -2114,7 +2142,7 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat)
return score;
}
-static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
+static unsigned int fragmentation_score_wmark(bool low)
{
unsigned int wmark_low;
@@ -2134,7 +2162,7 @@ static bool should_proactive_compact_node(pg_data_t *pgdat)
if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat))
return false;
- wmark_high = fragmentation_score_wmark(pgdat, false);
+ wmark_high = fragmentation_score_wmark(false);
return fragmentation_score_node(pgdat) > wmark_high;
}
@@ -2173,7 +2201,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
return COMPACT_PARTIAL_SKIPPED;
score = fragmentation_score_zone(cc->zone);
- wmark_low = fragmentation_score_wmark(pgdat, true);
+ wmark_low = fragmentation_score_wmark(true);
if (score > wmark_low)
ret = COMPACT_CONTINUE;
@@ -2480,7 +2508,8 @@ rescan:
goto check_drain;
case ISOLATE_SUCCESS:
update_cached = false;
- last_migrated_pfn = iteration_start_pfn;
+ last_migrated_pfn = max(cc->zone->zone_start_pfn,
+ pageblock_start_pfn(cc->migrate_pfn - 1));
}
err = migrate_pages(&cc->migratepages, compaction_alloc,
@@ -2503,7 +2532,7 @@ rescan:
}
/*
* If an ASYNC or SYNC_LIGHT fails to migrate a page
- * within the current order-aligned block and
+ * within the pageblock_order-aligned block and
* fast_find_migrateblock may be used then scan the
* remainder of the pageblock. This will mark the
* pageblock "skip" to avoid rescanning in the near
@@ -2869,7 +2898,7 @@ int compaction_register_node(struct node *node)
void compaction_unregister_node(struct node *node)
{
- return device_remove_file(&node->dev, &dev_attr_compact);
+ device_remove_file(&node->dev, &dev_attr_compact);
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index bb07721909e1..6cc8b245586d 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -341,6 +341,78 @@ static void damon_test_set_attrs(struct kunit *test)
KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL);
}
+static void damos_test_new_filter(struct kunit *test)
+{
+ struct damos_filter *filter;
+
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
+ KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON);
+ KUNIT_EXPECT_EQ(test, filter->matching, true);
+ KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list);
+ KUNIT_EXPECT_PTR_EQ(test, filter->list.next, &filter->list);
+ damos_destroy_filter(filter);
+}
+
+static void damos_test_filter_out(struct kunit *test)
+{
+ struct damon_target *t;
+ struct damon_region *r, *r2;
+ struct damos_filter *f;
+
+ f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true);
+ f->addr_range = (struct damon_addr_range){
+ .start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6};
+
+ t = damon_new_target();
+ r = damon_new_region(DAMON_MIN_REGION * 3, DAMON_MIN_REGION * 5);
+ damon_add_region(r, t);
+
+ /* region in the range */
+ KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
+
+ /* region before the range */
+ r->ar.start = DAMON_MIN_REGION * 1;
+ r->ar.end = DAMON_MIN_REGION * 2;
+ KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
+
+ /* region after the range */
+ r->ar.start = DAMON_MIN_REGION * 6;
+ r->ar.end = DAMON_MIN_REGION * 8;
+ KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
+
+ /* region started before the range */
+ r->ar.start = DAMON_MIN_REGION * 1;
+ r->ar.end = DAMON_MIN_REGION * 4;
+ KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ /* filter should have split the region */
+ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1);
+ KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2);
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2);
+ r2 = damon_next_region(r);
+ KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 2);
+ KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 4);
+ damon_destroy_region(r2, t);
+
+ /* region started in the range */
+ r->ar.start = DAMON_MIN_REGION * 2;
+ r->ar.end = DAMON_MIN_REGION * 8;
+ KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f));
+ /* filter should have split the region */
+ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2);
+ KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6);
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2);
+ r2 = damon_next_region(r);
+ KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 6);
+ KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 8);
+ damon_destroy_region(r2, t);
+
+ damon_free_target(t);
+ damos_free_filter(f);
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_target),
KUNIT_CASE(damon_test_regions),
@@ -353,6 +425,8 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_set_regions),
KUNIT_CASE(damon_test_update_monitoring_result),
KUNIT_CASE(damon_test_set_attrs),
+ KUNIT_CASE(damos_test_new_filter),
+ KUNIT_CASE(damos_test_filter_out),
{},
};
diff --git a/mm/damon/core.c b/mm/damon/core.c
index eb9580942a5c..bcd2bd9d6c10 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -878,6 +878,66 @@ static void damos_update_stat(struct damos *s,
s->stat.sz_applied += sz_applied;
}
+static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos_filter *filter)
+{
+ bool matched = false;
+ struct damon_target *ti;
+ int target_idx = 0;
+ unsigned long start, end;
+
+ switch (filter->type) {
+ case DAMOS_FILTER_TYPE_TARGET:
+ damon_for_each_target(ti, ctx) {
+ if (ti == t)
+ break;
+ target_idx++;
+ }
+ matched = target_idx == filter->target_idx;
+ break;
+ case DAMOS_FILTER_TYPE_ADDR:
+ start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION);
+ end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION);
+
+ /* inside the range */
+ if (start <= r->ar.start && r->ar.end <= end) {
+ matched = true;
+ break;
+ }
+ /* outside of the range */
+ if (r->ar.end <= start || end <= r->ar.start) {
+ matched = false;
+ break;
+ }
+ /* start before the range and overlap */
+ if (r->ar.start < start) {
+ damon_split_region_at(t, r, start - r->ar.start);
+ matched = false;
+ break;
+ }
+ /* start inside the range */
+ damon_split_region_at(t, r, end - r->ar.start);
+ matched = true;
+ break;
+ default:
+ break;
+ }
+
+ return matched == filter->matching;
+}
+
+static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos *s)
+{
+ struct damos_filter *filter;
+
+ damos_for_each_filter(filter, s) {
+ if (__damos_filter_out(ctx, t, r, filter))
+ return true;
+ }
+ return false;
+}
+
static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
struct damon_region *r, struct damos *s)
{
@@ -895,6 +955,8 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
goto update_stat;
damon_split_region_at(t, r, sz);
}
+ if (damos_filter_out(c, t, r, s))
+ return;
ktime_get_coarse_ts64(&begin);
if (c->callback.before_damos_apply)
err = c->callback.before_damos_apply(c, t, r, s);
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index e940802a15a4..ac1c3fa80f98 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -54,7 +54,7 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr
void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- struct folio *folio = damon_get_folio(pmd_pfn(*pmd));
+ struct folio *folio = damon_get_folio(pmd_pfn(pmdp_get(pmd)));
if (!folio)
return;
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 40801e38fcf0..909db25efb35 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -94,7 +94,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
mmu_notifier_test_young(vma->vm_mm, addr);
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- *accessed = pmd_young(*pvmw.pmd) ||
+ *accessed = pmd_young(pmdp_get(pvmw.pmd)) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
#else
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index db677eba78fd..fd482a0639b4 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -47,7 +47,7 @@ void damon_sysfs_schemes_update_stats(
int damon_sysfs_schemes_update_regions_start(
struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx);
+ struct damon_ctx *ctx, bool total_bytes_only);
int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 50cf89dcd898..527e7d17eb3b 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -117,6 +117,7 @@ struct damon_sysfs_scheme_regions {
struct kobject kobj;
struct list_head regions_list;
int nr_regions;
+ unsigned long total_bytes;
};
static struct damon_sysfs_scheme_regions *
@@ -128,9 +129,19 @@ damon_sysfs_scheme_regions_alloc(void)
regions->kobj = (struct kobject){};
INIT_LIST_HEAD(&regions->regions_list);
regions->nr_regions = 0;
+ regions->total_bytes = 0;
return regions;
}
+static ssize_t total_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_regions *regions = container_of(kobj,
+ struct damon_sysfs_scheme_regions, kobj);
+
+ return sysfs_emit(buf, "%lu\n", regions->total_bytes);
+}
+
static void damon_sysfs_scheme_regions_rm_dirs(
struct damon_sysfs_scheme_regions *regions)
{
@@ -148,7 +159,11 @@ static void damon_sysfs_scheme_regions_release(struct kobject *kobj)
kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj));
}
+static struct kobj_attribute damon_sysfs_scheme_regions_total_bytes_attr =
+ __ATTR_RO_MODE(total_bytes, 0400);
+
static struct attribute *damon_sysfs_scheme_regions_attrs[] = {
+ &damon_sysfs_scheme_regions_total_bytes_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions);
@@ -267,6 +282,8 @@ struct damon_sysfs_scheme_filter {
enum damos_filter_type type;
bool matching;
char *memcg_path;
+ struct damon_addr_range addr_range;
+ int target_idx;
};
static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
@@ -278,6 +295,8 @@ static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
static const char * const damon_sysfs_scheme_filter_type_strs[] = {
"anon",
"memcg",
+ "addr",
+ "target",
};
static ssize_t type_show(struct kobject *kobj,
@@ -358,6 +377,63 @@ static ssize_t memcg_path_store(struct kobject *kobj,
return count;
}
+static ssize_t addr_start_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%lu\n", filter->addr_range.start);
+}
+
+static ssize_t addr_start_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ int err = kstrtoul(buf, 0, &filter->addr_range.start);
+
+ return err ? err : count;
+}
+
+static ssize_t addr_end_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%lu\n", filter->addr_range.end);
+}
+
+static ssize_t addr_end_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ int err = kstrtoul(buf, 0, &filter->addr_range.end);
+
+ return err ? err : count;
+}
+
+static ssize_t damon_target_idx_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%d\n", filter->target_idx);
+}
+
+static ssize_t damon_target_idx_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ int err = kstrtoint(buf, 0, &filter->target_idx);
+
+ return err ? err : count;
+}
+
static void damon_sysfs_scheme_filter_release(struct kobject *kobj)
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
@@ -376,10 +452,22 @@ static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr =
static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr =
__ATTR_RW_MODE(memcg_path, 0600);
+static struct kobj_attribute damon_sysfs_scheme_filter_addr_start_attr =
+ __ATTR_RW_MODE(addr_start, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_addr_end_attr =
+ __ATTR_RW_MODE(addr_end, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_damon_target_idx_attr =
+ __ATTR_RW_MODE(damon_target_idx, 0600);
+
static struct attribute *damon_sysfs_scheme_filter_attrs[] = {
&damon_sysfs_scheme_filter_type_attr.attr,
&damon_sysfs_scheme_filter_matching_attr.attr,
&damon_sysfs_scheme_filter_memcg_path_attr.attr,
+ &damon_sysfs_scheme_filter_addr_start_attr.attr,
+ &damon_sysfs_scheme_filter_addr_end_attr.attr,
+ &damon_sysfs_scheme_filter_damon_target_idx_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter);
@@ -1469,7 +1557,17 @@ static int damon_sysfs_set_scheme_filters(struct damos *scheme,
damos_destroy_filter(filter);
return err;
}
+ } else if (filter->type == DAMOS_FILTER_TYPE_ADDR) {
+ if (sysfs_filter->addr_range.end <
+ sysfs_filter->addr_range.start) {
+ damos_destroy_filter(filter);
+ return -EINVAL;
+ }
+ filter->addr_range = sysfs_filter->addr_range;
+ } else if (filter->type == DAMOS_FILTER_TYPE_TARGET) {
+ filter->target_idx = sysfs_filter->target_idx;
}
+
damos_add_filter(scheme, filter);
}
return 0;
@@ -1620,6 +1718,7 @@ void damon_sysfs_schemes_update_stats(
*/
static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback;
static int damon_sysfs_schemes_region_idx;
+static bool damos_regions_upd_total_bytes_only;
/*
* DAMON callback that called before damos apply. While this callback is
@@ -1648,6 +1747,10 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
return 0;
sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions;
+ sysfs_regions->total_bytes += r->ar.end - r->ar.start;
+ if (damos_regions_upd_total_bytes_only)
+ return 0;
+
region = damon_sysfs_scheme_region_alloc(r);
list_add_tail(&region->list, &sysfs_regions->regions_list);
sysfs_regions->nr_regions++;
@@ -1678,6 +1781,7 @@ int damon_sysfs_schemes_clear_regions(
sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++];
damon_sysfs_scheme_regions_rm_dirs(
sysfs_scheme->tried_regions);
+ sysfs_scheme->tried_regions->total_bytes = 0;
}
return 0;
}
@@ -1685,10 +1789,11 @@ int damon_sysfs_schemes_clear_regions(
/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
int damon_sysfs_schemes_update_regions_start(
struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx)
+ struct damon_ctx *ctx, bool total_bytes_only)
{
damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx);
damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
+ damos_regions_upd_total_bytes_only = total_bytes_only;
ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
return 0;
}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 33e1d5c9cb54..b86ba7b0a921 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1000,6 +1000,11 @@ enum damon_sysfs_cmd {
*/
DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS,
/*
+ * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: Update
+ * tried_regions/total_bytes sysfs files for each scheme.
+ */
+ DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES,
+ /*
* @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried
* regions
*/
@@ -1021,6 +1026,7 @@ static const char * const damon_sysfs_cmd_strs[] = {
"off",
"commit",
"update_schemes_stats",
+ "update_schemes_tried_bytes",
"update_schemes_tried_regions",
"clear_schemes_tried_regions",
};
@@ -1206,12 +1212,14 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
struct damon_sysfs_kdamond *kdamond;
+ enum damon_sysfs_cmd cmd;
/* damon_sysfs_schemes_update_regions_stop() might not yet called */
kdamond = damon_sysfs_cmd_request.kdamond;
- if (kdamond && damon_sysfs_cmd_request.cmd ==
- DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS &&
- ctx == kdamond->damon_ctx) {
+ cmd = damon_sysfs_cmd_request.cmd;
+ if (kdamond && ctx == kdamond->damon_ctx &&
+ (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS ||
+ cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES)) {
damon_sysfs_schemes_update_regions_stop(ctx);
mutex_unlock(&damon_sysfs_lock);
}
@@ -1248,14 +1256,15 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
}
static int damon_sysfs_upd_schemes_regions_start(
- struct damon_sysfs_kdamond *kdamond)
+ struct damon_sysfs_kdamond *kdamond, bool total_bytes_only)
{
struct damon_ctx *ctx = kdamond->damon_ctx;
if (!ctx)
return -EINVAL;
return damon_sysfs_schemes_update_regions_start(
- kdamond->contexts->contexts_arr[0]->schemes, ctx);
+ kdamond->contexts->contexts_arr[0]->schemes, ctx,
+ total_bytes_only);
}
static int damon_sysfs_upd_schemes_regions_stop(
@@ -1332,6 +1341,7 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
{
struct damon_sysfs_kdamond *kdamond;
static bool damon_sysfs_schemes_regions_updating;
+ bool total_bytes_only = false;
int err = 0;
/* avoid deadlock due to concurrent state_store('off') */
@@ -1348,9 +1358,13 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
case DAMON_SYSFS_CMD_COMMIT:
err = damon_sysfs_commit_input(kdamond);
break;
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES:
+ total_bytes_only = true;
+ fallthrough;
case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS:
if (!damon_sysfs_schemes_regions_updating) {
- err = damon_sysfs_upd_schemes_regions_start(kdamond);
+ err = damon_sysfs_upd_schemes_regions_start(kdamond,
+ total_bytes_only);
if (!err) {
damon_sysfs_schemes_regions_updating = true;
goto keep_lock_out;
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index e0e59d420fca..4c81a9dbd044 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -301,16 +301,19 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
pte_t *pte;
+ pmd_t pmde;
spinlock_t *ptl;
- if (pmd_trans_huge(*pmd)) {
+ if (pmd_trans_huge(pmdp_get(pmd))) {
ptl = pmd_lock(walk->mm, pmd);
- if (!pmd_present(*pmd)) {
+ pmde = pmdp_get(pmd);
+
+ if (!pmd_present(pmde)) {
spin_unlock(ptl);
return 0;
}
- if (pmd_trans_huge(*pmd)) {
+ if (pmd_trans_huge(pmde)) {
damon_pmdp_mkold(pmd, walk->vma, addr);
spin_unlock(ptl);
return 0;
@@ -440,21 +443,25 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
struct damon_young_walk_private *priv = walk->private;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_trans_huge(*pmd)) {
+ if (pmd_trans_huge(pmdp_get(pmd))) {
+ pmd_t pmde;
+
ptl = pmd_lock(walk->mm, pmd);
- if (!pmd_present(*pmd)) {
+ pmde = pmdp_get(pmd);
+
+ if (!pmd_present(pmde)) {
spin_unlock(ptl);
return 0;
}
- if (!pmd_trans_huge(*pmd)) {
+ if (!pmd_trans_huge(pmde)) {
spin_unlock(ptl);
goto regular_page;
}
- folio = damon_get_folio(pmd_pfn(*pmd));
+ folio = damon_get_folio(pmd_pfn(pmde));
if (!folio)
goto huge_out;
- if (pmd_young(*pmd) || !folio_test_idle(folio) ||
+ if (pmd_young(pmde) || !folio_test_idle(folio) ||
mmu_notifier_test_young(walk->mm,
addr))
priv->young = true;
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index ee119e33fef1..d61eaa075c75 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -302,7 +302,7 @@ static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx)
unsigned long val = idx, *ptr = &val;
pud_t pud;
- if (!has_transparent_hugepage())
+ if (!has_transparent_pud_hugepage())
return;
pr_debug("Validating PUD basic (%pGv)\n", ptr);
@@ -343,7 +343,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
unsigned long vaddr = args->vaddr;
pud_t pud;
- if (!has_transparent_hugepage())
+ if (!has_transparent_pud_hugepage())
return;
page = (args->pud_pfn != ULONG_MAX) ? pfn_to_page(args->pud_pfn) : NULL;
@@ -385,7 +385,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
#ifndef __PAGETABLE_PMD_FOLDED
- pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1);
+ pudp_huge_get_and_clear_full(args->vma, vaddr, args->pudp, 1);
pud = READ_ONCE(*args->pudp);
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
@@ -405,7 +405,7 @@ static void __init pud_leaf_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (!has_transparent_hugepage())
+ if (!has_transparent_pud_hugepage())
return;
pr_debug("Validating PUD leaf\n");
@@ -732,7 +732,7 @@ static void __init pud_devmap_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (!has_transparent_hugepage())
+ if (!has_transparent_pud_hugepage())
return;
pr_debug("Validating PUD devmap\n");
@@ -981,7 +981,7 @@ static void __init pud_thp_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (!has_transparent_hugepage())
+ if (!has_transparent_pud_hugepage())
return;
pr_debug("Validating PUD based THP\n");
@@ -1022,8 +1022,7 @@ static void __init destroy_args(struct pgtable_debug_args *args)
/* Free (huge) page */
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
- has_transparent_hugepage() &&
+ has_transparent_pud_hugepage() &&
args->pud_pfn != ULONG_MAX) {
if (args->is_contiguous_page) {
free_contig_range(args->pud_pfn,
@@ -1274,8 +1273,7 @@ static int __init init_args(struct pgtable_debug_args *args)
* if we fail to allocate (huge) pages.
*/
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
- has_transparent_hugepage()) {
+ has_transparent_pud_hugepage()) {
page = debug_vm_pgtable_alloc_huge_page(args,
HPAGE_PUD_SHIFT - PAGE_SHIFT);
if (page) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 9e44a49bbd74..dd022b065614 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2075,7 +2075,7 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
if (!xa_is_value(folio)) {
if (folio->index < *start)
goto put;
- if (folio->index + folio_nr_pages(folio) - 1 > end)
+ if (folio_next_index(folio) - 1 > end)
goto put;
if (!folio_trylock(folio))
goto put;
@@ -2174,7 +2174,7 @@ bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
return false;
if (index >= max)
return false;
- return index < folio->index + folio_nr_pages(folio) - 1;
+ return index < folio_next_index(folio) - 1;
}
/**
@@ -2242,7 +2242,7 @@ update_start:
if (folio_test_hugetlb(folio))
*start = folio->index + 1;
else
- *start = folio->index + folio_nr_pages(folio);
+ *start = folio_next_index(folio);
}
out:
rcu_read_unlock();
@@ -2359,7 +2359,7 @@ static void filemap_get_read_batch(struct address_space *mapping,
break;
if (folio_test_readahead(folio))
break;
- xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1);
+ xas_advance(&xas, folio_next_index(folio) - 1);
continue;
put_folio:
folio_put(folio);
@@ -2632,6 +2632,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
int i, error = 0;
bool writably_mapped;
loff_t isize, end_offset;
+ loff_t last_pos = ra->prev_pos;
if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
return 0;
@@ -2682,8 +2683,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
* When a read accesses the same folio several times, only
* mark it as accessed the first time.
*/
- if (!pos_same_folio(iocb->ki_pos, ra->prev_pos - 1,
- fbatch.folios[0]))
+ if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
+ fbatch.folios[0]))
folio_mark_accessed(fbatch.folios[0]);
for (i = 0; i < folio_batch_count(&fbatch); i++) {
@@ -2710,7 +2711,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
already_read += copied;
iocb->ki_pos += copied;
- ra->prev_pos = iocb->ki_pos;
+ last_pos = iocb->ki_pos;
if (copied < bytes) {
error = -EFAULT;
@@ -2724,7 +2725,7 @@ put_folios:
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
file_accessed(filp);
-
+ ra->prev_pos = last_pos;
return already_read ? already_read : error;
}
EXPORT_SYMBOL_GPL(filemap_read);
@@ -4072,6 +4073,8 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp)
struct address_space * const mapping = folio->mapping;
BUG_ON(!folio_test_locked(folio));
+ if (!folio_needs_release(folio))
+ return true;
if (folio_test_writeback(folio))
return false;
diff --git a/mm/frontswap.c b/mm/frontswap.c
deleted file mode 100644
index 2fb5df3384b8..000000000000
--- a/mm/frontswap.c
+++ /dev/null
@@ -1,283 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Frontswap frontend
- *
- * This code provides the generic "frontend" layer to call a matching
- * "backend" driver implementation of frontswap. See
- * Documentation/mm/frontswap.rst for more information.
- *
- * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
- * Author: Dan Magenheimer
- */
-
-#include <linux/mman.h>
-#include <linux/swap.h>
-#include <linux/swapops.h>
-#include <linux/security.h>
-#include <linux/module.h>
-#include <linux/debugfs.h>
-#include <linux/frontswap.h>
-#include <linux/swapfile.h>
-
-DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key);
-
-/*
- * frontswap_ops are added by frontswap_register_ops, and provide the
- * frontswap "backend" implementation functions. Multiple implementations
- * may be registered, but implementations can never deregister. This
- * is a simple singly-linked list of all registered implementations.
- */
-static const struct frontswap_ops *frontswap_ops __read_mostly;
-
-#ifdef CONFIG_DEBUG_FS
-/*
- * Counters available via /sys/kernel/debug/frontswap (if debugfs is
- * properly configured). These are for information only so are not protected
- * against increment races.
- */
-static u64 frontswap_loads;
-static u64 frontswap_succ_stores;
-static u64 frontswap_failed_stores;
-static u64 frontswap_invalidates;
-
-static inline void inc_frontswap_loads(void)
-{
- data_race(frontswap_loads++);
-}
-static inline void inc_frontswap_succ_stores(void)
-{
- data_race(frontswap_succ_stores++);
-}
-static inline void inc_frontswap_failed_stores(void)
-{
- data_race(frontswap_failed_stores++);
-}
-static inline void inc_frontswap_invalidates(void)
-{
- data_race(frontswap_invalidates++);
-}
-#else
-static inline void inc_frontswap_loads(void) { }
-static inline void inc_frontswap_succ_stores(void) { }
-static inline void inc_frontswap_failed_stores(void) { }
-static inline void inc_frontswap_invalidates(void) { }
-#endif
-
-/*
- * Due to the asynchronous nature of the backends loading potentially
- * _after_ the swap system has been activated, we have chokepoints
- * on all frontswap functions to not call the backend until the backend
- * has registered.
- *
- * This would not guards us against the user deciding to call swapoff right as
- * we are calling the backend to initialize (so swapon is in action).
- * Fortunately for us, the swapon_mutex has been taken by the callee so we are
- * OK. The other scenario where calls to frontswap_store (called via
- * swap_writepage) is racing with frontswap_invalidate_area (called via
- * swapoff) is again guarded by the swap subsystem.
- *
- * While no backend is registered all calls to frontswap_[store|load|
- * invalidate_area|invalidate_page] are ignored or fail.
- *
- * The time between the backend being registered and the swap file system
- * calling the backend (via the frontswap_* functions) is indeterminate as
- * frontswap_ops is not atomic_t (or a value guarded by a spinlock).
- * That is OK as we are comfortable missing some of these calls to the newly
- * registered backend.
- *
- * Obviously the opposite (unloading the backend) must be done after all
- * the frontswap_[store|load|invalidate_area|invalidate_page] start
- * ignoring or failing the requests. However, there is currently no way
- * to unload a backend once it is registered.
- */
-
-/*
- * Register operations for frontswap
- */
-int frontswap_register_ops(const struct frontswap_ops *ops)
-{
- if (frontswap_ops)
- return -EINVAL;
-
- frontswap_ops = ops;
- static_branch_inc(&frontswap_enabled_key);
- return 0;
-}
-
-/*
- * Called when a swap device is swapon'd.
- */
-void frontswap_init(unsigned type, unsigned long *map)
-{
- struct swap_info_struct *sis = swap_info[type];
-
- VM_BUG_ON(sis == NULL);
-
- /*
- * p->frontswap is a bitmap that we MUST have to figure out which page
- * has gone in frontswap. Without it there is no point of continuing.
- */
- if (WARN_ON(!map))
- return;
- /*
- * Irregardless of whether the frontswap backend has been loaded
- * before this function or it will be later, we _MUST_ have the
- * p->frontswap set to something valid to work properly.
- */
- frontswap_map_set(sis, map);
-
- if (!frontswap_enabled())
- return;
- frontswap_ops->init(type);
-}
-
-static bool __frontswap_test(struct swap_info_struct *sis,
- pgoff_t offset)
-{
- if (sis->frontswap_map)
- return test_bit(offset, sis->frontswap_map);
- return false;
-}
-
-static inline void __frontswap_set(struct swap_info_struct *sis,
- pgoff_t offset)
-{
- set_bit(offset, sis->frontswap_map);
- atomic_inc(&sis->frontswap_pages);
-}
-
-static inline void __frontswap_clear(struct swap_info_struct *sis,
- pgoff_t offset)
-{
- clear_bit(offset, sis->frontswap_map);
- atomic_dec(&sis->frontswap_pages);
-}
-
-/*
- * "Store" data from a page to frontswap and associate it with the page's
- * swaptype and offset. Page must be locked and in the swap cache.
- * If frontswap already contains a page with matching swaptype and
- * offset, the frontswap implementation may either overwrite the data and
- * return success or invalidate the page from frontswap and return failure.
- */
-int __frontswap_store(struct page *page)
-{
- int ret = -1;
- swp_entry_t entry = { .val = page_private(page), };
- int type = swp_type(entry);
- struct swap_info_struct *sis = swap_info[type];
- pgoff_t offset = swp_offset(entry);
-
- VM_BUG_ON(!frontswap_ops);
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(sis == NULL);
-
- /*
- * If a dup, we must remove the old page first; we can't leave the
- * old page no matter if the store of the new page succeeds or fails,
- * and we can't rely on the new page replacing the old page as we may
- * not store to the same implementation that contains the old page.
- */
- if (__frontswap_test(sis, offset)) {
- __frontswap_clear(sis, offset);
- frontswap_ops->invalidate_page(type, offset);
- }
-
- ret = frontswap_ops->store(type, offset, page);
- if (ret == 0) {
- __frontswap_set(sis, offset);
- inc_frontswap_succ_stores();
- } else {
- inc_frontswap_failed_stores();
- }
-
- return ret;
-}
-
-/*
- * "Get" data from frontswap associated with swaptype and offset that were
- * specified when the data was put to frontswap and use it to fill the
- * specified page with data. Page must be locked and in the swap cache.
- */
-int __frontswap_load(struct page *page)
-{
- int ret = -1;
- swp_entry_t entry = { .val = page_private(page), };
- int type = swp_type(entry);
- struct swap_info_struct *sis = swap_info[type];
- pgoff_t offset = swp_offset(entry);
- bool exclusive = false;
-
- VM_BUG_ON(!frontswap_ops);
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(sis == NULL);
-
- if (!__frontswap_test(sis, offset))
- return -1;
-
- /* Try loading from each implementation, until one succeeds. */
- ret = frontswap_ops->load(type, offset, page, &exclusive);
- if (ret == 0) {
- inc_frontswap_loads();
- if (exclusive) {
- SetPageDirty(page);
- __frontswap_clear(sis, offset);
- }
- }
- return ret;
-}
-
-/*
- * Invalidate any data from frontswap associated with the specified swaptype
- * and offset so that a subsequent "get" will fail.
- */
-void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
-{
- struct swap_info_struct *sis = swap_info[type];
-
- VM_BUG_ON(!frontswap_ops);
- VM_BUG_ON(sis == NULL);
-
- if (!__frontswap_test(sis, offset))
- return;
-
- frontswap_ops->invalidate_page(type, offset);
- __frontswap_clear(sis, offset);
- inc_frontswap_invalidates();
-}
-
-/*
- * Invalidate all data from frontswap associated with all offsets for the
- * specified swaptype.
- */
-void __frontswap_invalidate_area(unsigned type)
-{
- struct swap_info_struct *sis = swap_info[type];
-
- VM_BUG_ON(!frontswap_ops);
- VM_BUG_ON(sis == NULL);
-
- if (sis->frontswap_map == NULL)
- return;
-
- frontswap_ops->invalidate_area(type);
- atomic_set(&sis->frontswap_pages, 0);
- bitmap_zero(sis->frontswap_map, sis->max);
-}
-
-static int __init init_frontswap(void)
-{
-#ifdef CONFIG_DEBUG_FS
- struct dentry *root = debugfs_create_dir("frontswap", NULL);
- if (root == NULL)
- return -ENXIO;
- debugfs_create_u64("loads", 0444, root, &frontswap_loads);
- debugfs_create_u64("succ_stores", 0444, root, &frontswap_succ_stores);
- debugfs_create_u64("failed_stores", 0444, root,
- &frontswap_failed_stores);
- debugfs_create_u64("invalidates", 0444, root, &frontswap_invalidates);
-#endif
- return 0;
-}
-
-module_init(init_frontswap);
diff --git a/mm/gup.c b/mm/gup.c
index 6e2f9e9d6537..3bbfae411880 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -811,7 +811,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
struct follow_page_context *ctx)
{
pgd_t *pgd;
- struct page *page;
struct mm_struct *mm = vma->vm_mm;
ctx->page_mask = 0;
@@ -820,16 +819,10 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
* Call hugetlb_follow_page_mask for hugetlb vmas as it will use
* special hugetlb page table walking code. This eliminates the
* need to check for hugetlb entries in the general walking code.
- *
- * hugetlb_follow_page_mask is only for follow_page() handling here.
- * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
*/
- if (is_vm_hugetlb_page(vma)) {
- page = hugetlb_follow_page_mask(vma, address, flags);
- if (!page)
- page = no_page_table(vma, flags);
- return page;
- }
+ if (is_vm_hugetlb_page(vma))
+ return hugetlb_follow_page_mask(vma, address, flags,
+ &ctx->page_mask);
pgd = pgd_offset(mm, address);
@@ -1215,7 +1208,7 @@ static long __get_user_pages(struct mm_struct *mm,
if (!vma && in_gate_area(mm, start)) {
ret = get_gate_page(mm, start & PAGE_MASK,
gup_flags, &vma,
- pages ? &pages[i] : NULL);
+ pages ? &page : NULL);
if (ret)
goto out;
ctx.page_mask = 0;
@@ -1229,22 +1222,6 @@ static long __get_user_pages(struct mm_struct *mm,
ret = check_vma_flags(vma, gup_flags);
if (ret)
goto out;
-
- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages,
- &start, &nr_pages, i,
- gup_flags, locked);
- if (!*locked) {
- /*
- * We've got a VM_FAULT_RETRY
- * and we've lost mmap_lock.
- * We must stop here.
- */
- BUG_ON(gup_flags & FOLL_NOWAIT);
- goto out;
- }
- continue;
- }
}
retry:
/*
@@ -1285,22 +1262,58 @@ retry:
ret = PTR_ERR(page);
goto out;
}
-
- goto next_page;
} else if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
}
- if (pages) {
- pages[i] = page;
- flush_anon_page(vma, page, start);
- flush_dcache_page(page);
- ctx.page_mask = 0;
- }
next_page:
page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
+
+ if (pages) {
+ struct page *subpage;
+ unsigned int j;
+
+ /*
+ * This must be a large folio (and doesn't need to
+ * be the whole folio; it can be part of it), do
+ * the refcount work for all the subpages too.
+ *
+ * NOTE: here the page may not be the head page
+ * e.g. when start addr is not thp-size aligned.
+ * try_grab_folio() should have taken care of tail
+ * pages.
+ */
+ if (page_increm > 1) {
+ struct folio *folio;
+
+ /*
+ * Since we already hold refcount on the
+ * large folio, this should never fail.
+ */
+ folio = try_grab_folio(page, page_increm - 1,
+ foll_flags);
+ if (WARN_ON_ONCE(!folio)) {
+ /*
+ * Release the 1st page ref if the
+ * folio is problematic, fail hard.
+ */
+ gup_put_folio(page_folio(page), 1,
+ foll_flags);
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ for (j = 0; j < page_increm; j++) {
+ subpage = nth_page(page, j);
+ pages[i + j] = subpage;
+ flush_anon_page(vma, subpage, start + j * PAGE_SIZE);
+ flush_dcache_page(subpage);
+ }
+ }
+
i += page_increm;
start += page_increm * PAGE_SIZE;
nr_pages -= page_increm;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f15d557e5708..154c210892a1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1980,7 +1980,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (!ptl)
return 0;
- pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
+ pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
if (vma_is_special_huge(vma)) {
spin_unlock(ptl);
@@ -2002,7 +2002,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
count_vm_event(THP_SPLIT_PUD);
- pudp_huge_clear_flush_notify(vma, haddr, pud);
+ pudp_huge_clear_flush(vma, haddr, pud);
}
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
@@ -2022,11 +2022,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
out:
spin_unlock(ptl);
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pudp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(&range);
+ mmu_notifier_invalidate_range_end(&range);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -2093,7 +2089,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
count_vm_event(THP_SPLIT_PMD);
if (!vma_is_anonymous(vma)) {
- old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
/*
* We are going to unmap this huge page. So
* just go ahead and zap it
@@ -2123,8 +2119,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (is_huge_zero_pmd(*pmd)) {
/*
* FIXME: Do we want to invalidate secondary mmu by calling
- * mmu_notifier_invalidate_range() see comments below inside
- * __split_huge_pmd() ?
+ * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
+ * inside __split_huge_pmd() ?
*
* We are going from a zero huge page write protected to zero
* small page also write protected so it does not seems useful
@@ -2254,7 +2250,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_mksoft_dirty(entry);
if (uffd_wp)
entry = pte_mkuffd_wp(entry);
- page_add_anon_rmap(page + i, vma, addr, false);
+ page_add_anon_rmap(page + i, vma, addr, RMAP_NONE);
}
VM_BUG_ON(!pte_none(ptep_get(pte)));
set_pte_at(mm, addr, pte, entry);
@@ -2303,20 +2299,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
out:
spin_unlock(ptl);
- /*
- * No need to double call mmu_notifier->invalidate_range() callback.
- * They are 3 cases to consider inside __split_huge_pmd_locked():
- * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
- * 2) __split_huge_zero_page_pmd() read only zero page and any write
- * fault will trigger a flush_notify before pointing to a new page
- * (it is fine if the secondary mmu keeps pointing to the old zero
- * page in the meantime)
- * 3) Split a huge pmd into pte pointing to the same page. No need
- * to invalidate secondary tlb entry they are all still valid.
- * any further changes to individual pte will notify. So no need
- * to call mmu_notifier->invalidate_range()
- */
- mmu_notifier_invalidate_range_only_end(&range);
+ mmu_notifier_invalidate_range_end(&range);
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
@@ -2696,8 +2679,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
gfp = current_gfp_context(mapping_gfp_mask(mapping) &
GFP_RECLAIM_MASK);
- if (folio_test_private(folio) &&
- !filemap_release_folio(folio, gfp)) {
+ if (!filemap_release_folio(folio, gfp)) {
ret = -EBUSY;
goto out;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6da626bfb52e..5f498e8025cc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,6 +34,7 @@
#include <linux/nospec.h>
#include <linux/delayacct.h>
#include <linux/memory.h>
+#include <linux/mm_inline.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -1811,10 +1812,10 @@ static void free_hpage_workfn(struct work_struct *work)
node = node->next;
page->mapping = NULL;
/*
- * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
- * is going to trigger because a previous call to
+ * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
+ * folio_hstate() is going to trigger because a previous call to
* remove_hugetlb_folio() will call folio_set_compound_dtor
- * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate()
+ * (folio, NULL_COMPOUND_DTOR), so do not use folio_hstate()
* directly.
*/
h = size_to_hstate(page_size(page));
@@ -4774,7 +4775,7 @@ void hugetlb_show_meminfo_node(int nid)
void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
{
seq_printf(m, "HugetlbPages:\t%8lu kB\n",
- atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
+ K(atomic_long_read(&mm->hugetlb_usage)));
}
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
@@ -5055,7 +5056,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
src_vma->vm_start,
src_vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
- mmap_assert_write_locked(src);
+ vma_assert_write_locked(src_vma);
raw_write_seqcount_begin(&src->write_protect_seq);
} else {
/*
@@ -5128,15 +5129,12 @@ again:
entry = huge_pte_clear_uffd_wp(entry);
set_huge_pte_at(dst, addr, dst_pte, entry);
} else if (unlikely(is_pte_marker(entry))) {
- /* No swap on hugetlb */
- WARN_ON_ONCE(
- is_swapin_error_entry(pte_to_swp_entry(entry)));
- /*
- * We copy the pte marker only if the dst vma has
- * uffd-wp enabled.
- */
- if (userfaultfd_wp(dst_vma))
- set_huge_pte_at(dst, addr, dst_pte, entry);
+ pte_marker marker = copy_pte_marker(
+ pte_to_swp_entry(entry), dst_vma);
+
+ if (marker)
+ set_huge_pte_at(dst, addr, dst_pte,
+ make_pte_marker(marker));
} else {
entry = huge_ptep_get(src_pte);
pte_folio = page_folio(pte_page(entry));
@@ -5308,9 +5306,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
}
if (shared_pmd)
- flush_tlb_range(vma, range.start, range.end);
+ flush_hugetlb_tlb_range(vma, range.start, range.end);
else
- flush_tlb_range(vma, old_end - len, old_end);
+ flush_hugetlb_tlb_range(vma, old_end - len, old_end);
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
hugetlb_vma_unlock_write(vma);
@@ -5717,7 +5715,6 @@ retry_avoidcopy:
/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
- mmu_notifier_invalidate_range(mm, range.start, range.end);
page_remove_rmap(&old_folio->page, vma, true);
hugepage_add_new_anon_rmap(new_folio, vma, haddr);
if (huge_pte_uffd_wp(pte))
@@ -5748,7 +5745,6 @@ out_release_old:
/*
* Return whether there is a pagecache page to back given address within VMA.
- * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
*/
static bool hugetlbfs_pagecache_present(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
@@ -6093,6 +6089,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
+ /* TODO: Handle faults under the VMA lock */
+ if (flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
+
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
@@ -6117,14 +6119,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
entry = huge_ptep_get(ptep);
- /* PTE markers should be handled the same way as none pte */
- if (huge_pte_none_mostly(entry))
+ if (huge_pte_none_mostly(entry)) {
+ if (is_pte_marker(entry)) {
+ pte_marker marker =
+ pte_marker_get(pte_to_swp_entry(entry));
+
+ if (marker & PTE_MARKER_POISONED) {
+ ret = VM_FAULT_HWPOISON_LARGE;
+ goto out_mutex;
+ }
+ }
+
/*
+ * Other PTE markers should be handled the same way as none PTE.
+ *
* hugetlb_no_page will drop vma lock and hugetlb fault
* mutex internally, which make us return immediately.
*/
return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
entry, flags);
+ }
ret = 0;
@@ -6280,6 +6294,25 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
int writable;
bool folio_in_pagecache = false;
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
+ ptl = huge_pte_lock(h, dst_mm, dst_pte);
+
+ /* Don't overwrite any existing PTEs (even markers) */
+ if (!huge_pte_none(huge_ptep_get(dst_pte))) {
+ spin_unlock(ptl);
+ return -EEXIST;
+ }
+
+ _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
+ set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+ spin_unlock(ptl);
+ return 0;
+ }
+
if (is_continue) {
ret = -EFAULT;
folio = filemap_lock_folio(mapping, idx);
@@ -6449,39 +6482,9 @@ out_release_nounlock:
}
#endif /* CONFIG_USERFAULTFD */
-static void record_subpages(struct page *page, struct vm_area_struct *vma,
- int refs, struct page **pages)
-{
- int nr;
-
- for (nr = 0; nr < refs; nr++) {
- if (likely(pages))
- pages[nr] = nth_page(page, nr);
- }
-}
-
-static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
- unsigned int flags, pte_t *pte,
- bool *unshare)
-{
- pte_t pteval = huge_ptep_get(pte);
-
- *unshare = false;
- if (is_swap_pte(pteval))
- return true;
- if (huge_pte_write(pteval))
- return false;
- if (flags & FOLL_WRITE)
- return true;
- if (gup_must_unshare(vma, flags, pte_page(pteval))) {
- *unshare = true;
- return true;
- }
- return false;
-}
-
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags)
+ unsigned long address, unsigned int flags,
+ unsigned int *page_mask)
{
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
@@ -6489,13 +6492,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
struct page *page = NULL;
spinlock_t *ptl;
pte_t *pte, entry;
-
- /*
- * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
- * follow_hugetlb_page().
- */
- if (WARN_ON_ONCE(flags & FOLL_PIN))
- return NULL;
+ int ret;
hugetlb_vma_lock_read(vma);
pte = hugetlb_walk(vma, haddr, huge_page_size(h));
@@ -6505,8 +6502,23 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
ptl = huge_pte_lock(h, mm, pte);
entry = huge_ptep_get(pte);
if (pte_present(entry)) {
- page = pte_page(entry) +
- ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+ page = pte_page(entry);
+
+ if (!huge_pte_write(entry)) {
+ if (flags & FOLL_WRITE) {
+ page = NULL;
+ goto out;
+ }
+
+ if (gup_must_unshare(vma, flags, page)) {
+ /* Tell the caller to do unsharing */
+ page = ERR_PTR(-EMLINK);
+ goto out;
+ }
+ }
+
+ page += ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+
/*
* Note that page may be a sub-page, and with vmemmap
* optimizations the page struct may be read only.
@@ -6516,208 +6528,29 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
* try_grab_page() should always be able to get the page here,
* because we hold the ptl lock and have verified pte_present().
*/
- if (try_grab_page(page, flags)) {
- page = NULL;
+ ret = try_grab_page(page, flags);
+
+ if (WARN_ON_ONCE(ret)) {
+ page = ERR_PTR(ret);
goto out;
}
+
+ *page_mask = (1U << huge_page_order(h)) - 1;
}
out:
spin_unlock(ptl);
out_unlock:
hugetlb_vma_unlock_read(vma);
- return page;
-}
-
-long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, unsigned long *position,
- unsigned long *nr_pages, long i, unsigned int flags,
- int *locked)
-{
- unsigned long pfn_offset;
- unsigned long vaddr = *position;
- unsigned long remainder = *nr_pages;
- struct hstate *h = hstate_vma(vma);
- int err = -EFAULT, refs;
-
- while (vaddr < vma->vm_end && remainder) {
- pte_t *pte;
- spinlock_t *ptl = NULL;
- bool unshare = false;
- int absent;
- struct page *page;
-
- /*
- * If we have a pending SIGKILL, don't keep faulting pages and
- * potentially allocating memory.
- */
- if (fatal_signal_pending(current)) {
- remainder = 0;
- break;
- }
-
- hugetlb_vma_lock_read(vma);
- /*
- * Some archs (sparc64, sh*) have multiple pte_ts to
- * each hugepage. We have to make sure we get the
- * first, for the page indexing below to work.
- *
- * Note that page table lock is not held when pte is null.
- */
- pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
- huge_page_size(h));
- if (pte)
- ptl = huge_pte_lock(h, mm, pte);
- absent = !pte || huge_pte_none(huge_ptep_get(pte));
-
- /*
- * When coredumping, it suits get_dump_page if we just return
- * an error where there's an empty slot with no huge pagecache
- * to back it. This way, we avoid allocating a hugepage, and
- * the sparse dumpfile avoids allocating disk blocks, but its
- * huge holes still show up with zeroes where they need to be.
- */
- if (absent && (flags & FOLL_DUMP) &&
- !hugetlbfs_pagecache_present(h, vma, vaddr)) {
- if (pte)
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- remainder = 0;
- break;
- }
-
- /*
- * We need call hugetlb_fault for both hugepages under migration
- * (in which case hugetlb_fault waits for the migration,) and
- * hwpoisoned hugepages (in which case we need to prevent the
- * caller from accessing to them.) In order to do this, we use
- * here is_swap_pte instead of is_hugetlb_entry_migration and
- * is_hugetlb_entry_hwpoisoned. This is because it simply covers
- * both cases, and because we can't follow correct pages
- * directly from any kind of swap entries.
- */
- if (absent ||
- __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
- vm_fault_t ret;
- unsigned int fault_flags = 0;
-
- if (pte)
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- if (flags & FOLL_WRITE)
- fault_flags |= FAULT_FLAG_WRITE;
- else if (unshare)
- fault_flags |= FAULT_FLAG_UNSHARE;
- if (locked) {
- fault_flags |= FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_KILLABLE;
- if (flags & FOLL_INTERRUPTIBLE)
- fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
- }
- if (flags & FOLL_NOWAIT)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_RETRY_NOWAIT;
- if (flags & FOLL_TRIED) {
- /*
- * Note: FAULT_FLAG_ALLOW_RETRY and
- * FAULT_FLAG_TRIED can co-exist
- */
- fault_flags |= FAULT_FLAG_TRIED;
- }
- ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
- if (ret & VM_FAULT_ERROR) {
- err = vm_fault_to_errno(ret, flags);
- remainder = 0;
- break;
- }
- if (ret & VM_FAULT_RETRY) {
- if (locked &&
- !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
- *locked = 0;
- *nr_pages = 0;
- /*
- * VM_FAULT_RETRY must not return an
- * error, it will return zero
- * instead.
- *
- * No need to update "position" as the
- * caller will not check it after
- * *nr_pages is set to 0.
- */
- return i;
- }
- continue;
- }
-
- pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
- page = pte_page(huge_ptep_get(pte));
-
- VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
- !PageAnonExclusive(page), page);
-
- /*
- * If subpage information not requested, update counters
- * and skip the same_page loop below.
- */
- if (!pages && !pfn_offset &&
- (vaddr + huge_page_size(h) < vma->vm_end) &&
- (remainder >= pages_per_huge_page(h))) {
- vaddr += huge_page_size(h);
- remainder -= pages_per_huge_page(h);
- i += pages_per_huge_page(h);
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- continue;
- }
-
- /* vaddr may not be aligned to PAGE_SIZE */
- refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
- (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
-
- if (pages)
- record_subpages(nth_page(page, pfn_offset),
- vma, refs,
- likely(pages) ? pages + i : NULL);
-
- if (pages) {
- /*
- * try_grab_folio() should always succeed here,
- * because: a) we hold the ptl lock, and b) we've just
- * checked that the huge page is present in the page
- * tables. If the huge page is present, then the tail
- * pages must also be present. The ptl prevents the
- * head page and tail pages from being rearranged in
- * any way. As this is hugetlb, the pages will never
- * be p2pdma or not longterm pinable. So this page
- * must be available at this point, unless the page
- * refcount overflowed:
- */
- if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
- flags))) {
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- remainder = 0;
- err = -ENOMEM;
- break;
- }
- }
-
- vaddr += (refs << PAGE_SHIFT);
- remainder -= refs;
- i += refs;
-
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- }
- *nr_pages = remainder;
/*
- * setting position is actually required only if remainder is
- * not zero but it's faster not to add a "if (remainder)"
- * branch.
+ * Fixup retval for dump requests: if pagecache doesn't exist,
+ * don't try to allocate a new page but just skip it.
*/
- *position = vaddr;
+ if (!page && (flags & FOLL_DUMP) &&
+ !hugetlbfs_pagecache_present(h, vma, address))
+ page = ERR_PTR(-EFAULT);
- return i ? i : err;
+ return page;
}
long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -6849,8 +6682,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
else
flush_hugetlb_tlb_range(vma, start, end);
/*
- * No need to call mmu_notifier_invalidate_range() we are downgrading
- * page table protection not changing it to point to a new page.
+ * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
+ * downgrading page table protection not changing it to point to a new
+ * page.
*
* See Documentation/mm/mmu_notifier.rst
*/
@@ -7494,7 +7328,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
/*
- * No need to call mmu_notifier_invalidate_range(), see
+ * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index c2007ef5e9b0..4b9734777f69 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -36,14 +36,22 @@ struct vmemmap_remap_walk {
struct list_head *vmemmap_pages;
};
-static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
{
pmd_t __pmd;
int i;
unsigned long addr = start;
- struct page *page = pmd_page(*pmd);
- pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
+ struct page *head;
+ pte_t *pgtable;
+
+ spin_lock(&init_mm.page_table_lock);
+ head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
+ spin_unlock(&init_mm.page_table_lock);
+ if (!head)
+ return 0;
+
+ pgtable = pte_alloc_one_kernel(&init_mm);
if (!pgtable)
return -ENOMEM;
@@ -53,7 +61,7 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
pte_t entry, *pte;
pgprot_t pgprot = PAGE_KERNEL;
- entry = mk_pte(page + i, pgprot);
+ entry = mk_pte(head + i, pgprot);
pte = pte_offset_kernel(&__pmd, addr);
set_pte_at(&init_mm, addr, pte, entry);
}
@@ -65,8 +73,8 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
* be treated as indepdenent small pages (as they can be freed
* individually).
*/
- if (!PageReserved(page))
- split_page(page, get_order(PMD_SIZE));
+ if (!PageReserved(head))
+ split_page(head, get_order(PMD_SIZE));
/* Make pte visible before pmd. See comment in pmd_install(). */
smp_wmb();
@@ -80,20 +88,6 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
return 0;
}
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
-{
- int leaf;
-
- spin_lock(&init_mm.page_table_lock);
- leaf = pmd_leaf(*pmd);
- spin_unlock(&init_mm.page_table_lock);
-
- if (!leaf)
- return 0;
-
- return __split_vmemmap_huge_pmd(pmd, start);
-}
-
static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end,
struct vmemmap_remap_walk *walk)
diff --git a/mm/init-mm.c b/mm/init-mm.c
index efa97b57acfd..cfd367822cdd 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -17,6 +17,8 @@
#define INIT_MM_CONTEXT(name)
#endif
+const struct vm_operations_struct vma_dummy_vm_ops;
+
/*
* For dynamically allocated mm_structs, there is a dynamically sized cpumask
* at the end of the structure, the size of which depends on the maximum CPU
diff --git a/mm/internal.h b/mm/internal.h
index 8ed127c1c808..d99ffb473f90 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,6 +62,12 @@ void page_writeback_init(void);
#define FOLIO_PAGES_MAPPED (COMPOUND_MAPPED - 1)
/*
+ * Flags passed to __show_mem() and show_free_areas() to suppress output in
+ * various contexts.
+ */
+#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
+
+/*
* How many individual pages have an elevated _mapcount. Excludes
* the folio's entire_mapcount.
*/
@@ -103,7 +109,7 @@ bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
void folio_activate(struct folio *folio);
-void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *start_vma, unsigned long floor,
unsigned long ceiling, bool mm_wr_locked);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
@@ -170,6 +176,17 @@ static inline void set_page_refcounted(struct page *page)
set_page_count(page, 1);
}
+/*
+ * Return true if a folio needs ->release_folio() calling upon it.
+ */
+static inline bool folio_needs_release(struct folio *folio)
+{
+ struct address_space *mapping = folio_mapping(folio);
+
+ return folio_has_private(folio) ||
+ (mapping && mapping_release_always(mapping));
+}
+
extern unsigned long highest_memmap_pfn;
/*
@@ -689,7 +706,7 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
if (fault_flag_allow_retry_first(flags) &&
!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
fpin = get_file(vmf->vma->vm_file);
- mmap_read_unlock(vmf->vma->vm_mm);
+ release_fault_lock(vmf);
}
return fpin;
}
@@ -1022,6 +1039,7 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
}
extern bool mirrored_kernelcore;
+extern bool memblock_has_mirror(void);
static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
{
@@ -1041,21 +1059,39 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
return !(vma->vm_flags & VM_SOFTDIRTY);
}
+static inline void vma_iter_config(struct vma_iterator *vmi,
+ unsigned long index, unsigned long last)
+{
+ MAS_BUG_ON(&vmi->mas, vmi->mas.node != MAS_START &&
+ (vmi->mas.index > index || vmi->mas.last < index));
+ __mas_set_range(&vmi->mas, index, last - 1);
+}
+
/*
* VMA Iterator functions shared between nommu and mmap
*/
-static inline int vma_iter_prealloc(struct vma_iterator *vmi)
+static inline int vma_iter_prealloc(struct vma_iterator *vmi,
+ struct vm_area_struct *vma)
{
- return mas_preallocate(&vmi->mas, GFP_KERNEL);
+ return mas_preallocate(&vmi->mas, vma, GFP_KERNEL);
}
-static inline void vma_iter_clear(struct vma_iterator *vmi,
- unsigned long start, unsigned long end)
+static inline void vma_iter_clear(struct vma_iterator *vmi)
{
- mas_set_range(&vmi->mas, start, end - 1);
mas_store_prealloc(&vmi->mas, NULL);
}
+static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
+ unsigned long start, unsigned long end, gfp_t gfp)
+{
+ __mas_set_range(&vmi->mas, start, end - 1);
+ mas_store_gfp(&vmi->mas, NULL, gfp);
+ if (unlikely(mas_is_err(&vmi->mas)))
+ return -ENOMEM;
+
+ return 0;
+}
+
static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
{
return mas_walk(&vmi->mas);
@@ -1085,8 +1121,7 @@ static inline void vma_iter_store(struct vma_iterator *vmi,
((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
vma_iter_invalidate(vmi);
- vmi->mas.index = vma->vm_start;
- vmi->mas.last = vma->vm_end - 1;
+ __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
mas_store_prealloc(&vmi->mas, vma);
}
@@ -1097,8 +1132,7 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
vma_iter_invalidate(vmi);
- vmi->mas.index = vma->vm_start;
- vmi->mas.last = vma->vm_end - 1;
+ __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
mas_store_gfp(&vmi->mas, vma, gfp);
if (unlikely(mas_is_err(&vmi->mas)))
return -ENOMEM;
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 8652426282cc..3e049dfb28bd 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -10,14 +10,19 @@
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/export.h>
+#include <linux/ioremap.h>
-void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
- unsigned long prot)
+void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size,
+ pgprot_t prot)
{
unsigned long offset, vaddr;
phys_addr_t last_addr;
struct vm_struct *area;
+ /* An early platform driver might end up here */
+ if (WARN_ON_ONCE(!slab_is_available()))
+ return NULL;
+
/* Disallow wrap-around or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
@@ -28,34 +33,42 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
phys_addr -= offset;
size = PAGE_ALIGN(size + offset);
- if (!ioremap_allowed(phys_addr, size, prot))
- return NULL;
-
- area = get_vm_area_caller(size, VM_IOREMAP,
- __builtin_return_address(0));
+ area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START,
+ IOREMAP_END, __builtin_return_address(0));
if (!area)
return NULL;
vaddr = (unsigned long)area->addr;
area->phys_addr = phys_addr;
- if (ioremap_page_range(vaddr, vaddr + size, phys_addr,
- __pgprot(prot))) {
+ if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
free_vm_area(area);
return NULL;
}
return (void __iomem *)(vaddr + offset);
}
+
+#ifndef ioremap_prot
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long prot)
+{
+ return generic_ioremap_prot(phys_addr, size, __pgprot(prot));
+}
EXPORT_SYMBOL(ioremap_prot);
+#endif
-void iounmap(volatile void __iomem *addr)
+void generic_iounmap(volatile void __iomem *addr)
{
void *vaddr = (void *)((unsigned long)addr & PAGE_MASK);
- if (!iounmap_allowed(vaddr))
- return;
-
- if (is_vmalloc_addr(vaddr))
+ if (is_ioremap_addr(vaddr))
vunmap(vaddr);
}
+
+#ifndef iounmap
+void iounmap(volatile void __iomem *addr)
+{
+ generic_iounmap(addr);
+}
EXPORT_SYMBOL(iounmap);
+#endif
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index dad3c0eb70a0..96fd0411f5c5 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -116,7 +116,15 @@ EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
* backing pages (in __kfence_pool).
*/
static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
-struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+struct kfence_metadata *kfence_metadata __read_mostly;
+
+/*
+ * If kfence_metadata is not NULL, it may be accessed by kfence_shutdown_cache().
+ * So introduce kfence_metadata_init to initialize metadata, and then make
+ * kfence_metadata visible after initialization is successful. This prevents
+ * potential UAF or access to uninitialized metadata.
+ */
+static struct kfence_metadata *kfence_metadata_init __read_mostly;
/* Freelist with available objects. */
static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
@@ -591,7 +599,7 @@ static unsigned long kfence_init_pool(void)
__folio_set_slab(slab_folio(slab));
#ifdef CONFIG_MEMCG
- slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
+ slab->memcg_data = (unsigned long)&kfence_metadata_init[i / 2 - 1].objcg |
MEMCG_DATA_OBJCGS;
#endif
}
@@ -610,7 +618,7 @@ static unsigned long kfence_init_pool(void)
}
for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
- struct kfence_metadata *meta = &kfence_metadata[i];
+ struct kfence_metadata *meta = &kfence_metadata_init[i];
/* Initialize metadata. */
INIT_LIST_HEAD(&meta->list);
@@ -626,6 +634,12 @@ static unsigned long kfence_init_pool(void)
addr += 2 * PAGE_SIZE;
}
+ /*
+ * Make kfence_metadata visible only when initialization is successful.
+ * Otherwise, if the initialization fails and kfence_metadata is freed,
+ * it may cause UAF in kfence_shutdown_cache().
+ */
+ smp_store_release(&kfence_metadata, kfence_metadata_init);
return 0;
reset_slab:
@@ -672,26 +686,10 @@ static bool __init kfence_init_pool_early(void)
*/
memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
__kfence_pool = NULL;
- return false;
-}
-
-static bool kfence_init_pool_late(void)
-{
- unsigned long addr, free_size;
- addr = kfence_init_pool();
-
- if (!addr)
- return true;
+ memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE);
+ kfence_metadata_init = NULL;
- /* Same as above. */
- free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
-#ifdef CONFIG_CONTIG_ALLOC
- free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE);
-#else
- free_pages_exact((void *)addr, free_size);
-#endif
- __kfence_pool = NULL;
return false;
}
@@ -841,19 +839,30 @@ static void toggle_allocation_gate(struct work_struct *work)
/* === Public interface ===================================================== */
-void __init kfence_alloc_pool(void)
+void __init kfence_alloc_pool_and_metadata(void)
{
if (!kfence_sample_interval)
return;
- /* if the pool has already been initialized by arch, skip the below. */
- if (__kfence_pool)
- return;
-
- __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
-
+ /*
+ * If the pool has already been initialized by arch, there is no need to
+ * re-allocate the memory pool.
+ */
if (!__kfence_pool)
+ __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
+
+ if (!__kfence_pool) {
pr_err("failed to allocate pool\n");
+ return;
+ }
+
+ /* The memory allocated by memblock has been zeroed out. */
+ kfence_metadata_init = memblock_alloc(KFENCE_METADATA_SIZE, PAGE_SIZE);
+ if (!kfence_metadata_init) {
+ pr_err("failed to allocate metadata\n");
+ memblock_free(__kfence_pool, KFENCE_POOL_SIZE);
+ __kfence_pool = NULL;
+ }
}
static void kfence_init_enable(void)
@@ -895,33 +904,69 @@ void __init kfence_init(void)
static int kfence_init_late(void)
{
- const unsigned long nr_pages = KFENCE_POOL_SIZE / PAGE_SIZE;
+ const unsigned long nr_pages_pool = KFENCE_POOL_SIZE / PAGE_SIZE;
+ const unsigned long nr_pages_meta = KFENCE_METADATA_SIZE / PAGE_SIZE;
+ unsigned long addr = (unsigned long)__kfence_pool;
+ unsigned long free_size = KFENCE_POOL_SIZE;
+ int err = -ENOMEM;
+
#ifdef CONFIG_CONTIG_ALLOC
struct page *pages;
- pages = alloc_contig_pages(nr_pages, GFP_KERNEL, first_online_node, NULL);
+ pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL, first_online_node,
+ NULL);
if (!pages)
return -ENOMEM;
+
__kfence_pool = page_to_virt(pages);
+ pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL, first_online_node,
+ NULL);
+ if (pages)
+ kfence_metadata_init = page_to_virt(pages);
#else
- if (nr_pages > MAX_ORDER_NR_PAGES) {
+ if (nr_pages_pool > MAX_ORDER_NR_PAGES ||
+ nr_pages_meta > MAX_ORDER_NR_PAGES) {
pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n");
return -EINVAL;
}
+
__kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL);
if (!__kfence_pool)
return -ENOMEM;
+
+ kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, GFP_KERNEL);
#endif
- if (!kfence_init_pool_late()) {
- pr_err("%s failed\n", __func__);
- return -EBUSY;
+ if (!kfence_metadata_init)
+ goto free_pool;
+
+ memzero_explicit(kfence_metadata_init, KFENCE_METADATA_SIZE);
+ addr = kfence_init_pool();
+ if (!addr) {
+ kfence_init_enable();
+ kfence_debugfs_init();
+ return 0;
}
- kfence_init_enable();
- kfence_debugfs_init();
+ pr_err("%s failed\n", __func__);
+ free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
+ err = -EBUSY;
- return 0;
+#ifdef CONFIG_CONTIG_ALLOC
+ free_contig_range(page_to_pfn(virt_to_page((void *)kfence_metadata_init)),
+ nr_pages_meta);
+free_pool:
+ free_contig_range(page_to_pfn(virt_to_page((void *)addr)),
+ free_size / PAGE_SIZE);
+#else
+ free_pages_exact((void *)kfence_metadata_init, KFENCE_METADATA_SIZE);
+free_pool:
+ free_pages_exact((void *)addr, free_size);
+#endif
+
+ kfence_metadata_init = NULL;
+ __kfence_pool = NULL;
+ return err;
}
static int kfence_enable_late(void)
@@ -941,6 +986,10 @@ void kfence_shutdown_cache(struct kmem_cache *s)
struct kfence_metadata *meta;
int i;
+ /* Pairs with release in kfence_init_pool(). */
+ if (!smp_load_acquire(&kfence_metadata))
+ return;
+
for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
bool in_use;
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 392fb273e7bd..f46fbb03062b 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -102,7 +102,10 @@ struct kfence_metadata {
#endif
};
-extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+#define KFENCE_METADATA_SIZE PAGE_ALIGN(sizeof(struct kfence_metadata) * \
+ CONFIG_KFENCE_NUM_OBJECTS)
+
+extern struct kfence_metadata *kfence_metadata;
static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
{
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 78c8d5d8b628..9a6e0d507759 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -19,6 +19,7 @@
#include <linux/page_table_check.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
+#include <linux/ksm.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -92,8 +93,6 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __read_mostly;
-#define MAX_PTE_MAPPED_THP 8
-
struct collapse_control {
bool is_khugepaged;
@@ -107,15 +106,9 @@ struct collapse_control {
/**
* struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
* @slot: hash lookup from mm to mm_slot
- * @nr_pte_mapped_thp: number of pte mapped THP
- * @pte_mapped_thp: address array corresponding pte mapped THP
*/
struct khugepaged_mm_slot {
struct mm_slot slot;
-
- /* pte-mapped THP in this mm */
- int nr_pte_mapped_thp;
- unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
};
/**
@@ -709,6 +702,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
spin_lock(ptl);
ptep_clear(vma->vm_mm, address, _pte);
spin_unlock(ptl);
+ ksm_might_unmap_zero_page(vma->vm_mm, pteval);
}
} else {
src_page = pte_page(pteval);
@@ -1439,51 +1433,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
}
#ifdef CONFIG_SHMEM
-/*
- * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
- * khugepaged should try to collapse the page table.
- *
- * Note that following race exists:
- * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A,
- * emptying the A's ->pte_mapped_thp[] array.
- * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and
- * retract_page_tables() finds a VMA in mm_struct A mapping the same extent
- * (at virtual address X) and adds an entry (for X) into mm_struct A's
- * ->pte-mapped_thp[] array.
- * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X,
- * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry
- * (for X) into mm_struct A's ->pte-mapped_thp[] array.
- * Thus, it's possible the same address is added multiple times for the same
- * mm_struct. Should this happen, we'll simply attempt
- * collapse_pte_mapped_thp() multiple times for the same address, under the same
- * exclusive mmap_lock, and assuming the first call is successful, subsequent
- * attempts will return quickly (without grabbing any additional locks) when
- * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap
- * check, and since this is a rare occurrence, the cost of preventing this
- * "multiple-add" is thought to be more expensive than just handling it, should
- * it occur.
- */
-static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr)
-{
- struct khugepaged_mm_slot *mm_slot;
- struct mm_slot *slot;
- bool ret = false;
-
- VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
-
- spin_lock(&khugepaged_mm_lock);
- slot = mm_slot_lookup(mm_slots_hash, mm);
- mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
- if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) {
- mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
- ret = true;
- }
- spin_unlock(&khugepaged_mm_lock);
- return ret;
-}
-
-/* hpage must be locked, and mmap_lock must be held in write */
+/* hpage must be locked, and mmap_lock must be held */
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmdp, struct page *hpage)
{
@@ -1495,7 +1445,7 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
};
VM_BUG_ON(!PageTransHuge(hpage));
- mmap_assert_write_locked(vma->vm_mm);
+ mmap_assert_locked(vma->vm_mm);
if (do_set_pmd(&vmf, hpage))
return SCAN_FAIL;
@@ -1504,48 +1454,6 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
return SCAN_SUCCEED;
}
-/*
- * A note about locking:
- * Trying to take the page table spinlocks would be useless here because those
- * are only used to synchronize:
- *
- * - modifying terminal entries (ones that point to a data page, not to another
- * page table)
- * - installing *new* non-terminal entries
- *
- * Instead, we need roughly the same kind of protection as free_pgtables() or
- * mm_take_all_locks() (but only for a single VMA):
- * The mmap lock together with this VMA's rmap locks covers all paths towards
- * the page table entries we're messing with here, except for hardware page
- * table walks and lockless_pages_from_mm().
- */
-static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp)
-{
- pmd_t pmd;
- struct mmu_notifier_range range;
-
- mmap_assert_write_locked(mm);
- if (vma->vm_file)
- lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
- /*
- * All anon_vmas attached to the VMA have the same root and are
- * therefore locked by the same lock.
- */
- if (vma->anon_vma)
- lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
- addr + HPAGE_PMD_SIZE);
- mmu_notifier_invalidate_range_start(&range);
- pmd = pmdp_collapse_flush(vma, addr, pmdp);
- tlb_remove_table_sync_one();
- mmu_notifier_invalidate_range_end(&range);
- mm_dec_nr_ptes(mm);
- page_table_check_pte_clear_range(mm, addr, pmd);
- pte_free(mm, pmd_pgtable(pmd));
-}
-
/**
* collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
* address haddr.
@@ -1561,26 +1469,29 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd)
{
+ struct mmu_notifier_range range;
+ bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
- pmd_t *pmd;
- spinlock_t *ptl;
- int count = 0, result = SCAN_FAIL;
+ pmd_t *pmd, pgt_pmd;
+ spinlock_t *pml, *ptl;
+ int nr_ptes = 0, result = SCAN_FAIL;
int i;
- mmap_assert_write_locked(mm);
+ mmap_assert_locked(mm);
+
+ /* First check VMA found, in case page tables are being torn down */
+ if (!vma || !vma->vm_file ||
+ !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
+ return SCAN_VMA_CHECK;
/* Fast check before locking page if already PMD-mapped */
result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
if (result == SCAN_PMD_MAPPED)
return result;
- if (!vma || !vma->vm_file ||
- !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
- return SCAN_VMA_CHECK;
-
/*
* If we are here, we've succeeded in replacing all the native pages
* in the page cache with a single hugepage. If a mm were to fault-in
@@ -1610,41 +1521,24 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto drop_hpage;
}
+ result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
switch (result) {
case SCAN_SUCCEED:
break;
case SCAN_PMD_NONE:
/*
- * In MADV_COLLAPSE path, possible race with khugepaged where
- * all pte entries have been removed and pmd cleared. If so,
- * skip all the pte checks and just update the pmd mapping.
+ * All pte entries have been removed and pmd cleared.
+ * Skip all the pte checks and just update the pmd mapping.
*/
goto maybe_install_pmd;
default:
goto drop_hpage;
}
- /* Lock the vma before taking i_mmap and page table locks */
- vma_start_write(vma);
-
- /*
- * We need to lock the mapping so that from here on, only GUP-fast and
- * hardware page walks can access the parts of the page tables that
- * we're operating on.
- * See collapse_and_free_pmd().
- */
- i_mmap_lock_write(vma->vm_file->f_mapping);
-
- /*
- * This spinlock should be unnecessary: Nobody else should be accessing
- * the page tables under spinlock protection here, only
- * lockless_pages_from_mm() and the hardware page walker can access page
- * tables while all the high-level locks are held in write mode.
- */
result = SCAN_FAIL;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
- if (!start_pte)
- goto drop_immap;
+ if (!start_pte) /* mmap_lock + page lock should prevent this */
+ goto drop_hpage;
/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
@@ -1671,10 +1565,18 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
*/
if (hpage + i != page)
goto abort;
- count++;
}
- /* step 2: adjust rmap */
+ pte_unmap_unlock(start_pte, ptl);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ notified = true;
+ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+ if (!start_pte) /* mmap_lock + page lock should prevent this */
+ goto abort;
+
+ /* step 2: clear page table and adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
struct page *page;
@@ -1682,189 +1584,160 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (pte_none(ptent))
continue;
+ /*
+ * We dropped ptl after the first scan, to do the mmu_notifier:
+ * page lock stops more PTEs of the hpage being faulted in, but
+ * does not stop write faults COWing anon copies from existing
+ * PTEs; and does not stop those being swapped out or migrated.
+ */
+ if (!pte_present(ptent)) {
+ result = SCAN_PTE_NON_PRESENT;
+ goto abort;
+ }
page = vm_normal_page(vma, addr, ptent);
- if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+ if (hpage + i != page)
goto abort;
+
+ /*
+ * Must clear entry, or a racing truncate may re-remove it.
+ * TLB flush can be left until pmdp_collapse_flush() does it.
+ * PTE dirty? Shmem page is already dirty; file is read-only.
+ */
+ ptep_clear(mm, addr, pte);
page_remove_rmap(page, vma, false);
+ nr_ptes++;
}
pte_unmap_unlock(start_pte, ptl);
/* step 3: set proper refcount and mm_counters. */
- if (count) {
- page_ref_sub(hpage, count);
- add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+ if (nr_ptes) {
+ page_ref_sub(hpage, nr_ptes);
+ add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
}
- /* step 4: remove pte entries */
- /* we make no change to anon, but protect concurrent anon page lookup */
- if (vma->anon_vma)
- anon_vma_lock_write(vma->anon_vma);
+ /* step 4: remove page table */
- collapse_and_free_pmd(mm, vma, haddr, pmd);
+ /* Huge page lock is still held, so page table must remain empty */
+ pml = pmd_lock(mm, pmd);
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
+ pmdp_get_lockless_sync();
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);
- if (vma->anon_vma)
- anon_vma_unlock_write(vma->anon_vma);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ mmu_notifier_invalidate_range_end(&range);
+
+ mm_dec_nr_ptes(mm);
+ page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
+ pte_free_defer(mm, pmd_pgtable(pgt_pmd));
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
? set_huge_pmd(vma, haddr, pmd, hpage)
: SCAN_SUCCEED;
-
+ goto drop_hpage;
+abort:
+ if (nr_ptes) {
+ flush_tlb_mm(mm);
+ page_ref_sub(hpage, nr_ptes);
+ add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
+ }
+ if (start_pte)
+ pte_unmap_unlock(start_pte, ptl);
+ if (notified)
+ mmu_notifier_invalidate_range_end(&range);
drop_hpage:
unlock_page(hpage);
put_page(hpage);
return result;
-
-abort:
- pte_unmap_unlock(start_pte, ptl);
-drop_immap:
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- goto drop_hpage;
-}
-
-static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
-{
- struct mm_slot *slot = &mm_slot->slot;
- struct mm_struct *mm = slot->mm;
- int i;
-
- if (likely(mm_slot->nr_pte_mapped_thp == 0))
- return;
-
- if (!mmap_write_trylock(mm))
- return;
-
- if (unlikely(hpage_collapse_test_exit(mm)))
- goto out;
-
- for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
- collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false);
-
-out:
- mm_slot->nr_pte_mapped_thp = 0;
- mmap_write_unlock(mm);
}
-static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
- struct mm_struct *target_mm,
- unsigned long target_addr, struct page *hpage,
- struct collapse_control *cc)
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
- int target_result = SCAN_FAIL;
- i_mmap_lock_write(mapping);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- int result = SCAN_FAIL;
- struct mm_struct *mm = NULL;
- unsigned long addr = 0;
- pmd_t *pmd;
- bool is_target = false;
+ struct mmu_notifier_range range;
+ struct mm_struct *mm;
+ unsigned long addr;
+ pmd_t *pmd, pgt_pmd;
+ spinlock_t *pml;
+ spinlock_t *ptl;
+ bool skipped_uffd = false;
/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
- * got written to. These VMAs are likely not worth investing
- * mmap_write_lock(mm) as PMD-mapping is likely to be split
- * later.
- *
- * Note that vma->anon_vma check is racy: it can be set up after
- * the check but before we took mmap_lock by the fault path.
- * But page lock would prevent establishing any new ptes of the
- * page, so we are safe.
- *
- * An alternative would be drop the check, but check that page
- * table is clear before calling pmdp_collapse_flush() under
- * ptl. It has higher chance to recover THP for the VMA, but
- * has higher cost too. It would also probably require locking
- * the anon_vma.
+ * got written to. These VMAs are likely not worth removing
+ * page tables from, as PMD-mapping is likely to be split later.
*/
- if (READ_ONCE(vma->anon_vma)) {
- result = SCAN_PAGE_ANON;
- goto next;
- }
+ if (READ_ONCE(vma->anon_vma))
+ continue;
+
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (addr & ~HPAGE_PMD_MASK ||
- vma->vm_end < addr + HPAGE_PMD_SIZE) {
- result = SCAN_VMA_CHECK;
- goto next;
- }
+ vma->vm_end < addr + HPAGE_PMD_SIZE)
+ continue;
+
mm = vma->vm_mm;
- is_target = mm == target_mm && addr == target_addr;
- result = find_pmd_or_thp_or_none(mm, addr, &pmd);
- if (result != SCAN_SUCCEED)
- goto next;
+ if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
+ continue;
+
+ if (hpage_collapse_test_exit(mm))
+ continue;
/*
- * We need exclusive mmap_lock to retract page table.
- *
- * We use trylock due to lock inversion: we need to acquire
- * mmap_lock while holding page lock. Fault path does it in
- * reverse order. Trylock is a way to avoid deadlock.
- *
- * Also, it's not MADV_COLLAPSE's job to collapse other
- * mappings - let khugepaged take care of them later.
+ * When a vma is registered with uffd-wp, we cannot recycle
+ * the page table because there may be pte markers installed.
+ * Other vmas can still have the same file mapped hugely, but
+ * skip this one: it will always be mapped in small page size
+ * for uffd-wp registered ranges.
*/
- result = SCAN_PTE_MAPPED_HUGEPAGE;
- if ((cc->is_khugepaged || is_target) &&
- mmap_write_trylock(mm)) {
- /* trylock for the same lock inversion as above */
- if (!vma_try_start_write(vma))
- goto unlock_next;
+ if (userfaultfd_wp(vma))
+ continue;
+
+ /* PTEs were notified when unmapped; but now for the PMD? */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ addr, addr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ pml = pmd_lock(mm, pmd);
+ ptl = pte_lockptr(mm, pmd);
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
- /*
- * Re-check whether we have an ->anon_vma, because
- * collapse_and_free_pmd() requires that either no
- * ->anon_vma exists or the anon_vma is locked.
- * We already checked ->anon_vma above, but that check
- * is racy because ->anon_vma can be populated under the
- * mmap lock in read mode.
- */
- if (vma->anon_vma) {
- result = SCAN_PAGE_ANON;
- goto unlock_next;
- }
- /*
- * When a vma is registered with uffd-wp, we can't
- * recycle the pmd pgtable because there can be pte
- * markers installed. Skip it only, so the rest mm/vma
- * can still have the same file mapped hugely, however
- * it'll always mapped in small page size for uffd-wp
- * registered ranges.
- */
- if (hpage_collapse_test_exit(mm)) {
- result = SCAN_ANY_PROCESS;
- goto unlock_next;
- }
- if (userfaultfd_wp(vma)) {
- result = SCAN_PTE_UFFD_WP;
- goto unlock_next;
- }
- collapse_and_free_pmd(mm, vma, addr, pmd);
- if (!cc->is_khugepaged && is_target)
- result = set_huge_pmd(vma, addr, pmd, hpage);
- else
- result = SCAN_SUCCEED;
-
-unlock_next:
- mmap_write_unlock(mm);
- goto next;
- }
/*
- * Calling context will handle target mm/addr. Otherwise, let
- * khugepaged try again later.
+ * Huge page lock is still held, so normally the page table
+ * must remain empty; and we have already skipped anon_vma
+ * and userfaultfd_wp() vmas. But since the mmap_lock is not
+ * held, it is still possible for a racing userfaultfd_ioctl()
+ * to have inserted ptes or markers. Now that we hold ptlock,
+ * repeating the anon_vma check protects from one category,
+ * and repeating the userfaultfd_wp() check from another.
*/
- if (!is_target) {
- khugepaged_add_pte_mapped_thp(mm, addr);
- continue;
+ if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
+ skipped_uffd = true;
+ } else {
+ pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
+ pmdp_get_lockless_sync();
+ }
+
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ if (!skipped_uffd) {
+ mm_dec_nr_ptes(mm);
+ page_table_check_pte_clear_range(mm, addr, pgt_pmd);
+ pte_free_defer(mm, pmd_pgtable(pgt_pmd));
}
-next:
- if (is_target)
- target_result = result;
}
- i_mmap_unlock_write(mapping);
- return target_result;
+ i_mmap_unlock_read(mapping);
}
/**
@@ -2076,8 +1949,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
goto out_unlock;
}
- if (folio_has_private(folio) &&
- !filemap_release_folio(folio, GFP_KERNEL)) {
+ if (!filemap_release_folio(folio, GFP_KERNEL)) {
result = SCAN_PAGE_HAS_PRIVATE;
folio_putback_lru(folio);
goto out_unlock;
@@ -2259,9 +2131,11 @@ immap_locked:
/*
* Remove pte page tables, so we can re-fault the page as huge.
+ * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
*/
- result = retract_page_tables(mapping, start, mm, addr, hpage,
- cc);
+ retract_page_tables(mapping, start);
+ if (cc && !cc->is_khugepaged)
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
unlock_page(hpage);
/*
@@ -2422,16 +2296,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
{
BUILD_BUG();
}
-
-static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
-{
-}
-
-static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr)
-{
- return false;
-}
#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
@@ -2461,7 +2325,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
khugepaged_scan.mm_slot = mm_slot;
}
spin_unlock(&khugepaged_mm_lock);
- khugepaged_collapse_pte_mapped_thps(mm_slot);
mm = slot->mm;
/*
@@ -2514,36 +2377,27 @@ skip:
khugepaged_scan.address);
mmap_read_unlock(mm);
- *result = hpage_collapse_scan_file(mm,
- khugepaged_scan.address,
- file, pgoff, cc);
mmap_locked = false;
+ *result = hpage_collapse_scan_file(mm,
+ khugepaged_scan.address, file, pgoff, cc);
fput(file);
+ if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
+ mmap_read_lock(mm);
+ if (hpage_collapse_test_exit(mm))
+ goto breakouterloop;
+ *result = collapse_pte_mapped_thp(mm,
+ khugepaged_scan.address, false);
+ if (*result == SCAN_PMD_MAPPED)
+ *result = SCAN_SUCCEED;
+ mmap_read_unlock(mm);
+ }
} else {
*result = hpage_collapse_scan_pmd(mm, vma,
- khugepaged_scan.address,
- &mmap_locked,
- cc);
+ khugepaged_scan.address, &mmap_locked, cc);
}
- switch (*result) {
- case SCAN_PTE_MAPPED_HUGEPAGE: {
- pmd_t *pmd;
-
- *result = find_pmd_or_thp_or_none(mm,
- khugepaged_scan.address,
- &pmd);
- if (*result != SCAN_SUCCEED)
- break;
- if (!khugepaged_add_pte_mapped_thp(mm,
- khugepaged_scan.address))
- break;
- } fallthrough;
- case SCAN_SUCCEED:
+
+ if (*result == SCAN_SUCCEED)
++khugepaged_pages_collapsed;
- break;
- default:
- break;
- }
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
@@ -2889,9 +2743,9 @@ handle_result:
case SCAN_PTE_MAPPED_HUGEPAGE:
BUG_ON(mmap_locked);
BUG_ON(*prev);
- mmap_write_lock(mm);
+ mmap_read_lock(mm);
result = collapse_pte_mapped_thp(mm, addr, true);
- mmap_write_unlock(mm);
+ mmap_read_unlock(mm);
goto handle_result;
/* Whitelisted set of results where continuing OK */
case SCAN_PMD_NULL:
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a2d34226e3c8..2918150e31bd 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -218,7 +218,7 @@ static int kmemleak_enabled = 1;
/* same as above but only for the kmemleak_free() callback */
static int kmemleak_free_enabled = 1;
/* set in the late_initcall if there were no errors */
-static int kmemleak_initialized;
+static int kmemleak_late_initialized;
/* set if a kmemleak warning was issued */
static int kmemleak_warning;
/* set if a fatal kmemleak error has occurred */
@@ -610,7 +610,12 @@ static noinline depot_stack_handle_t set_track_prepare(void)
unsigned long entries[MAX_TRACE];
unsigned int nr_entries;
- if (!kmemleak_initialized)
+ /*
+ * Use object_cache to determine whether kmemleak_init() has
+ * been invoked. stack_depot_early_init() is called before
+ * kmemleak_init() in mm_core_init().
+ */
+ if (!object_cache)
return 0;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
trace_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
@@ -2052,7 +2057,7 @@ static void kmemleak_disable(void)
kmemleak_enabled = 0;
/* check whether it is too early for a kernel thread */
- if (kmemleak_initialized)
+ if (kmemleak_late_initialized)
schedule_work(&cleanup_work);
else
kmemleak_free_enabled = 0;
@@ -2117,7 +2122,7 @@ void __init kmemleak_init(void)
*/
static int __init kmemleak_late_init(void)
{
- kmemleak_initialized = 1;
+ kmemleak_late_initialized = 1;
debugfs_create_file("kmemleak", 0644, NULL, NULL, &kmemleak_fops);
@@ -2125,7 +2130,7 @@ static int __init kmemleak_late_init(void)
/*
* Some error occurred and kmemleak was disabled. There is a
* small chance that kmemleak_disable() was called immediately
- * after setting kmemleak_initialized and we may end up with
+ * after setting kmemleak_late_initialized and we may end up with
* two clean-up threads but serialized by scan_mutex.
*/
schedule_work(&cleanup_work);
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index ec0da72e65aa..5d6e2dee5692 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -117,7 +117,7 @@ void kmsan_kfree_large(const void *ptr)
page = virt_to_head_page((void *)ptr);
KMSAN_WARN_ON(ptr != page_address(page));
kmsan_internal_poison_memory((void *)ptr,
- PAGE_SIZE << compound_order(page),
+ page_size(page),
GFP_KERNEL,
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
@@ -339,7 +339,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
* internal KMSAN checks.
*/
while (size > 0) {
- page_offset = addr % PAGE_SIZE;
+ page_offset = offset_in_page(addr);
to_go = min(PAGE_SIZE - page_offset, (u64)size);
kmsan_handle_dma_page((void *)addr, to_go, dir);
addr += to_go;
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index b8bb95eea5e3..87318f9170f1 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -145,7 +145,7 @@ void *kmsan_get_metadata(void *address, bool is_origin)
return NULL;
if (!page_has_metadata(page))
return NULL;
- off = addr % PAGE_SIZE;
+ off = offset_in_page(addr);
return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off;
}
@@ -210,7 +210,7 @@ void kmsan_free_page(struct page *page, unsigned int order)
return;
kmsan_enter_runtime();
kmsan_internal_poison_memory(page_address(page),
- PAGE_SIZE << compound_order(page),
+ page_size(page),
GFP_KERNEL,
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
@@ -281,8 +281,8 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end)
struct page *page;
u64 size;
- start = (void *)ALIGN_DOWN((u64)start, PAGE_SIZE);
- size = ALIGN((u64)end - (u64)start, PAGE_SIZE);
+ start = (void *)PAGE_ALIGN_DOWN((u64)start);
+ size = PAGE_ALIGN((u64)end - (u64)start);
shadow = memblock_alloc(size, PAGE_SIZE);
origin = memblock_alloc(size, PAGE_SIZE);
for (u64 addr = 0; addr < size; addr += PAGE_SIZE) {
diff --git a/mm/ksm.c b/mm/ksm.c
index d7b5b95e936e..8d6aee05421d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -242,6 +242,9 @@ static struct kmem_cache *rmap_item_cache;
static struct kmem_cache *stable_node_cache;
static struct kmem_cache *mm_slot_cache;
+/* The number of pages scanned */
+static unsigned long ksm_pages_scanned;
+
/* The number of nodes in the stable tree */
static unsigned long ksm_pages_shared;
@@ -278,6 +281,9 @@ static unsigned int zero_checksum __read_mostly;
/* Whether to merge empty (zeroed) pages with actual zero pages */
static bool ksm_use_zero_pages __read_mostly;
+/* The number of zero pages which is placed by KSM */
+unsigned long ksm_zero_pages;
+
#ifdef CONFIG_NUMA
/* Zeroed when merging across nodes is not allowed */
static unsigned int ksm_merge_across_nodes = 1;
@@ -448,7 +454,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
if (is_migration_entry(entry))
page = pfn_swap_entry_to_page(entry);
}
- ret = page && PageKsm(page);
+ /* return 1 if the page is an normal ksm page or KSM-placed zero page */
+ ret = (page && PageKsm(page)) || is_ksm_zero_pte(*pte);
pte_unmap_unlock(pte, ptl);
return ret;
}
@@ -1229,8 +1236,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
page_add_anon_rmap(kpage, vma, addr, RMAP_NONE);
newpte = mk_pte(kpage, vma->vm_page_prot);
} else {
- newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
- vma->vm_page_prot));
+ /*
+ * Use pte_mkdirty to mark the zero page mapped by KSM, and then
+ * we can easily track all KSM-placed zero pages by checking if
+ * the dirty bit in zero page's PTE is set.
+ */
+ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
+ ksm_zero_pages++;
+ mm->ksm_zero_pages++;
/*
* We're replacing an anonymous page with a zero page, which is
* not anonymous. We need to do proper accounting otherwise we
@@ -2473,8 +2486,9 @@ static void ksm_do_scan(unsigned int scan_npages)
{
struct ksm_rmap_item *rmap_item;
struct page *page;
+ unsigned int npages = scan_npages;
- while (scan_npages-- && likely(!freezing(current))) {
+ while (npages-- && likely(!freezing(current))) {
cond_resched();
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
@@ -2482,6 +2496,8 @@ static void ksm_do_scan(unsigned int scan_npages)
cmp_and_merge_page(page, rmap_item);
put_page(page);
}
+
+ ksm_pages_scanned += scan_npages - npages;
}
static int ksmd_should_run(void)
@@ -3091,7 +3107,7 @@ static void wait_while_offlining(void)
#ifdef CONFIG_PROC_FS
long ksm_process_profit(struct mm_struct *mm)
{
- return mm->ksm_merging_pages * PAGE_SIZE -
+ return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE -
mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
}
#endif /* CONFIG_PROC_FS */
@@ -3322,6 +3338,13 @@ static ssize_t max_page_sharing_store(struct kobject *kobj,
}
KSM_ATTR(max_page_sharing);
+static ssize_t pages_scanned_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", ksm_pages_scanned);
+}
+KSM_ATTR_RO(pages_scanned);
+
static ssize_t pages_shared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -3360,12 +3383,19 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
}
KSM_ATTR_RO(pages_volatile);
+static ssize_t ksm_zero_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%ld\n", ksm_zero_pages);
+}
+KSM_ATTR_RO(ksm_zero_pages);
+
static ssize_t general_profit_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
long general_profit;
- general_profit = ksm_pages_sharing * PAGE_SIZE -
+ general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE -
ksm_rmap_items * sizeof(struct ksm_rmap_item);
return sysfs_emit(buf, "%ld\n", general_profit);
@@ -3423,10 +3453,12 @@ static struct attribute *ksm_attrs[] = {
&sleep_millisecs_attr.attr,
&pages_to_scan_attr.attr,
&run_attr.attr,
+ &pages_scanned_attr.attr,
&pages_shared_attr.attr,
&pages_sharing_attr.attr,
&pages_unshared_attr.attr,
&pages_volatile_attr.attr,
+ &ksm_zero_pages_attr.attr,
&full_scans_attr.attr,
#ifdef CONFIG_NUMA
&merge_across_nodes_attr.attr,
diff --git a/mm/madvise.c b/mm/madvise.c
index bfe0e06427bd..1fb2a11d77d9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -173,9 +173,8 @@ static int madvise_update_vma(struct vm_area_struct *vma,
}
success:
- /*
- * vm_flags is protected by the mmap_lock held in write mode.
- */
+ /* vm_flags is protected by the mmap_lock held in write mode. */
+ vma_start_write(vma);
vm_flags_reset(vma, new_flags);
if (!vma->vm_file || vma_is_anon_shmem(vma)) {
error = replace_anon_vma_name(vma, anon_name);
@@ -414,6 +413,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
folio_clear_referenced(folio);
folio_test_clear_young(folio);
+ if (folio_test_active(folio))
+ folio_set_workingset(folio);
if (pageout) {
if (folio_isolate_lru(folio)) {
if (folio_test_unevictable(folio))
@@ -511,6 +512,8 @@ regular_folio:
*/
folio_clear_referenced(folio);
folio_test_clear_young(folio);
+ if (folio_test_active(folio))
+ folio_set_workingset(folio);
if (pageout) {
if (folio_isolate_lru(folio)) {
if (folio_test_unevictable(folio))
@@ -662,7 +665,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
free_swap_and_cache(entry);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} else if (is_hwpoison_entry(entry) ||
- is_swapin_error_entry(entry)) {
+ is_poisoned_swp_entry(entry)) {
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
}
continue;
diff --git a/mm/memblock.c b/mm/memblock.c
index f9e61e565a53..913b2520a9a0 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -161,6 +161,11 @@ static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock;
static int memblock_reserved_in_slab __initdata_memblock;
+bool __init_memblock memblock_has_mirror(void)
+{
+ return system_has_some_mirror;
+}
+
static enum memblock_flags __init_memblock choose_memblock_flags(void)
{
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 315fd5f45e3c..8e125aa5a18d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -197,7 +197,7 @@ static struct move_charge_struct {
};
/*
- * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
+ * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
*/
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
@@ -1629,7 +1629,6 @@ static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
WARN_ON_ONCE(seq_buf_has_overflowed(s));
}
-#define K(x) ((x) << (PAGE_SHIFT-10))
/**
* mem_cgroup_print_oom_context: Print OOM information relevant to
* memory controller.
@@ -3036,21 +3035,21 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
return objcg;
}
-struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
+struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
{
struct obj_cgroup *objcg;
if (!memcg_kmem_online())
return NULL;
- if (PageMemcgKmem(page)) {
- objcg = __folio_objcg(page_folio(page));
+ if (folio_memcg_kmem(folio)) {
+ objcg = __folio_objcg(folio);
obj_cgroup_get(objcg);
} else {
struct mem_cgroup *memcg;
rcu_read_lock();
- memcg = __folio_memcg(page_folio(page));
+ memcg = __folio_memcg(folio);
if (memcg)
objcg = __get_obj_cgroup_from_memcg(memcg);
else
@@ -3871,10 +3870,6 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
case _MEMSWAP:
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
- case _KMEM:
- /* kmem.limit_in_bytes is deprecated. */
- ret = -EOPNOTSUPP;
- break;
case _TCP:
ret = memcg_update_tcp_max(memcg, nr_pages);
break;
@@ -5086,12 +5081,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
},
#endif
{
- .name = "kmem.limit_in_bytes",
- .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
- .write = mem_cgroup_write,
- .read_u64 = mem_cgroup_read_u64,
- },
- {
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
@@ -5165,6 +5154,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
* those references are manageable from userspace.
*/
+#define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1)
static DEFINE_IDR(mem_cgroup_idr);
static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
@@ -5648,7 +5638,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
{
struct page *page = vm_normal_page(vma, addr, ptent);
- if (!page || !page_mapped(page))
+ if (!page)
return NULL;
if (PageAnon(page)) {
if (!(mc.flags & MOVE_ANON))
@@ -5657,8 +5647,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
if (!(mc.flags & MOVE_FILE))
return NULL;
}
- if (!get_page_unless_zero(page))
- return NULL;
+ get_page(page);
return page;
}
@@ -6698,8 +6687,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
lru_add_drain_all();
reclaimed = try_to_free_mem_cgroup_pages(memcg,
- nr_to_reclaim - nr_reclaimed,
- GFP_KERNEL, reclaim_options);
+ min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX),
+ GFP_KERNEL, reclaim_options);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
@@ -7789,7 +7778,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
* @objcg: the object cgroup
* @size: size of compressed object
*
- * This forces the charge after obj_cgroup_may_swap() allowed
+ * This forces the charge after obj_cgroup_may_zswap() allowed
* compression and storage in zwap for this cgroup to go ahead.
*/
void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
diff --git a/mm/memfd.c b/mm/memfd.c
index e763e76f1106..1cad1904fc26 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -268,11 +268,33 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
+static int check_sysctl_memfd_noexec(unsigned int *flags)
+{
+#ifdef CONFIG_SYSCTL
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ int sysctl = pidns_memfd_noexec_scope(ns);
+
+ if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
+ if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL)
+ *flags |= MFD_NOEXEC_SEAL;
+ else
+ *flags |= MFD_EXEC;
+ }
+
+ if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) {
+ pr_err_ratelimited(
+ "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n",
+ current->comm, task_pid_nr(current), sysctl);
+ return -EACCES;
+ }
+#endif
+ return 0;
+}
+
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
{
- char comm[TASK_COMM_LEN];
unsigned int *file_seals;
struct file *file;
int fd, error;
@@ -294,35 +316,15 @@ SYSCALL_DEFINE2(memfd_create,
return -EINVAL;
if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
-#ifdef CONFIG_SYSCTL
- int sysctl = MEMFD_NOEXEC_SCOPE_EXEC;
- struct pid_namespace *ns;
-
- ns = task_active_pid_ns(current);
- if (ns)
- sysctl = ns->memfd_noexec_scope;
-
- switch (sysctl) {
- case MEMFD_NOEXEC_SCOPE_EXEC:
- flags |= MFD_EXEC;
- break;
- case MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL:
- flags |= MFD_NOEXEC_SEAL;
- break;
- default:
- pr_warn_once(
- "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n",
- task_pid_nr(current), get_task_comm(comm, current));
- return -EINVAL;
- }
-#else
- flags |= MFD_EXEC;
-#endif
- pr_warn_once(
- "memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n",
- task_pid_nr(current), get_task_comm(comm, current));
+ pr_info_ratelimited(
+ "%s[%d]: memfd_create() called without MFD_EXEC or MFD_NOEXEC_SEAL set\n",
+ current->comm, task_pid_nr(current));
}
+ error = check_sysctl_memfd_noexec(&flags);
+ if (error < 0)
+ return error;
+
/* length includes terminating zero */
len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
if (len <= 0)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fe121fdb05f7..55dfe8a7bf4b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -39,7 +39,6 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
-#include <linux/kernel-page-flags.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/dax.h>
@@ -50,7 +49,6 @@
#include <linux/swap.h>
#include <linux/backing-dev.h>
#include <linux/migrate.h>
-#include <linux/suspend.h>
#include <linux/slab.h>
#include <linux/swapops.h>
#include <linux/hugetlb.h>
@@ -59,7 +57,6 @@
#include <linux/memremap.h>
#include <linux/kfifo.h>
#include <linux/ratelimit.h>
-#include <linux/page-isolation.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
#include <linux/sysctl.h>
@@ -75,13 +72,15 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
static bool hw_memory_failure __read_mostly = false;
-inline void num_poisoned_pages_inc(unsigned long pfn)
+static DEFINE_MUTEX(mf_mutex);
+
+void num_poisoned_pages_inc(unsigned long pfn)
{
atomic_long_inc(&num_poisoned_pages);
memblk_nr_poison_inc(pfn);
}
-inline void num_poisoned_pages_sub(unsigned long pfn, long i)
+void num_poisoned_pages_sub(unsigned long pfn, long i)
{
atomic_long_sub(i, &num_poisoned_pages);
if (pfn != -1UL)
@@ -363,17 +362,14 @@ void shake_page(struct page *p)
{
if (PageHuge(p))
return;
-
- if (!PageSlab(p)) {
- lru_add_drain_all();
- if (PageLRU(p) || is_free_buddy_page(p))
- return;
- }
-
/*
* TODO: Could shrink slab caches here if a lightweight range-based
* shrinker will be available.
*/
+ if (PageSlab(p))
+ return;
+
+ lru_add_drain_all();
}
EXPORT_SYMBOL_GPL(shake_page);
@@ -614,7 +610,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
pgoff = page_to_pgoff(page);
read_lock(&tasklist_lock);
- for_each_process (tsk) {
+ for_each_process(tsk) {
struct anon_vma_chain *vmac;
struct task_struct *t = task_early_kill(tsk, force_early);
@@ -658,7 +654,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
/*
* Send early kill signal to tasks where a vma covers
* the page but the corrupted page is not necessarily
- * mapped it in its pte.
+ * mapped in its pte.
* Assume applications who requested early kill want
* to be informed of all such data corruptions.
*/
@@ -940,14 +936,12 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
struct folio *folio = page_folio(p);
int err = mapping->a_ops->error_remove_page(mapping, p);
- if (err != 0) {
+ if (err != 0)
pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
- } else if (folio_has_private(folio) &&
- !filemap_release_folio(folio, GFP_NOIO)) {
+ else if (!filemap_release_folio(folio, GFP_NOIO))
pr_info("%#lx: failed to release buffers\n", pfn);
- } else {
+ else
ret = MF_RECOVERED;
- }
} else {
/*
* If the file system doesn't support it just invalidate
@@ -1193,9 +1187,6 @@ static int me_huge_page(struct page_state *ps, struct page *p)
struct address_space *mapping;
bool extra_pins = false;
- if (!PageHuge(hpage))
- return MF_DELAYED;
-
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
@@ -1395,8 +1386,15 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags)
bool hugetlb = false;
ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false);
- if (hugetlb)
- return ret;
+ if (hugetlb) {
+ /* Make sure hugetlb demotion did not happen from under us. */
+ if (folio == page_folio(page))
+ return ret;
+ if (ret > 0) {
+ folio_put(folio);
+ folio = page_folio(page);
+ }
+ }
/*
* This check prevents from calling folio_try_get() for any
@@ -1485,8 +1483,13 @@ static int __get_unpoison_page(struct page *page)
bool hugetlb = false;
ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true);
- if (hugetlb)
- return ret;
+ if (hugetlb) {
+ /* Make sure hugetlb demotion did not happen from under us. */
+ if (folio == page_folio(page))
+ return ret;
+ if (ret > 0)
+ folio_put(folio);
+ }
/*
* PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
@@ -1814,6 +1817,7 @@ EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
#endif /* CONFIG_FS_DAX */
#ifdef CONFIG_HUGETLB_PAGE
+
/*
* Struct raw_hwp_page represents information about "raw error page",
* constructing singly linked list from ->_hugetlb_hwpoison field of folio.
@@ -1828,16 +1832,49 @@ static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
return (struct llist_head *)&folio->_hugetlb_hwpoison;
}
+bool is_raw_hwpoison_page_in_hugepage(struct page *page)
+{
+ struct llist_head *raw_hwp_head;
+ struct raw_hwp_page *p;
+ struct folio *folio = page_folio(page);
+ bool ret = false;
+
+ if (!folio_test_hwpoison(folio))
+ return false;
+
+ if (!folio_test_hugetlb(folio))
+ return PageHWPoison(page);
+
+ /*
+ * When RawHwpUnreliable is set, kernel lost track of which subpages
+ * are HWPOISON. So return as if ALL subpages are HWPOISONed.
+ */
+ if (folio_test_hugetlb_raw_hwp_unreliable(folio))
+ return true;
+
+ mutex_lock(&mf_mutex);
+
+ raw_hwp_head = raw_hwp_list_head(folio);
+ llist_for_each_entry(p, raw_hwp_head->first, node) {
+ if (page == p->page) {
+ ret = true;
+ break;
+ }
+ }
+
+ mutex_unlock(&mf_mutex);
+
+ return ret;
+}
+
static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
{
- struct llist_head *head;
- struct llist_node *t, *tnode;
+ struct llist_node *head;
+ struct raw_hwp_page *p, *next;
unsigned long count = 0;
- head = raw_hwp_list_head(folio);
- llist_for_each_safe(tnode, t, head->first) {
- struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
-
+ head = llist_del_all(raw_hwp_list_head(folio));
+ llist_for_each_entry_safe(p, next, head, node) {
if (move_flag)
SetPageHWPoison(p->page);
else
@@ -1845,7 +1882,6 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
kfree(p);
count++;
}
- llist_del_all(head);
return count;
}
@@ -1853,7 +1889,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
{
struct llist_head *head;
struct raw_hwp_page *raw_hwp;
- struct llist_node *t, *tnode;
+ struct raw_hwp_page *p, *next;
int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
/*
@@ -1864,9 +1900,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return -EHWPOISON;
head = raw_hwp_list_head(folio);
- llist_for_each_safe(tnode, t, head->first) {
- struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
-
+ llist_for_each_entry_safe(p, next, head->first, node) {
if (p->page == page)
return -EHWPOISON;
}
@@ -1917,6 +1951,8 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
{
if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return;
+ if (folio_test_hugetlb_vmemmap_optimized(folio))
+ return;
folio_clear_hwpoison(folio);
folio_free_raw_hwp(folio, true);
}
@@ -2105,12 +2141,11 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
out:
/* drop pgmap ref acquired in caller */
put_dev_pagemap(pgmap);
- action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
+ if (rc != -EOPNOTSUPP)
+ action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
return rc;
}
-static DEFINE_MUTEX(mf_mutex);
-
/**
* memory_failure - Handle memory failure of a page.
* @pfn: Page Number of the corrupted page
@@ -2126,7 +2161,7 @@ static DEFINE_MUTEX(mf_mutex);
* detected by a background scrubber)
*
* Must run in process context (e.g. a work queue) with interrupts
- * enabled and no spinlocks hold.
+ * enabled and no spinlocks held.
*
* Return: 0 for successfully handled the memory error,
* -EOPNOTSUPP for hwpoison_filter() filtered the error event,
@@ -2184,8 +2219,6 @@ try_again:
goto unlock_mutex;
}
- hpage = compound_head(p);
-
/*
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
@@ -2224,13 +2257,14 @@ try_again:
}
}
+ hpage = compound_head(p);
if (PageTransHuge(hpage)) {
/*
* The flag must be set after the refcount is bumped
* otherwise it may race with THP split.
* And the flag can't be set in get_hwpoison_page() since
* it is called by soft offline too and it is just called
- * for !MF_COUNT_INCREASE. So here seems to be the best
+ * for !MF_COUNT_INCREASED. So here seems to be the best
* place.
*
* Don't need care about the above error handling paths for
@@ -2590,10 +2624,10 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
/*
* If we succeed to isolate the page, we grabbed another refcount on
- * the page, so we can safely drop the one we got from get_any_pages().
+ * the page, so we can safely drop the one we got from get_any_page().
* If we failed to isolate the page, it means that we cannot go further
* and we will return an error, so drop the reference we got from
- * get_any_pages() as well.
+ * get_any_page() as well.
*/
put_page(page);
return isolated;
@@ -2626,7 +2660,7 @@ static int soft_offline_in_use_page(struct page *page)
}
lock_page(page);
- if (!PageHuge(page))
+ if (!huge)
wait_on_page_writeback(page);
if (PageHWPoison(page)) {
unlock_page(page);
@@ -2635,7 +2669,7 @@ static int soft_offline_in_use_page(struct page *page)
return 0;
}
- if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page))
+ if (!huge && PageLRU(page) && !PageSwapCache(page))
/*
* Try to invalidate first. This should work for
* non dirty unmapped page cache pages.
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index a516e303e304..37a4f59d9585 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -560,11 +560,11 @@ struct memory_dev_type *alloc_memory_type(int adistance)
}
EXPORT_SYMBOL_GPL(alloc_memory_type);
-void destroy_memory_type(struct memory_dev_type *memtype)
+void put_memory_type(struct memory_dev_type *memtype)
{
kref_put(&memtype->kref, release_memtype);
}
-EXPORT_SYMBOL_GPL(destroy_memory_type);
+EXPORT_SYMBOL_GPL(put_memory_type);
void init_node_memory_type(int node, struct memory_dev_type *memtype)
{
@@ -586,7 +586,7 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
*/
if (!node_memory_types[node].map_count) {
node_memory_types[node].memtype = NULL;
- kref_put(&memtype->kref, release_memtype);
+ put_memory_type(memtype);
}
mutex_unlock(&memory_tier_lock);
}
@@ -672,16 +672,16 @@ bool numa_demotion_enabled = false;
#ifdef CONFIG_MIGRATION
#ifdef CONFIG_SYSFS
-static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t demotion_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%s\n",
numa_demotion_enabled ? "true" : "false");
}
-static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t demotion_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
ssize_t ret;
@@ -693,8 +693,7 @@ static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
}
static struct kobj_attribute numa_demotion_enabled_attr =
- __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
- numa_demotion_enabled_store);
+ __ATTR_RW(demotion_enabled);
static struct attribute *numa_attrs[] = {
&numa_demotion_enabled_attr.attr,
diff --git a/mm/memory.c b/mm/memory.c
index 603b2f419948..4a7c8be9fe71 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -77,7 +77,6 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
-#include <linux/net_mm.h>
#include <trace/events/kmem.h>
@@ -361,12 +360,10 @@ void free_pgd_range(struct mmu_gather *tlb,
} while (pgd++, addr = next, addr != end);
}
-void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *vma, unsigned long floor,
unsigned long ceiling, bool mm_wr_locked)
{
- MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
-
do {
unsigned long addr = vma->vm_start;
struct vm_area_struct *next;
@@ -375,7 +372,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
* Note: USER_PGTABLES_CEILING may be passed as ceiling and may
* be 0. This will underflow and is okay.
*/
- next = mas_find(&mas, ceiling - 1);
+ next = mas_find(mas, ceiling - 1);
/*
* Hide vma from rmap and truncate_pagecache before freeing
@@ -396,7 +393,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
- next = mas_find(&mas, ceiling - 1);
+ next = mas_find(mas, ceiling - 1);
if (mm_wr_locked)
vma_start_write(vma);
unlink_anon_vmas(vma);
@@ -860,8 +857,11 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return -EBUSY;
return -ENOENT;
} else if (is_pte_marker_entry(entry)) {
- if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma))
- set_pte_at(dst_mm, addr, dst_pte, pte);
+ pte_marker marker = copy_pte_marker(entry, dst_vma);
+
+ if (marker)
+ set_pte_at(dst_mm, addr, dst_pte,
+ make_pte_marker(marker));
return 0;
}
if (!userfaultfd_wp(dst_vma))
@@ -1312,7 +1312,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
* Use the raw variant of the seqcount_t write API to avoid
* lockdep complaining about preemptibility.
*/
- mmap_assert_write_locked(src_mm);
+ vma_assert_write_locked(src_vma);
raw_write_seqcount_begin(&src_mm->write_protect_seq);
}
@@ -1433,8 +1433,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
tlb_remove_tlb_entry(tlb, pte, addr);
zap_install_uffd_wp_if_needed(vma, addr, pte, details,
ptent);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ ksm_might_unmap_zero_page(mm, ptent);
continue;
+ }
delay_rmap = 0;
if (!PageAnon(page)) {
@@ -1500,7 +1502,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
!zap_drop_file_uffd_wp(details))
continue;
} else if (is_hwpoison_entry(entry) ||
- is_swapin_error_entry(entry)) {
+ is_poisoned_swp_entry(entry)) {
if (!should_zap_cows(details))
continue;
} else {
@@ -1691,10 +1693,12 @@ static void unmap_single_vma(struct mmu_gather *tlb,
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlb: address of the caller's struct mmu_gather
- * @mt: the maple tree
+ * @mas: the maple state
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
+ * @tree_end: The maximum index to check
+ * @mm_wr_locked: lock flag
*
* Unmap all pages in the vma list.
*
@@ -1707,9 +1711,10 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
+void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr, bool mm_wr_locked)
+ unsigned long end_addr, unsigned long tree_end,
+ bool mm_wr_locked)
{
struct mmu_notifier_range range;
struct zap_details details = {
@@ -1717,7 +1722,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
/* Careful - we need to zap private pages too! */
.even_cows = true,
};
- MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
start_addr, end_addr);
@@ -1725,7 +1729,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
do {
unmap_single_vma(tlb, vma, start_addr, end_addr, &details,
mm_wr_locked);
- } while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
+ } while ((vma = mas_find(mas, tree_end - 1)) != NULL);
mmu_notifier_invalidate_range_end(&range);
}
@@ -2927,10 +2931,9 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
*
* We do this without the lock held, so that it can sleep if it needs to.
*/
-static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
{
vm_fault_t ret;
- struct page *page = vmf->page;
unsigned int old_flags = vmf->flags;
vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
@@ -2945,14 +2948,14 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
- lock_page(page);
- if (!page->mapping) {
- unlock_page(page);
+ folio_lock(folio);
+ if (!folio->mapping) {
+ folio_unlock(folio);
return 0; /* retry */
}
ret |= VM_FAULT_LOCKED;
} else
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
return ret;
}
@@ -2965,20 +2968,20 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping;
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
bool dirtied;
bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
- dirtied = set_page_dirty(page);
- VM_BUG_ON_PAGE(PageAnon(page), page);
+ dirtied = folio_mark_dirty(folio);
+ VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
/*
- * Take a local copy of the address_space - page.mapping may be zeroed
- * by truncate after unlock_page(). The address_space itself remains
- * pinned by vma->vm_file's reference. We rely on unlock_page()'s
+ * Take a local copy of the address_space - folio.mapping may be zeroed
+ * by truncate after folio_unlock(). The address_space itself remains
+ * pinned by vma->vm_file's reference. We rely on folio_unlock()'s
* release semantics to prevent the compiler from undoing this copying.
*/
- mapping = page_rmapping(page);
- unlock_page(page);
+ mapping = folio_raw_mapping(folio);
+ folio_unlock(folio);
if (!page_mkwrite)
file_update_time(vma->vm_file);
@@ -3128,6 +3131,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
inc_mm_counter(mm, MM_ANONPAGES);
}
} else {
+ ksm_might_unmap_zero_page(mm, vmf->orig_pte);
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
@@ -3149,7 +3153,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* that left a window where the new PTE could be loaded into
* some TLBs while the old PTE remains in others.
*/
- ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+ ptep_clear_flush(vma, vmf->address, vmf->pte);
folio_add_new_anon_rmap(new_folio, vma, vmf->address);
folio_add_lru_vma(new_folio, vma);
/*
@@ -3195,11 +3199,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
}
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above ptep_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(&range);
+ mmu_notifier_invalidate_range_end(&range);
if (new_folio)
folio_put(new_folio);
@@ -3269,6 +3269,11 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
vm_fault_t ret;
pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vmf->vma);
+ return VM_FAULT_RETRY;
+ }
+
vmf->flags |= FAULT_FLAG_MKWRITE;
ret = vma->vm_ops->pfn_mkwrite(vmf);
if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
@@ -3279,36 +3284,42 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
return 0;
}
-static vm_fault_t wp_page_shared(struct vm_fault *vmf)
+static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret = 0;
- get_page(vmf->page);
+ folio_get(folio);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
vm_fault_t tmp;
pte_unmap_unlock(vmf->pte, vmf->ptl);
- tmp = do_page_mkwrite(vmf);
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ folio_put(folio);
+ vma_end_read(vmf->vma);
+ return VM_FAULT_RETRY;
+ }
+
+ tmp = do_page_mkwrite(vmf, folio);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(vmf->page);
+ folio_put(folio);
return tmp;
}
tmp = finish_mkwrite_fault(vmf);
if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
- unlock_page(vmf->page);
- put_page(vmf->page);
+ folio_unlock(folio);
+ folio_put(folio);
return tmp;
}
} else {
wp_page_reuse(vmf);
- lock_page(vmf->page);
+ folio_lock(folio);
}
ret |= fault_dirty_shared_page(vmf);
- put_page(vmf->page);
+ folio_put(folio);
return ret;
}
@@ -3359,6 +3370,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+ if (vmf->page)
+ folio = page_folio(vmf->page);
+
/*
* Shared mapping: we are guaranteed to have VM_WRITE and
* FAULT_FLAG_WRITE set at this point.
@@ -3373,12 +3387,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
*/
if (!vmf->page)
return wp_pfn_shared(vmf);
- return wp_page_shared(vmf);
+ return wp_page_shared(vmf, folio);
}
- if (vmf->page)
- folio = page_folio(vmf->page);
-
/*
* Private mapping: create an exclusive anonymous page copy if reuse
* is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
@@ -3432,6 +3443,12 @@ reuse:
return 0;
}
copy:
+ if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ vma_end_read(vmf->vma);
+ return VM_FAULT_RETRY;
+ }
+
/*
* Ok, we need to copy. Oh, well..
*/
@@ -3495,7 +3512,7 @@ void unmap_mapping_folio(struct folio *folio)
VM_BUG_ON(!folio_test_locked(folio));
first_index = folio->index;
- last_index = folio->index + folio_nr_pages(folio) - 1;
+ last_index = folio_next_index(folio) - 1;
details.even_cows = false;
details.single_folio = folio;
@@ -3647,7 +3664,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
* none pte. Otherwise it means the pte could have changed, so retry.
*
* This should also cover the case where e.g. the pte changed
- * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
+ * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
* So is_pte_marker() check is not enough to safely drop the pte.
*/
if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
@@ -3693,8 +3710,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
/* Higher priority than uffd-wp when data corrupted */
- if (marker & PTE_MARKER_SWAPIN_ERROR)
- return VM_FAULT_SIGBUS;
+ if (marker & PTE_MARKER_POISONED)
+ return VM_FAULT_HWPOISON;
if (pte_marker_entry_uffd_wp(entry))
return pte_marker_handle_uffd_wp(vmf);
@@ -4532,6 +4549,7 @@ static inline bool should_fault_around(struct vm_fault *vmf)
static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
vm_fault_t ret = 0;
+ struct folio *folio;
/*
* Let's call ->map_pages() first and use ->fault() as fallback
@@ -4544,14 +4562,20 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
return ret;
}
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vmf->vma);
+ return VM_FAULT_RETRY;
+ }
+
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
ret |= finish_fault(vmf);
- unlock_page(vmf->page);
+ folio = page_folio(vmf->page);
+ folio_unlock(folio);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- put_page(vmf->page);
+ folio_put(folio);
return ret;
}
@@ -4560,6 +4584,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
+
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
@@ -4598,21 +4627,29 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret, tmp;
+ struct folio *folio;
+
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
+ folio = page_folio(vmf->page);
+
/*
* Check if the backing address space wants to know that the page is
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) {
- unlock_page(vmf->page);
- tmp = do_page_mkwrite(vmf);
+ folio_unlock(folio);
+ tmp = do_page_mkwrite(vmf, folio);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(vmf->page);
+ folio_put(folio);
return tmp;
}
}
@@ -4620,8 +4657,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
- unlock_page(vmf->page);
- put_page(vmf->page);
+ folio_unlock(folio);
+ folio_put(folio);
return ret;
}
@@ -4817,36 +4854,47 @@ out_map:
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
- if (vma_is_anonymous(vmf->vma))
+ struct vm_area_struct *vma = vmf->vma;
+ if (vma_is_anonymous(vma))
return do_huge_pmd_anonymous_page(vmf);
- if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+ if (vma->vm_ops->huge_fault) {
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
+ return vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+ }
return VM_FAULT_FALLBACK;
}
/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
vm_fault_t ret;
- if (vma_is_anonymous(vmf->vma)) {
+ if (vma_is_anonymous(vma)) {
if (likely(!unshare) &&
- userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
+ userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf);
}
- if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- if (vmf->vma->vm_ops->huge_fault) {
- ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ if (vma->vm_ops->huge_fault) {
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
+ ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
}
}
/* COW or write-notify handled on pte level: split pmd. */
- __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
@@ -4855,11 +4903,17 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ struct vm_area_struct *vma = vmf->vma;
/* No support for anonymous transparent PUD pages yet */
- if (vma_is_anonymous(vmf->vma))
+ if (vma_is_anonymous(vma))
return VM_FAULT_FALLBACK;
- if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+ if (vma->vm_ops->huge_fault) {
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
+ return vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+ }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return VM_FAULT_FALLBACK;
}
@@ -4868,21 +4922,26 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
/* No support for anonymous transparent PUD pages yet */
- if (vma_is_anonymous(vmf->vma))
+ if (vma_is_anonymous(vma))
goto split;
- if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- if (vmf->vma->vm_ops->huge_fault) {
- ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ if (vma->vm_ops->huge_fault) {
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
+ ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
}
}
split:
/* COW or write-notify not handled on PUD level: split pud.*/
- __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
+ __split_huge_pud(vma, vmf->pud, vmf->address);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
return VM_FAULT_FALLBACK;
}
@@ -4980,10 +5039,10 @@ unlock:
}
/*
- * By the time we get here, we already hold the mm semaphore
- *
- * The mmap_lock may have been released depending on flags and our
- * return value. See filemap_fault() and __folio_lock_or_retry().
+ * On entry, we hold either the VMA lock or the mmap_lock
+ * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in
+ * the result, the mmap_lock is not held on exit. See filemap_fault()
+ * and __folio_lock_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
@@ -5081,7 +5140,7 @@ retry_pud:
/**
* mm_account_fault - Do page fault accounting
- *
+ * @mm: mm from which memcg should be extracted. It can be NULL.
* @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
* of perf event counters, but we'll still do the per-task accounting to
* the task who triggered this page fault.
@@ -5389,10 +5448,6 @@ retry:
if (!vma)
goto inval;
- /* Only anonymous and tcp vmas are supported for now */
- if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
- goto inval;
-
if (!vma_start_read(vma))
goto inval;
@@ -6059,19 +6114,19 @@ void __init ptlock_cache_init(void)
SLAB_PANIC, NULL);
}
-bool ptlock_alloc(struct page *page)
+bool ptlock_alloc(struct ptdesc *ptdesc)
{
spinlock_t *ptl;
ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
if (!ptl)
return false;
- page->ptl = ptl;
+ ptdesc->ptl = ptl;
return true;
}
-void ptlock_free(struct page *page)
+void ptlock_free(struct ptdesc *ptdesc)
{
- kmem_cache_free(page_ptl_cachep, page->ptl);
+ kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3f231cf1b410..1b03f4ec6fd2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -41,17 +41,83 @@
#include "internal.h"
#include "shuffle.h"
+enum {
+ MEMMAP_ON_MEMORY_DISABLE = 0,
+ MEMMAP_ON_MEMORY_ENABLE,
+ MEMMAP_ON_MEMORY_FORCE,
+};
+
+static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
+
+static inline unsigned long memory_block_memmap_size(void)
+{
+ return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
+}
+
+static inline unsigned long memory_block_memmap_on_memory_pages(void)
+{
+ unsigned long nr_pages = PFN_UP(memory_block_memmap_size());
+
+ /*
+ * In "forced" memmap_on_memory mode, we add extra pages to align the
+ * vmemmap size to cover full pageblocks. That way, we can add memory
+ * even if the vmemmap size is not properly aligned, however, we might waste
+ * memory.
+ */
+ if (memmap_mode == MEMMAP_ON_MEMORY_FORCE)
+ return pageblock_align(nr_pages);
+ return nr_pages;
+}
+
#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
/*
* memory_hotplug.memmap_on_memory parameter
*/
-static bool memmap_on_memory __ro_after_init;
-module_param(memmap_on_memory, bool, 0444);
-MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+static int set_memmap_mode(const char *val, const struct kernel_param *kp)
+{
+ int ret, mode;
+ bool enabled;
+
+ if (sysfs_streq(val, "force") || sysfs_streq(val, "FORCE")) {
+ mode = MEMMAP_ON_MEMORY_FORCE;
+ } else {
+ ret = kstrtobool(val, &enabled);
+ if (ret < 0)
+ return ret;
+ if (enabled)
+ mode = MEMMAP_ON_MEMORY_ENABLE;
+ else
+ mode = MEMMAP_ON_MEMORY_DISABLE;
+ }
+ *((int *)kp->arg) = mode;
+ if (mode == MEMMAP_ON_MEMORY_FORCE) {
+ unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
+
+ pr_info_once("Memory hotplug will waste %ld pages in each memory block\n",
+ memmap_pages - PFN_UP(memory_block_memmap_size()));
+ }
+ return 0;
+}
+
+static int get_memmap_mode(char *buffer, const struct kernel_param *kp)
+{
+ if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE)
+ return sprintf(buffer, "force\n");
+ return param_get_bool(buffer, kp);
+}
+
+static const struct kernel_param_ops memmap_mode_ops = {
+ .set = set_memmap_mode,
+ .get = get_memmap_mode,
+};
+module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n"
+ "With value \"force\" it could result in memory wastage due "
+ "to memmap size limitations (Y/N/force)");
static inline bool mhp_memmap_on_memory(void)
{
- return memmap_on_memory;
+ return memmap_mode != MEMMAP_ON_MEMORY_DISABLE;
}
#else
static inline bool mhp_memmap_on_memory(void)
@@ -1247,11 +1313,22 @@ static int online_memory_block(struct memory_block *mem, void *arg)
return device_online(&mem->dev);
}
-bool mhp_supports_memmap_on_memory(unsigned long size)
+#ifndef arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
+{
+ /*
+ * As default, we want the vmemmap to span a complete PMD such that we
+ * can map the vmemmap using a single PMD if supported by the
+ * architecture.
+ */
+ return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+}
+#endif
+
+static bool mhp_supports_memmap_on_memory(unsigned long size)
{
- unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
- unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
- unsigned long remaining_size = size - vmemmap_size;
+ unsigned long vmemmap_size = memory_block_memmap_size();
+ unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
/*
* Besides having arch support and the feature enabled at runtime, we
@@ -1279,10 +1356,28 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
* altmap as an alternative source of memory, and we do not exactly
* populate a single PMD.
*/
- return mhp_memmap_on_memory() &&
- size == memory_block_size_bytes() &&
- IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
- IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+ if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+ return false;
+
+ /*
+ * Make sure the vmemmap allocation is fully contained
+ * so that we always allocate vmemmap memory from altmap area.
+ */
+ if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE))
+ return false;
+
+ /*
+ * start pfn should be pageblock_nr_pages aligned for correctly
+ * setting migrate types
+ */
+ if (!pageblock_aligned(memmap_pages))
+ return false;
+
+ if (memmap_pages == PHYS_PFN(memory_block_size_bytes()))
+ /* No effective hotplugged memory doesn't make sense. */
+ return false;
+
+ return arch_supports_memmap_on_memory(vmemmap_size);
}
/*
@@ -1295,7 +1390,10 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
enum memblock_flags memblock_flags = MEMBLOCK_NONE;
- struct vmem_altmap mhp_altmap = {};
+ struct vmem_altmap mhp_altmap = {
+ .base_pfn = PHYS_PFN(res->start),
+ .end_pfn = PHYS_PFN(res->end),
+ };
struct memory_group *group = NULL;
u64 start, size;
bool new_node = false;
@@ -1339,26 +1437,29 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
* Self hosted memmap array
*/
if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
- if (!mhp_supports_memmap_on_memory(size)) {
- ret = -EINVAL;
- goto error;
+ if (mhp_supports_memmap_on_memory(size)) {
+ mhp_altmap.free = memory_block_memmap_on_memory_pages();
+ params.altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
+ if (!params.altmap) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ memcpy(params.altmap, &mhp_altmap, sizeof(mhp_altmap));
}
- mhp_altmap.free = PHYS_PFN(size);
- mhp_altmap.base_pfn = PHYS_PFN(start);
- params.altmap = &mhp_altmap;
+ /* fallback to not using altmap */
}
/* call arch's memory hotadd */
ret = arch_add_memory(nid, start, size, &params);
if (ret < 0)
- goto error;
+ goto error_free;
/* create memory block devices after memory was added */
- ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
- group);
+ ret = create_memory_block_devices(start, size, params.altmap, group);
if (ret) {
arch_remove_memory(start, size, NULL);
- goto error;
+ goto error_free;
}
if (new_node) {
@@ -1395,6 +1496,8 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
+error_free:
+ kfree(params.altmap);
error:
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
@@ -1843,6 +1946,11 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
do {
pfn = start_pfn;
do {
+ /*
+ * Historically we always checked for any signal and
+ * can't limit it to fatal signals without eventually
+ * breaking user space.
+ */
if (signal_pending(current)) {
ret = -EINTR;
reason = "signal backoff";
@@ -1956,12 +2064,18 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
return 0;
}
-static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+static int test_has_altmap_cb(struct memory_block *mem, void *arg)
{
+ struct memory_block **mem_ptr = (struct memory_block **)arg;
/*
- * If not set, continue with the next block.
+ * return the memblock if we have altmap
+ * and break callback.
*/
- return mem->nr_vmemmap_pages;
+ if (mem->altmap) {
+ *mem_ptr = mem;
+ return 1;
+ }
+ return 0;
}
static int check_cpu_on_node(int nid)
@@ -2036,10 +2150,9 @@ EXPORT_SYMBOL(try_offline_node);
static int __ref try_remove_memory(u64 start, u64 size)
{
- struct vmem_altmap mhp_altmap = {};
- struct vmem_altmap *altmap = NULL;
- unsigned long nr_vmemmap_pages;
+ struct memory_block *mem;
int rc = 0, nid = NUMA_NO_NODE;
+ struct vmem_altmap *altmap = NULL;
BUG_ON(check_hotplug_memory_range(start, size));
@@ -2061,23 +2174,20 @@ static int __ref try_remove_memory(u64 start, u64 size)
* the same granularity it was added - a single memory block.
*/
if (mhp_memmap_on_memory()) {
- nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
- get_nr_vmemmap_pages_cb);
- if (nr_vmemmap_pages) {
+ rc = walk_memory_blocks(start, size, &mem, test_has_altmap_cb);
+ if (rc) {
if (size != memory_block_size_bytes()) {
pr_warn("Refuse to remove %#llx - %#llx,"
"wrong granularity\n",
start, start + size);
return -EINVAL;
}
-
+ altmap = mem->altmap;
/*
- * Let remove_pmd_table->free_hugepage_table do the
- * right thing if we used vmem_altmap when hot-adding
- * the range.
+ * Mark altmap NULL so that we can add a debug
+ * check on memblock free.
*/
- mhp_altmap.alloc = nr_vmemmap_pages;
- altmap = &mhp_altmap;
+ mem->altmap = NULL;
}
}
@@ -2094,6 +2204,12 @@ static int __ref try_remove_memory(u64 start, u64 size)
arch_remove_memory(start, size, altmap);
+ /* Verify that all vmemmap pages have actually been freed. */
+ if (altmap) {
+ WARN(altmap->alloc, "Altmap not fully unmapped");
+ kfree(altmap);
+ }
+
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
memblock_phys_free(start, size);
memblock_remove(start, size);
diff --git a/mm/memtest.c b/mm/memtest.c
index 57149dfee438..32f3e9dda837 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -3,9 +3,10 @@
#include <linux/types.h>
#include <linux/init.h>
#include <linux/memblock.h>
+#include <linux/seq_file.h>
-bool early_memtest_done;
-phys_addr_t early_memtest_bad_size;
+static bool early_memtest_done;
+static phys_addr_t early_memtest_bad_size;
static u64 patterns[] __initdata = {
/* The first entry has to be 0 to leave memtest with zeroed memory */
@@ -117,3 +118,20 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end)
do_one_pass(patterns[idx], start, end);
}
}
+
+void memtest_report_meminfo(struct seq_file *m)
+{
+ unsigned long early_memtest_bad_size_kb;
+
+ if (!IS_ENABLED(CONFIG_PROC_FS))
+ return;
+
+ if (!early_memtest_done)
+ return;
+
+ early_memtest_bad_size_kb = early_memtest_bad_size >> 10;
+ if (early_memtest_bad_size && !early_memtest_bad_size_kb)
+ early_memtest_bad_size_kb = 1;
+ /* When 0 is reported, it means there actually was a successful test */
+ seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb);
+}
diff --git a/mm/migrate.c b/mm/migrate.c
index 24baad2571e3..e21d5a7e7447 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -773,7 +773,7 @@ recheck_buffers:
bh = head;
do {
- set_bh_page(bh, &dst->page, bh_offset(bh));
+ folio_set_bh(bh, dst, bh_offset(bh));
bh = bh->b_this_page;
} while (bh != head);
@@ -922,8 +922,7 @@ static int fallback_migrate_folio(struct address_space *mapping,
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
- if (folio_test_private(src) &&
- !filemap_release_folio(src, GFP_KERNEL))
+ if (!filemap_release_folio(src, GFP_KERNEL))
return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
return migrate_folio(mapping, dst, src, mode);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index d5f492356e3e..d69131adc51c 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -659,7 +659,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
if (flush) {
flush_cache_page(vma, addr, pte_pfn(orig_pte));
- ptep_clear_flush_notify(vma, addr, ptep);
+ ptep_clear_flush(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, entry);
update_mmu_cache(vma, addr, ptep);
} else {
@@ -728,13 +728,22 @@ static void __migrate_device_pages(unsigned long *src_pfns,
if (is_device_private_page(newpage) ||
is_device_coherent_page(newpage)) {
- /*
- * For now only support anonymous memory migrating to
- * device private or coherent memory.
- */
if (mapping) {
- src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ struct folio *folio;
+
+ folio = page_folio(page);
+
+ /*
+ * For now only support anonymous memory migrating to
+ * device private or coherent memory.
+ *
+ * Try to get rid of swap cache if possible.
+ */
+ if (!folio_test_anon(folio) ||
+ !folio_free_swap(folio)) {
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
}
} else if (is_zone_device_page(newpage)) {
/*
@@ -755,13 +764,8 @@ static void __migrate_device_pages(unsigned long *src_pfns,
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
}
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
- * did already call it.
- */
if (notified)
- mmu_notifier_invalidate_range_only_end(&range);
+ mmu_notifier_invalidate_range_end(&range);
}
/**
diff --git a/mm/mlock.c b/mm/mlock.c
index 479e09d0994c..06bdfab83b58 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -387,6 +387,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
*/
if (newflags & VM_LOCKED)
newflags |= VM_IO;
+ vma_start_write(vma);
vm_flags_reset_once(vma, newflags);
lru_add_drain();
@@ -461,9 +462,9 @@ success:
* It's okay if try_to_unmap_one unmaps a page just after we
* set VM_LOCKED, populate_vma_page_range will bring it back.
*/
-
if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
/* No work to do, and mlocking twice would be wrong */
+ vma_start_write(vma);
vm_flags_reset(vma, newflags);
} else {
mlock_vma_pages_range(vma, start, end, newflags);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index a1963c3322af..50f2f34745af 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -79,7 +79,7 @@ void __init mminit_verify_pageflags_layout(void)
int shift, width;
unsigned long or_mask, add_mask;
- shift = 8 * sizeof(unsigned long);
+ shift = BITS_PER_LONG;
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
@@ -154,7 +154,6 @@ early_param("mminit_loglevel", set_mminit_loglevel);
#endif /* CONFIG_DEBUG_MEMORY_INIT */
struct kobject *mm_kobj;
-EXPORT_SYMBOL_GPL(mm_kobj);
#ifdef CONFIG_SMP
s32 vm_committed_as_batch = 32;
@@ -377,6 +376,11 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (mirrored_kernelcore) {
bool mem_below_4gb_not_mirrored = false;
+ if (!memblock_has_mirror()) {
+ pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n");
+ goto out;
+ }
+
for_each_mem_region(r) {
if (memblock_is_mirror(r))
continue;
@@ -1020,7 +1024,7 @@ static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
if (!vmemmap_can_optimize(altmap, pgmap))
return pgmap_vmemmap_nr(pgmap);
- return 2 * (PAGE_SIZE / sizeof(struct page));
+ return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
}
static void __ref memmap_init_compound(struct page *head,
@@ -1105,7 +1109,6 @@ void __ref memmap_init_zone_device(struct zone *zone,
*/
static void __init adjust_zone_range_for_zone_movable(int nid,
unsigned long zone_type,
- unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zone_start_pfn,
unsigned long *zone_end_pfn)
@@ -1222,9 +1225,8 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
/* Get the start and end of the zone */
*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
- adjust_zone_range_for_zone_movable(nid, zone_type,
- node_start_pfn, node_end_pfn,
- zone_start_pfn, zone_end_pfn);
+ adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn,
+ zone_start_pfn, zone_end_pfn);
/* Check that this node has pages within the zone's required range */
if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
@@ -1424,9 +1426,9 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
usemapsize = roundup(zonesize, pageblock_nr_pages);
usemapsize = usemapsize >> pageblock_order;
usemapsize *= NR_PAGEBLOCK_BITS;
- usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
+ usemapsize = roundup(usemapsize, BITS_PER_LONG);
- return usemapsize / 8;
+ return usemapsize / BITS_PER_BYTE;
}
static void __ref setup_usemap(struct zone *zone)
@@ -1681,8 +1683,7 @@ static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
*
* It returns the start and end page frame of a node based on information
* provided by memblock_set_node(). If called for a node
- * with no available memory, a warning is printed and the start and end
- * PFNs will be 0.
+ * with no available memory, the start and end PFNs will be 0.
*/
void __init get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
@@ -1737,7 +1738,7 @@ static void __init free_area_init_node(int nid)
}
/* Any regular or high memory on that node ? */
-static void check_for_memory(pg_data_t *pgdat)
+static void __init check_for_memory(pg_data_t *pgdat)
{
enum zone_type zone_type;
@@ -2490,15 +2491,7 @@ void *__init alloc_large_system_hash(const char *tablename,
else
numentries <<= (PAGE_SHIFT - scale);
- /* Make sure we've got at least a 0-order allocation.. */
- if (unlikely(flags & HASH_SMALL)) {
- /* Makes no sense without HASH_EARLY */
- WARN_ON(!(flags & HASH_EARLY));
- if (!(numentries >> *_hash_shift)) {
- numentries = 1UL << *_hash_shift;
- BUG_ON(!numentries);
- }
- } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+ if (unlikely((numentries * bucketsize) < PAGE_SIZE))
numentries = PAGE_SIZE / bucketsize;
}
numentries = roundup_pow_of_two(numentries);
@@ -2778,7 +2771,7 @@ void __init mm_core_init(void)
*/
page_ext_init_flatmem();
mem_debugging_and_hardening_init();
- kfence_alloc_pool();
+ kfence_alloc_pool_and_metadata();
report_meminit();
kmsan_init_shadow();
stack_depot_early_init();
diff --git a/mm/mmap.c b/mm/mmap.c
index 3937479d0e07..514ced13c65c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -76,10 +76,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
-static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
+static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
struct vm_area_struct *vma, struct vm_area_struct *prev,
struct vm_area_struct *next, unsigned long start,
- unsigned long end, bool mm_wr_locked);
+ unsigned long end, unsigned long tree_end, bool mm_wr_locked);
static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
@@ -154,18 +154,6 @@ static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
return mas_prev(&vmi->mas, min);
}
-static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
- unsigned long start, unsigned long end, gfp_t gfp)
-{
- vmi->mas.index = start;
- vmi->mas.last = end - 1;
- mas_store_gfp(&vmi->mas, NULL, gfp);
- if (unlikely(mas_is_err(&vmi->mas)))
- return -ENOMEM;
-
- return 0;
-}
-
/*
* check_brk_limits() - Use platform specific check of range & verify mlock
* limits.
@@ -409,17 +397,17 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
VMA_ITERATOR(vmi, mm, 0);
struct address_space *mapping = NULL;
- if (vma_iter_prealloc(&vmi))
+ vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
+ if (vma_iter_prealloc(&vmi, vma))
return -ENOMEM;
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
- }
+ vma_start_write(vma);
vma_iter_store(&vmi, vma);
- if (mapping) {
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ i_mmap_lock_write(mapping);
__vma_link_file(vma, mapping);
i_mmap_unlock_write(mapping);
}
@@ -474,15 +462,6 @@ static inline void init_vma_prep(struct vma_prepare *vp,
*/
static inline void vma_prepare(struct vma_prepare *vp)
{
- vma_start_write(vp->vma);
- if (vp->adj_next)
- vma_start_write(vp->adj_next);
- /* vp->insert is always a newly created VMA, no need for locking */
- if (vp->remove)
- vma_start_write(vp->remove);
- if (vp->remove2)
- vma_start_write(vp->remove2);
-
if (vp->file) {
uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
@@ -597,6 +576,7 @@ again:
}
if (vp->insert && vp->file)
uprobe_mmap(vp->insert);
+ validate_mm(mm);
}
/*
@@ -615,7 +595,7 @@ static inline int dup_anon_vma(struct vm_area_struct *dst,
* anon pages imported.
*/
if (src->anon_vma && !dst->anon_vma) {
- vma_start_write(dst);
+ vma_assert_write_locked(dst);
dst->anon_vma = src->anon_vma;
return anon_vma_clone(dst, src);
}
@@ -647,10 +627,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
bool remove_next = false;
struct vma_prepare vp;
+ vma_start_write(vma);
if (next && (vma != next) && (end == next->vm_end)) {
int ret;
remove_next = true;
+ vma_start_write(next);
ret = dup_anon_vma(vma, next);
if (ret)
return ret;
@@ -663,23 +645,19 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
/* Only handles expanding */
VM_WARN_ON(vma->vm_start < start || vma->vm_end > end);
- if (vma_iter_prealloc(vmi))
+ /* Note: vma iterator must be pointing to 'start' */
+ vma_iter_config(vmi, start, end);
+ if (vma_iter_prealloc(vmi, vma))
goto nomem;
vma_prepare(&vp);
vma_adjust_trans_huge(vma, start, end, 0);
- /* VMA iterator points to previous, so set to start if necessary */
- if (vma_iter_addr(vmi) != start)
- vma_iter_set(vmi, start);
-
vma->vm_start = start;
vma->vm_end = end;
vma->vm_pgoff = pgoff;
- /* Note: mas must be pointing to the expanding VMA */
vma_iter_store(vmi, vma);
vma_complete(&vp, vmi, vma->vm_mm);
- validate_mm(vma->vm_mm);
return 0;
nomem:
@@ -702,24 +680,25 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
- if (vma_iter_prealloc(vmi))
+ if (vma->vm_start < start)
+ vma_iter_config(vmi, vma->vm_start, start);
+ else
+ vma_iter_config(vmi, end, vma->vm_end);
+
+ if (vma_iter_prealloc(vmi, NULL))
return -ENOMEM;
+ vma_start_write(vma);
+
init_vma_prep(&vp, vma);
vma_prepare(&vp);
vma_adjust_trans_huge(vma, start, end, 0);
- if (vma->vm_start < start)
- vma_iter_clear(vmi, vma->vm_start, start);
-
- if (vma->vm_end > end)
- vma_iter_clear(vmi, end, vma->vm_end);
-
+ vma_iter_clear(vmi);
vma->vm_start = start;
vma->vm_end = end;
vma->vm_pgoff = pgoff;
vma_complete(&vp, vmi, vma->vm_mm);
- validate_mm(vma->vm_mm);
return 0;
}
@@ -892,7 +871,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
long adj_start = 0;
- validate_mm(mm);
/*
* We later require that vma->vm_flags == vm_flags,
* so this tests vma->vm_flags & VM_SPECIAL, too.
@@ -937,16 +915,21 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
if (!merge_prev && !merge_next)
return NULL; /* Not mergeable. */
+ if (merge_prev)
+ vma_start_write(prev);
+
res = vma = prev;
remove = remove2 = adjust = NULL;
/* Can we merge both the predecessor and the successor? */
if (merge_prev && merge_next &&
is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
+ vma_start_write(next);
remove = next; /* case 1 */
vma_end = next->vm_end;
err = dup_anon_vma(prev, next);
if (curr) { /* case 6 */
+ vma_start_write(curr);
remove = curr;
remove2 = next;
if (!next->anon_vma)
@@ -954,6 +937,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
}
} else if (merge_prev) { /* case 2 */
if (curr) {
+ vma_start_write(curr);
err = dup_anon_vma(prev, curr);
if (end == curr->vm_end) { /* case 7 */
remove = curr;
@@ -963,8 +947,10 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
}
}
} else { /* merge_next */
+ vma_start_write(next);
res = next;
if (prev && addr < prev->vm_end) { /* case 4 */
+ vma_start_write(prev);
vma_end = addr;
adjust = next;
adj_start = -(prev->vm_end - addr);
@@ -980,6 +966,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
vma_pgoff = next->vm_pgoff - pglen;
if (curr) { /* case 8 */
vma_pgoff = curr->vm_pgoff;
+ vma_start_write(curr);
remove = curr;
err = dup_anon_vma(next, curr);
}
@@ -990,7 +977,17 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
if (err)
return NULL;
- if (vma_iter_prealloc(vmi))
+ if (vma_start < vma->vm_start || vma_end > vma->vm_end)
+ vma_expanded = true;
+
+ if (vma_expanded) {
+ vma_iter_config(vmi, vma_start, vma_end);
+ } else {
+ vma_iter_config(vmi, adjust->vm_start + adj_start,
+ adjust->vm_end);
+ }
+
+ if (vma_iter_prealloc(vmi, vma))
return NULL;
init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
@@ -999,8 +996,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
vma_prepare(&vp);
vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
- if (vma_start < vma->vm_start || vma_end > vma->vm_end)
- vma_expanded = true;
vma->vm_start = vma_start;
vma->vm_end = vma_end;
@@ -1019,10 +1014,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
}
vma_complete(&vp, vmi, mm);
- vma_iter_free(vmi);
- validate_mm(mm);
khugepaged_enter_vma(res, vm_flags);
-
return res;
}
@@ -1197,7 +1189,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags_t vm_flags;
int pkey = 0;
- validate_mm(mm);
*populate = 0;
if (!len)
@@ -1944,7 +1935,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
struct vm_area_struct *next;
unsigned long gap_addr;
int error = 0;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, address);
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
@@ -1969,7 +1960,11 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/* Check that both stack segments have the same anon_vma? */
}
- if (mas_preallocate(&mas, GFP_KERNEL))
+ if (next)
+ mas_prev_range(&mas, address);
+
+ __mas_set_range(&mas, vma->vm_start, address - 1);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
@@ -2014,7 +2009,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
/* Overwrite old entry in mtree. */
- mas_set_range(&mas, vma->vm_start, address - 1);
mas_store_prealloc(&mas, vma);
anon_vma_interval_tree_post_update_vma(vma);
spin_unlock(&mm->page_table_lock);
@@ -2026,6 +2020,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma(vma, vma->vm_flags);
mas_destroy(&mas);
+ validate_mm(mm);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -2058,7 +2053,11 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
return -ENOMEM;
}
- if (mas_preallocate(&mas, GFP_KERNEL))
+ if (prev)
+ mas_next_range(&mas, vma->vm_start);
+
+ __mas_set_range(&mas, address, vma->vm_end - 1);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
@@ -2104,7 +2103,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
vma->vm_start = address;
vma->vm_pgoff -= grow;
/* Overwrite old entry in mtree. */
- mas_set_range(&mas, address, vma->vm_end - 1);
mas_store_prealloc(&mas, vma);
anon_vma_interval_tree_post_update_vma(vma);
spin_unlock(&mm->page_table_lock);
@@ -2116,6 +2114,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma(vma, vma->vm_flags);
mas_destroy(&mas);
+ validate_mm(mm);
return error;
}
@@ -2293,7 +2292,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
remove_vma(vma, false);
}
vm_unacct_memory(nr_accounted);
- validate_mm(mm);
}
/*
@@ -2301,18 +2299,20 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
*
* Called with the mm semaphore held.
*/
-static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
+static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
struct vm_area_struct *vma, struct vm_area_struct *prev,
- struct vm_area_struct *next,
- unsigned long start, unsigned long end, bool mm_wr_locked)
+ struct vm_area_struct *next, unsigned long start,
+ unsigned long end, unsigned long tree_end, bool mm_wr_locked)
{
struct mmu_gather tlb;
+ unsigned long mt_start = mas->index;
lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked);
- free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked);
+ mas_set(mas, mt_start);
+ free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING,
mm_wr_locked);
tlb_finish_mmu(&tlb);
@@ -2330,8 +2330,6 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct *new;
int err;
- validate_mm(vma->vm_mm);
-
WARN_ON(vma->vm_start >= addr);
WARN_ON(vma->vm_end <= addr);
@@ -2345,10 +2343,6 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (!new)
return -ENOMEM;
- err = -ENOMEM;
- if (vma_iter_prealloc(vmi))
- goto out_free_vma;
-
if (new_below) {
new->vm_end = addr;
} else {
@@ -2356,6 +2350,11 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
+ err = -ENOMEM;
+ vma_iter_config(vmi, new->vm_start, new->vm_end);
+ if (vma_iter_prealloc(vmi, new))
+ goto out_free_vma;
+
err = vma_dup_policy(vma, new);
if (err)
goto out_free_vmi;
@@ -2370,6 +2369,9 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
+ vma_start_write(vma);
+ vma_start_write(new);
+
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
@@ -2388,7 +2390,6 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
/* Success. */
if (new_below)
vma_next(vmi);
- validate_mm(vma->vm_mm);
return 0;
out_free_mpol:
@@ -2397,7 +2398,6 @@ out_free_vmi:
vma_iter_free(vmi);
out_free_vma:
vm_area_free(new);
- validate_mm(vma->vm_mm);
return err;
}
@@ -2440,7 +2440,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long locked_vm = 0;
MA_STATE(mas_detach, &mt_detach, 0, 0);
mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
- mt_set_external_lock(&mt_detach, &mm->mmap_lock);
+ mt_on_stack(mt_detach);
/*
* If we need to split any vma, do it now to save pain later.
@@ -2461,22 +2461,17 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
goto map_count_exceeded;
- error = __split_vma(vmi, vma, start, 0);
+ error = __split_vma(vmi, vma, start, 1);
if (error)
goto start_split_failed;
-
- vma = vma_iter_load(vmi);
}
- prev = vma_prev(vmi);
- if (unlikely((!prev)))
- vma_iter_set(vmi, start);
-
/*
* Detach a range of VMAs from the mm. Using next as a temp variable as
* it is always overwritten.
*/
- for_each_vma_range(*vmi, next, end) {
+ next = vma;
+ do {
/* Does it split the end? */
if (next->vm_end > end) {
error = __split_vma(vmi, next, end, 0);
@@ -2484,7 +2479,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
goto end_split_failed;
}
vma_start_write(next);
- mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1);
+ mas_set(&mas_detach, count);
error = mas_store_gfp(&mas_detach, next, GFP_KERNEL);
if (error)
goto munmap_gather_failed;
@@ -2512,34 +2507,31 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
BUG_ON(next->vm_start < start);
BUG_ON(next->vm_start > end);
#endif
- }
-
- if (vma_iter_end(vmi) > end)
- next = vma_iter_load(vmi);
-
- if (!next)
- next = vma_next(vmi);
+ } for_each_vma_range(*vmi, next, end);
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
/* Make sure no VMAs are about to be lost. */
{
- MA_STATE(test, &mt_detach, start, end - 1);
+ MA_STATE(test, &mt_detach, 0, 0);
struct vm_area_struct *vma_mas, *vma_test;
int test_count = 0;
vma_iter_set(vmi, start);
rcu_read_lock();
- vma_test = mas_find(&test, end - 1);
+ vma_test = mas_find(&test, count - 1);
for_each_vma_range(*vmi, vma_mas, end) {
BUG_ON(vma_mas != vma_test);
test_count++;
- vma_test = mas_next(&test, end - 1);
+ vma_test = mas_next(&test, count - 1);
}
rcu_read_unlock();
BUG_ON(count != test_count);
}
#endif
- vma_iter_set(vmi, start);
+
+ while (vma_iter_addr(vmi) > start)
+ vma_iter_prev_range(vmi);
+
error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
if (error)
goto clear_tree_failed;
@@ -2550,19 +2542,26 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (unlock)
mmap_write_downgrade(mm);
+ prev = vma_iter_prev_range(vmi);
+ next = vma_next(vmi);
+ if (next)
+ vma_iter_prev_range(vmi);
+
/*
* We can free page tables without write-locking mmap_lock because VMAs
* were isolated before we downgraded mmap_lock.
*/
- unmap_region(mm, &mt_detach, vma, prev, next, start, end, !unlock);
+ mas_set(&mas_detach, 1);
+ unmap_region(mm, &mas_detach, vma, prev, next, start, end, count,
+ !unlock);
/* Statistics and freeing VMAs */
- mas_set(&mas_detach, start);
+ mas_set(&mas_detach, 0);
remove_mt(mm, &mas_detach);
- __mt_destroy(&mt_detach);
validate_mm(mm);
if (unlock)
mmap_read_unlock(mm);
+ __mt_destroy(&mt_detach);
return 0;
clear_tree_failed:
@@ -2686,8 +2685,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
next = vma_next(&vmi);
prev = vma_prev(&vmi);
- if (vm_flags & VM_SPECIAL)
+ if (vm_flags & VM_SPECIAL) {
+ if (prev)
+ vma_iter_next_range(&vmi);
goto cannot_expand;
+ }
/* Attempt to expand an old mapping */
/* Check next */
@@ -2708,9 +2710,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
merge_start = prev->vm_start;
vma = prev;
vm_pgoff = prev->vm_pgoff;
+ } else if (prev) {
+ vma_iter_next_range(&vmi);
}
-
/* Actually expand, if possible */
if (vma &&
!vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
@@ -2718,9 +2721,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
goto expanded;
}
+ if (vma == prev)
+ vma_iter_set(&vmi, addr);
cannot_expand:
- if (prev)
- vma_iter_next_range(&vmi);
/*
* Determine the object being mapped and call the appropriate
@@ -2733,7 +2736,7 @@ cannot_expand:
goto unacct_error;
}
- vma_iter_set(&vmi, addr);
+ vma_iter_config(&vmi, addr, end);
vma->vm_start = addr;
vma->vm_end = end;
vm_flags_init(vma, vm_flags);
@@ -2760,7 +2763,7 @@ cannot_expand:
if (WARN_ON((addr != vma->vm_start)))
goto close_and_free_vma;
- vma_iter_set(&vmi, addr);
+ vma_iter_config(&vmi, addr, end);
/*
* If vm_flags changed after call_mmap(), we should try merge
* vma again as we may succeed this time.
@@ -2807,17 +2810,15 @@ cannot_expand:
goto close_and_free_vma;
error = -ENOMEM;
- if (vma_iter_prealloc(&vmi))
+ if (vma_iter_prealloc(&vmi, vma))
goto close_and_free_vma;
/* Lock the VMA since it is modified after insertion into VMA tree */
vma_start_write(vma);
- if (vma->vm_file)
- i_mmap_lock_write(vma->vm_file->f_mapping);
-
vma_iter_store(&vmi, vma);
mm->map_count++;
if (vma->vm_file) {
+ i_mmap_lock_write(vma->vm_file->f_mapping);
if (vma->vm_flags & VM_SHARED)
mapping_allow_writable(vma->vm_file->f_mapping);
@@ -2878,9 +2879,10 @@ unmap_and_free_vma:
fput(vma->vm_file);
vma->vm_file = NULL;
+ vma_iter_set(&vmi, vma->vm_end);
/* Undo any partial mapping done by a device driver. */
- unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start,
- vma->vm_end, true);
+ unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
+ vma->vm_end, vma->vm_end, true);
}
if (file && (vm_flags & VM_SHARED))
mapping_unmap_writable(file->f_mapping);
@@ -3050,7 +3052,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm = current->mm;
struct vma_prepare vp;
- validate_mm(mm);
/*
* Check against address space limits by the changed size
* Note: This happens *after* clearing old mappings in some code paths.
@@ -3072,9 +3073,12 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (vma && vma->vm_end == addr && !vma_policy(vma) &&
can_vma_merge_after(vma, flags, NULL, NULL,
addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
- if (vma_iter_prealloc(vmi))
+ vma_iter_config(vmi, vma->vm_start, addr + len);
+ if (vma_iter_prealloc(vmi, vma))
goto unacct_fail;
+ vma_start_write(vma);
+
init_vma_prep(&vp, vma);
vma_prepare(&vp);
vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
@@ -3087,6 +3091,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
goto out;
}
+ if (vma)
+ vma_iter_next_range(vmi);
/* create a vma struct for an anonymous mapping */
vma = vm_area_alloc(mm);
if (!vma)
@@ -3098,10 +3104,12 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma->vm_pgoff = addr >> PAGE_SHIFT;
vm_flags_init(vma, flags);
vma->vm_page_prot = vm_get_page_prot(flags);
+ vma_start_write(vma);
if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
goto mas_store_fail;
mm->map_count++;
+ validate_mm(mm);
ksm_add_vma(vma);
out:
perf_event_mmap(vma);
@@ -3110,7 +3118,6 @@ out:
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vm_flags_set(vma, VM_SOFTDIRTY);
- validate_mm(mm);
return 0;
mas_store_fail:
@@ -3200,7 +3207,7 @@ void exit_mmap(struct mm_struct *mm)
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
- unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX, false);
+ unmap_vmas(&tlb, &mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
mmap_read_unlock(mm);
/*
@@ -3210,7 +3217,8 @@ void exit_mmap(struct mm_struct *mm)
set_bit(MMF_OOM_SKIP, &mm->flags);
mmap_write_lock(mm);
mt_clear_in_rcu(&mm->mm_mt);
- free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
+ mas_set(&mas, vma->vm_end);
+ free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS,
USER_PGTABLES_CEILING, true);
tlb_finish_mmu(&tlb);
@@ -3219,6 +3227,7 @@ void exit_mmap(struct mm_struct *mm)
* enabled, without holding any MM locks besides the unreachable
* mmap_write_lock.
*/
+ mas_set(&mas, vma->vm_end);
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
@@ -3291,7 +3300,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
bool faulted_in_anon_vma = true;
VMA_ITERATOR(vmi, mm, addr);
- validate_mm(mm);
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
@@ -3345,12 +3353,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
- vma_start_write(new_vma);
if (vma_link(mm, new_vma))
goto out_vma_link;
*need_rmap_locks = false;
}
- validate_mm(mm);
return new_vma;
out_vma_link:
@@ -3366,7 +3372,6 @@ out_free_mempol:
out_free_vma:
vm_area_free(new_vma);
out:
- validate_mm(mm);
return NULL;
}
@@ -3503,7 +3508,6 @@ static struct vm_area_struct *__install_special_mapping(
int ret;
struct vm_area_struct *vma;
- validate_mm(mm);
vma = vm_area_alloc(mm);
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
@@ -3526,12 +3530,10 @@ static struct vm_area_struct *__install_special_mapping(
perf_event_mmap(vma);
- validate_mm(mm);
return vma;
out:
vm_area_free(vma);
- validate_mm(mm);
return ERR_PTR(ret);
}
@@ -3663,6 +3665,12 @@ int mm_take_all_locks(struct mm_struct *mm)
mutex_lock(&mm_all_locks_mutex);
+ /*
+ * vma_start_write() does not have a complement in mm_drop_all_locks()
+ * because vma_start_write() is always asymmetrical; it marks a VMA as
+ * being written to until mmap_write_unlock() or mmap_write_downgrade()
+ * is reached.
+ */
mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
@@ -3759,7 +3767,6 @@ void mm_drop_all_locks(struct mm_struct *mm)
if (vma->vm_file && vma->vm_file->f_mapping)
vm_unlock_mapping(vma->vm_file->f_mapping);
}
- vma_end_write_all(mm);
mutex_unlock(&mm_all_locks_mutex);
}
@@ -3789,7 +3796,7 @@ static int init_user_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
@@ -3810,7 +3817,7 @@ static int init_admin_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
@@ -3854,7 +3861,7 @@ static int reserve_mem_notifier(struct notifier_block *nb,
break;
case MEM_OFFLINE:
- free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
if (sysctl_user_reserve_kbytes > free_kbytes) {
init_user_reserve();
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 50c0dde1354f..ec3b068cbbe6 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -199,7 +199,7 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
* invalidate_start/end and is colliding.
*
* The locking looks broadly like this:
- * mn_tree_invalidate_start(): mmu_interval_read_begin():
+ * mn_itree_inv_start(): mmu_interval_read_begin():
* spin_lock
* seq = READ_ONCE(interval_sub->invalidate_seq);
* seq == subs->invalidate_seq
@@ -207,7 +207,7 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
* spin_lock
* seq = ++subscriptions->invalidate_seq
* spin_unlock
- * op->invalidate_range():
+ * op->invalidate():
* user_lock
* mmu_interval_set_seq()
* interval_sub->invalidate_seq = seq
@@ -551,7 +551,7 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
static void
mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
- struct mmu_notifier_range *range, bool only_end)
+ struct mmu_notifier_range *range)
{
struct mmu_notifier *subscription;
int id;
@@ -559,24 +559,6 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
srcu_read_lock_held(&srcu)) {
- /*
- * Call invalidate_range here too to avoid the need for the
- * subsystem of having to register an invalidate_range_end
- * call-back when there is invalidate_range already. Usually a
- * subsystem registers either invalidate_range_start()/end() or
- * invalidate_range(), so this will be no additional overhead
- * (besides the pointer check).
- *
- * We skip call to invalidate_range() if we know it is safe ie
- * call site use mmu_notifier_invalidate_range_only_end() which
- * is safe to do when we know that a call to invalidate_range()
- * already happen under page table lock.
- */
- if (!only_end && subscription->ops->invalidate_range)
- subscription->ops->invalidate_range(subscription,
- range->mm,
- range->start,
- range->end);
if (subscription->ops->invalidate_range_end) {
if (!mmu_notifier_range_blockable(range))
non_block_start();
@@ -589,8 +571,7 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
srcu_read_unlock(&srcu, id);
}
-void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
- bool only_end)
+void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
struct mmu_notifier_subscriptions *subscriptions =
range->mm->notifier_subscriptions;
@@ -600,12 +581,12 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
mn_itree_inv_end(subscriptions);
if (!hlist_empty(&subscriptions->list))
- mn_hlist_invalidate_end(subscriptions, range, only_end);
+ mn_hlist_invalidate_end(subscriptions, range);
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}
-void __mmu_notifier_invalidate_range(struct mm_struct *mm,
- unsigned long start, unsigned long end)
+void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
struct mmu_notifier *subscription;
int id;
@@ -614,9 +595,10 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
hlist_for_each_entry_rcu(subscription,
&mm->notifier_subscriptions->list, hlist,
srcu_read_lock_held(&srcu)) {
- if (subscription->ops->invalidate_range)
- subscription->ops->invalidate_range(subscription, mm,
- start, end);
+ if (subscription->ops->arch_invalidate_secondary_tlbs)
+ subscription->ops->arch_invalidate_secondary_tlbs(
+ subscription, mm,
+ start, end);
}
srcu_read_unlock(&srcu, id);
}
@@ -635,6 +617,16 @@ int __mmu_notifier_register(struct mmu_notifier *subscription,
mmap_assert_write_locked(mm);
BUG_ON(atomic_read(&mm->mm_users) <= 0);
+ /*
+ * Subsystems should only register for invalidate_secondary_tlbs() or
+ * invalidate_range_start()/end() callbacks, not both.
+ */
+ if (WARN_ON_ONCE(subscription &&
+ (subscription->ops->arch_invalidate_secondary_tlbs &&
+ (subscription->ops->invalidate_range_start ||
+ subscription->ops->invalidate_range_end))))
+ return -EINVAL;
+
if (!mm->notifier_subscriptions) {
/*
* kmalloc cannot be called under mm_take_all_locks(), but we
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 3aef1340533a..130db91d3a8c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -213,7 +213,7 @@ static long change_pte_range(struct mmu_gather *tlb,
} else if (is_writable_device_private_entry(entry)) {
/*
* We do not preserve soft-dirtiness. See
- * copy_one_pte() for explanation.
+ * copy_nonpresent_pte() for explanation.
*/
entry = make_readable_device_private_entry(
swp_offset(entry));
@@ -230,10 +230,10 @@ static long change_pte_range(struct mmu_gather *tlb,
newpte = pte_swp_mkuffd_wp(newpte);
} else if (is_pte_marker_entry(entry)) {
/*
- * Ignore swapin errors unconditionally,
+ * Ignore error swap entries unconditionally,
* because any access should sigbus anyway.
*/
- if (is_swapin_error_entry(entry))
+ if (is_poisoned_swp_entry(entry))
continue;
/*
* If this is uffd-wp pte marker and we'd like
@@ -657,6 +657,7 @@ success:
* vm_flags and vm_page_prot are protected by the mmap_lock
* held in write mode.
*/
+ vma_start_write(vma);
vm_flags_reset(vma, newflags);
if (vma_wants_manual_pte_write_upgrade(vma))
mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
diff --git a/mm/mremap.c b/mm/mremap.c
index 11e06e4ab33b..056478c106ee 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -349,7 +349,7 @@ static inline bool move_normal_pud(struct vm_area_struct *vma,
}
#endif
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
{
diff --git a/mm/nommu.c b/mm/nommu.c
index c072a660ec2c..8dba41cfc44d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -583,7 +583,8 @@ static int delete_vma_from_mm(struct vm_area_struct *vma)
{
VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start);
- if (vma_iter_prealloc(&vmi)) {
+ vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
+ if (vma_iter_prealloc(&vmi, vma)) {
pr_warn("Allocation of vma tree for process %d failed\n",
current->pid);
return -ENOMEM;
@@ -591,7 +592,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma)
cleanup_vma_from_mm(vma);
/* remove from the MM's tree and list */
- vma_iter_clear(&vmi, vma->vm_start, vma->vm_end);
+ vma_iter_clear(&vmi);
return 0;
}
/*
@@ -1003,7 +1004,7 @@ error_free:
enomem:
pr_err("Allocation of length %lu from process %d (%s) failed\n",
len, current->pid, current->comm);
- show_free_areas(0, NULL);
+ show_mem();
return -ENOMEM;
}
@@ -1054,9 +1055,6 @@ unsigned long do_mmap(struct file *file,
if (!vma)
goto error_getting_vma;
- if (vma_iter_prealloc(&vmi))
- goto error_vma_iter_prealloc;
-
region->vm_usage = 1;
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
@@ -1198,6 +1196,10 @@ unsigned long do_mmap(struct file *file,
share:
BUG_ON(!vma->vm_region);
+ vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
+ if (vma_iter_prealloc(&vmi, vma))
+ goto error_just_free;
+
setup_vma_to_mm(vma, current->mm);
current->mm->map_count++;
/* add the VMA to the tree */
@@ -1236,22 +1238,14 @@ error_getting_vma:
kmem_cache_free(vm_region_jar, region);
pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
len, current->pid);
- show_free_areas(0, NULL);
+ show_mem();
return -ENOMEM;
error_getting_region:
pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
len, current->pid);
- show_free_areas(0, NULL);
- return -ENOMEM;
-
-error_vma_iter_prealloc:
- kmem_cache_free(vm_region_jar, region);
- vm_area_free(vma);
- pr_warn("Allocation of vma tree for process %d failed\n", current->pid);
- show_free_areas(0, NULL);
+ show_mem();
return -ENOMEM;
-
}
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
@@ -1336,12 +1330,6 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (!new)
goto err_vma_dup;
- if (vma_iter_prealloc(vmi)) {
- pr_warn("Allocation of vma tree for process %d failed\n",
- current->pid);
- goto err_vmi_preallocate;
- }
-
/* most fields are the same, copy all, and then fixup */
*region = *vma->vm_region;
new->vm_region = region;
@@ -1355,6 +1343,13 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
region->vm_pgoff = new->vm_pgoff += npages;
}
+ vma_iter_config(vmi, new->vm_start, new->vm_end);
+ if (vma_iter_prealloc(vmi, vma)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ goto err_vmi_preallocate;
+ }
+
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
@@ -1396,17 +1391,13 @@ static int vmi_shrink_vma(struct vma_iterator *vmi,
/* adjust the VMA's pointers, which may reposition it in the MM's tree
* and list */
- if (vma_iter_prealloc(vmi)) {
- pr_warn("Allocation of vma tree for process %d failed\n",
- current->pid);
- return -ENOMEM;
- }
-
if (from > vma->vm_start) {
- vma_iter_clear(vmi, from, vma->vm_end);
+ if (vma_iter_clear_gfp(vmi, from, vma->vm_end, GFP_KERNEL))
+ return -ENOMEM;
vma->vm_end = from;
} else {
- vma_iter_clear(vmi, vma->vm_start, to);
+ if (vma_iter_clear_gfp(vmi, vma->vm_start, to, GFP_KERNEL))
+ return -ENOMEM;
vma->vm_start = to;
}
@@ -1809,7 +1800,7 @@ static int __meminit init_user_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
@@ -1830,7 +1821,7 @@ static int __meminit init_admin_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 612b5597d3af..44bde56ecd02 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -479,8 +479,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
static bool oom_killer_disabled __read_mostly;
-#define K(x) ((x) << (PAGE_SHIFT-10))
-
/*
* task->mm can be NULL if the task is the exited group leader. So to
* determine whether the task is using a particular mm, we examine all the
@@ -994,7 +992,6 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
mmdrop(mm);
put_task_struct(victim);
}
-#undef K
/*
* Kill provided task unless it's secured by setting
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7d3460c7a480..986b56db96b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -371,10 +371,16 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
-static __always_inline
-unsigned long __get_pfnblock_flags_mask(const struct page *page,
- unsigned long pfn,
- unsigned long mask)
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+unsigned long get_pfnblock_flags_mask(const struct page *page,
+ unsigned long pfn, unsigned long mask)
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
@@ -393,24 +399,10 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page,
return (word >> bitidx) & mask;
}
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
-unsigned long get_pfnblock_flags_mask(const struct page *page,
- unsigned long pfn, unsigned long mask)
-{
- return __get_pfnblock_flags_mask(page, pfn, mask);
-}
-
static __always_inline int get_pfnblock_migratetype(const struct page *page,
unsigned long pfn)
{
- return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
+ return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
}
/**
@@ -459,7 +451,7 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
- int ret = 0;
+ int ret;
unsigned seq;
unsigned long pfn = page_to_pfn(page);
unsigned long sp, start_pfn;
@@ -468,8 +460,7 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
seq = zone_span_seqbegin(zone);
start_pfn = zone->zone_start_pfn;
sp = zone->spanned_pages;
- if (!zone_spans_pfn(zone, pfn))
- ret = 1;
+ ret = !zone_spans_pfn(zone, pfn);
} while (zone_span_seqretry(zone, seq));
if (ret)
@@ -539,8 +530,6 @@ out:
static inline unsigned int order_to_pindex(int migratetype, int order)
{
- int base = order;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != pageblock_order);
@@ -550,7 +539,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order)
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
#endif
- return (MIGRATE_PCPTYPES * base) + migratetype;
+ return (MIGRATE_PCPTYPES * order) + migratetype;
}
static inline int pindex_to_order(unsigned int pindex)
@@ -824,7 +813,7 @@ static inline void __free_one_page(struct page *page,
* pageblock isolation could cause incorrect freepage or CMA
* accounting or HIGHATOMIC accounting.
*/
- int buddy_mt = get_pageblock_migratetype(buddy);
+ int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
if (migratetype != buddy_mt
&& (!migratetype_is_mergeable(migratetype) ||
@@ -900,7 +889,7 @@ int split_free_page(struct page *free_page,
goto out;
}
- mt = get_pageblock_migratetype(free_page);
+ mt = get_pfnblock_migratetype(free_page, free_page_pfn);
if (likely(!is_migrate_isolate(mt)))
__mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -1210,8 +1199,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
int pindex)
{
unsigned long flags;
- int min_pindex = 0;
- int max_pindex = NR_PCP_LISTS - 1;
unsigned int order;
bool isolated_pageblocks;
struct page *page;
@@ -1234,17 +1221,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* Remove pages from lists in a round-robin fashion. */
do {
- if (++pindex > max_pindex)
- pindex = min_pindex;
+ if (++pindex > NR_PCP_LISTS - 1)
+ pindex = 0;
list = &pcp->lists[pindex];
- if (!list_empty(list))
- break;
-
- if (pindex == max_pindex)
- max_pindex--;
- if (pindex == min_pindex)
- min_pindex++;
- } while (1);
+ } while (list_empty(list));
order = pindex_to_order(pindex);
nr_pages = 1 << order;
@@ -1834,6 +1814,10 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
free_pages = move_freepages_block(zone, page, start_type,
&movable_pages);
+ /* moving whole block can fail due to zone boundary conditions */
+ if (!free_pages)
+ goto single_page;
+
/*
* Determine how many pages are compatible with our allocation.
* For movable allocation, it's the number of movable pages which
@@ -1855,14 +1839,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
else
alike_pages = 0;
}
-
- /* moving whole block can fail due to zone boundary conditions */
- if (!free_pages)
- goto single_page;
-
/*
* If a sufficient number of pages in the block are either free or of
- * comparable migratability as our allocation, claim the whole block.
+ * compatible migratability as our allocation, claim the whole block.
*/
if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
@@ -1912,8 +1891,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
* Reserve a pageblock for exclusive use of high-order atomic allocations if
* there are no empty page blocks that contain a page with a suitable order
*/
-static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
- unsigned int alloc_order)
+static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
{
int mt;
unsigned long max_managed, flags;
@@ -2353,10 +2331,10 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
return true;
}
-static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
- bool free_high)
+static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
{
int min_nr_free, max_nr_free;
+ int batch = READ_ONCE(pcp->batch);
/* Free everything if batch freeing high-order pages. */
if (unlikely(free_high))
@@ -2423,9 +2401,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
high = nr_pcp_high(pcp, zone, free_high);
if (pcp->count >= high) {
- int batch = READ_ONCE(pcp->batch);
-
- free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex);
}
}
@@ -3225,7 +3201,7 @@ try_this_zone:
* if the pageblock should be reserved for the future
*/
if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
- reserve_highatomic_pageblock(page, zone, order);
+ reserve_highatomic_pageblock(page, zone);
return page;
} else {
@@ -5139,19 +5115,17 @@ static void __build_all_zonelists(void *data)
unsigned long flags;
/*
- * Explicitly disable this CPU's interrupts before taking seqlock
- * to prevent any IRQ handler from calling into the page allocator
- * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
+ * The zonelist_update_seq must be acquired with irqsave because the
+ * reader can be invoked from IRQ with GFP_ATOMIC.
*/
- local_irq_save(flags);
+ write_seqlock_irqsave(&zonelist_update_seq, flags);
/*
- * Explicitly disable this CPU's synchronous printk() before taking
- * seqlock to prevent any printk() from trying to hold port->lock, for
+ * Also disable synchronous printk() to prevent any printk() from
+ * trying to hold port->lock, for
* tty_insert_flip_string_and_push_buffer() on other CPU might be
* calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
*/
printk_deferred_enter();
- write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -5188,9 +5162,8 @@ static void __build_all_zonelists(void *data)
#endif
}
- write_sequnlock(&zonelist_update_seq);
printk_deferred_exit();
- local_irq_restore(flags);
+ write_sequnlock_irqrestore(&zonelist_update_seq, flags);
}
static noinline void __init
@@ -5694,9 +5667,9 @@ static void __setup_per_zone_wmarks(void)
struct zone *zone;
unsigned long flags;
- /* Calculate total number of !ZONE_HIGHMEM pages */
+ /* Calculate total number of !ZONE_HIGHMEM and !ZONE_MOVABLE pages */
for_each_zone(zone) {
- if (!is_highmem(zone))
+ if (!is_highmem(zone) && zone_idx(zone) != ZONE_MOVABLE)
lowmem_pages += zone_managed_pages(zone);
}
@@ -5706,15 +5679,15 @@ static void __setup_per_zone_wmarks(void)
spin_lock_irqsave(&zone->lock, flags);
tmp = (u64)pages_min * zone_managed_pages(zone);
do_div(tmp, lowmem_pages);
- if (is_highmem(zone)) {
+ if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
- * need highmem pages, so cap pages_min to a small
- * value here.
+ * need highmem and movable zones pages, so cap pages_min
+ * to a small value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
* deltas control async page reclaim, and so should
- * not be capped for highmem.
+ * not be capped for highmem and movable zones.
*/
unsigned long min_pages;
diff --git a/mm/page_ext.c b/mm/page_ext.c
index dc1626be458b..4548fcc66d74 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -90,7 +90,6 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
unsigned long page_ext_size;
static unsigned long total_usage;
-static struct page_ext *lookup_page_ext(const struct page *page);
bool early_page_ext __meminitdata;
static int __init setup_early_page_ext(char *str)
@@ -137,62 +136,16 @@ static void __init invoke_init_callbacks(void)
}
}
-#ifndef CONFIG_SPARSEMEM
-void __init page_ext_init_flatmem_late(void)
-{
- invoke_init_callbacks();
-}
-#endif
-
static inline struct page_ext *get_entry(void *base, unsigned long index)
{
return base + page_ext_size * index;
}
-/**
- * page_ext_get() - Get the extended information for a page.
- * @page: The page we're interested in.
- *
- * Ensures that the page_ext will remain valid until page_ext_put()
- * is called.
- *
- * Return: NULL if no page_ext exists for this page.
- * Context: Any context. Caller may not sleep until they have called
- * page_ext_put().
- */
-struct page_ext *page_ext_get(struct page *page)
-{
- struct page_ext *page_ext;
-
- rcu_read_lock();
- page_ext = lookup_page_ext(page);
- if (!page_ext) {
- rcu_read_unlock();
- return NULL;
- }
-
- return page_ext;
-}
-
-/**
- * page_ext_put() - Working with page extended information is done.
- * @page_ext: Page extended information received from page_ext_get().
- *
- * The page extended information of the page may not be valid after this
- * function is called.
- *
- * Return: None.
- * Context: Any context with corresponding page_ext_get() is called.
- */
-void page_ext_put(struct page_ext *page_ext)
+#ifndef CONFIG_SPARSEMEM
+void __init page_ext_init_flatmem_late(void)
{
- if (unlikely(!page_ext))
- return;
-
- rcu_read_unlock();
+ invoke_init_callbacks();
}
-#ifndef CONFIG_SPARSEMEM
-
void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
{
@@ -424,13 +377,14 @@ static int __meminit online_page_ext(unsigned long start_pfn,
return 0;
/* rollback */
+ end = pfn - PAGES_PER_SECTION;
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_ext(pfn);
return -ENOMEM;
}
-static int __meminit offline_page_ext(unsigned long start_pfn,
+static void __meminit offline_page_ext(unsigned long start_pfn,
unsigned long nr_pages)
{
unsigned long start, end, pfn;
@@ -454,8 +408,6 @@ static int __meminit offline_page_ext(unsigned long start_pfn,
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_ext(pfn);
- return 0;
-
}
static int __meminit page_ext_callback(struct notifier_block *self,
@@ -537,3 +489,46 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
}
#endif
+
+/**
+ * page_ext_get() - Get the extended information for a page.
+ * @page: The page we're interested in.
+ *
+ * Ensures that the page_ext will remain valid until page_ext_put()
+ * is called.
+ *
+ * Return: NULL if no page_ext exists for this page.
+ * Context: Any context. Caller may not sleep until they have called
+ * page_ext_put().
+ */
+struct page_ext *page_ext_get(struct page *page)
+{
+ struct page_ext *page_ext;
+
+ rcu_read_lock();
+ page_ext = lookup_page_ext(page);
+ if (!page_ext) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ return page_ext;
+}
+
+/**
+ * page_ext_put() - Working with page extended information is done.
+ * @page_ext: Page extended information received from page_ext_get().
+ *
+ * The page extended information of the page may not be valid after this
+ * function is called.
+ *
+ * Return: None.
+ * Context: Any context with corresponding page_ext_get() is called.
+ */
+void page_ext_put(struct page_ext *page_ext)
+{
+ if (unlikely(!page_ext))
+ return;
+
+ rcu_read_unlock();
+}
diff --git a/mm/page_io.c b/mm/page_io.c
index 684cd3c7b59b..fe4c21af23f2 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -19,20 +19,19 @@
#include <linux/bio.h>
#include <linux/swapops.h>
#include <linux/writeback.h>
-#include <linux/frontswap.h>
#include <linux/blkdev.h>
#include <linux/psi.h>
#include <linux/uio.h>
#include <linux/sched/task.h>
#include <linux/delayacct.h>
+#include <linux/zswap.h>
#include "swap.h"
static void __end_swap_bio_write(struct bio *bio)
{
- struct page *page = bio_first_page_all(bio);
+ struct folio *folio = bio_first_folio_all(bio);
if (bio->bi_status) {
- SetPageError(page);
/*
* We failed to write the page out to swap-space.
* Re-dirty the page in order to avoid it being reclaimed.
@@ -41,13 +40,13 @@ static void __end_swap_bio_write(struct bio *bio)
*
* Also clear PG_reclaim to avoid folio_rotate_reclaimable()
*/
- set_page_dirty(page);
+ folio_mark_dirty(folio);
pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
(unsigned long long)bio->bi_iter.bi_sector);
- ClearPageReclaim(page);
+ folio_clear_reclaim(folio);
}
- end_page_writeback(page);
+ folio_end_writeback(folio);
}
static void end_swap_bio_write(struct bio *bio)
@@ -58,18 +57,16 @@ static void end_swap_bio_write(struct bio *bio)
static void __end_swap_bio_read(struct bio *bio)
{
- struct page *page = bio_first_page_all(bio);
+ struct folio *folio = bio_first_folio_all(bio);
if (bio->bi_status) {
- SetPageError(page);
- ClearPageUptodate(page);
pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
(unsigned long long)bio->bi_iter.bi_sector);
} else {
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
- unlock_page(page);
+ folio_unlock(folio);
}
static void end_swap_bio_read(struct bio *bio)
@@ -198,7 +195,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
folio_unlock(folio);
return ret;
}
- if (frontswap_store(&folio->page) == 0) {
+ if (zswap_store(folio)) {
folio_start_writeback(folio);
folio_unlock(folio);
folio_end_writeback(folio);
@@ -208,22 +205,22 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
return 0;
}
-static inline void count_swpout_vm_event(struct page *page)
+static inline void count_swpout_vm_event(struct folio *folio)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (unlikely(PageTransHuge(page)))
+ if (unlikely(folio_test_pmd_mappable(folio)))
count_vm_event(THP_SWPOUT);
#endif
- count_vm_events(PSWPOUT, thp_nr_pages(page));
+ count_vm_events(PSWPOUT, folio_nr_pages(folio));
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
+static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
{
struct cgroup_subsys_state *css;
struct mem_cgroup *memcg;
- memcg = page_memcg(page);
+ memcg = folio_memcg(folio);
if (!memcg)
return;
@@ -233,7 +230,7 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
rcu_read_unlock();
}
#else
-#define bio_associate_blkg_from_page(bio, page) do { } while (0)
+#define bio_associate_blkg_from_page(bio, folio) do { } while (0)
#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
struct swap_iocb {
@@ -283,7 +280,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
}
} else {
for (p = 0; p < sio->pages; p++)
- count_swpout_vm_event(sio->bvec[p].bv_page);
+ count_swpout_vm_event(page_folio(sio->bvec[p].bv_page));
}
for (p = 0; p < sio->pages; p++)
@@ -334,17 +331,18 @@ static void swap_writepage_bdev_sync(struct page *page,
{
struct bio_vec bv;
struct bio bio;
+ struct folio *folio = page_folio(page);
bio_init(&bio, sis->bdev, &bv, 1,
REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc));
bio.bi_iter.bi_sector = swap_page_sector(page);
__bio_add_page(&bio, page, thp_size(page), 0);
- bio_associate_blkg_from_page(&bio, page);
- count_swpout_vm_event(page);
+ bio_associate_blkg_from_page(&bio, folio);
+ count_swpout_vm_event(folio);
- set_page_writeback(page);
- unlock_page(page);
+ folio_start_writeback(folio);
+ folio_unlock(folio);
submit_bio_wait(&bio);
__end_swap_bio_write(&bio);
@@ -354,6 +352,7 @@ static void swap_writepage_bdev_async(struct page *page,
struct writeback_control *wbc, struct swap_info_struct *sis)
{
struct bio *bio;
+ struct folio *folio = page_folio(page);
bio = bio_alloc(sis->bdev, 1,
REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
@@ -362,10 +361,10 @@ static void swap_writepage_bdev_async(struct page *page,
bio->bi_end_io = end_swap_bio_write;
__bio_add_page(bio, page, thp_size(page), 0);
- bio_associate_blkg_from_page(bio, page);
- count_swpout_vm_event(page);
- set_page_writeback(page);
- unlock_page(page);
+ bio_associate_blkg_from_page(bio, folio);
+ count_swpout_vm_event(folio);
+ folio_start_writeback(folio);
+ folio_unlock(folio);
submit_bio(bio);
}
@@ -406,19 +405,17 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
if (ret == sio->len) {
for (p = 0; p < sio->pages; p++) {
- struct page *page = sio->bvec[p].bv_page;
+ struct folio *folio = page_folio(sio->bvec[p].bv_page);
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
}
count_vm_events(PSWPIN, sio->pages);
} else {
for (p = 0; p < sio->pages; p++) {
- struct page *page = sio->bvec[p].bv_page;
+ struct folio *folio = page_folio(sio->bvec[p].bv_page);
- SetPageError(page);
- ClearPageUptodate(page);
- unlock_page(page);
+ folio_unlock(folio);
}
pr_alert_ratelimited("Read-error on swap-device\n");
}
@@ -495,14 +492,15 @@ static void swap_readpage_bdev_async(struct page *page,
void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug)
{
+ struct folio *folio = page_folio(page);
struct swap_info_struct *sis = page_swap_info(page);
- bool workingset = PageWorkingset(page);
+ bool workingset = folio_test_workingset(folio);
unsigned long pflags;
bool in_thrashing;
- VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageUptodate(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
/*
* Count submission time as memory stall and delay. When the device
@@ -515,9 +513,9 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug)
}
delayacct_swapin_start();
- if (frontswap_load(page) == 0) {
- SetPageUptodate(page);
- unlock_page(page);
+ if (zswap_load(folio)) {
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
} else if (data_race(sis->flags & SWP_FS_OPS)) {
swap_readpage_fs(page, plug);
} else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) {
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 6599cc965e21..bcf99ba747a0 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -79,17 +79,17 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* handle each tail page individually in migration.
*/
if (PageHuge(page) || PageTransCompound(page)) {
- struct page *head = compound_head(page);
+ struct folio *folio = page_folio(page);
unsigned int skip_pages;
if (PageHuge(page)) {
- if (!hugepage_migration_supported(page_hstate(head)))
+ if (!hugepage_migration_supported(folio_hstate(folio)))
return page;
- } else if (!PageLRU(head) && !__PageMovable(head)) {
+ } else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) {
return page;
}
- skip_pages = compound_nr(head) - (page - head);
+ skip_pages = folio_nr_pages(folio) - folio_page_idx(folio, page);
pfn += skip_pages - 1;
continue;
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index c93baef0148f..4e2723e1b300 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -104,7 +104,7 @@ struct page_ext_operations page_owner_ops = {
static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
{
- return (void *)page_ext + page_owner_ops.offset;
+ return page_ext_data(page_ext, &page_owner_ops);
}
static noinline depot_stack_handle_t save_stack(gfp_t flags)
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 98438985e1ed..b4f456437b7e 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -4,7 +4,6 @@
#include <linux/mm.h>
#include <linux/mmdebug.h>
#include <linux/highmem.h>
-#include <linux/page_ext.h>
#include <linux/poison.h>
#include <linux/ratelimit.h>
#include <linux/kasan.h>
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 93ec7690a0d8..46e77c12c81e 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -51,15 +51,14 @@ struct page_ext_operations page_table_check_ops = {
static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
{
BUG_ON(!page_ext);
- return (void *)(page_ext) + page_table_check_ops.offset;
+ return page_ext_data(page_ext, &page_table_check_ops);
}
/*
* An entry is removed from the page table, decrement the counters for that page
* verify that it is of correct type and counters do not become negative.
*/
-static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
- unsigned long pfn, unsigned long pgcnt)
+static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
{
struct page_ext *page_ext;
struct page *page;
@@ -95,8 +94,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
* verify that it is of correct type and is not being mapped with a different
* type to a different process.
*/
-static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
- unsigned long pfn, unsigned long pgcnt,
+static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
bool rw)
{
struct page_ext *page_ext;
@@ -151,85 +149,73 @@ void __page_table_check_zero(struct page *page, unsigned int order)
page_ext_put(page_ext);
}
-void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t pte)
+void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
if (&init_mm == mm)
return;
if (pte_user_accessible_page(pte)) {
- page_table_check_clear(mm, addr, pte_pfn(pte),
- PAGE_SIZE >> PAGE_SHIFT);
+ page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT);
}
}
EXPORT_SYMBOL(__page_table_check_pte_clear);
-void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
- pmd_t pmd)
+void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
if (&init_mm == mm)
return;
if (pmd_user_accessible_page(pmd)) {
- page_table_check_clear(mm, addr, pmd_pfn(pmd),
- PMD_SIZE >> PAGE_SHIFT);
+ page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT);
}
}
EXPORT_SYMBOL(__page_table_check_pmd_clear);
-void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
- pud_t pud)
+void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
if (&init_mm == mm)
return;
if (pud_user_accessible_page(pud)) {
- page_table_check_clear(mm, addr, pud_pfn(pud),
- PUD_SIZE >> PAGE_SHIFT);
+ page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT);
}
}
EXPORT_SYMBOL(__page_table_check_pud_clear);
-void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
+void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte)
{
if (&init_mm == mm)
return;
- __page_table_check_pte_clear(mm, addr, ptep_get(ptep));
+ __page_table_check_pte_clear(mm, ptep_get(ptep));
if (pte_user_accessible_page(pte)) {
- page_table_check_set(mm, addr, pte_pfn(pte),
- PAGE_SIZE >> PAGE_SHIFT,
+ page_table_check_set(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT,
pte_write(pte));
}
}
EXPORT_SYMBOL(__page_table_check_pte_set);
-void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, pmd_t pmd)
+void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd)
{
if (&init_mm == mm)
return;
- __page_table_check_pmd_clear(mm, addr, *pmdp);
+ __page_table_check_pmd_clear(mm, *pmdp);
if (pmd_user_accessible_page(pmd)) {
- page_table_check_set(mm, addr, pmd_pfn(pmd),
- PMD_SIZE >> PAGE_SHIFT,
+ page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT,
pmd_write(pmd));
}
}
EXPORT_SYMBOL(__page_table_check_pmd_set);
-void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
- pud_t *pudp, pud_t pud)
+void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud)
{
if (&init_mm == mm)
return;
- __page_table_check_pud_clear(mm, addr, *pudp);
+ __page_table_check_pud_clear(mm, *pudp);
if (pud_user_accessible_page(pud)) {
- page_table_check_set(mm, addr, pud_pfn(pud),
- PUD_SIZE >> PAGE_SHIFT,
+ page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT,
pud_write(pud));
}
}
@@ -249,7 +235,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm,
if (WARN_ON(!ptep))
return;
for (i = 0; i < PTRS_PER_PTE; i++) {
- __page_table_check_pte_clear(mm, addr, ptep_get(ptep));
+ __page_table_check_pte_clear(mm, ptep_get(ptep));
addr += PAGE_SIZE;
ptep++;
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 49e0d28f0379..e0b368e545ed 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -73,20 +73,22 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
}
/**
- * check_pte - check if @pvmw->page is mapped at the @pvmw->pte
- * @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking
+ * check_pte - check if [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) is
+ * mapped at the @pvmw->pte
+ * @pvmw: page_vma_mapped_walk struct, includes a pair pte and pfn range
+ * for checking
*
- * page_vma_mapped_walk() found a place where @pvmw->page is *potentially*
+ * page_vma_mapped_walk() found a place where pfn range is *potentially*
* mapped. check_pte() has to validate this.
*
* pvmw->pte may point to empty PTE, swap PTE or PTE pointing to
* arbitrary page.
*
* If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration
- * entry that points to @pvmw->page or any subpage in case of THP.
+ * entry that points to [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages)
*
* If PVMW_MIGRATION flag is not set, returns true if pvmw->pte points to
- * pvmw->page or any subpage in case of THP.
+ * [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages)
*
* Otherwise, return false.
*
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4d454953046f..4fcd959dcc4d 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -13,6 +13,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mm_inline.h>
+#include <asm/pgalloc.h>
#include <asm/tlb.h>
/*
@@ -230,14 +231,62 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
return pmd;
}
#endif
+
+/* arch define pte_free_defer in asm/pgalloc.h for its own implementation */
+#ifndef pte_free_defer
+static void pte_free_now(struct rcu_head *head)
+{
+ struct page *page;
+
+ page = container_of(head, struct page, rcu_head);
+ pte_free(NULL /* mm not passed and not used */, (pgtable_t)page);
+}
+
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+ struct page *page;
+
+ page = pgtable;
+ call_rcu(&page->rcu_head, pte_free_now);
+}
+#endif /* pte_free_defer */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \
+ (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU))
+/*
+ * See the comment above ptep_get_lockless() in include/linux/pgtable.h:
+ * the barriers in pmdp_get_lockless() cannot guarantee that the value in
+ * pmd_high actually belongs with the value in pmd_low; but holding interrupts
+ * off blocks the TLB flush between present updates, which guarantees that a
+ * successful __pte_offset_map() points to a page from matched halves.
+ */
+static unsigned long pmdp_get_lockless_start(void)
+{
+ unsigned long irqflags;
+
+ local_irq_save(irqflags);
+ return irqflags;
+}
+static void pmdp_get_lockless_end(unsigned long irqflags)
+{
+ local_irq_restore(irqflags);
+}
+#else
+static unsigned long pmdp_get_lockless_start(void) { return 0; }
+static void pmdp_get_lockless_end(unsigned long irqflags) { }
+#endif
+
pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
{
+ unsigned long irqflags;
pmd_t pmdval;
- /* rcu_read_lock() to be added later */
+ rcu_read_lock();
+ irqflags = pmdp_get_lockless_start();
pmdval = pmdp_get_lockless(pmd);
+ pmdp_get_lockless_end(irqflags);
+
if (pmdvalp)
*pmdvalp = pmdval;
if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
@@ -250,7 +299,7 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
}
return __pte_map(&pmdval, addr);
nomap:
- /* rcu_read_unlock() to be added later */
+ rcu_read_unlock();
return NULL;
}
@@ -266,6 +315,50 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
return pte;
}
+/*
+ * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation
+ * __pte_offset_map_lock() below, is usually called with the pmd pointer for
+ * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while
+ * holding mmap_lock or vma lock for read or for write; or in truncate or rmap
+ * context, while holding file's i_mmap_lock or anon_vma lock for read (or for
+ * write). In a few cases, it may be used with pmd pointing to a pmd_t already
+ * copied to or constructed on the stack.
+ *
+ * When successful, it returns the pte pointer for addr, with its page table
+ * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent
+ * modification by software, with a pointer to that spinlock in ptlp (in some
+ * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's
+ * struct page). pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards.
+ *
+ * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no
+ * page table at *pmd: if, for example, the page table has just been removed,
+ * or replaced by the huge pmd of a THP. (When successful, *pmd is rechecked
+ * after acquiring the ptlock, and retried internally if it changed: so that a
+ * page table can be safely removed or replaced by THP while holding its lock.)
+ *
+ * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above,
+ * just returns the pte pointer for addr, its page table kmapped if necessary;
+ * or NULL if there is no page table at *pmd. It does not attempt to lock the
+ * page table, so cannot normally be used when the page table is to be updated,
+ * or when entries read must be stable. But it does take rcu_read_lock(): so
+ * that even when page table is racily removed, it remains a valid though empty
+ * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s
+ * afterwards.
+ *
+ * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
+ * but when successful, it also outputs a pointer to the spinlock in ptlp - as
+ * pte_offset_map_lock() does, but in this case without locking it. This helps
+ * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
+ * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock
+ * pointer for the page table that it returns. In principle, the caller should
+ * recheck *pmd once the lock is taken; in practice, no callsite needs that -
+ * either the mmap_lock for write, or pte_same() check on contents, is enough.
+ *
+ * Note that free_pgtables(), used after unmapping detached vmas, or when
+ * exiting the whole mm, does not take page table lock before freeing a page
+ * table, and may not use RCU at all: "outsiders" like khugepaged should avoid
+ * pte_offset_map() and co once the vma is detached from mm or mm_users is zero.
+ */
pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, spinlock_t **ptlp)
{
diff --git a/mm/rmap.c b/mm/rmap.c
index 0c0d8857dfce..51ec8aa5e61f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -642,7 +642,8 @@ void try_to_unmap_flush_dirty(void)
#define TLB_FLUSH_BATCH_PENDING_LARGE \
(TLB_FLUSH_BATCH_PENDING_MASK / 2)
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
+ unsigned long uaddr)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
int batch;
@@ -651,7 +652,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval)
if (!pte_accessible(mm, pteval))
return;
- arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
+ arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
tlb_ubc->flush_required = true;
/*
@@ -688,17 +689,10 @@ retry:
*/
static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
- bool should_defer = false;
-
if (!(flags & TTU_BATCH_FLUSH))
return false;
- /* If remote CPUs need to be flushed then defer batch the flush */
- if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
- should_defer = true;
- put_cpu();
-
- return should_defer;
+ return arch_tlbbatch_should_defer(mm);
}
/*
@@ -723,7 +717,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
if (pending != flushed) {
- flush_tlb_mm(mm);
+ arch_flush_tlb_batched_pending(mm);
/*
* If the new TLB flushing is pending during flushing, leave
* mm->tlb_flush_batched as is, to avoid losing flushing.
@@ -733,7 +727,8 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
}
}
#else
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
+ unsigned long uaddr)
{
}
@@ -990,13 +985,6 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
#endif
}
- /*
- * No need to call mmu_notifier_invalidate_range() as we are
- * downgrading page table protection not changing it to point
- * to a new page.
- *
- * See Documentation/mm/mmu_notifier.rst
- */
if (ret)
cleaned++;
}
@@ -1175,14 +1163,14 @@ out:
/**
* __page_check_anon_rmap - sanity check anonymous rmap addition
- * @page: the page to add the mapping to
+ * @folio: The folio containing @page.
+ * @page: the page to check the mapping of
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*/
-static void __page_check_anon_rmap(struct page *page,
+static void __page_check_anon_rmap(struct folio *folio, struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- struct folio *folio = page_folio(page);
/*
* The page's anon-rmap details (mapping and index) are guaranteed to
* be set up correctly at this point.
@@ -1262,7 +1250,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
__page_set_anon_rmap(folio, page, vma, address,
!!(flags & RMAP_EXCLUSIVE));
else
- __page_check_anon_rmap(page, vma, address);
+ __page_check_anon_rmap(folio, page, vma, address);
}
mlock_vma_folio(folio, vma, compound);
@@ -1554,8 +1542,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
hugetlb_vma_unlock_write(vma);
flush_tlb_range(vma,
range.start, range.end);
- mmu_notifier_invalidate_range(mm,
- range.start, range.end);
/*
* The ref count of the PMD page was
* dropped which is part of the way map
@@ -1586,7 +1572,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- set_tlb_ubc_flush_pending(mm, pteval);
+ set_tlb_ubc_flush_pending(mm, pteval, address);
} else {
pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
@@ -1628,9 +1614,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* copied pages.
*/
dec_mm_counter(mm, mm_counter(&folio->page));
- /* We have to invalidate as we cleared the pte */
- mmu_notifier_invalidate_range(mm, address,
- address + PAGE_SIZE);
} else if (folio_test_anon(folio)) {
swp_entry_t entry = { .val = page_private(subpage) };
pte_t swp_pte;
@@ -1642,9 +1625,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
folio_test_swapcache(folio))) {
WARN_ON_ONCE(1);
ret = false;
- /* We have to invalidate as we cleared the pte */
- mmu_notifier_invalidate_range(mm, address,
- address + PAGE_SIZE);
page_vma_mapped_walk_done(&pvmw);
break;
}
@@ -1675,9 +1655,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
if (ref_count == 1 + map_count &&
!folio_test_dirty(folio)) {
- /* Invalidate as we cleared the pte */
- mmu_notifier_invalidate_range(mm,
- address, address + PAGE_SIZE);
dec_mm_counter(mm, MM_ANONPAGES);
goto discard;
}
@@ -1732,9 +1709,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
- /* Invalidate as we cleared the pte */
- mmu_notifier_invalidate_range(mm, address,
- address + PAGE_SIZE);
} else {
/*
* This is a locked file-backed folio,
@@ -1750,13 +1724,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter_file(&folio->page));
}
discard:
- /*
- * No need to call mmu_notifier_invalidate_range() it has be
- * done above for all cases requiring it to happen under page
- * table lock before mmu_notifier_invalidate_range_end()
- *
- * See Documentation/mm/mmu_notifier.rst
- */
page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
@@ -1935,8 +1902,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
hugetlb_vma_unlock_write(vma);
flush_tlb_range(vma,
range.start, range.end);
- mmu_notifier_invalidate_range(mm,
- range.start, range.end);
/*
* The ref count of the PMD page was
@@ -1969,7 +1934,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- set_tlb_ubc_flush_pending(mm, pteval);
+ set_tlb_ubc_flush_pending(mm, pteval, address);
} else {
pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
@@ -2041,9 +2006,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* copied pages.
*/
dec_mm_counter(mm, mm_counter(&folio->page));
- /* We have to invalidate as we cleared the pte */
- mmu_notifier_invalidate_range(mm, address,
- address + PAGE_SIZE);
} else {
swp_entry_t entry;
pte_t swp_pte;
@@ -2107,13 +2069,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
}
- /*
- * No need to call mmu_notifier_invalidate_range() it has be
- * done above for all cases requiring it to happen under page
- * table lock before mmu_notifier_invalidate_range_end()
- *
- * See Documentation/mm/mmu_notifier.rst
- */
page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
@@ -2402,11 +2357,12 @@ out:
/*
* rmap_walk_anon - do something to anonymous page using the object-based
* rmap method
- * @page: the page to be handled
+ * @folio: the folio to be handled
* @rwc: control variable according to each walk type
+ * @locked: caller holds relevant rmap lock
*
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the anon_vma struct it points to.
+ * Find all the mappings of a folio using the mapping pointer and the vma
+ * chains contained in the anon_vma struct it points to.
*/
static void rmap_walk_anon(struct folio *folio,
struct rmap_walk_control *rwc, bool locked)
@@ -2450,10 +2406,11 @@ static void rmap_walk_anon(struct folio *folio,
/*
* rmap_walk_file - do something to file page using the object-based rmap method
- * @page: the page to be handled
+ * @folio: the folio to be handled
* @rwc: control variable according to each walk type
+ * @locked: caller holds relevant rmap lock
*
- * Find all the mappings of a page using the mapping pointer and the vma chains
+ * Find all the mappings of a folio using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
*/
static void rmap_walk_file(struct folio *folio,
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 86442a15d12f..3afb5ad701e1 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -55,6 +55,7 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
gfp_t gfp = vmf->gfp_mask;
unsigned long addr;
struct page *page;
+ struct folio *folio;
vm_fault_t ret;
int err;
@@ -66,23 +67,24 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
retry:
page = find_lock_page(mapping, offset);
if (!page) {
- page = alloc_page(gfp | __GFP_ZERO);
- if (!page) {
+ folio = folio_alloc(gfp | __GFP_ZERO, 0);
+ if (!folio) {
ret = VM_FAULT_OOM;
goto out;
}
+ page = &folio->page;
err = set_direct_map_invalid_noflush(page);
if (err) {
- put_page(page);
+ folio_put(folio);
ret = vmf_error(err);
goto out;
}
- __SetPageUptodate(page);
- err = add_to_page_cache_lru(page, mapping, offset, gfp);
+ __folio_mark_uptodate(folio);
+ err = filemap_add_folio(mapping, folio, offset, gfp);
if (unlikely(err)) {
- put_page(page);
+ folio_put(folio);
/*
* If a split of large page was required, it
* already happened when we marked the page invalid
diff --git a/mm/shmem.c b/mm/shmem.c
index f5af4b943e42..20daa207d8bf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -970,7 +970,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
same_folio = lend < folio_pos(folio) + folio_size(folio);
folio_mark_dirty(folio);
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
- start = folio->index + folio_nr_pages(folio);
+ start = folio_next_index(folio);
if (same_folio)
end = folio->index;
}
@@ -1707,7 +1707,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
swp_entry_t swapin_error;
void *old;
- swapin_error = make_swapin_error_entry();
+ swapin_error = make_poisoned_swp_entry();
old = xa_cmpxchg_irq(&mapping->i_pages, index,
swp_to_radix_entry(swap),
swp_to_radix_entry(swapin_error), 0);
@@ -1752,7 +1752,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
swap = radix_to_swp_entry(*foliop);
*foliop = NULL;
- if (is_swapin_error_entry(swap))
+ if (is_poisoned_swp_entry(swap))
return -EIO;
si = get_swap_device(swap);
@@ -3864,8 +3864,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
struct mempolicy *mpol;
if (sbinfo->max_blocks != shmem_default_max_blocks())
- seq_printf(seq, ",size=%luk",
- sbinfo->max_blocks << (PAGE_SHIFT - 10));
+ seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks));
if (sbinfo->max_inodes != shmem_default_max_inodes())
seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
if (sbinfo->mode != (0777 | S_ISVTX))
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 01f8e9905817..4b888b18bdde 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -186,7 +186,7 @@ static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
* SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
* cpuset.
*/
-void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
+static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
{
unsigned long free_pcp = 0;
int cpu, nid;
@@ -251,9 +251,9 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i
" writeback:%lukB"
" shmem:%lukB"
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- " shmem_thp: %lukB"
- " shmem_pmdmapped: %lukB"
- " anon_thp: %lukB"
+ " shmem_thp:%lukB"
+ " shmem_pmdmapped:%lukB"
+ " anon_thp:%lukB"
#endif
" writeback_tmp:%lukB"
" kernel_stack:%lukB"
@@ -406,7 +406,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
struct zone *zone;
printk("Mem-Info:\n");
- __show_free_areas(filter, nodemask, max_zone_idx);
+ show_free_areas(filter, nodemask, max_zone_idx);
for_each_populated_zone(zone) {
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a044a130405b..a2cbe44c48e1 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -358,6 +358,7 @@ int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
return 0;
}
+#ifndef vmemmap_populate_compound_pages
/*
* For compound pages bigger than section size (e.g. x86 1G compound
* pages with 2M subsection size) fill the rest of sections as tail
@@ -446,6 +447,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
return 0;
}
+#endif
+
struct page * __meminit __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
diff --git a/mm/sparse.c b/mm/sparse.c
index 297a8b772e8d..77d91e565045 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -172,8 +172,7 @@ static void __section_mark_present(struct mem_section *ms,
#define for_each_present_section_nr(start, section_nr) \
for (section_nr = next_present_section_nr(start-1); \
- ((section_nr != -1) && \
- (section_nr <= __highest_present_section_nr)); \
+ section_nr != -1; \
section_nr = next_present_section_nr(section_nr))
static inline unsigned long first_present_section_nr(void)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f8ea7015bad4..d157862ba0a6 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -63,9 +63,8 @@ static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
void show_swap_cache_info(void)
{
printk("%lu pages in swap cache\n", total_swapcache_pages());
- printk("Free swap = %ldkB\n",
- get_nr_swap_pages() << (PAGE_SHIFT - 10));
- printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
+ printk("Free swap = %ldkB\n", K(get_nr_swap_pages()));
+ printk("Total swap = %lukB\n", K(total_swap_pages));
}
void *get_shadow_from_swap_cache(swp_entry_t entry)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b15112b1f1a8..d46933adf789 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -35,17 +35,18 @@
#include <linux/memcontrol.h>
#include <linux/poll.h>
#include <linux/oom.h>
-#include <linux/frontswap.h>
#include <linux/swapfile.h>
#include <linux/export.h>
#include <linux/swap_slots.h>
#include <linux/sort.h>
#include <linux/completion.h>
#include <linux/suspend.h>
+#include <linux/zswap.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
+#include "internal.h"
#include "swap.h"
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
@@ -95,7 +96,7 @@ static PLIST_HEAD(swap_active_head);
static struct plist_head *swap_avail_heads;
static DEFINE_SPINLOCK(swap_avail_lock);
-struct swap_info_struct *swap_info[MAX_SWAPFILES];
+static struct swap_info_struct *swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
@@ -714,10 +715,8 @@ static void add_to_avail_list(struct swap_info_struct *p)
int nid;
spin_lock(&swap_avail_lock);
- for_each_node(nid) {
- WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
+ for_each_node(nid)
plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
- }
spin_unlock(&swap_avail_lock);
}
@@ -746,7 +745,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
swap_slot_free_notify = NULL;
while (offset <= end) {
arch_swap_invalidate_page(si->type, offset);
- frontswap_invalidate_page(si->type, offset);
+ zswap_invalidate(si->type, offset);
if (swap_slot_free_notify)
swap_slot_free_notify(si->bdev, offset);
offset++;
@@ -1773,13 +1772,20 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry = make_hwpoison_entry(swapcache);
page = swapcache;
} else {
- swp_entry = make_swapin_error_entry();
+ swp_entry = make_poisoned_swp_entry();
}
new_pte = swp_entry_to_pte(swp_entry);
ret = 0;
goto setpte;
}
+ /*
+ * Some architectures may have to restore extra metadata to the page
+ * when reading from swap. This metadata may be indexed by swap entry
+ * so this must be called before swap_free().
+ */
+ arch_swap_restore(entry, page_folio(page));
+
/* See do_swap_page() */
BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
BUG_ON(PageAnon(page) && PageAnonExclusive(page));
@@ -2330,16 +2336,18 @@ static void _enable_swap_info(struct swap_info_struct *p)
* swap_info_struct.
*/
plist_add(&p->list, &swap_active_head);
- add_to_avail_list(p);
+
+ /* add to available list iff swap device is not full */
+ if (p->highest_bit)
+ add_to_avail_list(p);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
- struct swap_cluster_info *cluster_info,
- unsigned long *frontswap_map)
+ struct swap_cluster_info *cluster_info)
{
- if (IS_ENABLED(CONFIG_FRONTSWAP))
- frontswap_init(p->type, frontswap_map);
+ zswap_swapon(p->type);
+
spin_lock(&swap_lock);
spin_lock(&p->lock);
setup_swap_info(p, prio, swap_map, cluster_info);
@@ -2382,7 +2390,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct swap_info_struct *p = NULL;
unsigned char *swap_map;
struct swap_cluster_info *cluster_info;
- unsigned long *frontswap_map;
struct file *swap_file, *victim;
struct address_space *mapping;
struct inode *inode;
@@ -2507,12 +2514,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->swap_map = NULL;
cluster_info = p->cluster_info;
p->cluster_info = NULL;
- frontswap_map = frontswap_map_get(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
arch_swap_invalidate_area(p->type);
- frontswap_invalidate_area(p->type);
- frontswap_map_set(p, NULL);
+ zswap_swapoff(p->type);
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
@@ -2520,7 +2525,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->cluster_next_cpu = NULL;
vfree(swap_map);
kvfree(cluster_info);
- kvfree(frontswap_map);
/* Destroy swap account information */
swap_cgroup_swapoff(p->type);
exit_swap_address_space(p->type);
@@ -2632,8 +2636,8 @@ static int swap_show(struct seq_file *swap, void *v)
return 0;
}
- bytes = si->pages << (PAGE_SHIFT - 10);
- inuse = READ_ONCE(si->inuse_pages) << (PAGE_SHIFT - 10);
+ bytes = K(si->pages);
+ inuse = K(READ_ONCE(si->inuse_pages));
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
@@ -2858,8 +2862,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
}
if (last_page > maxpages) {
pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
- maxpages << (PAGE_SHIFT - 10),
- last_page << (PAGE_SHIFT - 10));
+ K(maxpages), K(last_page));
}
if (maxpages > last_page) {
maxpages = last_page + 1;
@@ -2987,7 +2990,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
unsigned long maxpages;
unsigned char *swap_map = NULL;
struct swap_cluster_info *cluster_info = NULL;
- unsigned long *frontswap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
bool inced_nr_rotate_swap = false;
@@ -3127,11 +3129,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = nr_extents;
goto bad_swap_unlock_inode;
}
- /* frontswap enabled? set up bit-per-page map for frontswap */
- if (IS_ENABLED(CONFIG_FRONTSWAP))
- frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
- sizeof(long),
- GFP_KERNEL);
if ((swap_flags & SWAP_FLAG_DISCARD) &&
p->bdev && bdev_max_discard_sectors(p->bdev)) {
@@ -3184,16 +3181,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (swap_flags & SWAP_FLAG_PREFER)
prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
- enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
+ enable_swap_info(p, prio, swap_map, cluster_info);
- pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
- p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
- nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
+ pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n",
+ K(p->pages), name->name, p->prio, nr_extents,
+ K((unsigned long long)span),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
(p->flags & SWP_DISCARDABLE) ? "D" : "",
(p->flags & SWP_AREA_DISCARD) ? "s" : "",
- (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
- (frontswap_map) ? "FS" : "");
+ (p->flags & SWP_PAGE_DISCARD) ? "c" : "");
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
@@ -3223,7 +3219,6 @@ bad_swap:
spin_unlock(&swap_lock);
vfree(swap_map);
kvfree(cluster_info);
- kvfree(frontswap_map);
if (inced_nr_rotate_swap)
atomic_dec(&nr_rotate_swap);
if (swap_file)
diff --git a/mm/truncate.c b/mm/truncate.c
index 95d1291d269b..bd4fafd67f95 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -19,7 +19,6 @@
#include <linux/highmem.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/buffer_head.h> /* grr. try_to_release_page */
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include "internal.h"
@@ -276,7 +275,7 @@ static long mapping_evict_folio(struct address_space *mapping,
if (folio_ref_count(folio) >
folio_nr_pages(folio) + folio_has_private(folio) + 1)
return 0;
- if (folio_has_private(folio) && !filemap_release_folio(folio, 0))
+ if (!filemap_release_folio(folio, 0))
return 0;
return remove_mapping(mapping, folio);
@@ -378,7 +377,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
if (!IS_ERR(folio)) {
same_folio = lend < folio_pos(folio) + folio_size(folio);
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
- start = folio->index + folio_nr_pages(folio);
+ start = folio_next_index(folio);
if (same_folio)
end = folio->index;
}
@@ -573,8 +572,7 @@ static int invalidate_complete_folio2(struct address_space *mapping,
if (folio->mapping != mapping)
return 0;
- if (folio_has_private(folio) &&
- !filemap_release_folio(folio, GFP_KERNEL))
+ if (!filemap_release_folio(folio, GFP_KERNEL))
return 0;
spin_lock(&mapping->host->i_lock);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index a2bf37ee276d..0fc69efa4f1f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -45,6 +45,22 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
return dst_vma;
}
+/* Check if dst_addr is outside of file's size. Must be called with ptl held. */
+static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ struct inode *inode;
+ pgoff_t offset, max_off;
+
+ if (!dst_vma->vm_file)
+ return false;
+
+ inode = dst_vma->vm_file->f_inode;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ return offset >= max_off;
+}
+
/*
* Install PTEs, to map dst_addr (within dst_vma) to page.
*
@@ -64,8 +80,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
bool page_in_cache = page_mapping(page);
spinlock_t *ptl;
struct folio *folio;
- struct inode *inode;
- pgoff_t offset, max_off;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
_dst_pte = pte_mkdirty(_dst_pte);
@@ -81,14 +95,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
if (!dst_pte)
goto out;
- if (vma_is_shmem(dst_vma)) {
- /* serialize against truncate with the page table lock */
- inode = dst_vma->vm_file->f_inode;
- offset = linear_page_index(dst_vma, dst_addr);
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (mfill_file_over_size(dst_vma, dst_addr)) {
ret = -EFAULT;
- if (unlikely(offset >= max_off))
- goto out_unlock;
+ goto out_unlock;
}
ret = -EEXIST;
@@ -211,8 +220,6 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
int ret;
- pgoff_t offset, max_off;
- struct inode *inode;
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
@@ -220,14 +227,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
if (!dst_pte)
goto out;
- if (dst_vma->vm_file) {
- /* the shmem MAP_PRIVATE case requires checking the i_size */
- inode = dst_vma->vm_file->f_inode;
- offset = linear_page_index(dst_vma, dst_addr);
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (mfill_file_over_size(dst_vma, dst_addr)) {
ret = -EFAULT;
- if (unlikely(offset >= max_off))
- goto out_unlock;
+ goto out_unlock;
}
ret = -EEXIST;
if (!pte_none(ptep_get(dst_pte)))
@@ -286,6 +288,44 @@ out_release:
goto out;
}
+/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
+static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ uffd_flags_t flags)
+{
+ int ret;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+
+ _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
+ ret = -EAGAIN;
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!dst_pte)
+ goto out;
+
+ if (mfill_file_over_size(dst_vma, dst_addr)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ ret = -EEXIST;
+ /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
+ if (!pte_none(*dst_pte))
+ goto out_unlock;
+
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+out:
+ return ret;
+}
+
static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
@@ -481,6 +521,9 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
return mfill_atomic_pte_continue(dst_pmd, dst_vma,
dst_addr, flags);
+ } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
+ return mfill_atomic_pte_poison(dst_pmd, dst_vma,
+ dst_addr, flags);
}
/*
@@ -702,6 +745,14 @@ ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
}
+ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, atomic_t *mmap_changing,
+ uffd_flags_t flags)
+{
+ return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
+}
+
long uffd_wp_range(struct vm_area_struct *dst_vma,
unsigned long start, unsigned long len, bool enable_wp)
{
diff --git a/mm/util.c b/mm/util.c
index dd12b9531ac4..5e9305189c3f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -734,12 +734,6 @@ void *vcalloc(size_t n, size_t size)
}
EXPORT_SYMBOL(vcalloc);
-/* Neutral page->mapping pointer to address_space or anon_vma or other */
-void *page_rmapping(struct page *page)
-{
- return folio_raw_mapping(page_folio(page));
-}
-
struct anon_vma *folio_anon_vma(struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2fe4a11d63f4..c7c149cb8d66 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2064,7 +2064,7 @@ retry:
* (refcount == 1) it can be freed. Otherwise, leave
* the folio on the LRU so it is swappable.
*/
- if (folio_has_private(folio)) {
+ if (folio_needs_release(folio)) {
if (!filemap_release_folio(folio, sc->gfp_mask))
goto activate_locked;
if (!mapping && folio_ref_count(folio) == 1) {
@@ -2729,9 +2729,9 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
if (unlikely(buffer_heads_over_limit)) {
- if (folio_test_private(folio) && folio_trylock(folio)) {
- if (folio_test_private(folio))
- filemap_release_folio(folio, 0);
+ if (folio_needs_release(folio) &&
+ folio_trylock(folio)) {
+ filemap_release_folio(folio, 0);
folio_unlock(folio);
}
}
@@ -4440,7 +4440,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
int prev, next;
int type, zone;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
-
+restart:
spin_lock_irq(&lruvec->lru_lock);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
@@ -4451,11 +4451,12 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
- while (!inc_min_seq(lruvec, type, can_swap)) {
- spin_unlock_irq(&lruvec->lru_lock);
- cond_resched();
- spin_lock_irq(&lruvec->lru_lock);
- }
+ if (inc_min_seq(lruvec, type, can_swap))
+ continue;
+
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ goto restart;
}
/*
@@ -4656,6 +4657,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
struct folio *folio = pfn_folio(pvmw->pfn);
+ bool can_swap = !folio_is_file_lru(folio);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
@@ -4704,7 +4706,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
if (!pte_young(ptent))
continue;
- folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
if (!folio)
continue;
@@ -4891,7 +4893,8 @@ static int lru_gen_memcg_seg(struct lruvec *lruvec)
* the eviction
******************************************************************************/
-static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
+static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
+ int tier_idx)
{
bool success;
int gen = folio_lru_gen(folio);
@@ -4941,6 +4944,13 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
return true;
}
+ /* ineligible */
+ if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
+ gen = folio_inc_gen(lruvec, folio, false);
+ list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+ return true;
+ }
+
/* waiting for writeback */
if (folio_test_locked(folio) || folio_test_writeback(folio) ||
(type == LRU_GEN_FILE && folio_test_dirty(folio))) {
@@ -4989,7 +4999,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
int type, int tier, struct list_head *list)
{
- int gen, zone;
+ int i;
+ int gen;
enum vm_event_item item;
int sorted = 0;
int scanned = 0;
@@ -5005,9 +5016,10 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
gen = lru_gen_from_seq(lrugen->min_seq[type]);
- for (zone = sc->reclaim_idx; zone >= 0; zone--) {
+ for (i = MAX_NR_ZONES; i > 0; i--) {
LIST_HEAD(moved);
int skipped = 0;
+ int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
struct list_head *head = &lrugen->folios[gen][type][zone];
while (!list_empty(head)) {
@@ -5021,7 +5033,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
scanned += delta;
- if (sort_folio(lruvec, folio, tier))
+ if (sort_folio(lruvec, folio, sc, tier))
sorted += delta;
else if (isolate_folio(lruvec, folio, sc)) {
list_add(&folio->lru, list);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b731d57996c5..00e81e99c6ee 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -26,7 +26,6 @@
#include <linux/writeback.h>
#include <linux/compaction.h>
#include <linux/mm_inline.h>
-#include <linux/page_ext.h>
#include <linux/page_owner.h>
#include <linux/sched/isolation.h>
diff --git a/mm/z3fold.c b/mm/z3fold.c
index e84de91ecccb..7c76b396b74c 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -133,8 +133,6 @@ struct z3fold_header {
* @stale: list of pages marked for freeing
* @pages_nr: number of z3fold pages in the pool.
* @c_handle: cache for z3fold_buddy_slots allocation
- * @zpool: zpool driver
- * @zpool_ops: zpool operations structure with an evict callback
* @compact_wq: workqueue for page layout background optimization
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
@@ -480,6 +478,16 @@ static void release_z3fold_page_locked_list(struct kref *ref)
__release_z3fold_page(zhdr, true);
}
+static inline int put_z3fold_locked(struct z3fold_header *zhdr)
+{
+ return kref_put(&zhdr->refcount, release_z3fold_page_locked);
+}
+
+static inline int put_z3fold_locked_list(struct z3fold_header *zhdr)
+{
+ return kref_put(&zhdr->refcount, release_z3fold_page_locked_list);
+}
+
static void free_pages_work(struct work_struct *w)
{
struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
@@ -666,7 +674,7 @@ static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
return new_zhdr;
out_fail:
- if (new_zhdr && !kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) {
+ if (new_zhdr && !put_z3fold_locked(new_zhdr)) {
add_to_unbuddied(pool, new_zhdr);
z3fold_page_unlock(new_zhdr);
}
@@ -741,7 +749,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
list_del_init(&zhdr->buddy);
spin_unlock(&pool->lock);
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
+ if (put_z3fold_locked(zhdr))
return;
if (test_bit(PAGE_STALE, &page->private) ||
@@ -752,7 +760,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
if (!zhdr->foreign_handles && buddy_single(zhdr) &&
zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
- if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ if (!put_z3fold_locked(zhdr)) {
clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
@@ -878,7 +886,7 @@ lookup:
return zhdr;
out_fail:
- if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ if (!put_z3fold_locked(zhdr)) {
add_to_unbuddied(pool, zhdr);
z3fold_page_unlock(zhdr);
}
@@ -1012,8 +1020,7 @@ retry:
if (zhdr) {
bud = get_free_buddy(zhdr, chunks);
if (bud == HEADLESS) {
- if (!kref_put(&zhdr->refcount,
- release_z3fold_page_locked))
+ if (!put_z3fold_locked(zhdr))
z3fold_page_unlock(zhdr);
pr_err("No free chunks in unbuddied\n");
WARN_ON(1);
@@ -1129,7 +1136,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
if (!page_claimed)
free_handle(handle, zhdr);
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list))
+ if (put_z3fold_locked_list(zhdr))
return;
if (page_claimed) {
/* the page has not been claimed by us */
@@ -1346,7 +1353,7 @@ static void z3fold_page_putback(struct page *page)
if (!list_empty(&zhdr->buddy))
list_del_init(&zhdr->buddy);
INIT_LIST_HEAD(&page->lru);
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
+ if (put_z3fold_locked(zhdr))
return;
if (list_empty(&zhdr->buddy))
add_to_unbuddied(pool, zhdr);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 32916d28d9d9..b58f957429f0 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -795,8 +795,8 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}
-static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
- int tag)
+static inline bool obj_allocated(struct page *page, void *obj,
+ unsigned long *phandle)
{
unsigned long handle;
struct zspage *zspage = get_zspage(page);
@@ -807,7 +807,7 @@ static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
} else
handle = *(unsigned long *)obj;
- if (!(handle & tag))
+ if (!(handle & OBJ_ALLOCATED_TAG))
return false;
/* Clear all tags before returning the handle */
@@ -815,11 +815,6 @@ static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
return true;
}
-static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
-{
- return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
-}
-
static void reset_page(struct page *page)
{
__ClearPageMovable(page);
@@ -1147,6 +1142,11 @@ static bool zspage_full(struct size_class *class, struct zspage *zspage)
return get_zspage_inuse(zspage) == class->objs_per_zspage;
}
+static bool zspage_empty(struct zspage *zspage)
+{
+ return get_zspage_inuse(zspage) == 0;
+}
+
/**
* zs_lookup_class_index() - Returns index of the zsmalloc &size_class
* that hold objects of the provided size.
@@ -1546,11 +1546,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
}
/*
- * Find object with a certain tag in zspage from index object and
+ * Find alloced object in zspage from index object and
* return handle.
*/
-static unsigned long find_tagged_obj(struct size_class *class,
- struct page *page, int *obj_idx, int tag)
+static unsigned long find_alloced_obj(struct size_class *class,
+ struct page *page, int *obj_idx)
{
unsigned int offset;
int index = *obj_idx;
@@ -1561,7 +1561,7 @@ static unsigned long find_tagged_obj(struct size_class *class,
offset += class->size * index;
while (offset < PAGE_SIZE) {
- if (obj_tagged(page, addr + offset, &handle, tag))
+ if (obj_allocated(page, addr + offset, &handle))
break;
offset += class->size;
@@ -1575,35 +1575,14 @@ static unsigned long find_tagged_obj(struct size_class *class,
return handle;
}
-/*
- * Find alloced object in zspage from index object and
- * return handle.
- */
-static unsigned long find_alloced_obj(struct size_class *class,
- struct page *page, int *obj_idx)
-{
- return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
-}
-
-struct zs_compact_control {
- /* Source spage for migration which could be a subpage of zspage */
- struct page *s_page;
- /* Destination page for migration which should be a first page
- * of zspage. */
- struct page *d_page;
- /* Starting object index within @s_page which used for live object
- * in the subpage. */
- int obj_idx;
-};
-
-static void migrate_zspage(struct zs_pool *pool, struct size_class *class,
- struct zs_compact_control *cc)
+static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage,
+ struct zspage *dst_zspage)
{
unsigned long used_obj, free_obj;
unsigned long handle;
- struct page *s_page = cc->s_page;
- struct page *d_page = cc->d_page;
- int obj_idx = cc->obj_idx;
+ int obj_idx = 0;
+ struct page *s_page = get_first_page(src_zspage);
+ struct size_class *class = pool->size_class[src_zspage->class];
while (1) {
handle = find_alloced_obj(class, s_page, &obj_idx);
@@ -1615,21 +1594,21 @@ static void migrate_zspage(struct zs_pool *pool, struct size_class *class,
continue;
}
- /* Stop if there is no more space */
- if (zspage_full(class, get_zspage(d_page)))
- break;
-
used_obj = handle_to_obj(handle);
- free_obj = obj_malloc(pool, get_zspage(d_page), handle);
+ free_obj = obj_malloc(pool, dst_zspage, handle);
zs_object_copy(class, free_obj, used_obj);
obj_idx++;
record_obj(handle, free_obj);
obj_free(class->size, used_obj);
- }
- /* Remember last position in this iteration */
- cc->s_page = s_page;
- cc->obj_idx = obj_idx;
+ /* Stop if there is no more space */
+ if (zspage_full(class, dst_zspage))
+ break;
+
+ /* Stop if there are no more objects to migrate */
+ if (zspage_empty(src_zspage))
+ break;
+ }
}
static struct zspage *isolate_src_zspage(struct size_class *class)
@@ -2008,7 +1987,6 @@ static unsigned long zs_can_compact(struct size_class *class)
static unsigned long __zs_compact(struct zs_pool *pool,
struct size_class *class)
{
- struct zs_compact_control cc;
struct zspage *src_zspage = NULL;
struct zspage *dst_zspage = NULL;
unsigned long pages_freed = 0;
@@ -2026,7 +2004,6 @@ static unsigned long __zs_compact(struct zs_pool *pool,
if (!dst_zspage)
break;
migrate_write_lock(dst_zspage);
- cc.d_page = get_first_page(dst_zspage);
}
src_zspage = isolate_src_zspage(class);
@@ -2035,9 +2012,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
migrate_write_lock_nested(src_zspage);
- cc.obj_idx = 0;
- cc.s_page = get_first_page(src_zspage);
- migrate_zspage(pool, class, &cc);
+ migrate_zspage(pool, src_zspage, dst_zspage);
fg = putback_zspage(class, src_zspage);
migrate_write_unlock(src_zspage);
diff --git a/mm/zswap.c b/mm/zswap.c
index 62195f72bf56..7300b98d4a03 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -2,7 +2,7 @@
/*
* zswap.c - zswap driver file
*
- * zswap is a backend for frontswap that takes pages that are in the process
+ * zswap is a cache that takes pages that are in the process
* of being swapped out and attempts to compress and store them in a
* RAM-based memory pool. This can result in a significant I/O reduction on
* the swap device and, in the case where decompressing from RAM is faster
@@ -20,7 +20,6 @@
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/atomic.h>
-#include <linux/frontswap.h>
#include <linux/rbtree.h>
#include <linux/swap.h>
#include <linux/crypto.h>
@@ -28,7 +27,7 @@
#include <linux/mempool.h>
#include <linux/zpool.h>
#include <crypto/acompress.h>
-
+#include <linux/zswap.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/swapops.h>
@@ -142,6 +141,9 @@ static bool zswap_exclusive_loads_enabled = IS_ENABLED(
CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
+/* Number of zpools in zswap_pool (empirically determined for scalability) */
+#define ZSWAP_NR_ZPOOLS 32
+
/*********************************
* data structures
**********************************/
@@ -161,7 +163,7 @@ struct crypto_acomp_ctx {
* needs to be verified that it's still valid in the tree.
*/
struct zswap_pool {
- struct zpool *zpool;
+ struct zpool *zpools[ZSWAP_NR_ZPOOLS];
struct crypto_acomp_ctx __percpu *acomp_ctx;
struct kref kref;
struct list_head list;
@@ -180,7 +182,7 @@ struct zswap_pool {
* page within zswap.
*
* rbnode - links the entry into red-black tree for the appropriate swap type
- * offset - the swap offset for the entry. Index into the red-black tree.
+ * swpentry - associated swap entry, the offset indexes into the red-black tree
* refcount - the number of outstanding reference to the entry. This is needed
* to protect against premature freeing of the entry by code
* concurrent calls to load, invalidate, and writeback. The lock
@@ -193,6 +195,7 @@ struct zswap_pool {
* pool - the zswap_pool the entry's data is in
* handle - zpool allocation handle that stores the compressed page data
* value - value of the same-value filled pages which have same content
+ * objcg - the obj_cgroup that the compressed memory is charged to
* lru - handle to the pool's lru used to evict pages.
*/
struct zswap_entry {
@@ -248,7 +251,7 @@ static bool zswap_has_pool;
#define zswap_pool_debug(msg, p) \
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
- zpool_get_type((p)->zpool))
+ zpool_get_type((p)->zpools[0]))
static int zswap_writeback_entry(struct zswap_entry *entry,
struct zswap_tree *tree);
@@ -272,11 +275,13 @@ static void zswap_update_total_size(void)
{
struct zswap_pool *pool;
u64 total = 0;
+ int i;
rcu_read_lock();
list_for_each_entry_rcu(pool, &zswap_pools, list)
- total += zpool_get_total_size(pool->zpool);
+ for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+ total += zpool_get_total_size(pool->zpools[i]);
rcu_read_unlock();
@@ -365,6 +370,16 @@ static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
return false;
}
+static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
+{
+ int i = 0;
+
+ if (ZSWAP_NR_ZPOOLS > 1)
+ i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
+
+ return entry->pool->zpools[i];
+}
+
/*
* Carries out the common pattern of freeing and entry's zpool allocation,
* freeing the entry itself, and decrementing the number of stored pages.
@@ -381,7 +396,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
spin_lock(&entry->pool->lru_lock);
list_del(&entry->lru);
spin_unlock(&entry->pool->lru_lock);
- zpool_free(entry->pool->zpool, entry->handle);
+ zpool_free(zswap_find_zpool(entry), entry->handle);
zswap_pool_put(entry->pool);
}
zswap_entry_cache_free(entry);
@@ -403,9 +418,9 @@ static void zswap_entry_put(struct zswap_tree *tree,
{
int refcount = --entry->refcount;
- BUG_ON(refcount < 0);
+ WARN_ON_ONCE(refcount < 0);
if (refcount == 0) {
- zswap_rb_erase(&tree->rbroot, entry);
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
zswap_free_entry(entry);
}
}
@@ -590,7 +605,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
list_for_each_entry_rcu(pool, &zswap_pools, list) {
if (strcmp(pool->tfm_name, compressor))
continue;
- if (strcmp(zpool_get_type(pool->zpool), type))
+ /* all zpools share the same type */
+ if (strcmp(zpool_get_type(pool->zpools[0]), type))
continue;
/* if we can't get it, it's about to be destroyed */
if (!zswap_pool_get(pool))
@@ -695,6 +711,7 @@ static void shrink_worker(struct work_struct *w)
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
{
+ int i;
struct zswap_pool *pool;
char name[38]; /* 'zswap' + 32 char (max) num + \0 */
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
@@ -715,15 +732,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
if (!pool)
return NULL;
- /* unique name for each pool specifically required by zsmalloc */
- snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
+ for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
+ /* unique name for each pool specifically required by zsmalloc */
+ snprintf(name, 38, "zswap%x",
+ atomic_inc_return(&zswap_pools_count));
- pool->zpool = zpool_create_pool(type, name, gfp);
- if (!pool->zpool) {
- pr_err("%s zpool not available\n", type);
- goto error;
+ pool->zpools[i] = zpool_create_pool(type, name, gfp);
+ if (!pool->zpools[i]) {
+ pr_err("%s zpool not available\n", type);
+ goto error;
+ }
}
- pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
+ pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
@@ -755,8 +775,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
error:
if (pool->acomp_ctx)
free_percpu(pool->acomp_ctx);
- if (pool->zpool)
- zpool_destroy_pool(pool->zpool);
+ while (i--)
+ zpool_destroy_pool(pool->zpools[i]);
kfree(pool);
return NULL;
}
@@ -805,11 +825,14 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
static void zswap_pool_destroy(struct zswap_pool *pool)
{
+ int i;
+
zswap_pool_debug("destroying", pool);
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
free_percpu(pool->acomp_ctx);
- zpool_destroy_pool(pool->zpool);
+ for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+ zpool_destroy_pool(pool->zpools[i]);
kfree(pool);
}
@@ -1017,43 +1040,6 @@ static int zswap_enabled_param_set(const char *val,
/*********************************
* writeback code
**********************************/
-/* return enum for zswap_get_swap_cache_page */
-enum zswap_get_swap_ret {
- ZSWAP_SWAPCACHE_NEW,
- ZSWAP_SWAPCACHE_EXIST,
- ZSWAP_SWAPCACHE_FAIL,
-};
-
-/*
- * zswap_get_swap_cache_page
- *
- * This is an adaption of read_swap_cache_async()
- *
- * This function tries to find a page with the given swap entry
- * in the swapper_space address space (the swap cache). If the page
- * is found, it is returned in retpage. Otherwise, a page is allocated,
- * added to the swap cache, and returned in retpage.
- *
- * If success, the swap cache page is returned in retpage
- * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
- * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
- * the new page is added to swapcache and locked
- * Returns ZSWAP_SWAPCACHE_FAIL on error
- */
-static int zswap_get_swap_cache_page(swp_entry_t entry,
- struct page **retpage)
-{
- bool page_was_allocated;
-
- *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
- NULL, 0, &page_was_allocated);
- if (page_was_allocated)
- return ZSWAP_SWAPCACHE_NEW;
- if (!*retpage)
- return ZSWAP_SWAPCACHE_FAIL;
- return ZSWAP_SWAPCACHE_EXIST;
-}
-
/*
* Attempts to free an entry by adding a page to the swap cache,
* decompressing the entry data into the page, and issuing a
@@ -1061,7 +1047,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
*
* This can be thought of as a "resumed writeback" of the page
* to the swap device. We are basically resuming the same swap
- * writeback path that was intercepted with the frontswap_store()
+ * writeback path that was intercepted with the zswap_store()
* in the first place. After the page has been decompressed into
* the swap cache, the compressed version stored by zswap can be
* freed.
@@ -1073,8 +1059,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
struct page *page;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
- struct zpool *pool = entry->pool->zpool;
-
+ struct zpool *pool = zswap_find_zpool(entry);
+ bool page_was_allocated;
u8 *src, *tmp = NULL;
unsigned int dlen;
int ret;
@@ -1089,65 +1075,66 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
}
/* try to allocate swap cache page */
- switch (zswap_get_swap_cache_page(swpentry, &page)) {
- case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
+ page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
+ &page_was_allocated);
+ if (!page) {
ret = -ENOMEM;
goto fail;
+ }
- case ZSWAP_SWAPCACHE_EXIST:
- /* page is already in the swap cache, ignore for now */
+ /* Found an existing page, we raced with load/swapin */
+ if (!page_was_allocated) {
put_page(page);
ret = -EEXIST;
goto fail;
+ }
- case ZSWAP_SWAPCACHE_NEW: /* page is locked */
- /*
- * Having a local reference to the zswap entry doesn't exclude
- * swapping from invalidating and recycling the swap slot. Once
- * the swapcache is secured against concurrent swapping to and
- * from the slot, recheck that the entry is still current before
- * writing.
- */
- spin_lock(&tree->lock);
- if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
- spin_unlock(&tree->lock);
- delete_from_swap_cache(page_folio(page));
- ret = -ENOMEM;
- goto fail;
- }
+ /*
+ * Page is locked, and the swapcache is now secured against
+ * concurrent swapping to and from the slot. Verify that the
+ * swap entry hasn't been invalidated and recycled behind our
+ * backs (our zswap_entry reference doesn't prevent that), to
+ * avoid overwriting a new swap page with old compressed data.
+ */
+ spin_lock(&tree->lock);
+ if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
spin_unlock(&tree->lock);
+ delete_from_swap_cache(page_folio(page));
+ ret = -ENOMEM;
+ goto fail;
+ }
+ spin_unlock(&tree->lock);
- /* decompress */
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
- dlen = PAGE_SIZE;
+ /* decompress */
+ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+ dlen = PAGE_SIZE;
- src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
- if (!zpool_can_sleep_mapped(pool)) {
- memcpy(tmp, src, entry->length);
- src = tmp;
- zpool_unmap_handle(pool, entry->handle);
- }
+ src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
+ if (!zpool_can_sleep_mapped(pool)) {
+ memcpy(tmp, src, entry->length);
+ src = tmp;
+ zpool_unmap_handle(pool, entry->handle);
+ }
- mutex_lock(acomp_ctx->mutex);
- sg_init_one(&input, src, entry->length);
- sg_init_table(&output, 1);
- sg_set_page(&output, page, PAGE_SIZE, 0);
- acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
- ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
- dlen = acomp_ctx->req->dlen;
- mutex_unlock(acomp_ctx->mutex);
-
- if (!zpool_can_sleep_mapped(pool))
- kfree(tmp);
- else
- zpool_unmap_handle(pool, entry->handle);
+ mutex_lock(acomp_ctx->mutex);
+ sg_init_one(&input, src, entry->length);
+ sg_init_table(&output, 1);
+ sg_set_page(&output, page, PAGE_SIZE, 0);
+ acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
+ ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
+ mutex_unlock(acomp_ctx->mutex);
- BUG_ON(ret);
- BUG_ON(dlen != PAGE_SIZE);
+ if (!zpool_can_sleep_mapped(pool))
+ kfree(tmp);
+ else
+ zpool_unmap_handle(pool, entry->handle);
- /* page is up to date */
- SetPageUptodate(page);
- }
+ BUG_ON(ret);
+ BUG_ON(dlen != PAGE_SIZE);
+
+ /* page is up to date */
+ SetPageUptodate(page);
/* move it to the tail of the inactive list after end_writeback */
SetPageReclaim(page);
@@ -1158,16 +1145,16 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
zswap_written_back_pages++;
return ret;
+
fail:
if (!zpool_can_sleep_mapped(pool))
kfree(tmp);
/*
- * if we get here due to ZSWAP_SWAPCACHE_EXIST
- * a load may be happening concurrently.
- * it is safe and okay to not free the entry.
- * it is also okay to return !0
- */
+ * If we get here because the page is already in swapcache, a
+ * load may be happening concurrently. It is safe and okay to
+ * not free the entry. It is also okay to return !0.
+ */
return ret;
}
@@ -1201,47 +1188,44 @@ static void zswap_fill_page(void *ptr, unsigned long value)
memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
}
-/*********************************
-* frontswap hooks
-**********************************/
-/* attempts to compress and store an single page */
-static int zswap_frontswap_store(unsigned type, pgoff_t offset,
- struct page *page)
+bool zswap_store(struct folio *folio)
{
+ swp_entry_t swp = folio_swap_entry(folio);
+ int type = swp_type(swp);
+ pgoff_t offset = swp_offset(swp);
+ struct page *page = &folio->page;
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry, *dupentry;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
struct obj_cgroup *objcg = NULL;
struct zswap_pool *pool;
- int ret;
+ struct zpool *zpool;
unsigned int dlen = PAGE_SIZE;
unsigned long handle, value;
char *buf;
u8 *src, *dst;
gfp_t gfp;
+ int ret;
- /* THP isn't supported */
- if (PageTransHuge(page)) {
- ret = -EINVAL;
- goto reject;
- }
+ VM_WARN_ON_ONCE(!folio_test_locked(folio));
+ VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
- if (!zswap_enabled || !tree) {
- ret = -ENODEV;
- goto reject;
- }
+ /* Large folios aren't supported */
+ if (folio_test_large(folio))
+ return false;
+
+ if (!zswap_enabled || !tree)
+ return false;
/*
* XXX: zswap reclaim does not work with cgroups yet. Without a
* cgroup-aware entry LRU, we will push out entries system-wide based on
* local cgroup limits.
*/
- objcg = get_obj_cgroup_from_page(page);
- if (objcg && !obj_cgroup_may_zswap(objcg)) {
- ret = -ENOMEM;
+ objcg = get_obj_cgroup_from_folio(folio);
+ if (objcg && !obj_cgroup_may_zswap(objcg))
goto reject;
- }
/* reclaim space if needed */
if (zswap_is_full()) {
@@ -1251,10 +1235,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
}
if (zswap_pool_reached_full) {
- if (!zswap_can_accept()) {
- ret = -ENOMEM;
+ if (!zswap_can_accept())
goto shrink;
- } else
+ else
zswap_pool_reached_full = false;
}
@@ -1262,7 +1245,6 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
entry = zswap_entry_cache_alloc(GFP_KERNEL);
if (!entry) {
zswap_reject_kmemcache_fail++;
- ret = -ENOMEM;
goto reject;
}
@@ -1279,17 +1261,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
kunmap_atomic(src);
}
- if (!zswap_non_same_filled_pages_enabled) {
- ret = -EINVAL;
+ if (!zswap_non_same_filled_pages_enabled)
goto freepage;
- }
/* if entry is successfully added, it keeps the reference */
entry->pool = zswap_pool_current_get();
- if (!entry->pool) {
- ret = -EINVAL;
+ if (!entry->pool)
goto freepage;
- }
/* compress */
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
@@ -1309,25 +1287,24 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
* synchronous in fact.
* Theoretically, acomp supports users send multiple acomp requests in one
* acomp instance, then get those requests done simultaneously. but in this
- * case, frontswap actually does store and load page by page, there is no
+ * case, zswap actually does store and load page by page, there is no
* existing method to send the second page before the first page is done
- * in one thread doing frontswap.
+ * in one thread doing zwap.
* but in different threads running on different cpu, we have different
* acomp instance, so multiple threads can do (de)compression in parallel.
*/
ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
dlen = acomp_ctx->req->dlen;
- if (ret) {
- ret = -EINVAL;
+ if (ret)
goto put_dstmem;
- }
/* store */
+ zpool = zswap_find_zpool(entry);
gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- if (zpool_malloc_support_movable(entry->pool->zpool))
+ if (zpool_malloc_support_movable(zpool))
gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
- ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
+ ret = zpool_malloc(zpool, dlen, gfp, &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
goto put_dstmem;
@@ -1336,9 +1313,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
zswap_reject_alloc_fail++;
goto put_dstmem;
}
- buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
+ buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
memcpy(buf, dst, dlen);
- zpool_unmap_handle(entry->pool->zpool, handle);
+ zpool_unmap_handle(zpool, handle);
mutex_unlock(acomp_ctx->mutex);
/* populate entry */
@@ -1356,15 +1333,10 @@ insert_entry:
/* map */
spin_lock(&tree->lock);
- do {
- ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
- if (ret == -EEXIST) {
- zswap_duplicate_entry++;
- /* remove from rbtree */
- zswap_rb_erase(&tree->rbroot, dupentry);
- zswap_entry_put(tree, dupentry);
- }
- } while (ret == -EEXIST);
+ while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
+ zswap_duplicate_entry++;
+ zswap_invalidate_entry(tree, dupentry);
+ }
if (entry->length) {
spin_lock(&entry->pool->lru_lock);
list_add(&entry->lru, &entry->pool->lru);
@@ -1377,7 +1349,7 @@ insert_entry:
zswap_update_total_size();
count_vm_event(ZSWPOUT);
- return 0;
+ return true;
put_dstmem:
mutex_unlock(acomp_ctx->mutex);
@@ -1387,38 +1359,38 @@ freepage:
reject:
if (objcg)
obj_cgroup_put(objcg);
- return ret;
+ return false;
shrink:
pool = zswap_pool_last_get();
if (pool)
queue_work(shrink_wq, &pool->shrink_work);
- ret = -ENOMEM;
goto reject;
}
-/*
- * returns 0 if the page was successfully decompressed
- * return -1 on entry not found or error
-*/
-static int zswap_frontswap_load(unsigned type, pgoff_t offset,
- struct page *page, bool *exclusive)
+bool zswap_load(struct folio *folio)
{
+ swp_entry_t swp = folio_swap_entry(folio);
+ int type = swp_type(swp);
+ pgoff_t offset = swp_offset(swp);
+ struct page *page = &folio->page;
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
u8 *src, *dst, *tmp;
+ struct zpool *zpool;
unsigned int dlen;
- int ret;
+ bool ret;
+
+ VM_WARN_ON_ONCE(!folio_test_locked(folio));
/* find */
spin_lock(&tree->lock);
entry = zswap_entry_find_get(&tree->rbroot, offset);
if (!entry) {
- /* entry was written back */
spin_unlock(&tree->lock);
- return -1;
+ return false;
}
spin_unlock(&tree->lock);
@@ -1426,26 +1398,27 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
dst = kmap_atomic(page);
zswap_fill_page(dst, entry->value);
kunmap_atomic(dst);
- ret = 0;
+ ret = true;
goto stats;
}
- if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+ zpool = zswap_find_zpool(entry);
+ if (!zpool_can_sleep_mapped(zpool)) {
tmp = kmalloc(entry->length, GFP_KERNEL);
if (!tmp) {
- ret = -ENOMEM;
+ ret = false;
goto freeentry;
}
}
/* decompress */
dlen = PAGE_SIZE;
- src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
+ src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
- if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+ if (!zpool_can_sleep_mapped(zpool)) {
memcpy(tmp, src, entry->length);
src = tmp;
- zpool_unmap_handle(entry->pool->zpool, entry->handle);
+ zpool_unmap_handle(zpool, entry->handle);
}
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
@@ -1454,24 +1427,25 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
sg_init_table(&output, 1);
sg_set_page(&output, page, PAGE_SIZE, 0);
acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
- ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+ if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait))
+ WARN_ON(1);
mutex_unlock(acomp_ctx->mutex);
- if (zpool_can_sleep_mapped(entry->pool->zpool))
- zpool_unmap_handle(entry->pool->zpool, entry->handle);
+ if (zpool_can_sleep_mapped(zpool))
+ zpool_unmap_handle(zpool, entry->handle);
else
kfree(tmp);
- BUG_ON(ret);
+ ret = true;
stats:
count_vm_event(ZSWPIN);
if (entry->objcg)
count_objcg_event(entry->objcg, ZSWPIN);
freeentry:
spin_lock(&tree->lock);
- if (!ret && zswap_exclusive_loads_enabled) {
+ if (ret && zswap_exclusive_loads_enabled) {
zswap_invalidate_entry(tree, entry);
- *exclusive = true;
+ folio_mark_dirty(folio);
} else if (entry->length) {
spin_lock(&entry->pool->lru_lock);
list_move(&entry->lru, &entry->pool->lru);
@@ -1483,8 +1457,7 @@ freeentry:
return ret;
}
-/* frees an entry in zswap */
-static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
+void zswap_invalidate(int type, pgoff_t offset)
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
@@ -1501,8 +1474,22 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
spin_unlock(&tree->lock);
}
-/* frees all zswap entries for the given swap type */
-static void zswap_frontswap_invalidate_area(unsigned type)
+void zswap_swapon(int type)
+{
+ struct zswap_tree *tree;
+
+ tree = kzalloc(sizeof(*tree), GFP_KERNEL);
+ if (!tree) {
+ pr_err("alloc failed, zswap disabled for swap type %d\n", type);
+ return;
+ }
+
+ tree->rbroot = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ zswap_trees[type] = tree;
+}
+
+void zswap_swapoff(int type)
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry, *n;
@@ -1520,29 +1507,6 @@ static void zswap_frontswap_invalidate_area(unsigned type)
zswap_trees[type] = NULL;
}
-static void zswap_frontswap_init(unsigned type)
-{
- struct zswap_tree *tree;
-
- tree = kzalloc(sizeof(*tree), GFP_KERNEL);
- if (!tree) {
- pr_err("alloc failed, zswap disabled for swap type %d\n", type);
- return;
- }
-
- tree->rbroot = RB_ROOT;
- spin_lock_init(&tree->lock);
- zswap_trees[type] = tree;
-}
-
-static const struct frontswap_ops zswap_frontswap_ops = {
- .store = zswap_frontswap_store,
- .load = zswap_frontswap_load,
- .invalidate_page = zswap_frontswap_invalidate_page,
- .invalidate_area = zswap_frontswap_invalidate_area,
- .init = zswap_frontswap_init
-};
-
/*********************************
* debugfs functions
**********************************/
@@ -1619,7 +1583,7 @@ static int zswap_setup(void)
pool = __zswap_pool_create_fallback();
if (pool) {
pr_info("loaded using pool %s/%s\n", pool->tfm_name,
- zpool_get_type(pool->zpool));
+ zpool_get_type(pool->zpools[0]));
list_add(&pool->list, &zswap_pools);
zswap_has_pool = true;
} else {
@@ -1631,16 +1595,11 @@ static int zswap_setup(void)
if (!shrink_wq)
goto fallback_fail;
- ret = frontswap_register_ops(&zswap_frontswap_ops);
- if (ret)
- goto destroy_wq;
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
zswap_init_state = ZSWAP_INIT_SUCCEED;
return 0;
-destroy_wq:
- destroy_workqueue(shrink_wq);
fallback_fail:
if (pool)
zswap_pool_destroy(pool);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8ed52e1e3c99..b9d49803e77f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1739,7 +1739,7 @@ void tcp_update_recv_tstamps(struct sk_buff *skb,
}
#ifdef CONFIG_MMU
-const struct vm_operations_struct tcp_vm_ops = {
+static const struct vm_operations_struct tcp_vm_ops = {
};
int tcp_mmap(struct file *file, struct socket *sock,
@@ -2042,13 +2042,10 @@ static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
unsigned long address,
bool *mmap_locked)
{
- struct vm_area_struct *vma = NULL;
+ struct vm_area_struct *vma = lock_vma_under_rcu(mm, address);
-#ifdef CONFIG_PER_VMA_LOCK
- vma = lock_vma_under_rcu(mm, address);
-#endif
if (vma) {
- if (!vma_is_tcp(vma)) {
+ if (vma->vm_ops != &tcp_vm_ops) {
vma_end_read(vma);
return NULL;
}
@@ -2058,7 +2055,7 @@ static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
mmap_read_lock(mm);
vma = vma_lookup(mm, address);
- if (!vma || !vma_is_tcp(vma)) {
+ if (!vma || vma->vm_ops != &tcp_vm_ops) {
mmap_read_unlock(mm);
return NULL;
}
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index d06e350fedee..ee8575540a8e 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3762,13 +3762,10 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
if (default_noexec &&
(prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) {
int rc = 0;
- if (vma->vm_start >= vma->vm_mm->start_brk &&
- vma->vm_end <= vma->vm_mm->brk) {
+ if (vma_is_initial_heap(vma)) {
rc = avc_has_perm(sid, sid, SECCLASS_PROCESS,
PROCESS__EXECHEAP, NULL);
- } else if (!vma->vm_file &&
- ((vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack) ||
+ } else if (!vma->vm_file && (vma_is_initial_stack(vma) ||
vma_is_stack_for_current(vma))) {
rc = avc_has_perm(sid, sid, SECCLASS_PROCESS,
PROCESS__EXECSTACK, NULL);
diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h
index 0eef91daf289..a3778aff4fa9 100644
--- a/tools/include/nolibc/stdio.h
+++ b/tools/include/nolibc/stdio.h
@@ -21,6 +21,11 @@
#define EOF (-1)
#endif
+/* Buffering mode used by setvbuf. */
+#define _IOFBF 0 /* Fully buffered. */
+#define _IOLBF 1 /* Line buffered. */
+#define _IONBF 2 /* No buffering. */
+
/* just define FILE as a non-empty type. The value of the pointer gives
* the FD: FILE=~fd for fd>=0 or NULL for fd<0. This way positive FILE
* are immediately identified as abnormal entries (i.e. possible copies
@@ -350,6 +355,25 @@ void perror(const char *msg)
fprintf(stderr, "%s%serrno=%d\n", (msg && *msg) ? msg : "", (msg && *msg) ? ": " : "", errno);
}
+static __attribute__((unused))
+int setvbuf(FILE *stream, char *buf, int mode, size_t size)
+{
+ /*
+ * nolibc does not support buffering so this is a nop. Just check mode
+ * is valid as required by the spec.
+ */
+ switch (mode) {
+ case _IOFBF:
+ case _IOLBF:
+ case _IONBF:
+ break;
+ default:
+ return EOF;
+ }
+
+ return 0;
+}
+
/* make sure to include all global symbols */
#include "nolibc.h"
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 75ea2081a317..e5da1cad70ba 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -45,6 +45,13 @@ struct rcu_test_struct2 {
unsigned long last[RCU_RANGE_COUNT];
};
+struct rcu_test_struct3 {
+ struct maple_tree *mt;
+ unsigned long index;
+ unsigned long last;
+ bool stop;
+};
+
struct rcu_reader_struct {
unsigned int id;
int mod;
@@ -34954,6 +34961,70 @@ void run_check_rcu(struct maple_tree *mt, struct rcu_test_struct *vals)
MT_BUG_ON(mt, !vals->seen_entry2);
}
+static void *rcu_slot_store_reader(void *ptr)
+{
+ struct rcu_test_struct3 *test = ptr;
+ MA_STATE(mas, test->mt, test->index, test->index);
+
+ rcu_register_thread();
+
+ rcu_read_lock();
+ while (!test->stop) {
+ mas_walk(&mas);
+ /* The length of growth to both sides must be equal. */
+ RCU_MT_BUG_ON(test, (test->index - mas.index) !=
+ (mas.last - test->last));
+ }
+ rcu_read_unlock();
+
+ rcu_unregister_thread();
+ return NULL;
+}
+
+static noinline void run_check_rcu_slot_store(struct maple_tree *mt)
+{
+ pthread_t readers[20];
+ int range_cnt = 200, i, limit = 10000;
+ unsigned long len = ULONG_MAX / range_cnt, start, end;
+ struct rcu_test_struct3 test = {.stop = false, .mt = mt};
+
+ start = range_cnt / 2 * len;
+ end = start + len - 1;
+ test.index = start;
+ test.last = end;
+
+ for (i = 0; i < range_cnt; i++) {
+ mtree_store_range(mt, i * len, i * len + len - 1,
+ xa_mk_value(i * 100), GFP_KERNEL);
+ }
+
+ mt_set_in_rcu(mt);
+ MT_BUG_ON(mt, !mt_in_rcu(mt));
+
+ for (i = 0; i < ARRAY_SIZE(readers); i++) {
+ if (pthread_create(&readers[i], NULL, rcu_slot_store_reader,
+ &test)) {
+ perror("creating reader thread");
+ exit(1);
+ }
+ }
+
+ usleep(5);
+
+ while (limit--) {
+ /* Step by step, expand the most middle range to both sides. */
+ mtree_store_range(mt, --start, ++end, xa_mk_value(100),
+ GFP_KERNEL);
+ }
+
+ test.stop = true;
+
+ while (i--)
+ pthread_join(readers[i], NULL);
+
+ mt_validate(mt);
+}
+
static noinline
void run_check_rcu_slowread(struct maple_tree *mt, struct rcu_test_struct *vals)
{
@@ -35206,6 +35277,10 @@ static noinline void __init check_rcu_threaded(struct maple_tree *mt)
run_check_rcu(mt, &vals);
mtree_destroy(mt);
+ /* Check expanding range in RCU mode */
+ mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
+ run_check_rcu_slot_store(mt);
+ mtree_destroy(mt);
/* Forward writer for rcu stress */
mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
@@ -35383,7 +35458,9 @@ static noinline void __init check_prealloc(struct maple_tree *mt)
for (i = 0; i <= max; i++)
mtree_test_store_range(mt, i * 10, i * 10 + 5, &i);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+ /* Spanning store */
+ mas_set_range(&mas, 470, 500);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
allocated = mas_allocated(&mas);
height = mas_mt_height(&mas);
MT_BUG_ON(mt, allocated == 0);
@@ -35392,105 +35469,108 @@ static noinline void __init check_prealloc(struct maple_tree *mt)
allocated = mas_allocated(&mas);
MT_BUG_ON(mt, allocated != 0);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
allocated = mas_allocated(&mas);
height = mas_mt_height(&mas);
MT_BUG_ON(mt, allocated == 0);
MT_BUG_ON(mt, allocated != 1 + height * 3);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
mas_destroy(&mas);
allocated = mas_allocated(&mas);
MT_BUG_ON(mt, allocated != 0);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
allocated = mas_allocated(&mas);
height = mas_mt_height(&mas);
- MT_BUG_ON(mt, allocated == 0);
MT_BUG_ON(mt, allocated != 1 + height * 3);
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1);
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
mas_destroy(&mas);
allocated = mas_allocated(&mas);
MT_BUG_ON(mt, allocated != 0);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
allocated = mas_allocated(&mas);
height = mas_mt_height(&mas);
- MT_BUG_ON(mt, allocated == 0);
MT_BUG_ON(mt, allocated != 1 + height * 3);
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
mas_destroy(&mas);
allocated = mas_allocated(&mas);
MT_BUG_ON(mt, allocated != 0);
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
allocated = mas_allocated(&mas);
height = mas_mt_height(&mas);
- MT_BUG_ON(mt, allocated == 0);
MT_BUG_ON(mt, allocated != 1 + height * 3);
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1);
mas_push_node(&mas, mn);
MT_BUG_ON(mt, mas_allocated(&mas) != allocated);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
mas_destroy(&mas);
allocated = mas_allocated(&mas);
MT_BUG_ON(mt, allocated != 0);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
allocated = mas_allocated(&mas);
height = mas_mt_height(&mas);
- MT_BUG_ON(mt, allocated == 0);
MT_BUG_ON(mt, allocated != 1 + height * 3);
mas_store_prealloc(&mas, ptr);
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+ /* Slot store does not need allocations */
+ mas_set_range(&mas, 6, 9);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
allocated = mas_allocated(&mas);
- height = mas_mt_height(&mas);
- MT_BUG_ON(mt, allocated == 0);
- MT_BUG_ON(mt, allocated != 1 + height * 3);
+ MT_BUG_ON(mt, allocated != 0);
mas_store_prealloc(&mas, ptr);
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+
+ mas_set_range(&mas, 6, 10);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
allocated = mas_allocated(&mas);
height = mas_mt_height(&mas);
- MT_BUG_ON(mt, allocated == 0);
- MT_BUG_ON(mt, allocated != 1 + height * 3);
+ MT_BUG_ON(mt, allocated != 1);
mas_store_prealloc(&mas, ptr);
+ MT_BUG_ON(mt, mas_allocated(&mas) != 0);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+ /* Split */
+ mas_set_range(&mas, 54, 54);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
allocated = mas_allocated(&mas);
height = mas_mt_height(&mas);
- MT_BUG_ON(mt, allocated == 0);
- MT_BUG_ON(mt, allocated != 1 + height * 3);
+ MT_BUG_ON(mt, allocated != 1 + height * 2);
mas_store_prealloc(&mas, ptr);
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
mt_set_non_kernel(1);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0);
+ /* Spanning store */
+ mas_set_range(&mas, 1, 100);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0);
allocated = mas_allocated(&mas);
height = mas_mt_height(&mas);
MT_BUG_ON(mt, allocated != 0);
mas_destroy(&mas);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
+ /* Spanning store */
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
allocated = mas_allocated(&mas);
height = mas_mt_height(&mas);
MT_BUG_ON(mt, allocated == 0);
MT_BUG_ON(mt, allocated != 1 + height * 3);
mas_store_prealloc(&mas, ptr);
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
+ mas_set_range(&mas, 0, 200);
mt_set_non_kernel(1);
- MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0);
+ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0);
allocated = mas_allocated(&mas);
height = mas_mt_height(&mas);
MT_BUG_ON(mt, allocated != 0);
diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore
index c4a57e69f749..4d556df4f77b 100644
--- a/tools/testing/selftests/cgroup/.gitignore
+++ b/tools/testing/selftests/cgroup/.gitignore
@@ -5,4 +5,5 @@ test_freezer
test_kmem
test_kill
test_cpu
+test_zswap
wait_inotify
diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile
index 3d263747d2ad..27dbdd7bb4bb 100644
--- a/tools/testing/selftests/cgroup/Makefile
+++ b/tools/testing/selftests/cgroup/Makefile
@@ -12,6 +12,7 @@ TEST_GEN_PROGS += test_core
TEST_GEN_PROGS += test_freezer
TEST_GEN_PROGS += test_kill
TEST_GEN_PROGS += test_cpu
+TEST_GEN_PROGS += test_zswap
LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h
@@ -23,3 +24,4 @@ $(OUTPUT)/test_core: cgroup_util.c
$(OUTPUT)/test_freezer: cgroup_util.c
$(OUTPUT)/test_kill: cgroup_util.c
$(OUTPUT)/test_cpu: cgroup_util.c
+$(OUTPUT)/test_zswap: cgroup_util.c
diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
new file mode 100644
index 000000000000..49def87a909b
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <linux/limits.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/sysinfo.h>
+#include <string.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+static int read_int(const char *path, size_t *value)
+{
+ FILE *file;
+ int ret = 0;
+
+ file = fopen(path, "r");
+ if (!file)
+ return -1;
+ if (fscanf(file, "%ld", value) != 1)
+ ret = -1;
+ fclose(file);
+ return ret;
+}
+
+static int set_min_free_kb(size_t value)
+{
+ FILE *file;
+ int ret;
+
+ file = fopen("/proc/sys/vm/min_free_kbytes", "w");
+ if (!file)
+ return -1;
+ ret = fprintf(file, "%ld\n", value);
+ fclose(file);
+ return ret;
+}
+
+static int read_min_free_kb(size_t *value)
+{
+ return read_int("/proc/sys/vm/min_free_kbytes", value);
+}
+
+static int get_zswap_stored_pages(size_t *value)
+{
+ return read_int("/sys/kernel/debug/zswap/stored_pages", value);
+}
+
+static int get_zswap_written_back_pages(size_t *value)
+{
+ return read_int("/sys/kernel/debug/zswap/written_back_pages", value);
+}
+
+static int allocate_bytes(const char *cgroup, void *arg)
+{
+ size_t size = (size_t)arg;
+ char *mem = (char *)malloc(size);
+
+ if (!mem)
+ return -1;
+ for (int i = 0; i < size; i += 4095)
+ mem[i] = 'a';
+ free(mem);
+ return 0;
+}
+
+/*
+ * When trying to store a memcg page in zswap, if the memcg hits its memory
+ * limit in zswap, writeback should not be triggered.
+ *
+ * This was fixed with commit 0bdf0efa180a("zswap: do not shrink if cgroup may
+ * not zswap"). Needs to be revised when a per memcg writeback mechanism is
+ * implemented.
+ */
+static int test_no_invasive_cgroup_shrink(const char *root)
+{
+ size_t written_back_before, written_back_after;
+ int ret = KSFT_FAIL;
+ char *test_group;
+
+ /* Set up */
+ test_group = cg_name(root, "no_shrink_test");
+ if (!test_group)
+ goto out;
+ if (cg_create(test_group))
+ goto out;
+ if (cg_write(test_group, "memory.max", "1M"))
+ goto out;
+ if (cg_write(test_group, "memory.zswap.max", "10K"))
+ goto out;
+ if (get_zswap_written_back_pages(&written_back_before))
+ goto out;
+
+ /* Allocate 10x memory.max to push memory into zswap */
+ if (cg_run(test_group, allocate_bytes, (void *)MB(10)))
+ goto out;
+
+ /* Verify that no writeback happened because of the memcg allocation */
+ if (get_zswap_written_back_pages(&written_back_after))
+ goto out;
+ if (written_back_after == written_back_before)
+ ret = KSFT_PASS;
+out:
+ cg_destroy(test_group);
+ free(test_group);
+ return ret;
+}
+
+struct no_kmem_bypass_child_args {
+ size_t target_alloc_bytes;
+ size_t child_allocated;
+};
+
+static int no_kmem_bypass_child(const char *cgroup, void *arg)
+{
+ struct no_kmem_bypass_child_args *values = arg;
+ void *allocation;
+
+ allocation = malloc(values->target_alloc_bytes);
+ if (!allocation) {
+ values->child_allocated = true;
+ return -1;
+ }
+ for (long i = 0; i < values->target_alloc_bytes; i += 4095)
+ ((char *)allocation)[i] = 'a';
+ values->child_allocated = true;
+ pause();
+ free(allocation);
+ return 0;
+}
+
+/*
+ * When pages owned by a memcg are pushed to zswap by kswapd, they should be
+ * charged to that cgroup. This wasn't the case before commit
+ * cd08d80ecdac("mm: correctly charge compressed memory to its memcg").
+ *
+ * The test first allocates memory in a memcg, then raises min_free_kbytes to
+ * a very high value so that the allocation falls below low wm, then makes
+ * another allocation to trigger kswapd that should push the memcg-owned pages
+ * to zswap and verifies that the zswap pages are correctly charged.
+ *
+ * To be run on a VM with at most 4G of memory.
+ */
+static int test_no_kmem_bypass(const char *root)
+{
+ size_t min_free_kb_high, min_free_kb_low, min_free_kb_original;
+ struct no_kmem_bypass_child_args *values;
+ size_t trigger_allocation_size;
+ int wait_child_iteration = 0;
+ long stored_pages_threshold;
+ struct sysinfo sys_info;
+ int ret = KSFT_FAIL;
+ int child_status;
+ char *test_group;
+ pid_t child_pid;
+
+ /* Read sys info and compute test values accordingly */
+ if (sysinfo(&sys_info) != 0)
+ return KSFT_FAIL;
+ if (sys_info.totalram > 5000000000)
+ return KSFT_SKIP;
+ values = mmap(0, sizeof(struct no_kmem_bypass_child_args), PROT_READ |
+ PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (values == MAP_FAILED)
+ return KSFT_FAIL;
+ if (read_min_free_kb(&min_free_kb_original))
+ return KSFT_FAIL;
+ min_free_kb_high = sys_info.totalram / 2000;
+ min_free_kb_low = sys_info.totalram / 500000;
+ values->target_alloc_bytes = (sys_info.totalram - min_free_kb_high * 1000) +
+ sys_info.totalram * 5 / 100;
+ stored_pages_threshold = sys_info.totalram / 5 / 4096;
+ trigger_allocation_size = sys_info.totalram / 20;
+
+ /* Set up test memcg */
+ if (cg_write(root, "cgroup.subtree_control", "+memory"))
+ goto out;
+ test_group = cg_name(root, "kmem_bypass_test");
+ if (!test_group)
+ goto out;
+
+ /* Spawn memcg child and wait for it to allocate */
+ set_min_free_kb(min_free_kb_low);
+ if (cg_create(test_group))
+ goto out;
+ values->child_allocated = false;
+ child_pid = cg_run_nowait(test_group, no_kmem_bypass_child, values);
+ if (child_pid < 0)
+ goto out;
+ while (!values->child_allocated && wait_child_iteration++ < 10000)
+ usleep(1000);
+
+ /* Try to wakeup kswapd and let it push child memory to zswap */
+ set_min_free_kb(min_free_kb_high);
+ for (int i = 0; i < 20; i++) {
+ size_t stored_pages;
+ char *trigger_allocation = malloc(trigger_allocation_size);
+
+ if (!trigger_allocation)
+ break;
+ for (int i = 0; i < trigger_allocation_size; i += 4095)
+ trigger_allocation[i] = 'b';
+ usleep(100000);
+ free(trigger_allocation);
+ if (get_zswap_stored_pages(&stored_pages))
+ break;
+ if (stored_pages < 0)
+ break;
+ /* If memory was pushed to zswap, verify it belongs to memcg */
+ if (stored_pages > stored_pages_threshold) {
+ int zswapped = cg_read_key_long(test_group, "memory.stat", "zswapped ");
+ int delta = stored_pages * 4096 - zswapped;
+ int result_ok = delta < stored_pages * 4096 / 4;
+
+ ret = result_ok ? KSFT_PASS : KSFT_FAIL;
+ break;
+ }
+ }
+
+ kill(child_pid, SIGTERM);
+ waitpid(child_pid, &child_status, 0);
+out:
+ set_min_free_kb(min_free_kb_original);
+ cg_destroy(test_group);
+ free(test_group);
+ return ret;
+}
+
+#define T(x) { x, #x }
+struct zswap_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_no_kmem_bypass),
+ T(test_no_invasive_cgroup_shrink),
+};
+#undef T
+
+static bool zswap_configured(void)
+{
+ return access("/sys/module/zswap", F_OK) == 0;
+}
+
+int main(int argc, char **argv)
+{
+ char root[PATH_MAX];
+ int i, ret = EXIT_SUCCESS;
+
+ if (cg_find_unified_root(root, sizeof(root)))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+ if (!zswap_configured())
+ ksft_exit_skip("zswap isn't configured\n");
+
+ /*
+ * Check that memory controller is available:
+ * memory is listed in cgroup.controllers
+ */
+ if (cg_read_strstr(root, "cgroup.controllers", "memory"))
+ ksft_exit_skip("memory controller isn't available\n");
+
+ if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
+ if (cg_write(root, "cgroup.subtree_control", "+memory"))
+ ksft_exit_skip("Failed to set memory controller\n");
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ret = EXIT_FAILURE;
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh
index bcd4734ca094..60a9a305aef0 100644
--- a/tools/testing/selftests/damon/sysfs.sh
+++ b/tools/testing/selftests/damon/sysfs.sh
@@ -84,6 +84,7 @@ test_tried_regions()
{
tried_regions_dir=$1
ensure_dir "$tried_regions_dir" "exist"
+ ensure_file "$tried_regions_dir/total_bytes" "exist" "400"
}
test_stats()
@@ -102,9 +103,14 @@ test_filter()
ensure_file "$filter_dir/type" "exist" "600"
ensure_write_succ "$filter_dir/type" "anon" "valid input"
ensure_write_succ "$filter_dir/type" "memcg" "valid input"
+ ensure_write_succ "$filter_dir/type" "addr" "valid input"
+ ensure_write_succ "$filter_dir/type" "target" "valid input"
ensure_write_fail "$filter_dir/type" "foo" "invalid input"
ensure_file "$filter_dir/matching" "exist" "600"
ensure_file "$filter_dir/memcg_path" "exist" "600"
+ ensure_file "$filter_dir/addr_start" "exist" "600"
+ ensure_file "$filter_dir/addr_end" "exist" "600"
+ ensure_file "$filter_dir/damon_target_idx" "exist" "600"
}
test_filters()
diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index 829be379545a..529d29a35900 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -113,6 +113,15 @@ static inline int ksft_get_error_cnt(void) { return ksft_cnt.ksft_error; }
static inline void ksft_print_header(void)
{
+ /*
+ * Force line buffering; If stdout is not connected to a terminal, it
+ * will otherwise default to fully buffered, which can cause output
+ * duplication if there is content in the buffer when fork()ing. If
+ * there is a crash, line buffering also means the most recent output
+ * line will be visible.
+ */
+ setvbuf(stdout, NULL, _IOLBF, 0);
+
if (!(getenv("KSFT_TAP_LEVEL")))
printf("TAP version 13\n");
}
diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh
index 1c952d1401d4..261c73cab41b 100644
--- a/tools/testing/selftests/kselftest/runner.sh
+++ b/tools/testing/selftests/kselftest/runner.sh
@@ -105,15 +105,18 @@ run_one()
echo "# Warning: file $TEST is missing!"
echo "not ok $test_num $TEST_HDR_MSG"
else
+ if [ -x /usr/bin/stdbuf ]; then
+ stdbuf="/usr/bin/stdbuf --output=L "
+ fi
eval kselftest_cmd_args="\$${kselftest_cmd_args_ref:-}"
- cmd="./$BASENAME_TEST $kselftest_cmd_args"
+ cmd="$stdbuf ./$BASENAME_TEST $kselftest_cmd_args"
if [ ! -x "$TEST" ]; then
echo "# Warning: file $TEST is not executable"
if [ $(head -n 1 "$TEST" | cut -c -2) = "#!" ]
then
interpreter=$(head -n 1 "$TEST" | cut -c 3-)
- cmd="$interpreter ./$BASENAME_TEST"
+ cmd="$stdbuf $interpreter ./$BASENAME_TEST"
else
echo "not ok $test_num $TEST_HDR_MSG"
return
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index dba0e8ba002f..3df008677239 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -18,6 +18,7 @@
#include <sys/syscall.h>
#include <sys/wait.h>
#include <unistd.h>
+#include <ctype.h>
#include "common.h"
@@ -43,7 +44,6 @@
*/
static size_t mfd_def_size = MFD_DEF_SIZE;
static const char *memfd_str = MEMFD_STR;
-static pid_t spawn_newpid_thread(unsigned int flags, int (*fn)(void *));
static int newpid_thread_fn2(void *arg);
static void join_newpid_thread(pid_t pid);
@@ -96,12 +96,12 @@ static void sysctl_assert_write(const char *val)
int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC);
if (fd < 0) {
- printf("open sysctl failed\n");
+ printf("open sysctl failed: %m\n");
abort();
}
if (write(fd, val, strlen(val)) < 0) {
- printf("write sysctl failed\n");
+ printf("write sysctl %s failed: %m\n", val);
abort();
}
}
@@ -111,7 +111,7 @@ static void sysctl_fail_write(const char *val)
int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC);
if (fd < 0) {
- printf("open sysctl failed\n");
+ printf("open sysctl failed: %m\n");
abort();
}
@@ -122,6 +122,33 @@ static void sysctl_fail_write(const char *val)
}
}
+static void sysctl_assert_equal(const char *val)
+{
+ char *p, buf[128] = {};
+ int fd = open("/proc/sys/vm/memfd_noexec", O_RDONLY | O_CLOEXEC);
+
+ if (fd < 0) {
+ printf("open sysctl failed: %m\n");
+ abort();
+ }
+
+ if (read(fd, buf, sizeof(buf)) < 0) {
+ printf("read sysctl failed: %m\n");
+ abort();
+ }
+
+ /* Strip trailing whitespace. */
+ p = buf;
+ while (!isspace(*p))
+ p++;
+ *p = '\0';
+
+ if (strcmp(buf, val) != 0) {
+ printf("unexpected sysctl value: expected %s, got %s\n", val, buf);
+ abort();
+ }
+}
+
static int mfd_assert_reopen_fd(int fd_in)
{
int fd;
@@ -736,7 +763,7 @@ static int idle_thread_fn(void *arg)
return 0;
}
-static pid_t spawn_idle_thread(unsigned int flags)
+static pid_t spawn_thread(unsigned int flags, int (*fn)(void *), void *arg)
{
uint8_t *stack;
pid_t pid;
@@ -747,10 +774,7 @@ static pid_t spawn_idle_thread(unsigned int flags)
abort();
}
- pid = clone(idle_thread_fn,
- stack + STACK_SIZE,
- SIGCHLD | flags,
- NULL);
+ pid = clone(fn, stack + STACK_SIZE, SIGCHLD | flags, arg);
if (pid < 0) {
printf("clone() failed: %m\n");
abort();
@@ -759,6 +783,33 @@ static pid_t spawn_idle_thread(unsigned int flags)
return pid;
}
+static void join_thread(pid_t pid)
+{
+ int wstatus;
+
+ if (waitpid(pid, &wstatus, 0) < 0) {
+ printf("newpid thread: waitpid() failed: %m\n");
+ abort();
+ }
+
+ if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != 0) {
+ printf("newpid thread: exited with non-zero error code %d\n",
+ WEXITSTATUS(wstatus));
+ abort();
+ }
+
+ if (WIFSIGNALED(wstatus)) {
+ printf("newpid thread: killed by signal %d\n",
+ WTERMSIG(wstatus));
+ abort();
+ }
+}
+
+static pid_t spawn_idle_thread(unsigned int flags)
+{
+ return spawn_thread(flags, idle_thread_fn, NULL);
+}
+
static void join_idle_thread(pid_t pid)
{
kill(pid, SIGTERM);
@@ -1111,109 +1162,260 @@ static void test_noexec_seal(void)
close(fd);
}
-static void test_sysctl_child(void)
+static void test_sysctl_sysctl0(void)
{
int fd;
- int pid;
- printf("%s sysctl 0\n", memfd_str);
- sysctl_assert_write("0");
- fd = mfd_assert_new("kern_memfd_sysctl_0",
+ sysctl_assert_equal("0");
+
+ fd = mfd_assert_new("kern_memfd_sysctl_0_dfl",
mfd_def_size,
MFD_CLOEXEC | MFD_ALLOW_SEALING);
-
mfd_assert_mode(fd, 0777);
mfd_assert_has_seals(fd, 0);
mfd_assert_chmod(fd, 0644);
close(fd);
+}
- printf("%s sysctl 1\n", memfd_str);
- sysctl_assert_write("1");
- fd = mfd_assert_new("kern_memfd_sysctl_1",
+static void test_sysctl_set_sysctl0(void)
+{
+ sysctl_assert_write("0");
+ test_sysctl_sysctl0();
+}
+
+static void test_sysctl_sysctl1(void)
+{
+ int fd;
+
+ sysctl_assert_equal("1");
+
+ fd = mfd_assert_new("kern_memfd_sysctl_1_dfl",
mfd_def_size,
MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ mfd_assert_mode(fd, 0666);
+ mfd_assert_has_seals(fd, F_SEAL_EXEC);
+ mfd_fail_chmod(fd, 0777);
+ close(fd);
- printf("%s child ns\n", memfd_str);
- pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn2);
- join_newpid_thread(pid);
+ fd = mfd_assert_new("kern_memfd_sysctl_1_exec",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_EXEC | MFD_ALLOW_SEALING);
+ mfd_assert_mode(fd, 0777);
+ mfd_assert_has_seals(fd, 0);
+ mfd_assert_chmod(fd, 0644);
+ close(fd);
+ fd = mfd_assert_new("kern_memfd_sysctl_1_noexec",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_NOEXEC_SEAL | MFD_ALLOW_SEALING);
mfd_assert_mode(fd, 0666);
mfd_assert_has_seals(fd, F_SEAL_EXEC);
mfd_fail_chmod(fd, 0777);
- sysctl_fail_write("0");
close(fd);
-
- printf("%s sysctl 2\n", memfd_str);
- sysctl_assert_write("2");
- mfd_fail_new("kern_memfd_sysctl_2",
- MFD_CLOEXEC | MFD_ALLOW_SEALING);
- sysctl_fail_write("0");
- sysctl_fail_write("1");
}
-static int newpid_thread_fn(void *arg)
+static void test_sysctl_set_sysctl1(void)
{
- test_sysctl_child();
- return 0;
+ sysctl_assert_write("1");
+ test_sysctl_sysctl1();
}
-static void test_sysctl_child2(void)
+static void test_sysctl_sysctl2(void)
{
int fd;
- sysctl_fail_write("0");
- fd = mfd_assert_new("kern_memfd_sysctl_1",
+ sysctl_assert_equal("2");
+
+ fd = mfd_assert_new("kern_memfd_sysctl_2_dfl",
mfd_def_size,
MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ mfd_assert_mode(fd, 0666);
+ mfd_assert_has_seals(fd, F_SEAL_EXEC);
+ mfd_fail_chmod(fd, 0777);
+ close(fd);
+
+ mfd_fail_new("kern_memfd_sysctl_2_exec",
+ MFD_CLOEXEC | MFD_EXEC | MFD_ALLOW_SEALING);
+ fd = mfd_assert_new("kern_memfd_sysctl_2_noexec",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_NOEXEC_SEAL | MFD_ALLOW_SEALING);
mfd_assert_mode(fd, 0666);
mfd_assert_has_seals(fd, F_SEAL_EXEC);
mfd_fail_chmod(fd, 0777);
close(fd);
}
-static int newpid_thread_fn2(void *arg)
+static void test_sysctl_set_sysctl2(void)
+{
+ sysctl_assert_write("2");
+ test_sysctl_sysctl2();
+}
+
+static int sysctl_simple_child(void *arg)
+{
+ int fd;
+ int pid;
+
+ printf("%s sysctl 0\n", memfd_str);
+ test_sysctl_set_sysctl0();
+
+ printf("%s sysctl 1\n", memfd_str);
+ test_sysctl_set_sysctl1();
+
+ printf("%s sysctl 0\n", memfd_str);
+ test_sysctl_set_sysctl0();
+
+ printf("%s sysctl 2\n", memfd_str);
+ test_sysctl_set_sysctl2();
+
+ printf("%s sysctl 1\n", memfd_str);
+ test_sysctl_set_sysctl1();
+
+ printf("%s sysctl 0\n", memfd_str);
+ test_sysctl_set_sysctl0();
+
+ return 0;
+}
+
+/*
+ * Test sysctl
+ * A very basic test to make sure the core sysctl semantics work.
+ */
+static void test_sysctl_simple(void)
+{
+ int pid = spawn_thread(CLONE_NEWPID, sysctl_simple_child, NULL);
+
+ join_thread(pid);
+}
+
+static int sysctl_nested(void *arg)
{
- test_sysctl_child2();
+ void (*fn)(void) = arg;
+
+ fn();
return 0;
}
-static pid_t spawn_newpid_thread(unsigned int flags, int (*fn)(void *))
+
+static int sysctl_nested_wait(void *arg)
{
- uint8_t *stack;
- pid_t pid;
+ /* Wait for a SIGCONT. */
+ kill(getpid(), SIGSTOP);
+ return sysctl_nested(arg);
+}
- stack = malloc(STACK_SIZE);
- if (!stack) {
- printf("malloc(STACK_SIZE) failed: %m\n");
- abort();
- }
+static void test_sysctl_sysctl1_failset(void)
+{
+ sysctl_fail_write("0");
+ test_sysctl_sysctl1();
+}
- pid = clone(fn,
- stack + STACK_SIZE,
- SIGCHLD | flags,
- NULL);
- if (pid < 0) {
- printf("clone() failed: %m\n");
- abort();
- }
+static void test_sysctl_sysctl2_failset(void)
+{
+ sysctl_fail_write("1");
+ test_sysctl_sysctl2();
- return pid;
+ sysctl_fail_write("0");
+ test_sysctl_sysctl2();
}
-static void join_newpid_thread(pid_t pid)
+static int sysctl_nested_child(void *arg)
{
- waitpid(pid, NULL, 0);
+ int fd;
+ int pid;
+
+ printf("%s nested sysctl 0\n", memfd_str);
+ sysctl_assert_write("0");
+ /* A further nested pidns works the same. */
+ pid = spawn_thread(CLONE_NEWPID, sysctl_simple_child, NULL);
+ join_thread(pid);
+
+ printf("%s nested sysctl 1\n", memfd_str);
+ sysctl_assert_write("1");
+ /* Child inherits our setting. */
+ pid = spawn_thread(CLONE_NEWPID, sysctl_nested, test_sysctl_sysctl1);
+ join_thread(pid);
+ /* Child cannot raise the setting. */
+ pid = spawn_thread(CLONE_NEWPID, sysctl_nested,
+ test_sysctl_sysctl1_failset);
+ join_thread(pid);
+ /* Child can lower the setting. */
+ pid = spawn_thread(CLONE_NEWPID, sysctl_nested,
+ test_sysctl_set_sysctl2);
+ join_thread(pid);
+ /* Child lowering the setting has no effect on our setting. */
+ test_sysctl_sysctl1();
+
+ printf("%s nested sysctl 2\n", memfd_str);
+ sysctl_assert_write("2");
+ /* Child inherits our setting. */
+ pid = spawn_thread(CLONE_NEWPID, sysctl_nested, test_sysctl_sysctl2);
+ join_thread(pid);
+ /* Child cannot raise the setting. */
+ pid = spawn_thread(CLONE_NEWPID, sysctl_nested,
+ test_sysctl_sysctl2_failset);
+ join_thread(pid);
+
+ /* Verify that the rules are actually inherited after fork. */
+ printf("%s nested sysctl 0 -> 1 after fork\n", memfd_str);
+ sysctl_assert_write("0");
+
+ pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait,
+ test_sysctl_sysctl1_failset);
+ sysctl_assert_write("1");
+ kill(pid, SIGCONT);
+ join_thread(pid);
+
+ printf("%s nested sysctl 0 -> 2 after fork\n", memfd_str);
+ sysctl_assert_write("0");
+
+ pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait,
+ test_sysctl_sysctl2_failset);
+ sysctl_assert_write("2");
+ kill(pid, SIGCONT);
+ join_thread(pid);
+
+ /*
+ * Verify that the current effective setting is saved on fork, meaning
+ * that the parent lowering the sysctl doesn't affect already-forked
+ * children.
+ */
+ printf("%s nested sysctl 2 -> 1 after fork\n", memfd_str);
+ sysctl_assert_write("2");
+ pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait,
+ test_sysctl_sysctl2);
+ sysctl_assert_write("1");
+ kill(pid, SIGCONT);
+ join_thread(pid);
+
+ printf("%s nested sysctl 2 -> 0 after fork\n", memfd_str);
+ sysctl_assert_write("2");
+ pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait,
+ test_sysctl_sysctl2);
+ sysctl_assert_write("0");
+ kill(pid, SIGCONT);
+ join_thread(pid);
+
+ printf("%s nested sysctl 1 -> 0 after fork\n", memfd_str);
+ sysctl_assert_write("1");
+ pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait,
+ test_sysctl_sysctl1);
+ sysctl_assert_write("0");
+ kill(pid, SIGCONT);
+ join_thread(pid);
+
+ return 0;
}
/*
- * Test sysctl
- * A very basic sealing test to see whether setting/retrieving seals works.
+ * Test sysctl with nested pid namespaces
+ * Make sure that the sysctl nesting semantics work correctly.
*/
-static void test_sysctl(void)
+static void test_sysctl_nested(void)
{
- int pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn);
+ int pid = spawn_thread(CLONE_NEWPID, sysctl_nested_child, NULL);
- join_newpid_thread(pid);
+ join_thread(pid);
}
/*
@@ -1399,6 +1601,9 @@ int main(int argc, char **argv)
test_seal_grow();
test_seal_resize();
+ test_sysctl_simple();
+ test_sysctl_nested();
+
test_share_dup("SHARE-DUP", "");
test_share_mmap("SHARE-MMAP", "");
test_share_open("SHARE-OPEN", "");
@@ -1413,8 +1618,6 @@ int main(int argc, char **argv)
test_share_fork("SHARE-FORK", SHARED_FT_STR);
join_idle_thread(pid);
- test_sysctl();
-
printf("memfd: DONE\n");
return 0;
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 7e2a982383c0..cdc9ce4426b9 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -5,6 +5,7 @@ hugepage-mremap
hugepage-shm
hugepage-vmemmap
hugetlb-madvise
+hugetlb-read-hwpoison
khugepaged
map_hugetlb
map_populate
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 66d7c07dc177..6a9fc5693145 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -35,39 +35,43 @@ MAKEFLAGS += --no-builtin-rules
CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
LDLIBS = -lrt -lpthread
-TEST_GEN_PROGS = cow
-TEST_GEN_PROGS += compaction_test
-TEST_GEN_PROGS += gup_longterm
-TEST_GEN_PROGS += gup_test
-TEST_GEN_PROGS += hmm-tests
-TEST_GEN_PROGS += hugetlb-madvise
-TEST_GEN_PROGS += hugepage-mmap
-TEST_GEN_PROGS += hugepage-mremap
-TEST_GEN_PROGS += hugepage-shm
-TEST_GEN_PROGS += hugepage-vmemmap
-TEST_GEN_PROGS += khugepaged
-TEST_GEN_PROGS += madv_populate
-TEST_GEN_PROGS += map_fixed_noreplace
-TEST_GEN_PROGS += map_hugetlb
-TEST_GEN_PROGS += map_populate
-TEST_GEN_PROGS += memfd_secret
-TEST_GEN_PROGS += migration
-TEST_GEN_PROGS += mkdirty
-TEST_GEN_PROGS += mlock-random-test
-TEST_GEN_PROGS += mlock2-tests
-TEST_GEN_PROGS += mrelease_test
-TEST_GEN_PROGS += mremap_dontunmap
-TEST_GEN_PROGS += mremap_test
-TEST_GEN_PROGS += on-fault-limit
-TEST_GEN_PROGS += thuge-gen
-TEST_GEN_PROGS += transhuge-stress
-TEST_GEN_PROGS += uffd-stress
-TEST_GEN_PROGS += uffd-unit-tests
+TEST_GEN_FILES = cow
+TEST_GEN_FILES += compaction_test
+TEST_GEN_FILES += gup_longterm
+TEST_GEN_FILES += gup_test
+TEST_GEN_FILES += hmm-tests
+TEST_GEN_FILES += hugetlb-madvise
+TEST_GEN_FILES += hugetlb-read-hwpoison
+TEST_GEN_FILES += hugepage-mmap
+TEST_GEN_FILES += hugepage-mremap
+TEST_GEN_FILES += hugepage-shm
+TEST_GEN_FILES += hugepage-vmemmap
+TEST_GEN_FILES += khugepaged
+TEST_GEN_FILES += madv_populate
+TEST_GEN_FILES += map_fixed_noreplace
+TEST_GEN_FILES += map_hugetlb
+TEST_GEN_FILES += map_populate
+TEST_GEN_FILES += memfd_secret
+TEST_GEN_FILES += migration
+TEST_GEN_FILES += mkdirty
+TEST_GEN_FILES += mlock-random-test
+TEST_GEN_FILES += mlock2-tests
+TEST_GEN_FILES += mrelease_test
+TEST_GEN_FILES += mremap_dontunmap
+TEST_GEN_FILES += mremap_test
+TEST_GEN_FILES += on-fault-limit
+TEST_GEN_FILES += thuge-gen
+TEST_GEN_FILES += transhuge-stress
+TEST_GEN_FILES += uffd-stress
+TEST_GEN_FILES += uffd-unit-tests
+TEST_GEN_FILES += split_huge_page_test
+TEST_GEN_FILES += ksm_tests
+TEST_GEN_FILES += ksm_functional_tests
+TEST_GEN_FILES += mdwe_test
+
+ifneq ($(ARCH),arm64)
TEST_GEN_PROGS += soft-dirty
-TEST_GEN_PROGS += split_huge_page_test
-TEST_GEN_PROGS += ksm_tests
-TEST_GEN_PROGS += ksm_functional_tests
-TEST_GEN_PROGS += mdwe_test
+endif
ifeq ($(ARCH),x86_64)
CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32)
@@ -83,24 +87,24 @@ CFLAGS += -no-pie
endif
ifeq ($(CAN_BUILD_I386),1)
-TEST_GEN_PROGS += $(BINARIES_32)
+TEST_GEN_FILES += $(BINARIES_32)
endif
ifeq ($(CAN_BUILD_X86_64),1)
-TEST_GEN_PROGS += $(BINARIES_64)
+TEST_GEN_FILES += $(BINARIES_64)
endif
else
ifneq (,$(findstring $(ARCH),ppc64))
-TEST_GEN_PROGS += protection_keys
+TEST_GEN_FILES += protection_keys
endif
endif
ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64))
-TEST_GEN_PROGS += va_high_addr_switch
-TEST_GEN_PROGS += virtual_address_range
-TEST_GEN_PROGS += write_to_hugetlbfs
+TEST_GEN_FILES += va_high_addr_switch
+TEST_GEN_FILES += virtual_address_range
+TEST_GEN_FILES += write_to_hugetlbfs
endif
TEST_PROGS := run_vmtests.sh
@@ -112,6 +116,7 @@ TEST_FILES += va_high_addr_switch.sh
include ../lib.mk
$(TEST_GEN_PROGS): vm_util.c
+$(TEST_GEN_FILES): vm_util.c
$(OUTPUT)/uffd-stress: uffd-common.c
$(OUTPUT)/uffd-unit-tests: uffd-common.c
diff --git a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c
new file mode 100644
index 000000000000..ba6cc6f9cabc
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <linux/magic.h>
+#include <sys/mman.h>
+#include <sys/statfs.h>
+#include <errno.h>
+#include <stdbool.h>
+
+#include "../kselftest.h"
+
+#define PREFIX " ... "
+#define ERROR_PREFIX " !!! "
+
+#define MAX_WRITE_READ_CHUNK_SIZE (getpagesize() * 16)
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+enum test_status {
+ TEST_PASSED = 0,
+ TEST_FAILED = 1,
+ TEST_SKIPPED = 2,
+};
+
+static char *status_to_str(enum test_status status)
+{
+ switch (status) {
+ case TEST_PASSED:
+ return "TEST_PASSED";
+ case TEST_FAILED:
+ return "TEST_FAILED";
+ case TEST_SKIPPED:
+ return "TEST_SKIPPED";
+ default:
+ return "TEST_???";
+ }
+}
+
+static int setup_filemap(char *filemap, size_t len, size_t wr_chunk_size)
+{
+ char iter = 0;
+
+ for (size_t offset = 0; offset < len;
+ offset += wr_chunk_size) {
+ iter++;
+ memset(filemap + offset, iter, wr_chunk_size);
+ }
+
+ return 0;
+}
+
+static bool verify_chunk(char *buf, size_t len, char val)
+{
+ size_t i;
+
+ for (i = 0; i < len; ++i) {
+ if (buf[i] != val) {
+ printf(PREFIX ERROR_PREFIX "check fail: buf[%lu] = %u != %u\n",
+ i, buf[i], val);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size,
+ off_t offset, size_t expected)
+{
+ char buf[MAX_WRITE_READ_CHUNK_SIZE];
+ ssize_t ret_count = 0;
+ ssize_t total_ret_count = 0;
+ char val = offset / wr_chunk_size + offset % wr_chunk_size;
+
+ printf(PREFIX PREFIX "init val=%u with offset=0x%lx\n", val, offset);
+ printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n",
+ expected);
+ if (lseek(fd, offset, SEEK_SET) < 0) {
+ perror(PREFIX ERROR_PREFIX "seek failed");
+ return false;
+ }
+
+ while (offset + total_ret_count < len) {
+ ret_count = read(fd, buf, wr_chunk_size);
+ if (ret_count == 0) {
+ printf(PREFIX PREFIX "read reach end of the file\n");
+ break;
+ } else if (ret_count < 0) {
+ perror(PREFIX ERROR_PREFIX "read failed");
+ break;
+ }
+ ++val;
+ if (!verify_chunk(buf, ret_count, val))
+ return false;
+
+ total_ret_count += ret_count;
+ }
+ printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n",
+ total_ret_count);
+
+ return total_ret_count == expected;
+}
+
+static bool read_hugepage_filemap(int fd, size_t len,
+ size_t wr_chunk_size, size_t expected)
+{
+ char buf[MAX_WRITE_READ_CHUNK_SIZE];
+ ssize_t ret_count = 0;
+ ssize_t total_ret_count = 0;
+ char val = 0;
+
+ printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n",
+ expected);
+ while (total_ret_count < len) {
+ ret_count = read(fd, buf, wr_chunk_size);
+ if (ret_count == 0) {
+ printf(PREFIX PREFIX "read reach end of the file\n");
+ break;
+ } else if (ret_count < 0) {
+ perror(PREFIX ERROR_PREFIX "read failed");
+ break;
+ }
+ ++val;
+ if (!verify_chunk(buf, ret_count, val))
+ return false;
+
+ total_ret_count += ret_count;
+ }
+ printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n",
+ total_ret_count);
+
+ return total_ret_count == expected;
+}
+
+static enum test_status
+test_hugetlb_read(int fd, size_t len, size_t wr_chunk_size)
+{
+ enum test_status status = TEST_SKIPPED;
+ char *filemap = NULL;
+
+ if (ftruncate(fd, len) < 0) {
+ perror(PREFIX ERROR_PREFIX "ftruncate failed");
+ return status;
+ }
+
+ filemap = mmap(NULL, len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+ if (filemap == MAP_FAILED) {
+ perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed");
+ goto done;
+ }
+
+ setup_filemap(filemap, len, wr_chunk_size);
+ status = TEST_FAILED;
+
+ if (read_hugepage_filemap(fd, len, wr_chunk_size, len))
+ status = TEST_PASSED;
+
+ munmap(filemap, len);
+done:
+ if (ftruncate(fd, 0) < 0) {
+ perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed");
+ status = TEST_FAILED;
+ }
+
+ return status;
+}
+
+static enum test_status
+test_hugetlb_read_hwpoison(int fd, size_t len, size_t wr_chunk_size,
+ bool skip_hwpoison_page)
+{
+ enum test_status status = TEST_SKIPPED;
+ char *filemap = NULL;
+ char *hwp_addr = NULL;
+ const unsigned long pagesize = getpagesize();
+
+ if (ftruncate(fd, len) < 0) {
+ perror(PREFIX ERROR_PREFIX "ftruncate failed");
+ return status;
+ }
+
+ filemap = mmap(NULL, len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+ if (filemap == MAP_FAILED) {
+ perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed");
+ goto done;
+ }
+
+ setup_filemap(filemap, len, wr_chunk_size);
+ status = TEST_FAILED;
+
+ /*
+ * Poisoned hugetlb page layout (assume hugepagesize=2MB):
+ * |<---------------------- 1MB ---------------------->|
+ * |<---- healthy page ---->|<---- HWPOISON page ----->|
+ * |<------------------- (1MB - 8KB) ----------------->|
+ */
+ hwp_addr = filemap + len / 2 + pagesize;
+ if (madvise(hwp_addr, pagesize, MADV_HWPOISON) < 0) {
+ perror(PREFIX ERROR_PREFIX "MADV_HWPOISON failed");
+ goto unmap;
+ }
+
+ if (!skip_hwpoison_page) {
+ /*
+ * Userspace should be able to read (1MB + 1 page) from
+ * the beginning of the HWPOISONed hugepage.
+ */
+ if (read_hugepage_filemap(fd, len, wr_chunk_size,
+ len / 2 + pagesize))
+ status = TEST_PASSED;
+ } else {
+ /*
+ * Userspace should be able to read (1MB - 2 pages) from
+ * HWPOISONed hugepage.
+ */
+ if (seek_read_hugepage_filemap(fd, len, wr_chunk_size,
+ len / 2 + MAX(2 * pagesize, wr_chunk_size),
+ len / 2 - MAX(2 * pagesize, wr_chunk_size)))
+ status = TEST_PASSED;
+ }
+
+unmap:
+ munmap(filemap, len);
+done:
+ if (ftruncate(fd, 0) < 0) {
+ perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed");
+ status = TEST_FAILED;
+ }
+
+ return status;
+}
+
+static int create_hugetlbfs_file(struct statfs *file_stat)
+{
+ int fd;
+
+ fd = memfd_create("hugetlb_tmp", MFD_HUGETLB);
+ if (fd < 0) {
+ perror(PREFIX ERROR_PREFIX "could not open hugetlbfs file");
+ return -1;
+ }
+
+ memset(file_stat, 0, sizeof(*file_stat));
+ if (fstatfs(fd, file_stat)) {
+ perror(PREFIX ERROR_PREFIX "fstatfs failed");
+ goto close;
+ }
+ if (file_stat->f_type != HUGETLBFS_MAGIC) {
+ printf(PREFIX ERROR_PREFIX "not hugetlbfs file\n");
+ goto close;
+ }
+
+ return fd;
+close:
+ close(fd);
+ return -1;
+}
+
+int main(void)
+{
+ int fd;
+ struct statfs file_stat;
+ enum test_status status;
+ /* Test read() in different granularity. */
+ size_t wr_chunk_sizes[] = {
+ getpagesize() / 2, getpagesize(),
+ getpagesize() * 2, getpagesize() * 4
+ };
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(wr_chunk_sizes); ++i) {
+ printf("Write/read chunk size=0x%lx\n",
+ wr_chunk_sizes[i]);
+
+ fd = create_hugetlbfs_file(&file_stat);
+ if (fd < 0)
+ goto create_failure;
+ printf(PREFIX "HugeTLB read regression test...\n");
+ status = test_hugetlb_read(fd, file_stat.f_bsize,
+ wr_chunk_sizes[i]);
+ printf(PREFIX "HugeTLB read regression test...%s\n",
+ status_to_str(status));
+ close(fd);
+ if (status == TEST_FAILED)
+ return -1;
+
+ fd = create_hugetlbfs_file(&file_stat);
+ if (fd < 0)
+ goto create_failure;
+ printf(PREFIX "HugeTLB read HWPOISON test...\n");
+ status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize,
+ wr_chunk_sizes[i], false);
+ printf(PREFIX "HugeTLB read HWPOISON test...%s\n",
+ status_to_str(status));
+ close(fd);
+ if (status == TEST_FAILED)
+ return -1;
+
+ fd = create_hugetlbfs_file(&file_stat);
+ if (fd < 0)
+ goto create_failure;
+ printf(PREFIX "HugeTLB seek then read HWPOISON test...\n");
+ status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize,
+ wr_chunk_sizes[i], true);
+ printf(PREFIX "HugeTLB seek then read HWPOISON test...%s\n",
+ status_to_str(status));
+ close(fd);
+ if (status == TEST_FAILED)
+ return -1;
+ }
+
+ return 0;
+
+create_failure:
+ printf(ERROR_PREFIX "Abort test: failed to create hugetlbfs file\n");
+ return -1;
+}
diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
index 26853badae70..0de9d33cd565 100644
--- a/tools/testing/selftests/mm/ksm_functional_tests.c
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -29,6 +29,8 @@
static int ksm_fd;
static int ksm_full_scans_fd;
+static int proc_self_ksm_stat_fd;
+static int ksm_use_zero_pages_fd;
static int pagemap_fd;
static size_t pagesize;
@@ -59,6 +61,33 @@ static bool range_maps_duplicates(char *addr, unsigned long size)
return false;
}
+static long get_my_ksm_zero_pages(void)
+{
+ char buf[200];
+ char *substr_ksm_zero;
+ size_t value_pos;
+ ssize_t read_size;
+ unsigned long my_ksm_zero_pages;
+
+ if (!proc_self_ksm_stat_fd)
+ return 0;
+
+ read_size = pread(proc_self_ksm_stat_fd, buf, sizeof(buf) - 1, 0);
+ if (read_size < 0)
+ return -errno;
+
+ buf[read_size] = 0;
+
+ substr_ksm_zero = strstr(buf, "ksm_zero_pages");
+ if (!substr_ksm_zero)
+ return 0;
+
+ value_pos = strcspn(substr_ksm_zero, "0123456789");
+ my_ksm_zero_pages = strtol(substr_ksm_zero + value_pos, NULL, 10);
+
+ return my_ksm_zero_pages;
+}
+
static long ksm_get_full_scans(void)
{
char buf[10];
@@ -159,6 +188,70 @@ unmap:
munmap(map, size);
}
+static void test_unmerge_zero_pages(void)
+{
+ const unsigned int size = 2 * MiB;
+ char *map;
+ unsigned int offs;
+ unsigned long pages_expected;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ if (proc_self_ksm_stat_fd < 0) {
+ ksft_test_result_skip("open(\"/proc/self/ksm_stat\") failed\n");
+ return;
+ }
+ if (ksm_use_zero_pages_fd < 0) {
+ ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n");
+ return;
+ }
+ if (write(ksm_use_zero_pages_fd, "1", 1) != 1) {
+ ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n");
+ return;
+ }
+
+ /* Let KSM deduplicate zero pages. */
+ map = mmap_and_merge_range(0x00, size, false);
+ if (map == MAP_FAILED)
+ return;
+
+ /* Check if ksm_zero_pages is updated correctly after KSM merging */
+ pages_expected = size / pagesize;
+ if (pages_expected != get_my_ksm_zero_pages()) {
+ ksft_test_result_fail("'ksm_zero_pages' updated after merging\n");
+ goto unmap;
+ }
+
+ /* Try to unmerge half of the region */
+ if (madvise(map, size / 2, MADV_UNMERGEABLE)) {
+ ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+ goto unmap;
+ }
+
+ /* Check if ksm_zero_pages is updated correctly after unmerging */
+ pages_expected /= 2;
+ if (pages_expected != get_my_ksm_zero_pages()) {
+ ksft_test_result_fail("'ksm_zero_pages' updated after unmerging\n");
+ goto unmap;
+ }
+
+ /* Trigger unmerging of the other half by writing to the pages. */
+ for (offs = size / 2; offs < size; offs += pagesize)
+ *((unsigned int *)&map[offs]) = offs;
+
+ /* Now we should have no zeropages remaining. */
+ if (get_my_ksm_zero_pages()) {
+ ksft_test_result_fail("'ksm_zero_pages' updated after write fault\n");
+ goto unmap;
+ }
+
+ /* Check if ksm zero pages are really unmerged */
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "KSM zero pages were unmerged\n");
+unmap:
+ munmap(map, size);
+}
+
static void test_unmerge_discarded(void)
{
const unsigned int size = 2 * MiB;
@@ -358,7 +451,7 @@ unmap:
int main(int argc, char **argv)
{
- unsigned int tests = 5;
+ unsigned int tests = 6;
int err;
#ifdef __NR_userfaultfd
@@ -379,8 +472,11 @@ int main(int argc, char **argv)
pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
if (pagemap_fd < 0)
ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n");
+ proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY);
+ ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR);
test_unmerge();
+ test_unmerge_zero_pages();
test_unmerge_discarded();
#ifdef __NR_userfaultfd
test_unmerge_uffd_wp();
diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c
index 60547245e479..17bcb07f19f3 100644
--- a/tools/testing/selftests/mm/madv_populate.c
+++ b/tools/testing/selftests/mm/madv_populate.c
@@ -264,14 +264,35 @@ static void test_softdirty(void)
munmap(addr, SIZE);
}
+static int system_has_softdirty(void)
+{
+ /*
+ * There is no way to check if the kernel supports soft-dirty, other
+ * than by writing to a page and seeing if the bit was set. But the
+ * tests are intended to check that the bit gets set when it should, so
+ * doing that check would turn a potentially legitimate fail into a
+ * skip. Fortunately, we know for sure that arm64 does not support
+ * soft-dirty. So for now, let's just use the arch as a corse guide.
+ */
+#if defined(__aarch64__)
+ return 0;
+#else
+ return 1;
+#endif
+}
+
int main(int argc, char **argv)
{
+ int nr_tests = 16;
int err;
pagesize = getpagesize();
+ if (system_has_softdirty())
+ nr_tests += 5;
+
ksft_print_header();
- ksft_set_plan(21);
+ ksft_set_plan(nr_tests);
sense_support();
test_prot_read();
@@ -279,7 +300,8 @@ int main(int argc, char **argv)
test_holes();
test_populate_read();
test_populate_write();
- test_softdirty();
+ if (system_has_softdirty())
+ test_softdirty();
err = ksft_get_fail_cnt();
if (err)
diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c
index 379581567f27..6908569ef406 100644
--- a/tools/testing/selftests/mm/migration.c
+++ b/tools/testing/selftests/mm/migration.c
@@ -10,12 +10,13 @@
#include <numa.h>
#include <numaif.h>
#include <sys/mman.h>
+#include <sys/prctl.h>
#include <sys/types.h>
#include <signal.h>
#include <time.h>
#define TWOMEG (2<<20)
-#define RUNTIME (60)
+#define RUNTIME (20)
#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
@@ -155,10 +156,15 @@ TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME)
memset(ptr, 0xde, TWOMEG);
for (i = 0; i < self->nthreads - 1; i++) {
pid = fork();
- if (!pid)
+ if (!pid) {
+ prctl(PR_SET_PDEATHSIG, SIGHUP);
+ /* Parent may have died before prctl so check now. */
+ if (getppid() == 1)
+ kill(getpid(), SIGHUP);
access_mem(ptr);
- else
+ } else {
self->pids[i] = pid;
+ }
}
ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c
index dca21042b679..d822004a374e 100644
--- a/tools/testing/selftests/mm/mrelease_test.c
+++ b/tools/testing/selftests/mm/mrelease_test.c
@@ -7,6 +7,7 @@
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
+#include <sys/syscall.h>
#include <sys/wait.h>
#include <unistd.h>
#include <asm-generic/unistd.h>
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 3f26f6e15b2a..3e2bc818d566 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -12,11 +12,14 @@ exitcode=0
usage() {
cat <<EOF
-usage: ${BASH_SOURCE[0]:-$0} [ -h | -t "<categories>"]
+usage: ${BASH_SOURCE[0]:-$0} [ options ]
+
+ -a: run all tests, including extra ones
-t: specify specific categories to tests to run
-h: display this message
-The default behavior is to run all tests.
+The default behavior is to run required tests only. If -a is specified,
+will run all tests.
Alternatively, specific groups tests can be run by passing a string
to the -t argument containing one or more of the following categories
@@ -55,14 +58,27 @@ separated by spaces:
test soft dirty page bit semantics
- cow
test copy-on-write semantics
+- thp
+ test transparent huge pages
+- migration
+ invoke move_pages(2) to exercise the migration entry code
+ paths in the kernel
+- mkdirty
+ test handling of code that might set PTE/PMD dirty in
+ read-only VMAs
+- mdwe
+ test prctl(PR_SET_MDWE, ...)
+
example: ./run_vmtests.sh -t "hmm mmap ksm"
EOF
exit 0
}
+RUN_ALL=false
-while getopts "ht:" OPT; do
+while getopts "aht:" OPT; do
case ${OPT} in
+ "a") RUN_ALL=true ;;
"h") usage ;;
"t") VM_SELFTEST_ITEMS=${OPTARG} ;;
esac
@@ -85,6 +101,30 @@ test_selected() {
fi
}
+run_gup_matrix() {
+ # -t: thp=on, -T: thp=off, -H: hugetlb=on
+ local hugetlb_mb=$(( needmem_KB / 1024 ))
+
+ for huge in -t -T "-H -m $hugetlb_mb"; do
+ # -u: gup-fast, -U: gup-basic, -a: pin-fast, -b: pin-basic, -L: pin-longterm
+ for test_cmd in -u -U -a -b -L; do
+ # -w: write=1, -W: write=0
+ for write in -w -W; do
+ # -S: shared
+ for share in -S " "; do
+ # -n: How many pages to fetch together? 512 is special
+ # because it's default thp size (or 2M on x86), 123 to
+ # just test partial gup when hit a huge in whatever form
+ for num in "-n 1" "-n 512" "-n 123"; do
+ CATEGORY="gup_test" run_test ./gup_test \
+ $huge $test_cmd $write $share $num
+ done
+ done
+ done
+ done
+ done
+}
+
# get huge pagesize and freepages from /proc/meminfo
while read -r name size unit; do
if [ "$name" = "HugePages_Free:" ]; then
@@ -189,13 +229,16 @@ fi
CATEGORY="mmap" run_test ./map_fixed_noreplace
-# get_user_pages_fast() benchmark
-CATEGORY="gup_test" run_test ./gup_test -u
-# pin_user_pages_fast() benchmark
-CATEGORY="gup_test" run_test ./gup_test -a
+if $RUN_ALL; then
+ run_gup_matrix
+else
+ # get_user_pages_fast() benchmark
+ CATEGORY="gup_test" run_test ./gup_test -u
+ # pin_user_pages_fast() benchmark
+ CATEGORY="gup_test" run_test ./gup_test -a
+fi
# Dump pages 0, 19, and 4096, using pin_user_pages:
CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000
-
CATEGORY="gup_test" run_test ./gup_longterm
CATEGORY="userfaultfd" run_test ./uffd-unit-tests
@@ -262,6 +305,10 @@ CATEGORY="madv_populate" run_test ./madv_populate
CATEGORY="memfd_secret" run_test ./memfd_secret
+# KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100
+CATEGORY="ksm" run_test ./ksm_tests -H -s 100
+# KSM KSM_MERGE_TIME test with size of 100
+CATEGORY="ksm" run_test ./ksm_tests -P -s 100
# KSM MADV_MERGEABLE test with 10 identical pages
CATEGORY="ksm" run_test ./ksm_tests -M -p 10
# KSM unmerge test
@@ -290,11 +337,26 @@ then
CATEGORY="pkey" run_test ./protection_keys_64
fi
-CATEGORY="soft_dirty" run_test ./soft-dirty
+if [ -x ./soft-dirty ]
+then
+ CATEGORY="soft_dirty" run_test ./soft-dirty
+fi
# COW tests
CATEGORY="cow" run_test ./cow
+CATEGORY="thp" run_test ./khugepaged
+
+CATEGORY="thp" run_test ./transhuge-stress -d 20
+
+CATEGORY="thp" run_test ./split_huge_page_test
+
+CATEGORY="migration" run_test ./migration
+
+CATEGORY="mkdirty" run_test ./mkdirty
+
+CATEGORY="mdwe" run_test ./mdwe_test
+
echo "SUMMARY: PASS=${count_pass} SKIP=${count_skip} FAIL=${count_fail}"
exit $exitcode
diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings
index 9abfc60e9e6f..a953c96aa16e 100644
--- a/tools/testing/selftests/mm/settings
+++ b/tools/testing/selftests/mm/settings
@@ -1 +1 @@
-timeout=45
+timeout=180
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index 380ab5f0a534..16ed4dfa7359 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -139,7 +139,7 @@ void test_mmap(unsigned long size, unsigned flags)
before, after, before - after, size);
assert(size == getpagesize() || (before - after) == NUM_PAGES);
show(size);
- err = munmap(map, size);
+ err = munmap(map, size * NUM_PAGES);
assert(!err);
}
@@ -222,7 +222,7 @@ int main(void)
test_mmap(ps, MAP_HUGETLB | arg);
}
printf("Testing default huge mmap\n");
- test_mmap(default_hps, SHM_HUGETLB);
+ test_mmap(default_hps, MAP_HUGETLB);
puts("Testing non-huge shmget");
test_shmget(getpagesize(), 0);
diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c
index ba9d37ad3a89..c61fb9350b8c 100644
--- a/tools/testing/selftests/mm/transhuge-stress.c
+++ b/tools/testing/selftests/mm/transhuge-stress.c
@@ -25,13 +25,14 @@ int main(int argc, char **argv)
{
size_t ram, len;
void *ptr, *p;
- struct timespec a, b;
+ struct timespec start, a, b;
int i = 0;
char *name = NULL;
double s;
uint8_t *map;
size_t map_len;
int pagemap_fd;
+ int duration = 0;
ram = sysconf(_SC_PHYS_PAGES);
if (ram > SIZE_MAX / psize() / 4)
@@ -42,9 +43,11 @@ int main(int argc, char **argv)
while (++i < argc) {
if (!strcmp(argv[i], "-h"))
- errx(1, "usage: %s [size in MiB]", argv[0]);
+ errx(1, "usage: %s [-f <filename>] [-d <duration>] [size in MiB]", argv[0]);
else if (!strcmp(argv[i], "-f"))
name = argv[++i];
+ else if (!strcmp(argv[i], "-d"))
+ duration = atoi(argv[++i]);
else
len = atoll(argv[i]) << 20;
}
@@ -78,6 +81,8 @@ int main(int argc, char **argv)
if (!map)
errx(2, "map malloc");
+ clock_gettime(CLOCK_MONOTONIC, &start);
+
while (1) {
int nr_succeed = 0, nr_failed = 0, nr_pages = 0;
@@ -118,5 +123,8 @@ int main(int argc, char **argv)
"%4d succeed, %4d failed, %4d different pages",
s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
nr_succeed, nr_failed, nr_pages);
+
+ if (duration > 0 && b.tv_sec - start.tv_sec >= duration)
+ return 0;
}
}
diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c
index ba20d7504022..02b89860e193 100644
--- a/tools/testing/selftests/mm/uffd-common.c
+++ b/tools/testing/selftests/mm/uffd-common.c
@@ -499,6 +499,9 @@ void *uffd_poll_thread(void *arg)
int ret;
char tmp_chr;
+ if (!args->handle_fault)
+ args->handle_fault = uffd_handle_page_fault;
+
pollfd[0].fd = uffd;
pollfd[0].events = POLLIN;
pollfd[1].fd = pipefd[cpu*2];
@@ -527,7 +530,7 @@ void *uffd_poll_thread(void *arg)
err("unexpected msg event %u\n", msg.event);
break;
case UFFD_EVENT_PAGEFAULT:
- uffd_handle_page_fault(&msg, args);
+ args->handle_fault(&msg, args);
break;
case UFFD_EVENT_FORK:
close(uffd);
diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h
index 197f5262fe0d..7c4fa964c3b0 100644
--- a/tools/testing/selftests/mm/uffd-common.h
+++ b/tools/testing/selftests/mm/uffd-common.h
@@ -77,6 +77,9 @@ struct uffd_args {
unsigned long missing_faults;
unsigned long wp_faults;
unsigned long minor_faults;
+
+ /* A custom fault handler; defaults to uffd_handle_page_fault. */
+ void (*handle_fault)(struct uffd_msg *msg, struct uffd_args *args);
};
struct uffd_test_ops {
diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c
index 995ff13e74c7..469e0476af26 100644
--- a/tools/testing/selftests/mm/uffd-stress.c
+++ b/tools/testing/selftests/mm/uffd-stress.c
@@ -53,21 +53,21 @@ pthread_attr_t attr;
do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
const char *examples =
- "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
- "./userfaultfd anon 100 99999\n\n"
- "# Run share memory test on 1GiB region with 99 bounces:\n"
- "./userfaultfd shmem 1000 99\n\n"
- "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
- "./userfaultfd hugetlb 256 50\n\n"
- "# Run the same hugetlb test but using private file:\n"
- "./userfaultfd hugetlb-private 256 50\n\n"
- "# 10MiB-~6GiB 999 bounces anonymous test, "
- "continue forever unless an error triggers\n"
- "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
+ "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
+ "./uffd-stress anon 100 99999\n\n"
+ "# Run share memory test on 1GiB region with 99 bounces:\n"
+ "./uffd-stress shmem 1000 99\n\n"
+ "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
+ "./uffd-stress hugetlb 256 50\n\n"
+ "# Run the same hugetlb test but using private file:\n"
+ "./uffd-stress hugetlb-private 256 50\n\n"
+ "# 10MiB-~6GiB 999 bounces anonymous test, "
+ "continue forever unless an error triggers\n"
+ "while ./uffd-stress anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
static void usage(void)
{
- fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces>\n\n");
+ fprintf(stderr, "\nUsage: ./uffd-stress <test type> <MiB> <bounces>\n\n");
fprintf(stderr, "Supported <test type>: anon, hugetlb, "
"hugetlb-private, shmem, shmem-private\n\n");
fprintf(stderr, "Examples:\n\n");
@@ -189,10 +189,8 @@ static int stress(struct uffd_args *args)
locking_thread, (void *)cpu))
return 1;
if (bounces & BOUNCE_POLL) {
- if (pthread_create(&uffd_threads[cpu], &attr,
- uffd_poll_thread,
- (void *)&args[cpu]))
- return 1;
+ if (pthread_create(&uffd_threads[cpu], &attr, uffd_poll_thread, &args[cpu]))
+ err("uffd_poll_thread create");
} else {
if (pthread_create(&uffd_threads[cpu], &attr,
uffd_read_thread,
@@ -250,6 +248,8 @@ static int userfaultfd_stress(void)
struct uffd_args args[nr_cpus];
uint64_t mem_size = nr_pages * page_size;
+ memset(args, 0, sizeof(struct uffd_args) * nr_cpus);
+
if (uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED, NULL))
err("context init failed");
diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
index 04d91f144d1c..2709a34a39c5 100644
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -951,6 +951,117 @@ static void uffd_zeropage_test(uffd_test_args_t *args)
uffd_test_pass();
}
+static void uffd_register_poison(int uffd, void *addr, uint64_t len)
+{
+ uint64_t ioctls = 0;
+ uint64_t expected = (1 << _UFFDIO_COPY) | (1 << _UFFDIO_POISON);
+
+ if (uffd_register_with_ioctls(uffd, addr, len, true,
+ false, false, &ioctls))
+ err("poison register fail");
+
+ if ((ioctls & expected) != expected)
+ err("registered area doesn't support COPY and POISON ioctls");
+}
+
+static void do_uffdio_poison(int uffd, unsigned long offset)
+{
+ struct uffdio_poison uffdio_poison = { 0 };
+ int ret;
+ __s64 res;
+
+ uffdio_poison.range.start = (unsigned long) area_dst + offset;
+ uffdio_poison.range.len = page_size;
+ uffdio_poison.mode = 0;
+ ret = ioctl(uffd, UFFDIO_POISON, &uffdio_poison);
+ res = uffdio_poison.updated;
+
+ if (ret)
+ err("UFFDIO_POISON error: %"PRId64, (int64_t)res);
+ else if (res != page_size)
+ err("UFFDIO_POISON unexpected size: %"PRId64, (int64_t)res);
+}
+
+static void uffd_poison_handle_fault(
+ struct uffd_msg *msg, struct uffd_args *args)
+{
+ unsigned long offset;
+
+ if (msg->event != UFFD_EVENT_PAGEFAULT)
+ err("unexpected msg event %u", msg->event);
+
+ if (msg->arg.pagefault.flags &
+ (UFFD_PAGEFAULT_FLAG_WP | UFFD_PAGEFAULT_FLAG_MINOR))
+ err("unexpected fault type %llu", msg->arg.pagefault.flags);
+
+ offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
+
+ /* Odd pages -> copy zeroed page; even pages -> poison. */
+ if (offset & page_size)
+ copy_page(uffd, offset, false);
+ else
+ do_uffdio_poison(uffd, offset);
+}
+
+static void uffd_poison_test(uffd_test_args_t *targs)
+{
+ pthread_t uffd_mon;
+ char c;
+ struct uffd_args args = { 0 };
+ struct sigaction act = { 0 };
+ unsigned long nr_sigbus = 0;
+ unsigned long nr;
+
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+ uffd_register_poison(uffd, area_dst, nr_pages * page_size);
+ memset(area_src, 0, nr_pages * page_size);
+
+ args.handle_fault = uffd_poison_handle_fault;
+ if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
+ err("uffd_poll_thread create");
+
+ sigbuf = &jbuf;
+ act.sa_sigaction = sighndl;
+ act.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGBUS, &act, 0))
+ err("sigaction");
+
+ for (nr = 0; nr < nr_pages; ++nr) {
+ unsigned long offset = nr * page_size;
+ const char *bytes = (const char *) area_dst + offset;
+ const char *i;
+
+ if (sigsetjmp(*sigbuf, 1)) {
+ /*
+ * Access below triggered a SIGBUS, which was caught by
+ * sighndl, which then jumped here. Count this SIGBUS,
+ * and move on to next page.
+ */
+ ++nr_sigbus;
+ continue;
+ }
+
+ for (i = bytes; i < bytes + page_size; ++i) {
+ if (*i)
+ err("nonzero byte in area_dst (%p) at %p: %u",
+ area_dst, i, *i);
+ }
+ }
+
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
+ if (pthread_join(uffd_mon, NULL))
+ err("pthread_join()");
+
+ if (nr_sigbus != nr_pages / 2)
+ err("expected to receive %lu SIGBUS, actually received %lu",
+ nr_pages / 2, nr_sigbus);
+
+ uffd_test_pass();
+}
+
/*
* Test the returned uffdio_register.ioctls with different register modes.
* Note that _UFFDIO_ZEROPAGE is tested separately in the zeropage test.
@@ -1126,6 +1237,12 @@ uffd_test_case_t uffd_tests[] = {
UFFD_FEATURE_PAGEFAULT_FLAG_WP |
UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
},
+ {
+ .name = "poison",
+ .uffd_fn = uffd_poison_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_POISON,
+ },
};
static void usage(const char *prog)
diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c
index 7cfaf4a74c57..cfbc501290d3 100644
--- a/tools/testing/selftests/mm/va_high_addr_switch.c
+++ b/tools/testing/selftests/mm/va_high_addr_switch.c
@@ -292,7 +292,7 @@ static int supported_arch(void)
#elif defined(__x86_64__)
return 1;
#elif defined(__aarch64__)
- return 1;
+ return getpagesize() == PAGE_SIZE;
#else
return 0;
#endif
diff --git a/tools/testing/selftests/proc/proc-empty-vm.c b/tools/testing/selftests/proc/proc-empty-vm.c
index 7588428b8fcd..e1052443d070 100644
--- a/tools/testing/selftests/proc/proc-empty-vm.c
+++ b/tools/testing/selftests/proc/proc-empty-vm.c
@@ -77,7 +77,7 @@ static const char proc_pid_smaps_vsyscall_1[] =
"Swap: 0 kB\n"
"SwapPss: 0 kB\n"
"Locked: 0 kB\n"
-"THPeligible: 0\n"
+"THPeligible: 0\n"
/*
* "ProtectionKey:" field is conditional. It is possible to check it as well,
* but I don't have such machine.
@@ -107,7 +107,7 @@ static const char proc_pid_smaps_vsyscall_2[] =
"Swap: 0 kB\n"
"SwapPss: 0 kB\n"
"Locked: 0 kB\n"
-"THPeligible: 0\n"
+"THPeligible: 0\n"
/*
* "ProtectionKey:" field is conditional. It is possible to check it as well,
* but I'm too tired.