summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt21
-rw-r--r--Documentation/admin-guide/mm/hugetlbpage.rst11
-rw-r--r--Documentation/admin-guide/mm/memory-hotplug.rst13
-rw-r--r--Documentation/admin-guide/mm/pagemap.rst2
-rw-r--r--Documentation/admin-guide/mm/userfaultfd.rst3
-rw-r--r--Documentation/core-api/kernel-api.rst7
-rw-r--r--Documentation/filesystems/proc.rst48
-rw-r--r--Documentation/vm/hmm.rst19
-rw-r--r--Documentation/vm/unevictable-lru.rst33
-rw-r--r--MAINTAINERS10
-rw-r--r--arch/alpha/Kconfig5
-rw-r--r--arch/alpha/include/asm/pgalloc.h1
-rw-r--r--arch/alpha/include/asm/pgtable.h1
-rw-r--r--arch/alpha/include/uapi/asm/mman.h3
-rw-r--r--arch/alpha/kernel/setup.c2
-rw-r--r--arch/arc/include/asm/pgalloc.h2
-rw-r--r--arch/arc/include/asm/pgtable.h8
-rw-r--r--arch/arm/Kconfig3
-rw-r--r--arch/arm/include/asm/pgalloc.h1
-rw-r--r--arch/arm64/Kconfig13
-rw-r--r--arch/arm64/include/asm/hugetlb.h3
-rw-r--r--arch/arm64/include/asm/memory.h2
-rw-r--r--arch/arm64/include/asm/page.h2
-rw-r--r--arch/arm64/include/asm/pgalloc.h1
-rw-r--r--arch/arm64/include/asm/pgtable.h2
-rw-r--r--arch/arm64/kernel/setup.c1
-rw-r--r--arch/arm64/kvm/mmu.c2
-rw-r--r--arch/arm64/mm/hugetlbpage.c5
-rw-r--r--arch/arm64/mm/init.c31
-rw-r--r--arch/arm64/mm/ioremap.c4
-rw-r--r--arch/arm64/mm/mmu.c22
-rw-r--r--arch/csky/include/asm/pgalloc.h2
-rw-r--r--arch/csky/include/asm/pgtable.h1
-rw-r--r--arch/hexagon/include/asm/pgtable.h4
-rw-r--r--arch/ia64/Kconfig7
-rw-r--r--arch/ia64/include/asm/pal.h1
-rw-r--r--arch/ia64/include/asm/pgalloc.h1
-rw-r--r--arch/ia64/include/asm/pgtable.h1
-rw-r--r--arch/m68k/Kconfig5
-rw-r--r--arch/m68k/include/asm/mcf_pgalloc.h2
-rw-r--r--arch/m68k/include/asm/mcf_pgtable.h2
-rw-r--r--arch/m68k/include/asm/motorola_pgalloc.h1
-rw-r--r--arch/m68k/include/asm/motorola_pgtable.h2
-rw-r--r--arch/m68k/include/asm/pgtable_mm.h1
-rw-r--r--arch/m68k/include/asm/sun3_pgalloc.h1
-rw-r--r--arch/microblaze/Kconfig4
-rw-r--r--arch/microblaze/include/asm/pgalloc.h2
-rw-r--r--arch/microblaze/include/asm/pgtable.h2
-rw-r--r--arch/mips/Kconfig7
-rw-r--r--arch/mips/include/asm/pgalloc.h1
-rw-r--r--arch/mips/include/asm/pgtable-32.h1
-rw-r--r--arch/mips/include/asm/pgtable-64.h1
-rw-r--r--arch/mips/include/uapi/asm/mman.h3
-rw-r--r--arch/mips/kernel/relocate.c1
-rw-r--r--arch/mips/sgi-ip22/ip22-reset.c1
-rw-r--r--arch/mips/sgi-ip32/ip32-reset.c1
-rw-r--r--arch/nds32/include/asm/pgalloc.h5
-rw-r--r--arch/nios2/include/asm/pgalloc.h1
-rw-r--r--arch/nios2/include/asm/pgtable.h2
-rw-r--r--arch/openrisc/include/asm/pgalloc.h2
-rw-r--r--arch/openrisc/include/asm/pgtable.h1
-rw-r--r--arch/parisc/include/asm/pgalloc.h1
-rw-r--r--arch/parisc/include/asm/pgtable.h2
-rw-r--r--arch/parisc/include/uapi/asm/mman.h3
-rw-r--r--arch/parisc/kernel/pdc_chassis.c1
-rw-r--r--arch/powerpc/Kconfig6
-rw-r--r--arch/powerpc/include/asm/book3s/pgtable.h1
-rw-r--r--arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h5
-rw-r--r--arch/powerpc/include/asm/nohash/32/mmu-8xx.h43
-rw-r--r--arch/powerpc/include/asm/nohash/32/pgtable.h1
-rw-r--r--arch/powerpc/include/asm/nohash/64/pgtable.h2
-rw-r--r--arch/powerpc/include/asm/pgalloc.h5
-rw-r--r--arch/powerpc/include/asm/pgtable.h6
-rw-r--r--arch/powerpc/kernel/setup-common.c1
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype1
-rw-r--r--arch/riscv/Kconfig5
-rw-r--r--arch/riscv/include/asm/pgalloc.h2
-rw-r--r--arch/riscv/include/asm/pgtable.h2
-rw-r--r--arch/s390/Kconfig6
-rw-r--r--arch/s390/include/asm/pgalloc.h3
-rw-r--r--arch/s390/include/asm/pgtable.h5
-rw-r--r--arch/s390/kernel/ipl.c1
-rw-r--r--arch/s390/kernel/kprobes.c5
-rw-r--r--arch/s390/mm/pgtable.c2
-rw-r--r--arch/sh/include/asm/pgalloc.h1
-rw-r--r--arch/sh/include/asm/pgtable.h2
-rw-r--r--arch/sparc/Kconfig5
-rw-r--r--arch/sparc/include/asm/pgalloc_32.h1
-rw-r--r--arch/sparc/include/asm/pgalloc_64.h1
-rw-r--r--arch/sparc/include/asm/pgtable_32.h3
-rw-r--r--arch/sparc/include/asm/pgtable_64.h8
-rw-r--r--arch/sparc/kernel/sstate.c1
-rw-r--r--arch/sparc/mm/hugetlbpage.c6
-rw-r--r--arch/sparc/mm/init_64.c1
-rw-r--r--arch/um/drivers/mconsole_kern.c1
-rw-r--r--arch/um/include/asm/pgalloc.h1
-rw-r--r--arch/um/include/asm/pgtable-2level.h1
-rw-r--r--arch/um/include/asm/pgtable-3level.h1
-rw-r--r--arch/um/kernel/um_arch.c1
-rw-r--r--arch/x86/Kconfig17
-rw-r--r--arch/x86/include/asm/desc.h1
-rw-r--r--arch/x86/include/asm/pgalloc.h2
-rw-r--r--arch/x86/include/asm/pgtable_types.h2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c1
-rw-r--r--arch/x86/kernel/kprobes/core.c6
-rw-r--r--arch/x86/kernel/setup.c1
-rw-r--r--arch/x86/mm/init_64.c5
-rw-r--r--arch/x86/mm/pgtable.c34
-rw-r--r--arch/x86/purgatory/purgatory.c2
-rw-r--r--arch/x86/xen/enlighten.c1
-rw-r--r--arch/xtensa/include/asm/pgalloc.h2
-rw-r--r--arch/xtensa/include/asm/pgtable.h1
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h3
-rw-r--r--arch/xtensa/platforms/iss/setup.c1
-rw-r--r--drivers/block/zram/zram_drv.h2
-rw-r--r--drivers/bus/brcmstb_gisb.c1
-rw-r--r--drivers/char/ipmi/ipmi_msghandler.c1
-rw-r--r--drivers/clk/analogbits/wrpll-cln28hpc.c4
-rw-r--r--drivers/edac/altera_edac.c1
-rw-r--r--drivers/firmware/google/gsmi.c1
-rw-r--r--drivers/gpu/drm/nouveau/include/nvif/if000c.h1
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_svm.c156
-rw-r--r--drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h1
-rw-r--r--drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c6
-rw-r--r--drivers/hv/vmbus_drv.c1
-rw-r--r--drivers/hwtracing/coresight/coresight-cpu-debug.c1
-rw-r--r--drivers/leds/trigger/ledtrig-activity.c1
-rw-r--r--drivers/leds/trigger/ledtrig-heartbeat.c1
-rw-r--r--drivers/leds/trigger/ledtrig-panic.c1
-rw-r--r--drivers/misc/bcm-vk/bcm_vk_dev.c1
-rw-r--r--drivers/misc/ibmasm/heartbeat.c1
-rw-r--r--drivers/misc/pvpanic/pvpanic.c1
-rw-r--r--drivers/net/ipa/ipa_smp2p.c1
-rw-r--r--drivers/parisc/power.c1
-rw-r--r--drivers/power/reset/ltc2952-poweroff.c1
-rw-r--r--drivers/remoteproc/remoteproc_core.c1
-rw-r--r--drivers/s390/char/con3215.c1
-rw-r--r--drivers/s390/char/con3270.c1
-rw-r--r--drivers/s390/char/sclp.c1
-rw-r--r--drivers/s390/char/sclp_con.c1
-rw-r--r--drivers/s390/char/sclp_vt220.c1
-rw-r--r--drivers/s390/char/zcore.c1
-rw-r--r--drivers/soc/bcm/brcmstb/pm/pm-arm.c1
-rw-r--r--drivers/staging/olpc_dcon/olpc_dcon.c1
-rw-r--r--drivers/video/fbdev/hyperv_fb.c1
-rw-r--r--drivers/virtio/virtio_mem.c2
-rw-r--r--fs/Kconfig15
-rw-r--r--fs/exec.c3
-rw-r--r--fs/hfsplus/inode.c5
-rw-r--r--fs/hfsplus/xattr.c1
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/nilfs2/btree.c1
-rw-r--r--fs/open.c13
-rw-r--r--fs/proc/base.c6
-rw-r--r--fs/proc/fd.c20
-rw-r--r--fs/proc/kcore.c67
-rw-r--r--fs/proc/task_mmu.c34
-rw-r--r--fs/seq_file.c43
-rw-r--r--fs/userfaultfd.c15
-rw-r--r--include/asm-generic/bug.h3
-rw-r--r--include/linux/ascii85.h3
-rw-r--r--include/linux/bootmem_info.h66
-rw-r--r--include/linux/compat.h2
-rw-r--r--include/linux/compiler-clang.h17
-rw-r--r--include/linux/compiler-gcc.h6
-rw-r--r--include/linux/compiler_types.h2
-rw-r--r--include/linux/huge_mm.h70
-rw-r--r--include/linux/hugetlb.h42
-rw-r--r--include/linux/hugetlb_cgroup.h19
-rw-r--r--include/linux/kcore.h3
-rw-r--r--include/linux/kernel.h227
-rw-r--r--include/linux/kprobes.h1
-rw-r--r--include/linux/kstrtox.h155
-rw-r--r--include/linux/memblock.h4
-rw-r--r--include/linux/memory_hotplug.h27
-rw-r--r--include/linux/mempolicy.h9
-rw-r--r--include/linux/memremap.h2
-rw-r--r--include/linux/migrate.h27
-rw-r--r--include/linux/mm.h14
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/mmu_notifier.h26
-rw-r--r--include/linux/mmzone.h27
-rw-r--r--include/linux/mpi.h4
-rw-r--r--include/linux/page-flags.h22
-rw-r--r--include/linux/panic.h98
-rw-r--r--include/linux/panic_notifier.h12
-rw-r--r--include/linux/pgtable.h44
-rw-r--r--include/linux/rmap.h13
-rw-r--r--include/linux/seq_file.h10
-rw-r--r--include/linux/shmem_fs.h19
-rw-r--r--include/linux/signal.h2
-rw-r--r--include/linux/string.h7
-rw-r--r--include/linux/string_helpers.h31
-rw-r--r--include/linux/sunrpc/cache.h1
-rw-r--r--include/linux/swap.h19
-rw-r--r--include/linux/swapops.h125
-rw-r--r--include/linux/thread_info.h1
-rw-r--r--include/linux/userfaultfd_k.h5
-rw-r--r--include/linux/vmalloc.h15
-rw-r--r--include/linux/zbud.h23
-rw-r--r--include/trace/events/vmscan.h41
-rw-r--r--include/uapi/asm-generic/mman-common.h3
-rw-r--r--include/uapi/linux/mempolicy.h1
-rw-r--r--include/uapi/linux/userfaultfd.h7
-rw-r--r--init/main.c42
-rw-r--r--ipc/msg.c6
-rw-r--r--ipc/sem.c25
-rw-r--r--ipc/shm.c6
-rw-r--r--ipc/util.c44
-rw-r--r--ipc/util.h3
-rw-r--r--kernel/hung_task.c1
-rw-r--r--kernel/kexec_core.c1
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/signal.c14
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/trace/trace.c1
-rw-r--r--lib/Kconfig.debug12
-rw-r--r--lib/decompress_bunzip2.c6
-rw-r--r--lib/decompress_unlz4.c8
-rw-r--r--lib/decompress_unlzo.c3
-rw-r--r--lib/decompress_unxz.c2
-rw-r--r--lib/decompress_unzstd.c4
-rw-r--r--lib/kstrtox.c5
-rw-r--r--lib/lz4/lz4_decompress.c2
-rw-r--r--lib/math/Makefile1
-rw-r--r--lib/math/rational-test.c56
-rw-r--r--lib/math/rational.c16
-rw-r--r--lib/mpi/longlong.h4
-rw-r--r--lib/mpi/mpicoder.c6
-rw-r--r--lib/mpi/mpiutil.c2
-rw-r--r--lib/parser.c1
-rw-r--r--lib/string.c2
-rw-r--r--lib/string_helpers.c102
-rw-r--r--lib/test-string_helpers.c157
-rw-r--r--lib/test_hmm.c127
-rw-r--r--lib/test_hmm_uapi.h2
-rw-r--r--lib/test_string.c5
-rw-r--r--lib/vsprintf.c1
-rw-r--r--lib/xz/xz_dec_bcj.c2
-rw-r--r--lib/xz/xz_dec_lzma2.c8
-rw-r--r--lib/zlib_inflate/inffast.c2
-rw-r--r--lib/zstd/huf.h2
-rw-r--r--mm/Kconfig16
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem_info.c127
-rw-r--r--mm/compaction.c20
-rw-r--r--mm/debug_vm_pgtable.c109
-rw-r--r--mm/gup.c58
-rw-r--r--mm/hmm.c12
-rw-r--r--mm/huge_memory.c265
-rw-r--r--mm/hugetlb.c361
-rw-r--r--mm/hugetlb_vmemmap.c298
-rw-r--r--mm/hugetlb_vmemmap.h45
-rw-r--r--mm/internal.h29
-rw-r--r--mm/kfence/core.c4
-rw-r--r--mm/khugepaged.c20
-rw-r--r--mm/madvise.c66
-rw-r--r--mm/mapping_dirty_helpers.c2
-rw-r--r--mm/memblock.c28
-rw-r--r--mm/memcontrol.c4
-rw-r--r--mm/memory-failure.c38
-rw-r--r--mm/memory.c235
-rw-r--r--mm/memory_hotplug.c159
-rw-r--r--mm/mempolicy.c303
-rw-r--r--mm/migrate.c268
-rw-r--r--mm/mlock.c12
-rw-r--r--mm/mmap_lock.c59
-rw-r--r--mm/mprotect.c18
-rw-r--r--mm/nommu.c5
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/page_vma_mapped.c15
-rw-r--r--mm/rmap.c628
-rw-r--r--mm/shmem.c123
-rw-r--r--mm/sparse-vmemmap.c354
-rw-r--r--mm/sparse.c1
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/userfaultfd.c225
-rw-r--r--mm/util.c40
-rw-r--r--mm/vmalloc.c37
-rw-r--r--mm/vmscan.c20
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/z3fold.c39
-rw-r--r--mm/zbud.c235
-rw-r--r--mm/zsmalloc.c3
-rw-r--r--mm/zswap.c26
-rwxr-xr-xscripts/checkpatch.pl16
-rw-r--r--tools/testing/selftests/vm/.gitignore3
-rw-r--r--tools/testing/selftests/vm/Makefile5
-rw-r--r--tools/testing/selftests/vm/hmm-tests.c158
-rw-r--r--tools/testing/selftests/vm/khugepaged.c4
-rw-r--r--tools/testing/selftests/vm/madv_populate.c342
-rw-r--r--tools/testing/selftests/vm/pkey-x86.h1
-rw-r--r--tools/testing/selftests/vm/protection_keys.c85
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests.sh16
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c1054
299 files changed, 5991 insertions, 2961 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 0315407f5f57..13f13fdd4731 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1594,6 +1594,23 @@
Documentation/admin-guide/mm/hugetlbpage.rst.
Format: size[KMG]
+ hugetlb_free_vmemmap=
+ [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+ enabled.
+ Allows heavy hugetlb users to free up some more
+ memory (6 * PAGE_SIZE for each 2MB hugetlb page).
+ Format: { on | off (default) }
+
+ on: enable the feature
+ off: disable the feature
+
+ Built with CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON=y,
+ the default is on.
+
+ This is not compatible with memory_hotplug.memmap_on_memory.
+ If both parameters are enabled, hugetlb_free_vmemmap takes
+ precedence over memory_hotplug.memmap_on_memory.
+
hung_task_panic=
[KNL] Should the hung task detector generate panics.
Format: 0 | 1
@@ -2860,6 +2877,10 @@
Note that even when enabled, there are a few cases where
the feature is not effective.
+ This is not compatible with hugetlb_free_vmemmap. If
+ both parameters are enabled, hugetlb_free_vmemmap takes
+ precedence over memory_hotplug.memmap_on_memory.
+
memtest= [KNL,X86,ARM,PPC,RISCV] Enable memtest
Format: <integer>
default : 0 <disable>
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index f7b1c7462991..8abaeb144e44 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -60,6 +60,10 @@ HugePages_Surp
the pool above the value in ``/proc/sys/vm/nr_hugepages``. The
maximum number of surplus huge pages is controlled by
``/proc/sys/vm/nr_overcommit_hugepages``.
+ Note: When the feature of freeing unused vmemmap pages associated
+ with each hugetlb page is enabled, the number of surplus huge pages
+ may be temporarily larger than the maximum number of surplus huge
+ pages when the system is under memory pressure.
Hugepagesize
is the default hugepage size (in Kb).
Hugetlb
@@ -80,6 +84,10 @@ returned to the huge page pool when freed by a task. A user with root
privileges can dynamically allocate more or free some persistent huge pages
by increasing or decreasing the value of ``nr_hugepages``.
+Note: When the feature of freeing unused vmemmap pages associated with each
+hugetlb page is enabled, we can fail to free the huge pages triggered by
+the user when ths system is under memory pressure. Please try again later.
+
Pages that are used as huge pages are reserved inside the kernel and cannot
be used for other purposes. Huge pages cannot be swapped out under
memory pressure.
@@ -145,6 +153,9 @@ default_hugepagesz
will all result in 256 2M huge pages being allocated. Valid default
huge page size is architecture dependent.
+hugetlb_free_vmemmap
+ When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing
+ unused vmemmap pages associated with each HugeTLB page.
When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
indicates the current number of pre-allocated huge pages of the default size.
diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
index 05d51d2d8beb..c6bae2d77160 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -357,6 +357,19 @@ creates ZONE_MOVABLE as following.
Unfortunately, there is no information to show which memory block belongs
to ZONE_MOVABLE. This is TBD.
+ Memory offlining can fail when dissolving a free huge page on ZONE_MOVABLE
+ and the feature of freeing unused vmemmap pages associated with each hugetlb
+ page is enabled.
+
+ This can happen when we have plenty of ZONE_MOVABLE memory, but not enough
+ kernel memory to allocate vmemmmap pages. We may even be able to migrate
+ huge page contents, but will not be able to dissolve the source huge page.
+ This will prevent an offline operation and is unfortunate as memory offlining
+ is expected to succeed on movable zones. Users that depend on memory hotplug
+ to succeed for movable zones should carefully consider whether the memory
+ savings gained from this feature are worth the risk of possibly not being
+ able to offline memory in certain situations.
+
.. note::
Techniques that rely on long-term pinnings of memory (especially, RDMA and
vfio) are fundamentally problematic with ZONE_MOVABLE and, therefore, memory
diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst
index 340a5aee9b80..fb578fbbb76c 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -21,6 +21,8 @@ There are four components to pagemap:
* Bit 55 pte is soft-dirty (see
:ref:`Documentation/admin-guide/mm/soft-dirty.rst <soft_dirty>`)
* Bit 56 page exclusively mapped (since 4.2)
+ * Bit 57 pte is uffd-wp write-protected (since 5.13) (see
+ :ref:`Documentation/admin-guide/mm/userfaultfd.rst <userfaultfd>`)
* Bits 57-60 zero
* Bit 61 page is file-page or shared-anon (since 3.5)
* Bit 62 page swapped
diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index 3aa38e8b8361..6528036093e1 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -77,7 +77,8 @@ events, except page fault notifications, may be generated:
- ``UFFD_FEATURE_MINOR_HUGETLBFS`` indicates that the kernel supports
``UFFDIO_REGISTER_MODE_MINOR`` registration for hugetlbfs virtual memory
- areas.
+ areas. ``UFFD_FEATURE_MINOR_SHMEM`` is the analogous feature indicating
+ support for shmem virtual memory areas.
The userland application should set the feature flags it intends to use
when invoking the ``UFFDIO_API`` ioctl, to request that those features be
diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 741aa37dc181..2a7444e3a4c2 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -24,11 +24,8 @@ String Conversions
.. kernel-doc:: lib/vsprintf.c
:export:
-.. kernel-doc:: include/linux/kernel.h
- :functions: kstrtol
-
-.. kernel-doc:: include/linux/kernel.h
- :functions: kstrtoul
+.. kernel-doc:: include/linux/kstrtox.h
+ :functions: kstrtol kstrtoul
.. kernel-doc:: lib/kstrtox.c
:export:
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 81bfe3c800cc..042c418f4090 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -933,8 +933,15 @@ meminfo
~~~~~~~
Provides information about distribution and utilization of memory. This
-varies by architecture and compile options. The following is from a
-16GB PIII, which has highmem enabled. You may not have all of these fields.
+varies by architecture and compile options. Some of the counters reported
+here overlap. The memory reported by the non overlapping counters may not
+add up to the overall memory usage and the difference for some workloads
+can be substantial. In many cases there are other means to find out
+additional memory using subsystem specific interfaces, for instance
+/proc/net/sockstat for TCP memory allocations.
+
+The following is from a 16GB PIII, which has highmem enabled.
+You may not have all of these fields.
::
@@ -1913,18 +1920,20 @@ if precise results are needed.
3.8 /proc/<pid>/fdinfo/<fd> - Information about opened file
---------------------------------------------------------------
This file provides information associated with an opened file. The regular
-files have at least three fields -- 'pos', 'flags' and 'mnt_id'. The 'pos'
-represents the current offset of the opened file in decimal form [see lseek(2)
-for details], 'flags' denotes the octal O_xxx mask the file has been
-created with [see open(2) for details] and 'mnt_id' represents mount ID of
-the file system containing the opened file [see 3.5 /proc/<pid>/mountinfo
-for details].
+files have at least four fields -- 'pos', 'flags', 'mnt_id' and 'ino'.
+The 'pos' represents the current offset of the opened file in decimal
+form [see lseek(2) for details], 'flags' denotes the octal O_xxx mask the
+file has been created with [see open(2) for details] and 'mnt_id' represents
+mount ID of the file system containing the opened file [see 3.5
+/proc/<pid>/mountinfo for details]. 'ino' represents the inode number of
+the file.
A typical output is::
pos: 0
flags: 0100002
mnt_id: 19
+ ino: 63107
All locks associated with a file descriptor are shown in its fdinfo too::
@@ -1941,6 +1950,7 @@ Eventfd files
pos: 0
flags: 04002
mnt_id: 9
+ ino: 63107
eventfd-count: 5a
where 'eventfd-count' is hex value of a counter.
@@ -1953,6 +1963,7 @@ Signalfd files
pos: 0
flags: 04002
mnt_id: 9
+ ino: 63107
sigmask: 0000000000000200
where 'sigmask' is hex value of the signal mask associated
@@ -1966,6 +1977,7 @@ Epoll files
pos: 0
flags: 02
mnt_id: 9
+ ino: 63107
tfd: 5 events: 1d data: ffffffffffffffff pos:0 ino:61af sdev:7
where 'tfd' is a target file descriptor number in decimal form,
@@ -1982,6 +1994,8 @@ For inotify files the format is the following::
pos: 0
flags: 02000000
+ mnt_id: 9
+ ino: 63107
inotify wd:3 ino:9e7e sdev:800013 mask:800afce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:7e9e0000640d1b6d
where 'wd' is a watch descriptor in decimal form, i.e. a target file
@@ -2004,6 +2018,7 @@ For fanotify files the format is::
pos: 0
flags: 02
mnt_id: 9
+ ino: 63107
fanotify flags:10 event-flags:0
fanotify mnt_id:12 mflags:40 mask:38 ignored_mask:40000003
fanotify ino:4f969 sdev:800013 mflags:0 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:69f90400c275b5b4
@@ -2028,6 +2043,7 @@ Timerfd files
pos: 0
flags: 02
mnt_id: 9
+ ino: 63107
clockid: 0
ticks: 0
settime flags: 01
@@ -2042,6 +2058,22 @@ details]. 'it_value' is remaining time until the timer expiration.
with TIMER_ABSTIME option which will be shown in 'settime flags', but 'it_value'
still exhibits timer's remaining time.
+DMA Buffer files
+~~~~~~~~~~~~~~~~
+
+::
+
+ pos: 0
+ flags: 04002
+ mnt_id: 9
+ ino: 63107
+ size: 32768
+ count: 2
+ exp_name: system-heap
+
+where 'size' is the size of the DMA buffer in bytes. 'count' is the file count of
+the DMA buffer file. 'exp_name' is the name of the DMA buffer exporter.
+
3.9 /proc/<pid>/map_files - Information about memory mapped files
---------------------------------------------------------------------
This directory contains symbolic links which represent memory mapped files
diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 09e28507f5b2..a14c2938e7af 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -332,7 +332,7 @@ between device driver specific code and shared common code:
walks to fill in the ``args->src`` array with PFNs to be migrated.
The ``invalidate_range_start()`` callback is passed a
``struct mmu_notifier_range`` with the ``event`` field set to
- ``MMU_NOTIFY_MIGRATE`` and the ``migrate_pgmap_owner`` field set to
+ ``MMU_NOTIFY_MIGRATE`` and the ``owner`` field set to
the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This is
allows the device driver to skip the invalidation callback and only
invalidate device private MMU mappings that are actually migrating.
@@ -405,6 +405,23 @@ between device driver specific code and shared common code:
The lock can now be released.
+Exclusive access memory
+=======================
+
+Some devices have features such as atomic PTE bits that can be used to implement
+atomic access to system memory. To support atomic operations to a shared virtual
+memory page such a device needs access to that page which is exclusive of any
+userspace access from the CPU. The ``make_device_exclusive_range()`` function
+can be used to make a memory range inaccessible from userspace.
+
+This replaces all mappings for pages in the given range with special swap
+entries. Any attempt to access the swap entry results in a fault which is
+resovled by replacing the entry with the original mapping. A driver gets
+notified that the mapping has been changed by MMU notifiers, after which point
+it will no longer have exclusive access to the page. Exclusive access is
+guranteed to last until the driver drops the page lock and page reference, at
+which point any CPU faults on the page may proceed as described.
+
Memory cgroup (memcg) and rss accounting
========================================
diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst
index 0e1490524f53..eae3af17f2d9 100644
--- a/Documentation/vm/unevictable-lru.rst
+++ b/Documentation/vm/unevictable-lru.rst
@@ -389,14 +389,14 @@ mlocked, munlock_vma_page() updates that zone statistics for the number of
mlocked pages. Note, however, that at this point we haven't checked whether
the page is mapped by other VM_LOCKED VMAs.
-We can't call try_to_munlock(), the function that walks the reverse map to
+We can't call page_mlock(), the function that walks the reverse map to
check for other VM_LOCKED VMAs, without first isolating the page from the LRU.
-try_to_munlock() is a variant of try_to_unmap() and thus requires that the page
+page_mlock() is a variant of try_to_unmap() and thus requires that the page
not be on an LRU list [more on these below]. However, the call to
-isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). So,
+isolate_lru_page() could fail, in which case we can't call page_mlock(). So,
we go ahead and clear PG_mlocked up front, as this might be the only chance we
-have. If we can successfully isolate the page, we go ahead and
-try_to_munlock(), which will restore the PG_mlocked flag and update the zone
+have. If we can successfully isolate the page, we go ahead and call
+page_mlock(), which will restore the PG_mlocked flag and update the zone
page statistics if it finds another VMA holding the page mlocked. If we fail
to isolate the page, we'll have left a potentially mlocked page on the LRU.
This is fine, because we'll catch it later if and if vmscan tries to reclaim
@@ -545,31 +545,24 @@ munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim,
holepunching, and truncation of file pages and their anonymous COWed pages.
-try_to_munlock() Reverse Map Scan
+page_mlock() Reverse Map Scan
---------------------------------
-.. warning::
- [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
- page_referenced() reverse map walker.
-
When munlock_vma_page() [see section :ref:`munlock()/munlockall() System Call
Handling <munlock_munlockall_handling>` above] tries to munlock a
page, it needs to determine whether or not the page is mapped by any
VM_LOCKED VMA without actually attempting to unmap all PTEs from the
page. For this purpose, the unevictable/mlock infrastructure
-introduced a variant of try_to_unmap() called try_to_munlock().
+introduced a variant of try_to_unmap() called page_mlock().
-try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
-mapped file and KSM pages with a flag argument specifying unlock versus unmap
-processing. Again, these functions walk the respective reverse maps looking
-for VM_LOCKED VMAs. When such a VMA is found, as in the try_to_unmap() case,
-the functions mlock the page via mlock_vma_page() and return SWAP_MLOCK. This
-undoes the pre-clearing of the page's PG_mlocked done by munlock_vma_page.
+page_mlock() walks the respective reverse maps looking for VM_LOCKED VMAs. When
+such a VMA is found the page is mlocked via mlock_vma_page(). This undoes the
+pre-clearing of the page's PG_mlocked done by munlock_vma_page.
-Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's
+Note that page_mlock()'s reverse map walk must visit every VMA in a page's
reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
However, the scan can terminate when it encounters a VM_LOCKED VMA.
-Although try_to_munlock() might be called a great many times when munlocking a
+Although page_mlock() might be called a great many times when munlocking a
large region or tearing down a large address space that has been mlocked via
mlockall(), overall this is a fairly rare event.
@@ -602,7 +595,7 @@ inactive lists to the appropriate node's unevictable list.
shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd
after shrink_active_list() had moved them to the inactive list, or pages mapped
into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to
-recheck via try_to_munlock(). shrink_inactive_list() won't notice the latter,
+recheck via page_mlock(). shrink_inactive_list() won't notice the latter,
but will pass on to shrink_page_list().
shrink_page_list() again culls obviously unevictable pages that it could
diff --git a/MAINTAINERS b/MAINTAINERS
index e5891bbf5443..0d9d8fbd9e92 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7704,6 +7704,14 @@ L: linux-input@vger.kernel.org
S: Maintained
F: drivers/input/touchscreen/resistive-adc-touch.c
+GENERIC STRING LIBRARY
+R: Andy Shevchenko <andy@kernel.org>
+S: Maintained
+F: lib/string.c
+F: lib/string_helpers.c
+F: lib/test_string.c
+F: lib/test-string_helpers.c
+
GENERIC UIO DRIVER FOR PCI DEVICES
M: "Michael S. Tsirkin" <mst@redhat.com>
L: kvm@vger.kernel.org
@@ -11900,6 +11908,7 @@ F: include/linux/mmzone.h
F: include/linux/pagewalk.h
F: include/linux/vmalloc.h
F: mm/
+F: tools/testing/selftests/vm/
MEMORY TECHNOLOGY DEVICES (MTD)
M: Miquel Raynal <miquel.raynal@bootlin.com>
@@ -20307,7 +20316,6 @@ M: Seth Jennings <sjenning@redhat.com>
M: Dan Streetman <ddstreet@ieee.org>
L: linux-mm@kvack.org
S: Maintained
-F: include/linux/zbud.h
F: mm/zbud.c
ZD1211RW WIRELESS DRIVER
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 8954216b9956..77d3280dc678 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -40,6 +40,7 @@ config ALPHA
select MMU_GATHER_NO_RANGE
select SET_FS
select SPARSEMEM_EXTREME if SPARSEMEM
+ select ZONE_DMA
help
The Alpha is a 64-bit general-purpose processor designed and
marketed by the Digital Equipment Corporation of blessed memory,
@@ -65,10 +66,6 @@ config GENERIC_CALIBRATE_DELAY
bool
default y
-config ZONE_DMA
- bool
- default y
-
config GENERIC_ISA_DMA
bool
default y
diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h
index 9c6a24fe493d..68be7adbfe58 100644
--- a/arch/alpha/include/asm/pgalloc.h
+++ b/arch/alpha/include/asm/pgalloc.h
@@ -18,7 +18,6 @@ pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte)
{
pmd_set(pmd, (pte_t *)(page_to_pa(pte) + PAGE_OFFSET));
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
static inline void
pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index e1757b7cfe3d..ff690846465e 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -46,7 +46,6 @@ struct vm_area_struct;
#define PTRS_PER_PMD (1UL << (PAGE_SHIFT-3))
#define PTRS_PER_PGD (1UL << (PAGE_SHIFT-3))
#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS 0UL
/* Number of pointers that fit on a page: this will go away. */
#define PTRS_PER_PAGE (1UL << (PAGE_SHIFT-3))
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index a18ec7f63888..56b4ee5a6c9e 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -71,6 +71,9 @@
#define MADV_COLD 20 /* deactivate these pages */
#define MADV_PAGEOUT 21 /* reclaim these pages */
+#define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */
+#define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 5f6858e9dc28..7d56c217b235 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -28,6 +28,7 @@
#include <linux/init.h>
#include <linux/string.h>
#include <linux/ioport.h>
+#include <linux/panic_notifier.h>
#include <linux/platform_device.h>
#include <linux/memblock.h>
#include <linux/pci.h>
@@ -46,7 +47,6 @@
#include <linux/log2.h>
#include <linux/export.h>
-extern struct atomic_notifier_head panic_notifier_list;
static int alpha_panic_event(struct notifier_block *, unsigned long, void *);
static struct notifier_block alpha_panic_block = {
alpha_panic_event,
diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h
index 6147db925248..a32ca3104ced 100644
--- a/arch/arc/include/asm/pgalloc.h
+++ b/arch/arc/include/asm/pgalloc.h
@@ -129,6 +129,4 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptep)
#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte)
-#define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd))
-
#endif /* _ASM_ARC_PGALLOC_H */
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 5878846f00cf..320cc0ae8a08 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -222,12 +222,6 @@
*/
#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
-/*
- * No special requirements for lowest virtual address we permit any user space
- * mapping to be mapped at.
- */
-#define FIRST_USER_ADDRESS 0UL
-
/****************************************************************
* Bucket load of VM Helpers
@@ -356,6 +350,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
#define kern_addr_valid(addr) (1)
+#define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd))
+
/*
* remap a physical page `pfn' of size `size' with page protection `prot'
* into virtual address `from'
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 809317b5a6c6..06b6187b67af 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -218,9 +218,6 @@ config GENERIC_CALIBRATE_DELAY
config ARCH_MAY_HAVE_PC_FDC
bool
-config ZONE_DMA
- bool
-
config ARCH_SUPPORTS_UPROBES
def_bool y
diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
index fdee1f04f4f3..a17f01235c29 100644
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -143,7 +143,6 @@ pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
__pmd_populate(pmdp, page_to_phys(ptep), prot);
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
#endif /* CONFIG_MMU */
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index be9083882f97..e07e7de9ac49 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -42,6 +42,7 @@ config ARM64
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
+ select ARCH_HAS_ZONE_DMA_SET if EXPERT
select ARCH_HAVE_ELF_PROT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_INLINE_READ_LOCK if !PREEMPTION
@@ -155,7 +156,6 @@ config ARM64
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
- select HAVE_ARCH_PFN_VALID
select HAVE_ARCH_PREL32_RELOCATIONS
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP_FILTER
@@ -308,14 +308,6 @@ config GENERIC_CSUM
config GENERIC_CALIBRATE_DELAY
def_bool y
-config ZONE_DMA
- bool "Support DMA zone" if EXPERT
- default y
-
-config ZONE_DMA32
- bool "Support DMA32 zone" if EXPERT
- default y
-
config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
def_bool y
@@ -1053,9 +1045,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK
def_bool y
depends on NUMA
-config HOLES_IN_ZONE
- def_bool y
-
source "kernel/Kconfig.hz"
config ARCH_SPARSEMEM_ENABLE
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 5abf91e3494c..1242f71937f8 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -23,8 +23,7 @@ static inline void arch_clear_hugepage_flags(struct page *page)
}
#define arch_clear_hugepage_flags arch_clear_hugepage_flags
-extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
- struct page *page, int writable);
+pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
#define arch_make_huge_pte arch_make_huge_pte
#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 1a35a4473598..824a3655dd93 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -351,7 +351,7 @@ static inline void *phys_to_virt(phys_addr_t x)
#define virt_addr_valid(addr) ({ \
__typeof__(addr) __addr = __tag_reset(addr); \
- __is_lm_address(__addr) && pfn_valid(virt_to_pfn(__addr)); \
+ __is_lm_address(__addr) && pfn_is_map_memory(virt_to_pfn(__addr)); \
})
void dump_mem_limit(void);
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index ed1b9dcf12b2..993a27ea6f54 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -41,7 +41,7 @@ void tag_clear_highpage(struct page *to);
typedef struct page *pgtable_t;
-extern int pfn_valid(unsigned long);
+int pfn_is_map_memory(unsigned long pfn);
#include <asm/memory.h>
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 31fbab3d6f99..8433a2058eb1 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -86,6 +86,5 @@ pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
VM_BUG_ON(mm == &init_mm);
__pmd_populate(pmdp, page_to_phys(ptep), PMD_TYPE_TABLE | PMD_TABLE_PXN);
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
#endif
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index c0ba8cdfa10a..508c7ffad515 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -26,8 +26,6 @@
#define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
-#define FIRST_USER_ADDRESS 0UL
-
#ifndef __ASSEMBLY__
#include <asm/cmpxchg.h>
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 8ed66142b088..880f40bae60e 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -23,6 +23,7 @@
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <linux/fs.h>
+#include <linux/panic_notifier.h>
#include <linux/proc_fs.h>
#include <linux/memblock.h>
#include <linux/of_fdt.h>
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index f23dfa06433b..3155c9e778f0 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -85,7 +85,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
static bool kvm_is_device_pfn(unsigned long pfn)
{
- return !pfn_valid(pfn);
+ return !pfn_is_map_memory(pfn);
}
static void *stage2_memcache_zalloc_page(void *arg)
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 58987a98e179..23505fc35324 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -339,10 +339,9 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
return NULL;
}
-pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
- struct page *page, int writable)
+pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
{
- size_t pagesize = huge_page_size(hstate_vma(vma));
+ size_t pagesize = 1UL << shift;
if (pagesize == CONT_PTE_SIZE) {
entry = pte_mkcont(entry);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 6e1ca044ca90..8490ed2917ff 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -219,42 +219,17 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
free_area_init(max_zone_pfns);
}
-int pfn_valid(unsigned long pfn)
+int pfn_is_map_memory(unsigned long pfn)
{
phys_addr_t addr = PFN_PHYS(pfn);
- struct mem_section *ms;
- /*
- * Ensure the upper PAGE_SHIFT bits are clear in the
- * pfn. Else it might lead to false positives when
- * some of the upper bits are set, but the lower bits
- * match a valid pfn.
- */
+ /* avoid false positives for bogus PFNs, see comment in pfn_valid() */
if (PHYS_PFN(addr) != pfn)
return 0;
- if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
- return 0;
-
- ms = __pfn_to_section(pfn);
- if (!valid_section(ms))
- return 0;
-
- /*
- * ZONE_DEVICE memory does not have the memblock entries.
- * memblock_is_map_memory() check for ZONE_DEVICE based
- * addresses will always fail. Even the normal hotplugged
- * memory will never have MEMBLOCK_NOMAP flag set in their
- * memblock entries. Skip memblock search for all non early
- * memory sections covering all of hotplug memory including
- * both normal and ZONE_DEVICE based.
- */
- if (!early_section(ms))
- return pfn_section_valid(ms, pfn);
-
return memblock_is_map_memory(addr);
}
-EXPORT_SYMBOL(pfn_valid);
+EXPORT_SYMBOL(pfn_is_map_memory);
static phys_addr_t memory_limit = PHYS_ADDR_MAX;
diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index b5e83c46b23e..b7c81dacabf0 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -43,7 +43,7 @@ static void __iomem *__ioremap_caller(phys_addr_t phys_addr, size_t size,
/*
* Don't allow RAM to be mapped.
*/
- if (WARN_ON(pfn_valid(__phys_to_pfn(phys_addr))))
+ if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr))))
return NULL;
area = get_vm_area_caller(size, VM_IOREMAP, caller);
@@ -84,7 +84,7 @@ EXPORT_SYMBOL(iounmap);
void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size)
{
/* For normal memory we already have a cacheable mapping. */
- if (pfn_valid(__phys_to_pfn(phys_addr)))
+ if (pfn_is_map_memory(__phys_to_pfn(phys_addr)))
return (void __iomem *)__phys_to_virt(phys_addr);
return __ioremap_caller(phys_addr, size, __pgprot(PROT_NORMAL),
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 0b28cc218091..595fde9a47dd 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -82,7 +82,7 @@ void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
{
- if (!pfn_valid(pfn))
+ if (!pfn_is_map_memory(pfn))
return pgprot_noncached(vma_prot);
else if (file->f_flags & O_SYNC)
return pgprot_writecombine(vma_prot);
@@ -1339,6 +1339,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
return dt_virt;
}
+#if CONFIG_PGTABLE_LEVELS > 3
int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
{
pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
@@ -1353,6 +1354,16 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
return 1;
}
+int pud_clear_huge(pud_t *pudp)
+{
+ if (!pud_sect(READ_ONCE(*pudp)))
+ return 0;
+ pud_clear(pudp);
+ return 1;
+}
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 2
int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
{
pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
@@ -1367,14 +1378,6 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
return 1;
}
-int pud_clear_huge(pud_t *pudp)
-{
- if (!pud_sect(READ_ONCE(*pudp)))
- return 0;
- pud_clear(pudp);
- return 1;
-}
-
int pmd_clear_huge(pmd_t *pmdp)
{
if (!pmd_sect(READ_ONCE(*pmdp)))
@@ -1382,6 +1385,7 @@ int pmd_clear_huge(pmd_t *pmdp)
pmd_clear(pmdp);
return 1;
}
+#endif
int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
{
diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h
index cd211aabbefd..bbbd0698b397 100644
--- a/arch/csky/include/asm/pgalloc.h
+++ b/arch/csky/include/asm/pgalloc.h
@@ -22,8 +22,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
set_pmd(pmd, __pmd(__pa(page_address(pte))));
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
extern void pgd_init(unsigned long *p);
static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index 0d60367b6bfa..151607ed5158 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h
@@ -14,7 +14,6 @@
#define PGDIR_MASK (~(PGDIR_SIZE-1))
#define USER_PTRS_PER_PGD (PAGE_OFFSET/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS 0UL
/*
* C-SKY is two-level paging structure:
diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index dbb22b80b8c4..18cd6ea9ab23 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h
@@ -155,9 +155,6 @@ extern unsigned long _dflt_cache_att;
extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* located in head.S */
-/* Seems to be zero even in architectures where the zero page is firewalled? */
-#define FIRST_USER_ADDRESS 0UL
-
/* HUGETLB not working currently */
#ifdef CONFIG_HUGETLB_PAGE
#define pte_mkhuge(pte) __pte((pte_val(pte) & ~0x3) | HVM_HUGEPAGE_SIZE)
@@ -242,7 +239,6 @@ static inline int pmd_bad(pmd_t pmd)
* pmd_page - converts a PMD entry to a page pointer
*/
#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
-#define pmd_pgtable(pmd) pmd_page(pmd)
/**
* pte_none - check if pte is mapped
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index da22a35e6f03..cf425c2c63af 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -60,6 +60,7 @@ config IA64
select NUMA if !FLATMEM
select PCI_MSI_ARCH_FALLBACKS if PCI_MSI
select SET_FS
+ select ZONE_DMA32
default y
help
The Itanium Processor Family is Intel's 64-bit successor to
@@ -72,9 +73,6 @@ config 64BIT
select ATA_NONSTANDARD if ATA
default y
-config ZONE_DMA32
- def_bool y
-
config MMU
bool
default y
@@ -308,9 +306,6 @@ config NODES_SHIFT
MAX_NUMNODES will be 2^(This value).
If in doubt, use the default.
-config HOLES_IN_ZONE
- bool
-
config HAVE_ARCH_NODEDATA_EXTENSION
def_bool y
depends on NUMA
diff --git a/arch/ia64/include/asm/pal.h b/arch/ia64/include/asm/pal.h
index 5c51fceedaf9..e6b652f9e45e 100644
--- a/arch/ia64/include/asm/pal.h
+++ b/arch/ia64/include/asm/pal.h
@@ -99,6 +99,7 @@
#include <linux/types.h>
#include <asm/fpu.h>
+#include <asm/intrinsics.h>
/*
* Data types needed to pass information into PAL procedures and
diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h
index 9601cfe83c94..0fb2b6291d58 100644
--- a/arch/ia64/include/asm/pgalloc.h
+++ b/arch/ia64/include/asm/pgalloc.h
@@ -52,7 +52,6 @@ pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, pgtable_t pte)
{
pmd_val(*pmd_entry) = page_to_phys(pte);
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
static inline void
pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte)
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index d765fd948fae..3f5dbbd8b9d8 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -128,7 +128,6 @@
#define PTRS_PER_PGD_SHIFT PTRS_PER_PTD_SHIFT
#define PTRS_PER_PGD (1UL << PTRS_PER_PGD_SHIFT)
#define USER_PTRS_PER_PGD (5*PTRS_PER_PGD/8) /* regions 0-4 are user regions */
-#define FIRST_USER_ADDRESS 0UL
/*
* All the normal masks have the "page accessed" bits on, as any time
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 372e4e69c43a..05a729c6ad7f 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -34,6 +34,7 @@ config M68K
select SET_FS
select UACCESS_MEMCPY if !MMU
select VIRT_TO_BUS
+ select ZONE_DMA
config CPU_BIG_ENDIAN
def_bool y
@@ -62,10 +63,6 @@ config TIME_LOW_RES
config NO_IOPORT_MAP
def_bool y
-config ZONE_DMA
- bool
- default y
-
config HZ
int
default 1000 if CLEOPATRA
diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h
index bc1228e00518..5c2c0a864524 100644
--- a/arch/m68k/include/asm/mcf_pgalloc.h
+++ b/arch/m68k/include/asm/mcf_pgalloc.h
@@ -32,8 +32,6 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address)
#define pmd_populate_kernel pmd_populate
-#define pmd_pgtable(pmd) pfn_to_virt(pmd_val(pmd) >> PAGE_SHIFT)
-
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable,
unsigned long address)
{
diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h
index 8d4ec05996c5..6f2b87d7a50d 100644
--- a/arch/m68k/include/asm/mcf_pgtable.h
+++ b/arch/m68k/include/asm/mcf_pgtable.h
@@ -150,6 +150,8 @@
#ifndef __ASSEMBLY__
+#define pmd_pgtable(pmd) pfn_to_virt(pmd_val(pmd) >> PAGE_SHIFT)
+
/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h
index b4fc3b4f6bb3..74a817d9387f 100644
--- a/arch/m68k/include/asm/motorola_pgalloc.h
+++ b/arch/m68k/include/asm/motorola_pgalloc.h
@@ -88,7 +88,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t page
{
pmd_set(pmd, page);
}
-#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h
index 8076467eff4b..a2908164ee6f 100644
--- a/arch/m68k/include/asm/motorola_pgtable.h
+++ b/arch/m68k/include/asm/motorola_pgtable.h
@@ -105,6 +105,8 @@ extern unsigned long mm_cachebits;
#define __S110 PAGE_SHARED_C
#define __S111 PAGE_SHARED_C
+#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
+
/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h
index aca22c2c1ee2..143ba7de9bda 100644
--- a/arch/m68k/include/asm/pgtable_mm.h
+++ b/arch/m68k/include/asm/pgtable_mm.h
@@ -72,7 +72,6 @@
#define PTRS_PER_PGD 128
#endif
#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS 0UL
/* Virtual address region for use by kernel_map() */
#ifdef CONFIG_SUN3
diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h
index 000f64869b91..198036aff519 100644
--- a/arch/m68k/include/asm/sun3_pgalloc.h
+++ b/arch/m68k/include/asm/sun3_pgalloc.h
@@ -32,7 +32,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t page
{
pmd_val(*pmd) = __pa((unsigned long)page_address(page));
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
/*
* allocating and freeing a pmd is trivial: the 1-entry pmd is
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 0660f47012bc..14a67a42fcae 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -43,6 +43,7 @@ config MICROBLAZE
select MMU_GATHER_NO_RANGE
select SPARSE_IRQ
select SET_FS
+ select ZONE_DMA
# Endianness selection
choice
@@ -60,9 +61,6 @@ config CPU_LITTLE_ENDIAN
endchoice
-config ZONE_DMA
- def_bool y
-
config ARCH_HAS_ILOG2_U32
def_bool n
diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h
index d56b9f670ad1..6c33b05f730f 100644
--- a/arch/microblaze/include/asm/pgalloc.h
+++ b/arch/microblaze/include/asm/pgalloc.h
@@ -28,8 +28,6 @@ static inline pgd_t *get_pgd(void)
#define pgd_alloc(mm) get_pgd()
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte))
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index 9ae8d2c17dd5..71cd547655d9 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -25,8 +25,6 @@ extern int mem_init_done;
#include <asm/mmu.h>
#include <asm/page.h>
-#define FIRST_USER_ADDRESS 0UL
-
extern unsigned long va_to_phys(unsigned long address);
extern pte_t *va_to_pte(unsigned long address);
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index e039e7d542c4..cee6087cd686 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -3274,13 +3274,6 @@ config I8253
select CLKSRC_I8253
select CLKEVT_I8253
select MIPS_EXTERNAL_TIMER
-
-config ZONE_DMA
- bool
-
-config ZONE_DMA32
- bool
-
endmenu
config TRAD_SIGNALS
diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index d0cf997b4ba8..4b2567d6b2df 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -28,7 +28,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
{
set_pmd(pmd, __pmd((unsigned long)page_address(pte)));
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
/*
* Initialize a new pmd table with invalid pointers.
diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h
index 6c0532d7b211..95df9c293d8d 100644
--- a/arch/mips/include/asm/pgtable-32.h
+++ b/arch/mips/include/asm/pgtable-32.h
@@ -93,7 +93,6 @@ extern int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1,
#endif
#define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS 0UL
#define VMALLOC_START MAP_BASE
diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h
index 1e7d6ce9d8d6..046465906c82 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -137,7 +137,6 @@
#define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t))
#define USER_PTRS_PER_PGD ((TASK_SIZE64 / PGDIR_SIZE)?(TASK_SIZE64 / PGDIR_SIZE):1)
-#define FIRST_USER_ADDRESS 0UL
/*
* TLB refill handlers also map the vmalloc area into xuseg. Avoid
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 57dc2ac4f8bd..40b210c65a5a 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -98,6 +98,9 @@
#define MADV_COLD 20 /* deactivate these pages */
#define MADV_PAGEOUT 21 /* reclaim these pages */
+#define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */
+#define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/mips/kernel/relocate.c b/arch/mips/kernel/relocate.c
index 499a5357c09f..56b51de2dc51 100644
--- a/arch/mips/kernel/relocate.c
+++ b/arch/mips/kernel/relocate.c
@@ -18,6 +18,7 @@
#include <linux/kernel.h>
#include <linux/libfdt.h>
#include <linux/of_fdt.h>
+#include <linux/panic_notifier.h>
#include <linux/sched/task.h>
#include <linux/start_kernel.h>
#include <linux/string.h>
diff --git a/arch/mips/sgi-ip22/ip22-reset.c b/arch/mips/sgi-ip22/ip22-reset.c
index c374f3ceec38..9028dbbb45dd 100644
--- a/arch/mips/sgi-ip22/ip22-reset.c
+++ b/arch/mips/sgi-ip22/ip22-reset.c
@@ -12,6 +12,7 @@
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/notifier.h>
+#include <linux/panic_notifier.h>
#include <linux/pm.h>
#include <linux/timer.h>
diff --git a/arch/mips/sgi-ip32/ip32-reset.c b/arch/mips/sgi-ip32/ip32-reset.c
index 20d8637340be..18d1c115cd53 100644
--- a/arch/mips/sgi-ip32/ip32-reset.c
+++ b/arch/mips/sgi-ip32/ip32-reset.c
@@ -12,6 +12,7 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/panic_notifier.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/notifier.h>
diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h
index 85c117347c86..a08e1ebca70e 100644
--- a/arch/nds32/include/asm/pgalloc.h
+++ b/arch/nds32/include/asm/pgalloc.h
@@ -12,11 +12,6 @@
#define __HAVE_ARCH_PTE_ALLOC_ONE
#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
-/*
- * Since we have only two-level page tables, these are trivial
- */
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
extern pgd_t *pgd_alloc(struct mm_struct *mm);
extern void pgd_free(struct mm_struct *mm, pgd_t * pgd);
diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h
index e6600d2a5ae0..3c4ae74d5798 100644
--- a/arch/nios2/include/asm/pgalloc.h
+++ b/arch/nios2/include/asm/pgalloc.h
@@ -25,7 +25,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
{
set_pmd(pmd, __pmd((unsigned long)page_address(pte)));
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
/*
* Initialize a new pmd table with invalid pointers.
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index 2600d76c310c..4a995fa628ee 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -24,8 +24,6 @@
#include <asm/pgtable-bits.h>
#include <asm-generic/pgtable-nopmd.h>
-#define FIRST_USER_ADDRESS 0UL
-
#define VMALLOC_START CONFIG_NIOS2_KERNEL_MMU_REGION_BASE
#define VMALLOC_END (CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h
index 88820299ecc4..b7b2b8d16fad 100644
--- a/arch/openrisc/include/asm/pgalloc.h
+++ b/arch/openrisc/include/asm/pgalloc.h
@@ -72,6 +72,4 @@ do { \
tlb_remove_page((tlb), (pte)); \
} while (0)
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
#endif
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index 9425bedab4fc..4ac591c9ca33 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -73,7 +73,6 @@ extern void paging_init(void);
*/
#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS 0UL
/*
* Kernels own virtual memory area.
diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h
index dda557085311..6a7e98e71f1d 100644
--- a/arch/parisc/include/asm/pgalloc.h
+++ b/arch/parisc/include/asm/pgalloc.h
@@ -69,6 +69,5 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
#define pmd_populate(mm, pmd, pte_page) \
pmd_populate_kernel(mm, pmd, page_address(pte_page))
-#define pmd_pgtable(pmd) pmd_page(pmd)
#endif
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 39017210dbf0..7f33c29764cc 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -171,8 +171,6 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
* pgd entries used up by user/kernel:
*/
-#define FIRST_USER_ADDRESS 0UL
-
/* NB: The tlb miss handlers make certain assumptions about the order */
/* of the following bits, so be careful (One example, bits 25-31 */
/* are moved together in one instruction). */
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index ab78cba446ed..9e3c010c0f61 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -52,6 +52,9 @@
#define MADV_COLD 20 /* deactivate these pages */
#define MADV_PAGEOUT 21 /* reclaim these pages */
+#define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */
+#define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */
+
#define MADV_MERGEABLE 65 /* KSM may merge identical pages */
#define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */
diff --git a/arch/parisc/kernel/pdc_chassis.c b/arch/parisc/kernel/pdc_chassis.c
index 75ae88d13909..da154406d368 100644
--- a/arch/parisc/kernel/pdc_chassis.c
+++ b/arch/parisc/kernel/pdc_chassis.c
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/panic_notifier.h>
#include <linux/reboot.h>
#include <linux/notifier.h>
#include <linux/cache.h>
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 14b132cf95e2..df46324d5090 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -187,7 +187,7 @@ config PPC
select GENERIC_VDSO_TIME_NS
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP
- select HAVE_ARCH_HUGE_VMAP if PPC_BOOK3S_64 && PPC_RADIX_MMU
+ select HAVE_ARCH_HUGE_VMAP if PPC_RADIX_MMU || PPC_8xx
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN if PPC32 && PPC_PAGE_SHIFT <= 14
@@ -403,10 +403,6 @@ config PPC_ADV_DEBUG_DAC_RANGE
config PPC_DAWR
bool
-config ZONE_DMA
- bool
- default y if PPC_BOOK3E_64
-
config PGTABLE_LEVELS
int
default 2 if !PPC64
diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h
index 0e1263455d73..ad130e15a126 100644
--- a/arch/powerpc/include/asm/book3s/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/pgtable.h
@@ -8,7 +8,6 @@
#include <asm/book3s/32/pgtable.h>
#endif
-#define FIRST_USER_ADDRESS 0UL
#ifndef __ASSEMBLY__
/* Insert a PTE, top-level function is out of line. It uses an inline
* low level function in the respective pgtable-* files
diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index 39be9aea86db..64b6c608eca4 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -66,10 +66,9 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
}
#ifdef CONFIG_PPC_4K_PAGES
-static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
- struct page *page, int writable)
+static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
{
- size_t size = huge_page_size(hstate_vma(vma));
+ size_t size = 1UL << shift;
if (size == SZ_16K)
return __pte(pte_val(entry) & ~_PAGE_HUGE);
diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index 6e4faa0a9b35..997cec973406 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -178,6 +178,7 @@
#ifndef __ASSEMBLY__
#include <linux/mmdebug.h>
+#include <linux/sizes.h>
void mmu_pin_tlb(unsigned long top, bool readonly);
@@ -225,6 +226,48 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
BUG();
}
+static inline bool arch_vmap_try_size(unsigned long addr, unsigned long end, u64 pfn,
+ unsigned int max_page_shift, unsigned long size)
+{
+ if (end - addr < size)
+ return false;
+
+ if ((1UL << max_page_shift) < size)
+ return false;
+
+ if (!IS_ALIGNED(addr, size))
+ return false;
+
+ if (!IS_ALIGNED(PFN_PHYS(pfn), size))
+ return false;
+
+ return true;
+}
+
+static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, unsigned long end,
+ u64 pfn, unsigned int max_page_shift)
+{
+ if (arch_vmap_try_size(addr, end, pfn, max_page_shift, SZ_512K))
+ return SZ_512K;
+ if (PAGE_SIZE == SZ_16K)
+ return SZ_16K;
+ if (arch_vmap_try_size(addr, end, pfn, max_page_shift, SZ_16K))
+ return SZ_16K;
+ return PAGE_SIZE;
+}
+#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
+
+static inline int arch_vmap_pte_supported_shift(unsigned long size)
+{
+ if (size >= SZ_512K)
+ return 19;
+ else if (size >= SZ_16K)
+ return 14;
+ else
+ return PAGE_SHIFT;
+}
+#define arch_vmap_pte_supported_shift arch_vmap_pte_supported_shift
+
/* patch sites */
extern s32 patch__itlbmiss_exit_1, patch__dtlbmiss_exit_1;
extern s32 patch__itlbmiss_perf, patch__dtlbmiss_perf;
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 96522f7f0618..f06ae00f2a65 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -54,7 +54,6 @@ extern int icache_44x_need_flush;
#define PGD_MASKED_BITS 0
#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS 0UL
#define pte_ERROR(e) \
pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 57cd3892bfe0..53fbfdfac93d 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -12,8 +12,6 @@
#include <asm/barrier.h>
#include <asm/asm-const.h>
-#define FIRST_USER_ADDRESS 0UL
-
/*
* Size of EA range mapped by our pagetables.
*/
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index 6dd78a2dc03a..3360cad78ace 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -70,9 +70,4 @@ extern struct kmem_cache *pgtable_cache[];
#include <asm/nohash/pgalloc.h>
#endif
-static inline pgtable_t pmd_pgtable(pmd_t pmd)
-{
- return (pgtable_t)pmd_page_vaddr(pmd);
-}
-
#endif /* _ASM_POWERPC_PGALLOC_H */
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index c6a676714f04..5969743719bc 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -152,6 +152,12 @@ static inline bool p4d_is_leaf(p4d_t p4d)
}
#endif
+#define pmd_pgtable pmd_pgtable
+static inline pgtable_t pmd_pgtable(pmd_t pmd)
+{
+ return (pgtable_t)pmd_page_vaddr(pmd);
+}
+
#ifdef CONFIG_PPC64
#define is_ioremap_addr is_ioremap_addr
static inline bool is_ioremap_addr(const void *x)
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 74a98fff2c2f..046fe21b5c3b 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -9,6 +9,7 @@
#undef DEBUG
#include <linux/export.h>
+#include <linux/panic_notifier.h>
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/init.h>
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index f998e655b570..7d271de8fcbd 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -111,6 +111,7 @@ config PPC_BOOK3E_64
select PPC_FPU # Make it a choice ?
select PPC_SMP_MUXED_IPI
select PPC_DOORBELL
+ select ZONE_DMA
endchoice
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 15f9490a7aad..469a70bd8da6 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -104,6 +104,7 @@ config RISCV
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
select UACCESS_MEMCPY if !MMU
+ select ZONE_DMA32 if 64BIT
config ARCH_MMAP_RND_BITS_MIN
default 18 if 64BIT
@@ -133,10 +134,6 @@ config MMU
Select if you want MMU-based virtualised addressing space
support by paged memory management. If unsure, say 'Y'.
-config ZONE_DMA32
- bool
- default y if 64BIT
-
config VA_BITS
int
default 32 if 32BIT
diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 23b1544e0ca5..0af6933a7100 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -38,8 +38,6 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
}
#endif /* __PAGETABLE_PMD_FOLDED */
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 380cd3a7e548..62f3fe7368f3 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -536,8 +536,6 @@ void setup_bootmem(void);
void paging_init(void);
void misc_mem_init(void);
-#define FIRST_USER_ADDRESS 0
-
/*
* ZERO_PAGE is a global shared page that is always zero,
* used for zero-mapped memory areas, etc.
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a49971647f81..c448567b18ca 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -2,9 +2,6 @@
config MMU
def_bool y
-config ZONE_DMA
- def_bool y
-
config CPU_BIG_ENDIAN
def_bool y
@@ -62,7 +59,7 @@ config S390
select ARCH_BINFMT_ELF_STATE
select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM
select ARCH_ENABLE_MEMORY_HOTREMOVE
- select ARCH_ENABLE_SPLIT_PMD_PTLOCK
+ select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_HAS_DEBUG_VM_PGTABLE
select ARCH_HAS_DEBUG_WX
select ARCH_HAS_DEVMEM_IS_ALLOWED
@@ -211,6 +208,7 @@ config S390
select THREAD_INFO_IN_TASK
select TTY
select VIRT_CPU_ACCOUNTING
+ select ZONE_DMA
# Note: keep the above list sorted alphabetically
config SCHED_OMIT_FRAME_POINTER
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 6b187cd72251..f14a555eff74 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -134,9 +134,6 @@ static inline void pmd_populate(struct mm_struct *mm,
#define pmd_populate_kernel(mm, pmd, pte) pmd_populate(mm, pmd, pte)
-#define pmd_pgtable(pmd) \
- ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE))
-
/*
* page table entry allocation/free routines.
*/
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index b38f7b781564..1f8f5da53262 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -65,8 +65,6 @@ extern unsigned long zero_page_mask;
/* TODO: s390 cannot support io_remap_pfn_range... */
-#define FIRST_USER_ADDRESS 0UL
-
#define pte_ERROR(e) \
printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e))
#define pmd_ERROR(e) \
@@ -1711,4 +1709,7 @@ extern void s390_reset_cmma(struct mm_struct *mm);
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+#define pmd_pgtable(pmd) \
+ ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE))
+
#endif /* _S390_PAGE_H */
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index dba04fbc37a2..36f870dc944f 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -13,6 +13,7 @@
#include <linux/init.h>
#include <linux/device.h>
#include <linux/delay.h>
+#include <linux/panic_notifier.h>
#include <linux/reboot.h>
#include <linux/ctype.h>
#include <linux/fs.h>
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 74b0bd2c24d4..528bb31815c3 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -44,11 +44,6 @@ void *alloc_insn_page(void)
return page;
}
-void free_insn_page(void *page)
-{
- module_memfree(page);
-}
-
static void *alloc_s390_insn_page(void)
{
if (xchg(&insn_page_in_use, 1) == 1)
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 18205f851c24..eec3a9d7176e 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -691,7 +691,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
if (!non_swap_entry(entry))
dec_mm_counter(mm, MM_SWAPENTS);
else if (is_migration_entry(entry)) {
- struct page *page = migration_entry_to_page(entry);
+ struct page *page = pfn_swap_entry_to_page(entry);
dec_mm_counter(mm, mm_counter(page));
}
diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h
index 0e6b0be25e33..a9e98233c4d4 100644
--- a/arch/sh/include/asm/pgalloc.h
+++ b/arch/sh/include/asm/pgalloc.h
@@ -30,7 +30,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
{
set_pmd(pmd, __pmd((unsigned long)page_address(pte)));
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
#define __pte_free_tlb(tlb,pte,addr) \
do { \
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index 27751e9470df..d7ddb1ec86a0 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -59,8 +59,6 @@ static inline unsigned long long neff_sign_extend(unsigned long val)
/* Entries per level */
#define PTRS_PER_PTE (PAGE_SIZE / (1 << PTE_MAGNITUDE))
-#define FIRST_USER_ADDRESS 0UL
-
#define PHYS_ADDR_MASK29 0x1fffffff
#define PHYS_ADDR_MASK32 0xffffffff
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index c72f52c704cd..c5fa7932b550 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -59,6 +59,7 @@ config SPARC32
select CLZ_TAB
select HAVE_UID16
select OLD_SIGACTION
+ select ZONE_DMA
config SPARC64
def_bool 64BIT
@@ -141,10 +142,6 @@ config HIGHMEM
default y if SPARC32
select KMAP_LOCAL
-config ZONE_DMA
- bool
- default y if SPARC32
-
config GENERIC_ISA_DMA
bool
default y if SPARC32
diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h
index 9d353e6dc5a9..4f73e87b22a3 100644
--- a/arch/sparc/include/asm/pgalloc_32.h
+++ b/arch/sparc/include/asm/pgalloc_32.h
@@ -51,7 +51,6 @@ static inline void free_pmd_fast(pmd_t * pmd)
#define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd)
#define pmd_populate(mm, pmd, pte) pmd_set(pmd, pte)
-#define pmd_pgtable(pmd) (pgtable_t)__pmd_page(pmd)
void pmd_set(pmd_t *pmdp, pte_t *ptep);
#define pmd_populate_kernel pmd_populate
diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h
index a8dafc550985..7b5561d17ab1 100644
--- a/arch/sparc/include/asm/pgalloc_64.h
+++ b/arch/sparc/include/asm/pgalloc_64.h
@@ -67,7 +67,6 @@ void pte_free(struct mm_struct *mm, pgtable_t ptepage);
#define pmd_populate_kernel(MM, PMD, PTE) pmd_set(MM, PMD, PTE)
#define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE)
-#define pmd_pgtable(PMD) ((pte_t *)pmd_page_vaddr(PMD))
void pgtable_free(void *table, bool is_page);
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index a5cf79c149fe..ebaf374b55ab 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -48,7 +48,6 @@ unsigned long __init bootmem_init(unsigned long *pages_avail);
#define PTRS_PER_PMD 64
#define PTRS_PER_PGD 256
#define USER_PTRS_PER_PGD PAGE_OFFSET / PGDIR_SIZE
-#define FIRST_USER_ADDRESS 0UL
#define PTE_SIZE (PTRS_PER_PTE*4)
#define PAGE_NONE SRMMU_PAGE_NONE
@@ -433,4 +432,6 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
/* We provide our own get_unmapped_area to cope with VA holes for userland */
#define HAVE_ARCH_UNMAPPED_AREA
+#define pmd_pgtable(pmd) ((pgtable_t)__pmd_page(pmd))
+
#endif /* !(_SPARC_PGTABLE_H) */
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 550d3904de65..e0ee48ec3903 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -95,9 +95,6 @@ bool kern_addr_valid(unsigned long addr);
#define PTRS_PER_PUD (1UL << PUD_BITS)
#define PTRS_PER_PGD (1UL << PGDIR_BITS)
-/* Kernel has a separate 44bit address space. */
-#define FIRST_USER_ADDRESS 0UL
-
#define pmd_ERROR(e) \
pr_err("%s:%d: bad pmd %p(%016lx) seen at (%pS)\n", \
__FILE__, __LINE__, &(e), pmd_val(e), __builtin_return_address(0))
@@ -377,8 +374,7 @@ static inline pgprot_t pgprot_noncached(pgprot_t prot)
#define pgprot_noncached pgprot_noncached
#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
-extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
- struct page *page, int writable);
+pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
#define arch_make_huge_pte arch_make_huge_pte
static inline unsigned long __pte_default_huge_mask(void)
{
@@ -1121,6 +1117,8 @@ extern unsigned long cmdline_memory_size;
asmlinkage void do_sparc64_fault(struct pt_regs *regs);
+#define pmd_pgtable(PMD) ((pte_t *)pmd_page_vaddr(PMD))
+
#ifdef CONFIG_HUGETLB_PAGE
#define pud_leaf_size pud_leaf_size
diff --git a/arch/sparc/kernel/sstate.c b/arch/sparc/kernel/sstate.c
index ac8677c3841e..3bcc4ddc6911 100644
--- a/arch/sparc/kernel/sstate.c
+++ b/arch/sparc/kernel/sstate.c
@@ -6,6 +6,7 @@
#include <linux/kernel.h>
#include <linux/notifier.h>
+#include <linux/panic_notifier.h>
#include <linux/reboot.h>
#include <linux/init.h>
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 04d8790f6c32..0f49fada2093 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -177,10 +177,8 @@ static pte_t hugepage_shift_to_tte(pte_t entry, unsigned int shift)
return sun4u_hugepage_shift_to_tte(entry, shift);
}
-pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
- struct page *page, int writeable)
+pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
{
- unsigned int shift = huge_page_shift(hstate_vma(vma));
pte_t pte;
pte = hugepage_shift_to_tte(entry, shift);
@@ -188,7 +186,7 @@ pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
#ifdef CONFIG_SPARC64
/* If this vma has ADI enabled on it, turn on TTE.mcd
*/
- if (vma->vm_flags & VM_SPARC_ADI)
+ if (flags & VM_SPARC_ADI)
return pte_mkmcd(pte);
else
return pte_mknotmcd(pte);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 06e938d03f3b..1b23639e2fcd 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -27,6 +27,7 @@
#include <linux/percpu.h>
#include <linux/mmzone.h>
#include <linux/gfp.h>
+#include <linux/bootmem_info.h>
#include <asm/head.h>
#include <asm/page.h>
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 6d00af25ec6b..328b16f99b30 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/notifier.h>
+#include <linux/panic_notifier.h>
#include <linux/reboot.h>
#include <linux/sched/debug.h>
#include <linux/proc_fs.h>
diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h
index 2bbf28cf3aa9..8ec7cd46dd96 100644
--- a/arch/um/include/asm/pgalloc.h
+++ b/arch/um/include/asm/pgalloc.h
@@ -19,7 +19,6 @@
set_pmd(pmd, __pmd(_PAGE_TABLE + \
((unsigned long long)page_to_pfn(pte) << \
(unsigned long long) PAGE_SHIFT)))
-#define pmd_pgtable(pmd) pmd_page(pmd)
/*
* Allocate and free page tables.
diff --git a/arch/um/include/asm/pgtable-2level.h b/arch/um/include/asm/pgtable-2level.h
index 32106d31e4ab..8256ecc5b919 100644
--- a/arch/um/include/asm/pgtable-2level.h
+++ b/arch/um/include/asm/pgtable-2level.h
@@ -23,7 +23,6 @@
#define PTRS_PER_PTE 1024
#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE)
#define PTRS_PER_PGD 1024
-#define FIRST_USER_ADDRESS 0UL
#define pte_ERROR(e) \
printk("%s:%d: bad pte %p(%08lx).\n", __FILE__, __LINE__, &(e), \
diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h
index 7e6a4180db9d..9289a86643a9 100644
--- a/arch/um/include/asm/pgtable-3level.h
+++ b/arch/um/include/asm/pgtable-3level.h
@@ -41,7 +41,6 @@
#endif
#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS 0UL
#define pte_ERROR(e) \
printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), \
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 74e07e748a9b..9512253947d5 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/panic_notifier.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/utsname.h>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 867e7936dbc5..49270655e827 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -33,6 +33,7 @@ config X86_64
select NEED_DMA_MAP_STATE
select SWIOTLB
select ARCH_HAS_ELFCORE_COMPAT
+ select ZONE_DMA32
config FORCE_DYNAMIC_FTRACE
def_bool y
@@ -63,7 +64,7 @@ config X86
select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM)
select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
- select ARCH_ENABLE_SPLIT_PMD_PTLOCK if X86_64 || X86_PAE
+ select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE)
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_CACHE_LINE_SIZE
@@ -93,6 +94,7 @@ config X86
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_DEBUG_WX
+ select ARCH_HAS_ZONE_DMA_SET if EXPERT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
@@ -344,9 +346,6 @@ config ARCH_SUSPEND_POSSIBLE
config ARCH_WANT_GENERAL_HUGETLB
def_bool y
-config ZONE_DMA32
- def_bool y if X86_64
-
config AUDIT_ARCH
def_bool y if X86_64
@@ -394,16 +393,6 @@ config CC_HAS_SANE_STACKPROTECTOR
menu "Processor type and features"
-config ZONE_DMA
- bool "DMA memory allocation support" if EXPERT
- default y
- help
- DMA memory allocation support allows devices with less than 32-bit
- addressing to allocate within the first 16MB of address space.
- Disable if no such devices will be used.
-
- If unsure, say Y.
-
config SMP
bool "Symmetric multi-processing support"
help
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index e63cf582201f..ab97b22ac04a 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -9,6 +9,7 @@
#include <asm/irq_vectors.h>
#include <asm/cpu_entry_area.h>
+#include <linux/debug_locks.h>
#include <linux/smp.h>
#include <linux/percpu.h>
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 62ad61d6fefc..c7ec5bb88334 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -84,8 +84,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
#if CONFIG_PGTABLE_LEVELS > 2
extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f24d7ef8fffa..40497a9020c6 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -7,8 +7,6 @@
#include <asm/page_types.h>
-#define FIRST_USER_ADDRESS 0UL
-
#define _PAGE_BIT_PRESENT 0 /* is present */
#define _PAGE_BIT_RW 1 /* writeable */
#define _PAGE_BIT_USER 2 /* userspace addressable */
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 01ca94f42e4e..cc8f1773deca 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -17,6 +17,7 @@
#include <linux/irq.h>
#include <linux/kexec.h>
#include <linux/i8253.h>
+#include <linux/panic_notifier.h>
#include <linux/random.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index c492ad3001ca..b6e046e4b289 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -422,12 +422,6 @@ void *alloc_insn_page(void)
return page;
}
-/* Recover page to RW mode before releasing it */
-void free_insn_page(void *page)
-{
- module_memfree(page);
-}
-
/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 85acd22f8022..9f1d9215a9fb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -14,6 +14,7 @@
#include <linux/initrd.h>
#include <linux/iscsi_ibft.h>
#include <linux/memblock.h>
+#include <linux/panic_notifier.h>
#include <linux/pci.h>
#include <linux/root_dev.h>
#include <linux/hugetlb.h>
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e527d829e1ed..65ea58527176 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -33,6 +33,7 @@
#include <linux/nmi.h>
#include <linux/gfp.h>
#include <linux/kcore.h>
+#include <linux/bootmem_info.h>
#include <asm/processor.h>
#include <asm/bios_ebda.h>
@@ -1269,7 +1270,7 @@ static struct kcore_list kcore_vsyscall;
static void __init register_page_bootmem_info(void)
{
-#ifdef CONFIG_NUMA
+#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)
int i;
for_each_online_node(i)
@@ -1623,7 +1624,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
return err;
}
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long nr_pages)
{
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d27cf69e811d..1303ff6ef7be 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -682,6 +682,7 @@ int p4d_clear_huge(p4d_t *p4d)
}
#endif
+#if CONFIG_PGTABLE_LEVELS > 3
/**
* pud_set_huge - setup kernel PUD mapping
*
@@ -721,6 +722,23 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
}
/**
+ * pud_clear_huge - clear kernel PUD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PUD map is found).
+ */
+int pud_clear_huge(pud_t *pud)
+{
+ if (pud_large(*pud)) {
+ pud_clear(pud);
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 2
+/**
* pmd_set_huge - setup kernel PMD mapping
*
* See text over pud_set_huge() above.
@@ -751,21 +769,6 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
}
/**
- * pud_clear_huge - clear kernel PUD mapping when it is set
- *
- * Returns 1 on success and 0 on failure (no PUD map is found).
- */
-int pud_clear_huge(pud_t *pud)
-{
- if (pud_large(*pud)) {
- pud_clear(pud);
- return 1;
- }
-
- return 0;
-}
-
-/**
* pmd_clear_huge - clear kernel PMD mapping when it is set
*
* Returns 1 on success and 0 on failure (no PMD map is found).
@@ -779,6 +782,7 @@ int pmd_clear_huge(pmd_t *pmd)
return 0;
}
+#endif
#ifdef CONFIG_X86_64
/**
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index f03b64d9cb51..7558139920f8 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -9,6 +9,8 @@
*/
#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
#include <crypto/sha2.h>
#include <asm/purgatory.h>
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index aa9f50fccc5d..c79bd0af2e8c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -6,6 +6,7 @@
#include <linux/cpu.h>
#include <linux/kexec.h>
#include <linux/slab.h>
+#include <linux/panic_notifier.h>
#include <xen/xen.h>
#include <xen/features.h>
diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h
index d3a22da4d2c9..eeb2de3a89e5 100644
--- a/arch/xtensa/include/asm/pgalloc.h
+++ b/arch/xtensa/include/asm/pgalloc.h
@@ -25,7 +25,6 @@
(pmd_val(*(pmdp)) = ((unsigned long)ptep))
#define pmd_populate(mm, pmdp, page) \
(pmd_val(*(pmdp)) = ((unsigned long)page_to_virt(page)))
-#define pmd_pgtable(pmd) pmd_page(pmd)
static inline pgd_t*
pgd_alloc(struct mm_struct *mm)
@@ -63,7 +62,6 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
return page;
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
#endif /* CONFIG_MMU */
#endif /* _XTENSA_PGALLOC_H */
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index d7fc45c920c2..bd5aeb795567 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -59,7 +59,6 @@
#define PTRS_PER_PGD 1024
#define PGD_ORDER 0
#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS 0UL
#define FIRST_USER_PGD_NR (FIRST_USER_ADDRESS >> PGDIR_SHIFT)
#ifdef CONFIG_MMU
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index e5e643752947..b3a22095371b 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -106,6 +106,9 @@
#define MADV_COLD 20 /* deactivate these pages */
#define MADV_PAGEOUT 21 /* reclaim these pages */
+#define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */
+#define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/xtensa/platforms/iss/setup.c b/arch/xtensa/platforms/iss/setup.c
index ed519aee0ec8..d3433e1bb94e 100644
--- a/arch/xtensa/platforms/iss/setup.c
+++ b/arch/xtensa/platforms/iss/setup.c
@@ -14,6 +14,7 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/notifier.h>
+#include <linux/panic_notifier.h>
#include <linux/printk.h>
#include <linux/string.h>
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 74c411911b6e..80c3b43b4828 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -113,8 +113,8 @@ struct zram {
* zram is claimed so open request will be failed
*/
bool claim; /* Protected by disk->open_mutex */
- struct file *backing_dev;
#ifdef CONFIG_ZRAM_WRITEBACK
+ struct file *backing_dev;
spinlock_t wb_limit_lock;
bool wb_limit_enable;
u64 bd_wb_limit;
diff --git a/drivers/bus/brcmstb_gisb.c b/drivers/bus/brcmstb_gisb.c
index 7355fa2cb439..6551286a60cc 100644
--- a/drivers/bus/brcmstb_gisb.c
+++ b/drivers/bus/brcmstb_gisb.c
@@ -6,6 +6,7 @@
#include <linux/init.h>
#include <linux/types.h>
#include <linux/module.h>
+#include <linux/panic_notifier.h>
#include <linux/platform_device.h>
#include <linux/interrupt.h>
#include <linux/sysfs.h>
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 8a0e97b33cae..e96cb5c4f97a 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/errno.h>
+#include <linux/panic_notifier.h>
#include <linux/poll.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
diff --git a/drivers/clk/analogbits/wrpll-cln28hpc.c b/drivers/clk/analogbits/wrpll-cln28hpc.c
index 3b1947575dcc..09ca82356399 100644
--- a/drivers/clk/analogbits/wrpll-cln28hpc.c
+++ b/drivers/clk/analogbits/wrpll-cln28hpc.c
@@ -23,8 +23,12 @@
#include <linux/bug.h>
#include <linux/err.h>
+#include <linux/limits.h>
#include <linux/log2.h>
#include <linux/math64.h>
+#include <linux/math.h>
+#include <linux/minmax.h>
+
#include <linux/clk/analogbits-wrpll-cln28hpc.h>
/* MIN_INPUT_FREQ: minimum input clock frequency, in Hz (Fref_min) */
diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c
index 5f7fd79ec82f..61c21bd880a4 100644
--- a/drivers/edac/altera_edac.c
+++ b/drivers/edac/altera_edac.c
@@ -20,6 +20,7 @@
#include <linux/of_address.h>
#include <linux/of_irq.h>
#include <linux/of_platform.h>
+#include <linux/panic_notifier.h>
#include <linux/platform_device.h>
#include <linux/regmap.h>
#include <linux/types.h>
diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c
index bb6e77ee3898..adaa492c3d2d 100644
--- a/drivers/firmware/google/gsmi.c
+++ b/drivers/firmware/google/gsmi.c
@@ -19,6 +19,7 @@
#include <linux/dma-mapping.h>
#include <linux/fs.h>
#include <linux/slab.h>
+#include <linux/panic_notifier.h>
#include <linux/ioctl.h>
#include <linux/acpi.h>
#include <linux/io.h>
diff --git a/drivers/gpu/drm/nouveau/include/nvif/if000c.h b/drivers/gpu/drm/nouveau/include/nvif/if000c.h
index d6dd40f21eed..9c7ff56831c5 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/if000c.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/if000c.h
@@ -77,6 +77,7 @@ struct nvif_vmm_pfnmap_v0 {
#define NVIF_VMM_PFNMAP_V0_APER 0x00000000000000f0ULL
#define NVIF_VMM_PFNMAP_V0_HOST 0x0000000000000000ULL
#define NVIF_VMM_PFNMAP_V0_VRAM 0x0000000000000010ULL
+#define NVIF_VMM_PFNMAP_V0_A 0x0000000000000004ULL
#define NVIF_VMM_PFNMAP_V0_W 0x0000000000000002ULL
#define NVIF_VMM_PFNMAP_V0_V 0x0000000000000001ULL
#define NVIF_VMM_PFNMAP_V0_NONE 0x0000000000000000ULL
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 1c3f890377d2..82b583f5fca8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -35,6 +35,7 @@
#include <linux/sched/mm.h>
#include <linux/sort.h>
#include <linux/hmm.h>
+#include <linux/rmap.h>
struct nouveau_svm {
struct nouveau_drm *drm;
@@ -67,6 +68,11 @@ struct nouveau_svm {
} buffer[1];
};
+#define FAULT_ACCESS_READ 0
+#define FAULT_ACCESS_WRITE 1
+#define FAULT_ACCESS_ATOMIC 2
+#define FAULT_ACCESS_PREFETCH 3
+
#define SVM_DBG(s,f,a...) NV_DEBUG((s)->drm, "svm: "f"\n", ##a)
#define SVM_ERR(s,f,a...) NV_WARN((s)->drm, "svm: "f"\n", ##a)
@@ -265,7 +271,7 @@ nouveau_svmm_invalidate_range_start(struct mmu_notifier *mn,
* the invalidation is handled as part of the migration process.
*/
if (update->event == MMU_NOTIFY_MIGRATE &&
- update->migrate_pgmap_owner == svmm->vmm->cli->drm->dev)
+ update->owner == svmm->vmm->cli->drm->dev)
goto out;
if (limit > svmm->unmanaged.start && start < svmm->unmanaged.limit) {
@@ -412,6 +418,24 @@ nouveau_svm_fault_cancel_fault(struct nouveau_svm *svm,
}
static int
+nouveau_svm_fault_priority(u8 fault)
+{
+ switch (fault) {
+ case FAULT_ACCESS_PREFETCH:
+ return 0;
+ case FAULT_ACCESS_READ:
+ return 1;
+ case FAULT_ACCESS_WRITE:
+ return 2;
+ case FAULT_ACCESS_ATOMIC:
+ return 3;
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+}
+
+static int
nouveau_svm_fault_cmp(const void *a, const void *b)
{
const struct nouveau_svm_fault *fa = *(struct nouveau_svm_fault **)a;
@@ -421,9 +445,8 @@ nouveau_svm_fault_cmp(const void *a, const void *b)
return ret;
if ((ret = (s64)fa->addr - fb->addr))
return ret;
- /*XXX: atomic? */
- return (fa->access == 0 || fa->access == 3) -
- (fb->access == 0 || fb->access == 3);
+ return nouveau_svm_fault_priority(fa->access) -
+ nouveau_svm_fault_priority(fb->access);
}
static void
@@ -487,6 +510,10 @@ static bool nouveau_svm_range_invalidate(struct mmu_interval_notifier *mni,
struct svm_notifier *sn =
container_of(mni, struct svm_notifier, notifier);
+ if (range->event == MMU_NOTIFY_EXCLUSIVE &&
+ range->owner == sn->svmm->vmm->cli->drm->dev)
+ return true;
+
/*
* serializes the update to mni->invalidate_seq done by caller and
* prevents invalidation of the PTE from progressing while HW is being
@@ -555,6 +582,71 @@ static void nouveau_hmm_convert_pfn(struct nouveau_drm *drm,
args->p.phys[0] |= NVIF_VMM_PFNMAP_V0_W;
}
+static int nouveau_atomic_range_fault(struct nouveau_svmm *svmm,
+ struct nouveau_drm *drm,
+ struct nouveau_pfnmap_args *args, u32 size,
+ struct svm_notifier *notifier)
+{
+ unsigned long timeout =
+ jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+ struct mm_struct *mm = svmm->notifier.mm;
+ struct page *page;
+ unsigned long start = args->p.addr;
+ unsigned long notifier_seq;
+ int ret = 0;
+
+ ret = mmu_interval_notifier_insert(&notifier->notifier, mm,
+ args->p.addr, args->p.size,
+ &nouveau_svm_mni_ops);
+ if (ret)
+ return ret;
+
+ while (true) {
+ if (time_after(jiffies, timeout)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ notifier_seq = mmu_interval_read_begin(&notifier->notifier);
+ mmap_read_lock(mm);
+ ret = make_device_exclusive_range(mm, start, start + PAGE_SIZE,
+ &page, drm->dev);
+ mmap_read_unlock(mm);
+ if (ret <= 0 || !page) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ mutex_lock(&svmm->mutex);
+ if (!mmu_interval_read_retry(&notifier->notifier,
+ notifier_seq))
+ break;
+ mutex_unlock(&svmm->mutex);
+ }
+
+ /* Map the page on the GPU. */
+ args->p.page = 12;
+ args->p.size = PAGE_SIZE;
+ args->p.addr = start;
+ args->p.phys[0] = page_to_phys(page) |
+ NVIF_VMM_PFNMAP_V0_V |
+ NVIF_VMM_PFNMAP_V0_W |
+ NVIF_VMM_PFNMAP_V0_A |
+ NVIF_VMM_PFNMAP_V0_HOST;
+
+ svmm->vmm->vmm.object.client->super = true;
+ ret = nvif_object_ioctl(&svmm->vmm->vmm.object, args, size, NULL);
+ svmm->vmm->vmm.object.client->super = false;
+ mutex_unlock(&svmm->mutex);
+
+ unlock_page(page);
+ put_page(page);
+
+out:
+ mmu_interval_notifier_remove(&notifier->notifier);
+ return ret;
+}
+
static int nouveau_range_fault(struct nouveau_svmm *svmm,
struct nouveau_drm *drm,
struct nouveau_pfnmap_args *args, u32 size,
@@ -567,18 +659,27 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
unsigned long hmm_pfns[1];
struct hmm_range range = {
.notifier = &notifier->notifier,
- .start = notifier->notifier.interval_tree.start,
- .end = notifier->notifier.interval_tree.last + 1,
.default_flags = hmm_flags,
.hmm_pfns = hmm_pfns,
.dev_private_owner = drm->dev,
};
- struct mm_struct *mm = notifier->notifier.mm;
+ struct mm_struct *mm = svmm->notifier.mm;
int ret;
+ ret = mmu_interval_notifier_insert(&notifier->notifier, mm,
+ args->p.addr, args->p.size,
+ &nouveau_svm_mni_ops);
+ if (ret)
+ return ret;
+
+ range.start = notifier->notifier.interval_tree.start;
+ range.end = notifier->notifier.interval_tree.last + 1;
+
while (true) {
- if (time_after(jiffies, timeout))
- return -EBUSY;
+ if (time_after(jiffies, timeout)) {
+ ret = -EBUSY;
+ goto out;
+ }
range.notifier_seq = mmu_interval_read_begin(range.notifier);
mmap_read_lock(mm);
@@ -587,7 +688,7 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
if (ret) {
if (ret == -EBUSY)
continue;
- return ret;
+ goto out;
}
mutex_lock(&svmm->mutex);
@@ -606,6 +707,9 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
svmm->vmm->vmm.object.client->super = false;
mutex_unlock(&svmm->mutex);
+out:
+ mmu_interval_notifier_remove(&notifier->notifier);
+
return ret;
}
@@ -625,7 +729,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
unsigned long hmm_flags;
u64 inst, start, limit;
int fi, fn;
- int replay = 0, ret;
+ int replay = 0, atomic = 0, ret;
/* Parse available fault buffer entries into a cache, and update
* the GET pointer so HW can reuse the entries.
@@ -706,12 +810,14 @@ nouveau_svm_fault(struct nvif_notify *notify)
/*
* Determine required permissions based on GPU fault
* access flags.
- * XXX: atomic?
*/
switch (buffer->fault[fi]->access) {
case 0: /* READ. */
hmm_flags = HMM_PFN_REQ_FAULT;
break;
+ case 2: /* ATOMIC. */
+ atomic = true;
+ break;
case 3: /* PREFETCH. */
hmm_flags = 0;
break;
@@ -727,14 +833,14 @@ nouveau_svm_fault(struct nvif_notify *notify)
}
notifier.svmm = svmm;
- ret = mmu_interval_notifier_insert(&notifier.notifier, mm,
- args.i.p.addr, args.i.p.size,
- &nouveau_svm_mni_ops);
- if (!ret) {
+ if (atomic)
+ ret = nouveau_atomic_range_fault(svmm, svm->drm,
+ &args.i, sizeof(args),
+ &notifier);
+ else
ret = nouveau_range_fault(svmm, svm->drm, &args.i,
- sizeof(args), hmm_flags, &notifier);
- mmu_interval_notifier_remove(&notifier.notifier);
- }
+ sizeof(args), hmm_flags,
+ &notifier);
mmput(mm);
limit = args.i.p.addr + args.i.p.size;
@@ -750,11 +856,15 @@ nouveau_svm_fault(struct nvif_notify *notify)
*/
if (buffer->fault[fn]->svmm != svmm ||
buffer->fault[fn]->addr >= limit ||
- (buffer->fault[fi]->access == 0 /* READ. */ &&
+ (buffer->fault[fi]->access == FAULT_ACCESS_READ &&
!(args.phys[0] & NVIF_VMM_PFNMAP_V0_V)) ||
- (buffer->fault[fi]->access != 0 /* READ. */ &&
- buffer->fault[fi]->access != 3 /* PREFETCH. */ &&
- !(args.phys[0] & NVIF_VMM_PFNMAP_V0_W)))
+ (buffer->fault[fi]->access != FAULT_ACCESS_READ &&
+ buffer->fault[fi]->access != FAULT_ACCESS_PREFETCH &&
+ !(args.phys[0] & NVIF_VMM_PFNMAP_V0_W)) ||
+ (buffer->fault[fi]->access != FAULT_ACCESS_READ &&
+ buffer->fault[fi]->access != FAULT_ACCESS_WRITE &&
+ buffer->fault[fi]->access != FAULT_ACCESS_PREFETCH &&
+ !(args.phys[0] & NVIF_VMM_PFNMAP_V0_A)))
break;
}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h
index a2b179568970..f6188aa9171c 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h
@@ -178,6 +178,7 @@ void nvkm_vmm_unmap_region(struct nvkm_vmm *, struct nvkm_vma *);
#define NVKM_VMM_PFN_APER 0x00000000000000f0ULL
#define NVKM_VMM_PFN_HOST 0x0000000000000000ULL
#define NVKM_VMM_PFN_VRAM 0x0000000000000010ULL
+#define NVKM_VMM_PFN_A 0x0000000000000004ULL
#define NVKM_VMM_PFN_W 0x0000000000000002ULL
#define NVKM_VMM_PFN_V 0x0000000000000001ULL
#define NVKM_VMM_PFN_NONE 0x0000000000000000ULL
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c
index 236db5570771..f02abd9cb4dd 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c
@@ -88,6 +88,9 @@ gp100_vmm_pgt_pfn(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
if (!(*map->pfn & NVKM_VMM_PFN_W))
data |= BIT_ULL(6); /* RO. */
+ if (!(*map->pfn & NVKM_VMM_PFN_A))
+ data |= BIT_ULL(7); /* Atomic disable. */
+
if (!(*map->pfn & NVKM_VMM_PFN_VRAM)) {
addr = *map->pfn >> NVKM_VMM_PFN_ADDR_SHIFT;
addr = dma_map_page(dev, pfn_to_page(addr), 0,
@@ -322,6 +325,9 @@ gp100_vmm_pd0_pfn(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
if (!(*map->pfn & NVKM_VMM_PFN_W))
data |= BIT_ULL(6); /* RO. */
+ if (!(*map->pfn & NVKM_VMM_PFN_A))
+ data |= BIT_ULL(7); /* Atomic disable. */
+
if (!(*map->pfn & NVKM_VMM_PFN_VRAM)) {
addr = *map->pfn >> NVKM_VMM_PFN_ADDR_SHIFT;
addr = dma_map_page(dev, pfn_to_page(addr), 0,
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 92cb3f7d21d9..57bbbaa4e8f7 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -25,6 +25,7 @@
#include <linux/delay.h>
#include <linux/notifier.h>
+#include <linux/panic_notifier.h>
#include <linux/ptrace.h>
#include <linux/screen_info.h>
#include <linux/kdebug.h>
diff --git a/drivers/hwtracing/coresight/coresight-cpu-debug.c b/drivers/hwtracing/coresight/coresight-cpu-debug.c
index 2dcf13de751f..9731d3a96073 100644
--- a/drivers/hwtracing/coresight/coresight-cpu-debug.c
+++ b/drivers/hwtracing/coresight/coresight-cpu-debug.c
@@ -17,6 +17,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
+#include <linux/panic_notifier.h>
#include <linux/pm_qos.h>
#include <linux/slab.h>
#include <linux/smp.h>
diff --git a/drivers/leds/trigger/ledtrig-activity.c b/drivers/leds/trigger/ledtrig-activity.c
index 14ba7faaed9e..30bc9df03636 100644
--- a/drivers/leds/trigger/ledtrig-activity.c
+++ b/drivers/leds/trigger/ledtrig-activity.c
@@ -11,6 +11,7 @@
#include <linux/kernel_stat.h>
#include <linux/leds.h>
#include <linux/module.h>
+#include <linux/panic_notifier.h>
#include <linux/reboot.h>
#include <linux/sched.h>
#include <linux/slab.h>
diff --git a/drivers/leds/trigger/ledtrig-heartbeat.c b/drivers/leds/trigger/ledtrig-heartbeat.c
index 36b6709afe9f..7fe0a05574d2 100644
--- a/drivers/leds/trigger/ledtrig-heartbeat.c
+++ b/drivers/leds/trigger/ledtrig-heartbeat.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/panic_notifier.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/sched.h>
diff --git a/drivers/leds/trigger/ledtrig-panic.c b/drivers/leds/trigger/ledtrig-panic.c
index 5751cd032f9d..64abf2e91608 100644
--- a/drivers/leds/trigger/ledtrig-panic.c
+++ b/drivers/leds/trigger/ledtrig-panic.c
@@ -8,6 +8,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/notifier.h>
+#include <linux/panic_notifier.h>
#include <linux/leds.h>
#include "../leds.h"
diff --git a/drivers/misc/bcm-vk/bcm_vk_dev.c b/drivers/misc/bcm-vk/bcm_vk_dev.c
index 6bfea3210389..ad639ee85b2a 100644
--- a/drivers/misc/bcm-vk/bcm_vk_dev.c
+++ b/drivers/misc/bcm-vk/bcm_vk_dev.c
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/idr.h>
#include <linux/interrupt.h>
+#include <linux/panic_notifier.h>
#include <linux/kref.h>
#include <linux/module.h>
#include <linux/mutex.h>
diff --git a/drivers/misc/ibmasm/heartbeat.c b/drivers/misc/ibmasm/heartbeat.c
index 4f5f3bdc814d..59c9a0d95659 100644
--- a/drivers/misc/ibmasm/heartbeat.c
+++ b/drivers/misc/ibmasm/heartbeat.c
@@ -9,6 +9,7 @@
*/
#include <linux/notifier.h>
+#include <linux/panic_notifier.h>
#include "ibmasm.h"
#include "dot_command.h"
#include "lowlevel.h"
diff --git a/drivers/misc/pvpanic/pvpanic.c b/drivers/misc/pvpanic/pvpanic.c
index 65f70a4da8c0..793ea0c01193 100644
--- a/drivers/misc/pvpanic/pvpanic.c
+++ b/drivers/misc/pvpanic/pvpanic.c
@@ -13,6 +13,7 @@
#include <linux/mod_devicetable.h>
#include <linux/module.h>
#include <linux/platform_device.h>
+#include <linux/panic_notifier.h>
#include <linux/types.h>
#include <linux/cdev.h>
#include <linux/list.h>
diff --git a/drivers/net/ipa/ipa_smp2p.c b/drivers/net/ipa/ipa_smp2p.c
index cf709df70d28..93270e50b6b3 100644
--- a/drivers/net/ipa/ipa_smp2p.c
+++ b/drivers/net/ipa/ipa_smp2p.c
@@ -8,6 +8,7 @@
#include <linux/device.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
+#include <linux/panic_notifier.h>
#include <linux/soc/qcom/smem.h>
#include <linux/soc/qcom/smem_state.h>
diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c
index ebaf6867b457..456776bd8ee6 100644
--- a/drivers/parisc/power.c
+++ b/drivers/parisc/power.c
@@ -38,6 +38,7 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/notifier.h>
+#include <linux/panic_notifier.h>
#include <linux/reboot.h>
#include <linux/sched/signal.h>
#include <linux/kthread.h>
diff --git a/drivers/power/reset/ltc2952-poweroff.c b/drivers/power/reset/ltc2952-poweroff.c
index d1495af30081..8688c8ba8894 100644
--- a/drivers/power/reset/ltc2952-poweroff.c
+++ b/drivers/power/reset/ltc2952-poweroff.c
@@ -52,6 +52,7 @@
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/module.h>
+#include <linux/panic_notifier.h>
#include <linux/mod_devicetable.h>
#include <linux/gpio/consumer.h>
#include <linux/reboot.h>
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 626a6b90fba2..76dd8e2b1e7e 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -20,6 +20,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/device.h>
+#include <linux/panic_notifier.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/dma-map-ops.h>
diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c
index 1fd5bca9fa20..02523f4e29f4 100644
--- a/drivers/s390/char/con3215.c
+++ b/drivers/s390/char/con3215.c
@@ -19,6 +19,7 @@
#include <linux/console.h>
#include <linux/interrupt.h>
#include <linux/err.h>
+#include <linux/panic_notifier.h>
#include <linux/reboot.h>
#include <linux/serial.h> /* ASYNC_* flags */
#include <linux/slab.h>
diff --git a/drivers/s390/char/con3270.c b/drivers/s390/char/con3270.c
index e21962c0fd94..87cdbace1453 100644
--- a/drivers/s390/char/con3270.c
+++ b/drivers/s390/char/con3270.c
@@ -13,6 +13,7 @@
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/list.h>
+#include <linux/panic_notifier.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/err.h>
diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
index 986bbbc23d0a..6627820a5eb9 100644
--- a/drivers/s390/char/sclp.c
+++ b/drivers/s390/char/sclp.c
@@ -11,6 +11,7 @@
#include <linux/kernel_stat.h>
#include <linux/module.h>
#include <linux/err.h>
+#include <linux/panic_notifier.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/timer.h>
diff --git a/drivers/s390/char/sclp_con.c b/drivers/s390/char/sclp_con.c
index 9b852a47ccc1..cc01a7b8595d 100644
--- a/drivers/s390/char/sclp_con.c
+++ b/drivers/s390/char/sclp_con.c
@@ -10,6 +10,7 @@
#include <linux/kmod.h>
#include <linux/console.h>
#include <linux/init.h>
+#include <linux/panic_notifier.h>
#include <linux/timer.h>
#include <linux/jiffies.h>
#include <linux/termios.h>
diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c
index 7f4445b0f819..5b8a7b090a97 100644
--- a/drivers/s390/char/sclp_vt220.c
+++ b/drivers/s390/char/sclp_vt220.c
@@ -9,6 +9,7 @@
#include <linux/module.h>
#include <linux/spinlock.h>
+#include <linux/panic_notifier.h>
#include <linux/list.h>
#include <linux/wait.h>
#include <linux/timer.h>
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index bd3c724bf695..b5b0848da93b 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -15,6 +15,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/debugfs.h>
+#include <linux/panic_notifier.h>
#include <linux/reboot.h>
#include <asm/asm-offsets.h>
diff --git a/drivers/soc/bcm/brcmstb/pm/pm-arm.c b/drivers/soc/bcm/brcmstb/pm/pm-arm.c
index a673fdffe216..3cbb165d6e30 100644
--- a/drivers/soc/bcm/brcmstb/pm/pm-arm.c
+++ b/drivers/soc/bcm/brcmstb/pm/pm-arm.c
@@ -28,6 +28,7 @@
#include <linux/notifier.h>
#include <linux/of.h>
#include <linux/of_address.h>
+#include <linux/panic_notifier.h>
#include <linux/platform_device.h>
#include <linux/pm.h>
#include <linux/printk.h>
diff --git a/drivers/staging/olpc_dcon/olpc_dcon.c b/drivers/staging/olpc_dcon/olpc_dcon.c
index 6d8e9a481786..7284cb4ac395 100644
--- a/drivers/staging/olpc_dcon/olpc_dcon.c
+++ b/drivers/staging/olpc_dcon/olpc_dcon.c
@@ -22,6 +22,7 @@
#include <linux/device.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
+#include <linux/panic_notifier.h>
#include <linux/reboot.h>
#include <linux/olpc-ec.h>
#include <asm/tsc.h>
diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c
index a7e6eea2c4a1..23999df52739 100644
--- a/drivers/video/fbdev/hyperv_fb.c
+++ b/drivers/video/fbdev/hyperv_fb.c
@@ -52,6 +52,7 @@
#include <linux/completion.h>
#include <linux/fb.h>
#include <linux/pci.h>
+#include <linux/panic_notifier.h>
#include <linux/efi.h>
#include <linux/console.h>
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 10ec60d81e84..dc2a2e2b2ff8 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -1065,6 +1065,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
static void virtio_mem_set_fake_offline(unsigned long pfn,
unsigned long nr_pages, bool onlined)
{
+ page_offline_begin();
for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn);
@@ -1075,6 +1076,7 @@ static void virtio_mem_set_fake_offline(unsigned long pfn,
ClearPageReserved(page);
}
}
+ page_offline_end();
}
/*
diff --git a/fs/Kconfig b/fs/Kconfig
index 141a856c50e7..a7749c126b8e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -240,6 +240,21 @@ config HUGETLBFS
config HUGETLB_PAGE
def_bool HUGETLBFS
+config HUGETLB_PAGE_FREE_VMEMMAP
+ def_bool HUGETLB_PAGE
+ depends on X86_64
+ depends on SPARSEMEM_VMEMMAP
+
+config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
+ bool "Default freeing vmemmap pages of HugeTLB to on"
+ default n
+ depends on HUGETLB_PAGE_FREE_VMEMMAP
+ help
+ When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap
+ pages associated with each HugeTLB page is default off. Say Y here
+ to enable freeing vmemmap pages of HugeTLB by default. It can then
+ be disabled on the command line via hugetlb_free_vmemmap=off.
+
config MEMFD_CREATE
def_bool TMPFS || HUGETLBFS
diff --git a/fs/exec.c b/fs/exec.c
index f2bcdbeb3afb..38f63451b928 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -84,9 +84,6 @@ static DEFINE_RWLOCK(binfmt_lock);
void __register_binfmt(struct linux_binfmt * fmt, int insert)
{
- BUG_ON(!fmt);
- if (WARN_ON(!fmt->load_binary))
- return;
write_lock(&binfmt_lock);
insert ? list_add(&fmt->lh, &formats) :
list_add_tail(&fmt->lh, &formats);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 70e8374ddac4..6fef67c2a9f0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -281,6 +281,11 @@ int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct inode *inode = d_inode(path->dentry);
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+ if (request_mask & STATX_BTIME) {
+ stat->result_mask |= STATX_BTIME;
+ stat->btime = hfsp_mt2ut(hip->create_date);
+ }
+
if (inode->i_flags & S_APPEND)
stat->attributes |= STATX_ATTR_APPEND;
if (inode->i_flags & S_IMMUTABLE)
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 4d169c5a2673..e2855ceefd39 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -204,7 +204,6 @@ check_attr_tree_state_again:
buf = kzalloc(node_size, GFP_NOFS);
if (!buf) {
- pr_err("failed to allocate memory for header node\n");
err = -ENOMEM;
goto end_attr_file_creation;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b517a8794400..cd5eac2ba054 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2351,7 +2351,7 @@ static struct nfs4_client *get_nfsdfs_clp(struct inode *inode)
static void seq_quote_mem(struct seq_file *m, char *data, int len)
{
seq_printf(m, "\"");
- seq_escape_mem_ascii(m, data, len);
+ seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\");
seq_printf(m, "\"");
}
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index f42ab57201e7..ab9ec073330f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -738,7 +738,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
if (ptr2 != ptr + cnt || ++cnt == maxblocks)
goto end;
index++;
- continue;
}
if (level == maxlevel)
break;
diff --git a/fs/open.c b/fs/open.c
index 53bc0573c0ec..1a325b3194df 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -852,8 +852,17 @@ static int do_dentry_open(struct file *f,
* XXX: Huge page cache doesn't support writing yet. Drop all page
* cache for this file before processing writes.
*/
- if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping))
- truncate_pagecache(inode, 0);
+ if (f->f_mode & FMODE_WRITE) {
+ /*
+ * Paired with smp_mb() in collapse_file() to ensure nr_thps
+ * is up to date and the update to i_writecount by
+ * get_write_access() is visible. Ensures subsequent insertion
+ * of THPs into the page cache will fail.
+ */
+ smp_mb();
+ if (filemap_nr_thps(inode->i_mapping))
+ truncate_pagecache(inode, 0);
+ }
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9cbd915025ad..e5b5f7709d48 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -854,7 +854,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
while (count > 0) {
- int this_len = min_t(int, count, PAGE_SIZE);
+ size_t this_len = min_t(size_t, count, PAGE_SIZE);
if (write && copy_from_user(page, buf, this_len)) {
copied = -EFAULT;
@@ -3172,7 +3172,7 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
- DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
@@ -3517,7 +3517,7 @@ static const struct inode_operations proc_tid_comm_inode_operations = {
*/
static const struct pid_entry tid_base_stuff[] = {
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
- DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 07fc4fad2602..172c86270b31 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -6,6 +6,7 @@
#include <linux/fdtable.h>
#include <linux/namei.h>
#include <linux/pid.h>
+#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/file.h>
#include <linux/seq_file.h>
@@ -53,9 +54,10 @@ static int seq_show(struct seq_file *m, void *v)
if (ret)
return ret;
- seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
+ seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n",
(long long)file->f_pos, f_flags,
- real_mount(file->f_path.mnt)->mnt_id);
+ real_mount(file->f_path.mnt)->mnt_id,
+ file_inode(file)->i_ino);
/* show_fd_locks() never deferences files so a stale value is safe */
show_fd_locks(m, file, files);
@@ -72,6 +74,18 @@ out:
static int seq_fdinfo_open(struct inode *inode, struct file *file)
{
+ bool allowed = false;
+ struct task_struct *task = get_proc_task(inode);
+
+ if (!task)
+ return -ESRCH;
+
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+ put_task_struct(task);
+
+ if (!allowed)
+ return -EACCES;
+
return single_open(file, seq_show, inode);
}
@@ -308,7 +322,7 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUSR);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUGO);
if (!inode)
return ERR_PTR(-ENOENT);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 4d2e64e9016c..982e694aae77 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -313,6 +313,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
{
char *buf = file->private_data;
size_t phdrs_offset, notes_offset, data_offset;
+ size_t page_offline_frozen = 1;
size_t phdrs_len, notes_len;
struct kcore_list *m;
size_t tsz;
@@ -322,6 +323,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
int ret = 0;
down_read(&kclist_lock);
+ /*
+ * Don't race against drivers that set PageOffline() and expect no
+ * further page access.
+ */
+ page_offline_freeze();
get_kcore_size(&nphdr, &phdrs_len, &notes_len, &data_offset);
phdrs_offset = sizeof(struct elfhdr);
@@ -380,11 +386,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R | PF_W | PF_X;
phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset;
- if (m->type == KCORE_REMAP)
- phdr->p_vaddr = (size_t)m->vaddr;
- else
- phdr->p_vaddr = (size_t)m->addr;
- if (m->type == KCORE_RAM || m->type == KCORE_REMAP)
+ phdr->p_vaddr = (size_t)m->addr;
+ if (m->type == KCORE_RAM)
phdr->p_paddr = __pa(m->addr);
else if (m->type == KCORE_TEXT)
phdr->p_paddr = __pa_symbol(m->addr);
@@ -468,6 +471,9 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
m = NULL;
while (buflen) {
+ struct page *page;
+ unsigned long pfn;
+
/*
* If this is the first iteration or the address is not within
* the previous entry, search for a matching entry.
@@ -480,31 +486,57 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
}
+ if (page_offline_frozen++ % MAX_ORDER_NR_PAGES == 0) {
+ page_offline_thaw();
+ cond_resched();
+ page_offline_freeze();
+ }
+
if (&m->list == &kclist_head) {
if (clear_user(buffer, tsz)) {
ret = -EFAULT;
goto out;
}
m = NULL; /* skip the list anchor */
- } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- } else if (m->type == KCORE_VMALLOC) {
+ goto skip;
+ }
+
+ switch (m->type) {
+ case KCORE_VMALLOC:
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
if (copy_to_user(buffer, buf, tsz)) {
ret = -EFAULT;
goto out;
}
- } else if (m->type == KCORE_USER) {
+ break;
+ case KCORE_USER:
/* User page is handled prior to normal kernel page: */
if (copy_to_user(buffer, (char *)start, tsz)) {
ret = -EFAULT;
goto out;
}
- } else {
+ break;
+ case KCORE_RAM:
+ pfn = __pa(start) >> PAGE_SHIFT;
+ page = pfn_to_online_page(pfn);
+
+ /*
+ * Don't read offline sections, logically offline pages
+ * (e.g., inflated in a balloon), hwpoisoned pages,
+ * and explicitly excluded physical ranges.
+ */
+ if (!page || PageOffline(page) ||
+ is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ break;
+ }
+ fallthrough;
+ case KCORE_VMEMMAP:
+ case KCORE_TEXT:
if (kern_addr_valid(start)) {
/*
* Using bounce buffer to bypass the
@@ -528,7 +560,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
goto out;
}
}
+ break;
+ default:
+ pr_warn_once("Unhandled KCORE type: %d\n", m->type);
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
+skip:
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
@@ -537,6 +577,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
out:
+ page_offline_thaw();
up_read(&kclist_lock);
if (ret)
return ret;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 66965ad88d8b..eb97468dfe4c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -514,10 +514,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else {
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
- } else if (is_migration_entry(swpent))
- page = migration_entry_to_page(swpent);
- else if (is_device_private_entry(swpent))
- page = device_private_entry_to_page(swpent);
+ } else if (is_pfn_swap_entry(swpent))
+ page = pfn_swap_entry_to_page(swpent);
} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
&& pte_none(*pte))) {
page = xa_load(&vma->vm_file->f_mapping->i_pages,
@@ -549,7 +547,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
swp_entry_t entry = pmd_to_swp_entry(*pmd);
if (is_migration_entry(entry))
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
}
if (IS_ERR_OR_NULL(page))
return;
@@ -694,10 +692,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
- if (is_migration_entry(swpent))
- page = migration_entry_to_page(swpent);
- else if (is_device_private_entry(swpent))
- page = device_private_entry_to_page(swpent);
+ if (is_pfn_swap_entry(swpent))
+ page = pfn_swap_entry_to_page(swpent);
}
if (page) {
int mapcount = page_mapcount(page);
@@ -832,7 +828,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %d\n",
- transparent_hugepage_enabled(vma));
+ transparent_hugepage_active(vma));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -1302,6 +1298,7 @@ struct pagemapread {
#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
#define PM_SOFT_DIRTY BIT_ULL(55)
#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_UFFD_WP BIT_ULL(57)
#define PM_FILE BIT_ULL(61)
#define PM_SWAP BIT_ULL(62)
#define PM_PRESENT BIT_ULL(63)
@@ -1375,20 +1372,21 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
+ if (pte_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
} else if (is_swap_pte(pte)) {
swp_entry_t entry;
if (pte_swp_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
+ if (pte_swp_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
entry = pte_to_swp_entry(pte);
if (pm->show_pfn)
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
flags |= PM_SWAP;
- if (is_migration_entry(entry))
- page = migration_entry_to_page(entry);
-
- if (is_device_private_entry(entry))
- page = device_private_entry_to_page(entry);
+ if (is_pfn_swap_entry(entry))
+ page = pfn_swap_entry_to_page(entry);
}
if (page && !PageAnon(page))
@@ -1426,6 +1424,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
flags |= PM_PRESENT;
if (pmd_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
+ if (pmd_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
if (pm->show_pfn)
frame = pmd_pfn(pmd) +
((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1444,8 +1444,10 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
flags |= PM_SWAP;
if (pmd_swp_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
+ if (pmd_swp_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
VM_BUG_ON(!is_pmd_migration_entry(pmd));
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
}
#endif
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5059248f2d64..b117b212ef28 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -356,6 +356,31 @@ int seq_release(struct inode *inode, struct file *file)
EXPORT_SYMBOL(seq_release);
/**
+ * seq_escape_mem - print data into buffer, escaping some characters
+ * @m: target buffer
+ * @src: source buffer
+ * @len: size of source buffer
+ * @flags: flags to pass to string_escape_mem()
+ * @esc: set of characters that need escaping
+ *
+ * Puts data into buffer, replacing each occurrence of character from
+ * given class (defined by @flags and @esc) with printable escaped sequence.
+ *
+ * Use seq_has_overflowed() to check for errors.
+ */
+void seq_escape_mem(struct seq_file *m, const char *src, size_t len,
+ unsigned int flags, const char *esc)
+{
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int ret;
+
+ ret = string_escape_mem(src, len, buf, size, flags, esc);
+ seq_commit(m, ret < size ? ret : -1);
+}
+EXPORT_SYMBOL(seq_escape_mem);
+
+/**
* seq_escape - print string into buffer, escaping some characters
* @m: target buffer
* @s: string
@@ -367,26 +392,10 @@ EXPORT_SYMBOL(seq_release);
*/
void seq_escape(struct seq_file *m, const char *s, const char *esc)
{
- char *buf;
- size_t size = seq_get_buf(m, &buf);
- int ret;
-
- ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc);
- seq_commit(m, ret < size ? ret : -1);
+ seq_escape_str(m, s, ESCAPE_OCTAL, esc);
}
EXPORT_SYMBOL(seq_escape);
-void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz)
-{
- char *buf;
- size_t size = seq_get_buf(m, &buf);
- int ret;
-
- ret = string_escape_mem_ascii(src, isz, buf, size);
- seq_commit(m, ret < size ? ret : -1);
-}
-EXPORT_SYMBOL(seq_escape_mem_ascii);
-
void seq_vprintf(struct seq_file *m, const char *f, va_list args)
{
int len;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index dd7a6c62b56f..f6e0f0c0d0e5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
}
if (vm_flags & VM_UFFD_MINOR) {
- /* FIXME: Add minor fault interception for shmem. */
- if (!is_vm_hugetlb_page(vma))
+ if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
return false;
}
@@ -1304,8 +1303,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vm_flags = 0;
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
vm_flags |= VM_UFFD_MISSING;
- if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+ goto out;
+#endif
vm_flags |= VM_UFFD_WP;
+ }
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
goto out;
@@ -1941,7 +1944,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
- uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
+ uffdio_api.features &=
+ ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
+#endif
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+ uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index bafc51f483c4..edb0e2a602a8 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -18,7 +18,8 @@
#endif
#ifndef __ASSEMBLY__
-#include <linux/kernel.h>
+#include <linux/panic.h>
+#include <linux/printk.h>
#ifdef CONFIG_BUG
diff --git a/include/linux/ascii85.h b/include/linux/ascii85.h
index 4cc40201273e..83ad775ad0aa 100644
--- a/include/linux/ascii85.h
+++ b/include/linux/ascii85.h
@@ -8,7 +8,8 @@
#ifndef _ASCII85_H_
#define _ASCII85_H_
-#include <linux/kernel.h>
+#include <linux/math.h>
+#include <linux/types.h>
#define ASCII85_BUFSZ 6
diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
new file mode 100644
index 000000000000..2bc8b1f69c93
--- /dev/null
+++ b/include/linux/bootmem_info.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_BOOTMEM_INFO_H
+#define __LINUX_BOOTMEM_INFO_H
+
+#include <linux/mm.h>
+
+/*
+ * Types for free bootmem stored in page->lru.next. These have to be in
+ * some random range in unsigned long space for debugging purposes.
+ */
+enum {
+ MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
+ SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
+ MIX_SECTION_INFO,
+ NODE_INFO,
+ MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
+};
+
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
+
+void get_page_bootmem(unsigned long info, struct page *page,
+ unsigned long type);
+void put_page_bootmem(struct page *page);
+
+/*
+ * Any memory allocated via the memblock allocator and not via the
+ * buddy will be marked reserved already in the memmap. For those
+ * pages, we can call this function to free it to buddy allocator.
+ */
+static inline void free_bootmem_page(struct page *page)
+{
+ unsigned long magic = (unsigned long)page->freelist;
+
+ /*
+ * The reserve_bootmem_region sets the reserved flag on bootmem
+ * pages.
+ */
+ VM_BUG_ON_PAGE(page_ref_count(page) != 2, page);
+
+ if (magic == SECTION_INFO || magic == MIX_SECTION_INFO)
+ put_page_bootmem(page);
+ else
+ VM_BUG_ON_PAGE(1, page);
+}
+#else
+static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+}
+
+static inline void put_page_bootmem(struct page *page)
+{
+}
+
+static inline void get_page_bootmem(unsigned long info, struct page *page,
+ unsigned long type)
+{
+}
+
+static inline void free_bootmem_page(struct page *page)
+{
+ free_reserved_page(page);
+}
+#endif
+
+#endif /* __LINUX_BOOTMEM_INFO_H */
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 8855b1b702b2..c270124e4402 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -532,8 +532,6 @@ int __compat_save_altstack(compat_stack_t __user *, unsigned long);
&__uss->ss_sp, label); \
unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
- if (t->sas_ss_flags & SS_AUTODISARM) \
- sas_ss_reset(t); \
} while (0);
/*
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index adbe76b203e2..49b0ac8b6fd3 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -13,6 +13,12 @@
/* all clang versions usable with the kernel support KASAN ABI version 5 */
#define KASAN_ABI_VERSION 5
+/*
+ * Note: Checking __has_feature(*_sanitizer) is only true if the feature is
+ * enabled. Therefore it is not required to additionally check defined(CONFIG_*)
+ * to avoid adding redundant attributes in other configurations.
+ */
+
#if __has_feature(address_sanitizer) || __has_feature(hwaddress_sanitizer)
/* Emulate GCC's __SANITIZE_ADDRESS__ flag */
#define __SANITIZE_ADDRESS__
@@ -46,6 +52,17 @@
#endif
/*
+ * Support for __has_feature(coverage_sanitizer) was added in Clang 13 together
+ * with no_sanitize("coverage"). Prior versions of Clang support coverage
+ * instrumentation, but cannot be queried for support by the preprocessor.
+ */
+#if __has_feature(coverage_sanitizer)
+#define __no_sanitize_coverage __attribute__((no_sanitize("coverage")))
+#else
+#define __no_sanitize_coverage
+#endif
+
+/*
* Not all versions of clang implement the type-generic versions
* of the builtin overflow checkers. Fortunately, clang implements
* __has_builtin allowing us to avoid awkward version
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 5d97ef738a57..cb9217fc60af 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -122,6 +122,12 @@
#define __no_sanitize_undefined
#endif
+#if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__)
+#define __no_sanitize_coverage __attribute__((no_sanitize_coverage))
+#else
+#define __no_sanitize_coverage
+#endif
+
#if GCC_VERSION >= 50100
#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
#endif
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index d509169860f1..e4ea86fc584d 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -210,7 +210,7 @@ struct ftrace_likely_data {
/* Section for code which can't be instrumented at all */
#define noinstr \
noinline notrace __attribute((__section__(".noinstr.text"))) \
- __no_kcsan __no_sanitize_address __no_profile
+ __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage
#endif /* __KERNEL__ */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2a8ebe6c222e..f123e15d966e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -10,8 +10,8 @@
vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
- struct vm_area_struct *vma);
-void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
+ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
+void huge_pmd_set_accessed(struct vm_fault *vmf);
int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
struct vm_area_struct *vma);
@@ -24,7 +24,7 @@ static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
}
#endif
-vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
+vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf);
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd,
unsigned int flags);
@@ -115,9 +115,34 @@ extern struct kobj_attribute shmem_enabled_attr;
extern unsigned long transparent_hugepage_flags;
+static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
+ unsigned long haddr)
+{
+ /* Don't have to check pgoff for anonymous vma */
+ if (!vma_is_anonymous(vma)) {
+ if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+ HPAGE_PMD_NR))
+ return false;
+ }
+
+ if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+ return false;
+ return true;
+}
+
+static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ /* Explicitly disabled through madvise. */
+ if ((vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ return false;
+ return true;
+}
+
/*
* to be used on vmas which are known to support THP.
- * Use transparent_hugepage_enabled otherwise
+ * Use transparent_hugepage_active otherwise
*/
static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
{
@@ -128,15 +153,12 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
return false;
- if (vma->vm_flags & VM_NOHUGEPAGE)
+ if (!transhuge_vma_enabled(vma, vma->vm_flags))
return false;
if (vma_is_temporary_stack(vma))
return false;
- if (test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- return false;
-
if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
return true;
@@ -150,24 +172,7 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
return false;
}
-bool transparent_hugepage_enabled(struct vm_area_struct *vma);
-
-#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
-
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
- unsigned long haddr)
-{
- /* Don't have to check pgoff for anonymous vma */
- if (!vma_is_anonymous(vma)) {
- if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
- (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
- return false;
- }
-
- if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
- return false;
- return true;
-}
+bool transparent_hugepage_active(struct vm_area_struct *vma);
#define transparent_hugepage_use_zero_page() \
(transparent_hugepage_flags & \
@@ -283,7 +288,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, int flags, struct dev_pagemap **pgmap);
-vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
+vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
extern struct page *huge_zero_page;
extern unsigned long huge_zero_pfn;
@@ -354,7 +359,7 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
return false;
}
-static inline bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+static inline bool transparent_hugepage_active(struct vm_area_struct *vma)
{
return false;
}
@@ -365,6 +370,12 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
return false;
}
+static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ return false;
+}
+
static inline void prep_transhuge_page(struct page *page) {}
static inline bool is_transparent_hugepage(struct page *page)
@@ -430,8 +441,7 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
return NULL;
}
-static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf,
- pmd_t orig_pmd)
+static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
return 0;
}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8ba79dc64ab8..8e0f32f935bd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -29,6 +29,23 @@ typedef struct { unsigned long pd; } hugepd_t;
#include <linux/shm.h>
#include <asm/tlbflush.h>
+/*
+ * For HugeTLB page, there are more metadata to save in the struct page. But
+ * the head struct page cannot meet our needs, so we have to abuse other tail
+ * struct page to store the metadata. In order to avoid conflicts caused by
+ * subsequent use of more tail struct pages, we gather these discrete indexes
+ * of tail struct page here.
+ */
+enum {
+ SUBPAGE_INDEX_SUBPOOL = 1, /* reuse page->private */
+#ifdef CONFIG_CGROUP_HUGETLB
+ SUBPAGE_INDEX_CGROUP, /* reuse page->private */
+ SUBPAGE_INDEX_CGROUP_RSVD, /* reuse page->private */
+ __MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD,
+#endif
+ __NR_USED_SUBPAGE,
+};
+
struct hugepage_subpool {
spinlock_t lock;
long count;
@@ -515,12 +532,14 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
* modifications require hugetlb_lock.
* HPG_freed - Set when page is on the free lists.
* Synchronization: hugetlb_lock held for examination and modification.
+ * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
*/
enum hugetlb_page_flags {
HPG_restore_reserve = 0,
HPG_migratable,
HPG_temporary,
HPG_freed,
+ HPG_vmemmap_optimized,
__NR_HPAGEFLAGS,
};
@@ -566,6 +585,7 @@ HPAGEFLAG(RestoreReserve, restore_reserve)
HPAGEFLAG(Migratable, migratable)
HPAGEFLAG(Temporary, temporary)
HPAGEFLAG(Freed, freed)
+HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
#ifdef CONFIG_HUGETLB_PAGE
@@ -588,6 +608,9 @@ struct hstate {
unsigned int nr_huge_pages_node[MAX_NUMNODES];
unsigned int free_huge_pages_node[MAX_NUMNODES];
unsigned int surplus_huge_pages_node[MAX_NUMNODES];
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+ unsigned int nr_free_vmemmap_pages;
+#endif
#ifdef CONFIG_CGROUP_HUGETLB
/* cgroup control files */
struct cftype cgroup_files_dfl[7];
@@ -635,13 +658,13 @@ extern unsigned int default_hstate_idx;
*/
static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage)
{
- return (struct hugepage_subpool *)(hpage+1)->private;
+ return (void *)page_private(hpage + SUBPAGE_INDEX_SUBPOOL);
}
static inline void hugetlb_set_page_subpool(struct page *hpage,
struct hugepage_subpool *subpool)
{
- set_page_private(hpage+1, (unsigned long)subpool);
+ set_page_private(hpage + SUBPAGE_INDEX_SUBPOOL, (unsigned long)subpool);
}
static inline struct hstate *hstate_file(struct file *f)
@@ -718,8 +741,8 @@ static inline void arch_clear_hugepage_flags(struct page *page) { }
#endif
#ifndef arch_make_huge_pte
-static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
- struct page *page, int writable)
+static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift,
+ vm_flags_t flags)
{
return entry;
}
@@ -875,6 +898,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
+static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage)
+{
+ return NULL;
+}
+
static inline int isolate_or_dissolve_huge_page(struct page *page,
struct list_head *list)
{
@@ -1028,6 +1056,12 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
}
#endif /* CONFIG_HUGETLB_PAGE */
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+extern bool hugetlb_free_vmemmap_enabled;
+#else
+#define hugetlb_free_vmemmap_enabled false
+#endif
+
static inline spinlock_t *huge_pte_lock(struct hstate *h,
struct mm_struct *mm, pte_t *pte)
{
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 0bff345c4bc6..0b8d1fdda3a1 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -21,15 +21,16 @@ struct hugetlb_cgroup;
struct resv_map;
struct file_region;
+#ifdef CONFIG_CGROUP_HUGETLB
/*
* Minimum page order trackable by hugetlb cgroup.
* At least 4 pages are necessary for all the tracking information.
- * The second tail page (hpage[2]) is the fault usage cgroup.
- * The third tail page (hpage[3]) is the reservation usage cgroup.
+ * The second tail page (hpage[SUBPAGE_INDEX_CGROUP]) is the fault
+ * usage cgroup. The third tail page (hpage[SUBPAGE_INDEX_CGROUP_RSVD])
+ * is the reservation usage cgroup.
*/
-#define HUGETLB_CGROUP_MIN_ORDER 2
+#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__MAX_CGROUP_SUBPAGE_INDEX + 1)
-#ifdef CONFIG_CGROUP_HUGETLB
enum hugetlb_memory_event {
HUGETLB_MAX,
HUGETLB_NR_MEMORY_EVENTS,
@@ -66,9 +67,9 @@ __hugetlb_cgroup_from_page(struct page *page, bool rsvd)
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
return NULL;
if (rsvd)
- return (struct hugetlb_cgroup *)page[3].private;
+ return (void *)page_private(page + SUBPAGE_INDEX_CGROUP_RSVD);
else
- return (struct hugetlb_cgroup *)page[2].private;
+ return (void *)page_private(page + SUBPAGE_INDEX_CGROUP);
}
static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
@@ -90,9 +91,11 @@ static inline int __set_hugetlb_cgroup(struct page *page,
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
return -1;
if (rsvd)
- page[3].private = (unsigned long)h_cg;
+ set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD,
+ (unsigned long)h_cg);
else
- page[2].private = (unsigned long)h_cg;
+ set_page_private(page + SUBPAGE_INDEX_CGROUP,
+ (unsigned long)h_cg);
return 0;
}
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index da676cdbd727..86c0f1d18998 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -11,14 +11,11 @@ enum kcore_type {
KCORE_RAM,
KCORE_VMEMMAP,
KCORE_USER,
- KCORE_OTHER,
- KCORE_REMAP,
};
struct kcore_list {
struct list_head list;
unsigned long addr;
- unsigned long vaddr;
size_t size;
int type;
};
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f2ad8a53f71f..1b2f0a7e00d6 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -10,10 +10,12 @@
#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/bitops.h>
+#include <linux/kstrtox.h>
#include <linux/log2.h>
#include <linux/math.h>
#include <linux/minmax.h>
#include <linux/typecheck.h>
+#include <linux/panic.h>
#include <linux/printk.h>
#include <linux/build_bug.h>
#include <linux/static_call_types.h>
@@ -84,7 +86,6 @@
#define lower_16_bits(n) ((u16)((n) & 0xffff))
struct completion;
-struct pt_regs;
struct user;
#ifdef CONFIG_PREEMPT_VOLUNTARY
@@ -189,159 +190,9 @@ void __might_fault(const char *file, int line);
static inline void might_fault(void) { }
#endif
-extern struct atomic_notifier_head panic_notifier_list;
-extern long (*panic_blink)(int state);
-__printf(1, 2)
-void panic(const char *fmt, ...) __noreturn __cold;
-void nmi_panic(struct pt_regs *regs, const char *msg);
-extern void oops_enter(void);
-extern void oops_exit(void);
-extern bool oops_may_print(void);
void do_exit(long error_code) __noreturn;
void complete_and_exit(struct completion *, long) __noreturn;
-/* Internal, do not use. */
-int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
-int __must_check _kstrtol(const char *s, unsigned int base, long *res);
-
-int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res);
-int __must_check kstrtoll(const char *s, unsigned int base, long long *res);
-
-/**
- * kstrtoul - convert a string to an unsigned long
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign, but not a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Preferred over simple_strtoul(). Return code must be checked.
-*/
-static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res)
-{
- /*
- * We want to shortcut function call, but
- * __builtin_types_compatible_p(unsigned long, unsigned long long) = 0.
- */
- if (sizeof(unsigned long) == sizeof(unsigned long long) &&
- __alignof__(unsigned long) == __alignof__(unsigned long long))
- return kstrtoull(s, base, (unsigned long long *)res);
- else
- return _kstrtoul(s, base, res);
-}
-
-/**
- * kstrtol - convert a string to a long
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign or a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Preferred over simple_strtol(). Return code must be checked.
- */
-static inline int __must_check kstrtol(const char *s, unsigned int base, long *res)
-{
- /*
- * We want to shortcut function call, but
- * __builtin_types_compatible_p(long, long long) = 0.
- */
- if (sizeof(long) == sizeof(long long) &&
- __alignof__(long) == __alignof__(long long))
- return kstrtoll(s, base, (long long *)res);
- else
- return _kstrtol(s, base, res);
-}
-
-int __must_check kstrtouint(const char *s, unsigned int base, unsigned int *res);
-int __must_check kstrtoint(const char *s, unsigned int base, int *res);
-
-static inline int __must_check kstrtou64(const char *s, unsigned int base, u64 *res)
-{
- return kstrtoull(s, base, res);
-}
-
-static inline int __must_check kstrtos64(const char *s, unsigned int base, s64 *res)
-{
- return kstrtoll(s, base, res);
-}
-
-static inline int __must_check kstrtou32(const char *s, unsigned int base, u32 *res)
-{
- return kstrtouint(s, base, res);
-}
-
-static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *res)
-{
- return kstrtoint(s, base, res);
-}
-
-int __must_check kstrtou16(const char *s, unsigned int base, u16 *res);
-int __must_check kstrtos16(const char *s, unsigned int base, s16 *res);
-int __must_check kstrtou8(const char *s, unsigned int base, u8 *res);
-int __must_check kstrtos8(const char *s, unsigned int base, s8 *res);
-int __must_check kstrtobool(const char *s, bool *res);
-
-int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res);
-int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res);
-int __must_check kstrtoul_from_user(const char __user *s, size_t count, unsigned int base, unsigned long *res);
-int __must_check kstrtol_from_user(const char __user *s, size_t count, unsigned int base, long *res);
-int __must_check kstrtouint_from_user(const char __user *s, size_t count, unsigned int base, unsigned int *res);
-int __must_check kstrtoint_from_user(const char __user *s, size_t count, unsigned int base, int *res);
-int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigned int base, u16 *res);
-int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res);
-int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res);
-int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res);
-int __must_check kstrtobool_from_user(const char __user *s, size_t count, bool *res);
-
-static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res)
-{
- return kstrtoull_from_user(s, count, base, res);
-}
-
-static inline int __must_check kstrtos64_from_user(const char __user *s, size_t count, unsigned int base, s64 *res)
-{
- return kstrtoll_from_user(s, count, base, res);
-}
-
-static inline int __must_check kstrtou32_from_user(const char __user *s, size_t count, unsigned int base, u32 *res)
-{
- return kstrtouint_from_user(s, count, base, res);
-}
-
-static inline int __must_check kstrtos32_from_user(const char __user *s, size_t count, unsigned int base, s32 *res)
-{
- return kstrtoint_from_user(s, count, base, res);
-}
-
-/*
- * Use kstrto<foo> instead.
- *
- * NOTE: simple_strto<foo> does not check for the range overflow and,
- * depending on the input, may give interesting results.
- *
- * Use these functions if and only if you cannot use kstrto<foo>, because
- * the conversion ends on the first non-digit character, which may be far
- * beyond the supported range. It might be useful to parse the strings like
- * 10x50 or 12:21 without altering original string or temporary buffer in use.
- * Keep in mind above caveat.
- */
-
-extern unsigned long simple_strtoul(const char *,char **,unsigned int);
-extern long simple_strtol(const char *,char **,unsigned int);
-extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
-extern long long simple_strtoll(const char *,char **,unsigned int);
-
extern int num_to_str(char *buf, int size,
unsigned long long num, unsigned int width);
@@ -384,52 +235,8 @@ extern int __kernel_text_address(unsigned long addr);
extern int kernel_text_address(unsigned long addr);
extern int func_ptr_is_kernel_text(void *ptr);
-#ifdef CONFIG_SMP
-extern unsigned int sysctl_oops_all_cpu_backtrace;
-#else
-#define sysctl_oops_all_cpu_backtrace 0
-#endif /* CONFIG_SMP */
-
extern void bust_spinlocks(int yes);
-extern int panic_timeout;
-extern unsigned long panic_print;
-extern int panic_on_oops;
-extern int panic_on_unrecovered_nmi;
-extern int panic_on_io_nmi;
-extern int panic_on_warn;
-extern unsigned long panic_on_taint;
-extern bool panic_on_taint_nousertaint;
-extern int sysctl_panic_on_rcu_stall;
-extern int sysctl_max_rcu_stall_to_panic;
-extern int sysctl_panic_on_stackoverflow;
-
-extern bool crash_kexec_post_notifiers;
-/*
- * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
- * holds a CPU number which is executing panic() currently. A value of
- * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
- */
-extern atomic_t panic_cpu;
-#define PANIC_CPU_INVALID -1
-
-/*
- * Only to be used by arch init code. If the user over-wrote the default
- * CONFIG_PANIC_TIMEOUT, honor it.
- */
-static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout)
-{
- if (panic_timeout == arch_default_timeout)
- panic_timeout = timeout;
-}
-extern const char *print_tainted(void);
-enum lockdep_ok {
- LOCKDEP_STILL_OK,
- LOCKDEP_NOW_UNRELIABLE
-};
-extern void add_taint(unsigned flag, enum lockdep_ok);
-extern int test_taint(unsigned flag);
-extern unsigned long get_taint(void);
extern int root_mountflags;
extern bool early_boot_irqs_disabled;
@@ -448,36 +255,6 @@ extern enum system_states {
SYSTEM_SUSPEND,
} system_state;
-/* This cannot be an enum because some may be used in assembly source. */
-#define TAINT_PROPRIETARY_MODULE 0
-#define TAINT_FORCED_MODULE 1
-#define TAINT_CPU_OUT_OF_SPEC 2
-#define TAINT_FORCED_RMMOD 3
-#define TAINT_MACHINE_CHECK 4
-#define TAINT_BAD_PAGE 5
-#define TAINT_USER 6
-#define TAINT_DIE 7
-#define TAINT_OVERRIDDEN_ACPI_TABLE 8
-#define TAINT_WARN 9
-#define TAINT_CRAP 10
-#define TAINT_FIRMWARE_WORKAROUND 11
-#define TAINT_OOT_MODULE 12
-#define TAINT_UNSIGNED_MODULE 13
-#define TAINT_SOFTLOCKUP 14
-#define TAINT_LIVEPATCH 15
-#define TAINT_AUX 16
-#define TAINT_RANDSTRUCT 17
-#define TAINT_FLAGS_COUNT 18
-#define TAINT_FLAGS_MAX ((1UL << TAINT_FLAGS_COUNT) - 1)
-
-struct taint_flag {
- char c_true; /* character printed when tainted */
- char c_false; /* character printed when not tainted */
- bool module; /* also show as a per-module taint flag */
-};
-
-extern const struct taint_flag taint_flags[TAINT_FLAGS_COUNT];
-
extern const char hex_asc[];
#define hex_asc_lo(x) hex_asc[((x) & 0x0f)]
#define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4]
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 523ffc7bc3a8..4d0c28c2ba12 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -399,7 +399,6 @@ int enable_kprobe(struct kprobe *kp);
void dump_kprobe(struct kprobe *kp);
void *alloc_insn_page(void);
-void free_insn_page(void *page);
int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
char *sym);
diff --git a/include/linux/kstrtox.h b/include/linux/kstrtox.h
new file mode 100644
index 000000000000..529974e22ea7
--- /dev/null
+++ b/include/linux/kstrtox.h
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_KSTRTOX_H
+#define _LINUX_KSTRTOX_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/* Internal, do not use. */
+int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
+int __must_check _kstrtol(const char *s, unsigned int base, long *res);
+
+int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res);
+int __must_check kstrtoll(const char *s, unsigned int base, long long *res);
+
+/**
+ * kstrtoul - convert a string to an unsigned long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ * include a single newline before its terminating null. The first character
+ * may also be a plus sign, but not a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ * given as 0, then the base of the string is automatically detected with the
+ * conventional semantics - If it begins with 0x the number will be parsed as a
+ * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ * parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Preferred over simple_strtoul(). Return code must be checked.
+*/
+static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res)
+{
+ /*
+ * We want to shortcut function call, but
+ * __builtin_types_compatible_p(unsigned long, unsigned long long) = 0.
+ */
+ if (sizeof(unsigned long) == sizeof(unsigned long long) &&
+ __alignof__(unsigned long) == __alignof__(unsigned long long))
+ return kstrtoull(s, base, (unsigned long long *)res);
+ else
+ return _kstrtoul(s, base, res);
+}
+
+/**
+ * kstrtol - convert a string to a long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ * include a single newline before its terminating null. The first character
+ * may also be a plus sign or a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ * given as 0, then the base of the string is automatically detected with the
+ * conventional semantics - If it begins with 0x the number will be parsed as a
+ * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ * parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Preferred over simple_strtol(). Return code must be checked.
+ */
+static inline int __must_check kstrtol(const char *s, unsigned int base, long *res)
+{
+ /*
+ * We want to shortcut function call, but
+ * __builtin_types_compatible_p(long, long long) = 0.
+ */
+ if (sizeof(long) == sizeof(long long) &&
+ __alignof__(long) == __alignof__(long long))
+ return kstrtoll(s, base, (long long *)res);
+ else
+ return _kstrtol(s, base, res);
+}
+
+int __must_check kstrtouint(const char *s, unsigned int base, unsigned int *res);
+int __must_check kstrtoint(const char *s, unsigned int base, int *res);
+
+static inline int __must_check kstrtou64(const char *s, unsigned int base, u64 *res)
+{
+ return kstrtoull(s, base, res);
+}
+
+static inline int __must_check kstrtos64(const char *s, unsigned int base, s64 *res)
+{
+ return kstrtoll(s, base, res);
+}
+
+static inline int __must_check kstrtou32(const char *s, unsigned int base, u32 *res)
+{
+ return kstrtouint(s, base, res);
+}
+
+static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *res)
+{
+ return kstrtoint(s, base, res);
+}
+
+int __must_check kstrtou16(const char *s, unsigned int base, u16 *res);
+int __must_check kstrtos16(const char *s, unsigned int base, s16 *res);
+int __must_check kstrtou8(const char *s, unsigned int base, u8 *res);
+int __must_check kstrtos8(const char *s, unsigned int base, s8 *res);
+int __must_check kstrtobool(const char *s, bool *res);
+
+int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res);
+int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res);
+int __must_check kstrtoul_from_user(const char __user *s, size_t count, unsigned int base, unsigned long *res);
+int __must_check kstrtol_from_user(const char __user *s, size_t count, unsigned int base, long *res);
+int __must_check kstrtouint_from_user(const char __user *s, size_t count, unsigned int base, unsigned int *res);
+int __must_check kstrtoint_from_user(const char __user *s, size_t count, unsigned int base, int *res);
+int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigned int base, u16 *res);
+int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res);
+int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res);
+int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res);
+int __must_check kstrtobool_from_user(const char __user *s, size_t count, bool *res);
+
+static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res)
+{
+ return kstrtoull_from_user(s, count, base, res);
+}
+
+static inline int __must_check kstrtos64_from_user(const char __user *s, size_t count, unsigned int base, s64 *res)
+{
+ return kstrtoll_from_user(s, count, base, res);
+}
+
+static inline int __must_check kstrtou32_from_user(const char __user *s, size_t count, unsigned int base, u32 *res)
+{
+ return kstrtouint_from_user(s, count, base, res);
+}
+
+static inline int __must_check kstrtos32_from_user(const char __user *s, size_t count, unsigned int base, s32 *res)
+{
+ return kstrtoint_from_user(s, count, base, res);
+}
+
+/*
+ * Use kstrto<foo> instead.
+ *
+ * NOTE: simple_strto<foo> does not check for the range overflow and,
+ * depending on the input, may give interesting results.
+ *
+ * Use these functions if and only if you cannot use kstrto<foo>, because
+ * the conversion ends on the first non-digit character, which may be far
+ * beyond the supported range. It might be useful to parse the strings like
+ * 10x50 or 12:21 without altering original string or temporary buffer in use.
+ * Keep in mind above caveat.
+ */
+
+extern unsigned long simple_strtoul(const char *,char **,unsigned int);
+extern long simple_strtol(const char *,char **,unsigned int);
+extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
+extern long long simple_strtoll(const char *,char **,unsigned int);
+
+static inline int strtobool(const char *s, bool *res)
+{
+ return kstrtobool(s, res);
+}
+
+#endif /* _LINUX_KSTRTOX_H */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 552309342c38..cbf46f56d105 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -30,7 +30,9 @@ extern unsigned long long max_possible_pfn;
* @MEMBLOCK_NONE: no special request
* @MEMBLOCK_HOTPLUG: hotpluggable region
* @MEMBLOCK_MIRROR: mirrored region
- * @MEMBLOCK_NOMAP: don't add to kernel direct mapping
+ * @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as
+ * reserved in the memory map; refer to memblock_mark_nomap() description
+ * for further details
*/
enum memblock_flags {
MEMBLOCK_NONE = 0x0, /* No special request */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 28f32fd00fe9..a7fd2c3ccb77 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -18,18 +18,6 @@ struct vmem_altmap;
#ifdef CONFIG_MEMORY_HOTPLUG
struct page *pfn_to_online_page(unsigned long pfn);
-/*
- * Types for free bootmem stored in page->lru.next. These have to be in
- * some random range in unsigned long space for debugging purposes.
- */
-enum {
- MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
- SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
- MIX_SECTION_INFO,
- NODE_INFO,
- MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
-};
-
/* Types for control the zone type of onlined and offlined memory */
enum {
/* Offline the memory. */
@@ -222,17 +210,6 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
#endif /* CONFIG_NUMA */
#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
-#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
-extern void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
-#else
-static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
-}
-#endif
-extern void put_page_bootmem(struct page *page);
-extern void get_page_bootmem(unsigned long ingo, struct page *page,
- unsigned long type);
-
void get_online_mems(void);
void put_online_mems(void);
@@ -260,10 +237,6 @@ static inline void zone_span_writelock(struct zone *zone) {}
static inline void zone_span_writeunlock(struct zone *zone) {}
static inline void zone_seqlock_init(struct zone *zone) {}
-static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
-}
-
static inline int try_online_node(int nid)
{
return 0;
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 5f1c74df264d..0aaf91b496e2 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -46,11 +46,8 @@ struct mempolicy {
atomic_t refcnt;
unsigned short mode; /* See MPOL_* above */
unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
- union {
- short preferred_node; /* preferred */
- nodemask_t nodes; /* interleave/bind */
- /* undefined for default */
- } v;
+ nodemask_t nodes; /* interleave/bind/perfer */
+
union {
nodemask_t cpuset_mems_allowed; /* relative to these nodes */
nodemask_t user_nodemask; /* nodemask passed by user */
@@ -150,7 +147,7 @@ extern int huge_node(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask);
extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
-extern bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask);
extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 45a79da89c5f..c0e9d35889e8 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -26,7 +26,7 @@ struct vmem_altmap {
};
/*
- * Specialize ZONE_DEVICE memory into multiple types each having differents
+ * Specialize ZONE_DEVICE memory into multiple types each has a different
* usage.
*
* MEMORY_DEVICE_PRIVATE:
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 4bb4e519e3f5..9b7b7cd3bae9 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -51,6 +51,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page);
extern int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page, int extra_count);
+extern void copy_huge_page(struct page *dst, struct page *src);
#else
static inline void putback_movable_pages(struct list_head *l) {}
@@ -77,6 +78,9 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
return -ENOSYS;
}
+static inline void copy_huge_page(struct page *dst, struct page *src)
+{
+}
#endif /* CONFIG_MIGRATION */
#ifdef CONFIG_COMPACTION
@@ -95,14 +99,9 @@ static inline void __ClearPageMovable(struct page *page)
#endif
#ifdef CONFIG_NUMA_BALANCING
-extern bool pmd_trans_migrating(pmd_t pmd);
extern int migrate_misplaced_page(struct page *page,
struct vm_area_struct *vma, int node);
#else
-static inline bool pmd_trans_migrating(pmd_t pmd)
-{
- return false;
-}
static inline int migrate_misplaced_page(struct page *page,
struct vm_area_struct *vma, int node)
{
@@ -110,24 +109,6 @@ static inline int migrate_misplaced_page(struct page *page,
}
#endif /* CONFIG_NUMA_BALANCING */
-#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pmd_t *pmd, pmd_t entry,
- unsigned long address,
- struct page *page, int node);
-#else
-static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pmd_t *pmd, pmd_t entry,
- unsigned long address,
- struct page *page, int node)
-{
- return -EAGAIN;
-}
-#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
-
-
#ifdef CONFIG_MIGRATION
/*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b8bc39237dac..788a0b1323d0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -145,7 +145,7 @@ extern int mmap_rnd_compat_bits __read_mostly;
/* This function must be updated when the size of struct page grows above 80
* or reduces below 56. The idea that compiler optimizes out switch()
* statement, and only leaves move/store instructions. Also the compiler can
- * combine write statments if they are both assignments and can be reordered,
+ * combine write statements if they are both assignments and can be reordered,
* this can result in several of the writes here being dropped.
*/
#define mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
@@ -540,7 +540,12 @@ struct vm_fault {
pud_t *pud; /* Pointer to pud entry matching
* the 'address'
*/
- pte_t orig_pte; /* Value of PTE at the time of fault */
+ union {
+ pte_t orig_pte; /* Value of PTE at the time of fault */
+ pmd_t orig_pmd; /* Value of PMD at the time of fault,
+ * used by PMD fault only.
+ */
+ };
struct page *cow_page; /* Page handler may use for COW fault */
struct page *page; /* ->fault handlers should return a
@@ -3067,6 +3072,11 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
}
#endif
+int vmemmap_remap_free(unsigned long start, unsigned long end,
+ unsigned long reuse);
+int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+ unsigned long reuse, gfp_t gfp_mask);
+
void *sparse_buffer_alloc(unsigned long size);
struct page * __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d33d97c69da9..52bbd2b7cb46 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -404,7 +404,7 @@ struct mm_struct {
unsigned long mmap_base; /* base of mmap area */
unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
- /* Base adresses for compatible mmap() */
+ /* Base addresses for compatible mmap() */
unsigned long mmap_compat_base;
unsigned long mmap_compat_legacy_base;
#endif
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 1a6a9eb6d3fa..6692da8d121d 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -41,7 +41,12 @@ struct mmu_interval_notifier;
*
* @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
* a device driver to possibly ignore the invalidation if the
- * migrate_pgmap_owner field matches the driver's device private pgmap owner.
+ * owner field matches the driver's device private pgmap owner.
+ *
+ * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
+ * longer have exclusive access to the page. When sent during creation of an
+ * exclusive range the owner will be initialised to the value provided by the
+ * caller of make_device_exclusive_range(), otherwise the owner will be NULL.
*/
enum mmu_notifier_event {
MMU_NOTIFY_UNMAP = 0,
@@ -51,6 +56,7 @@ enum mmu_notifier_event {
MMU_NOTIFY_SOFT_DIRTY,
MMU_NOTIFY_RELEASE,
MMU_NOTIFY_MIGRATE,
+ MMU_NOTIFY_EXCLUSIVE,
};
#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
@@ -269,7 +275,7 @@ struct mmu_notifier_range {
unsigned long end;
unsigned flags;
enum mmu_notifier_event event;
- void *migrate_pgmap_owner;
+ void *owner;
};
static inline int mm_has_notifiers(struct mm_struct *mm)
@@ -521,14 +527,14 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
range->flags = flags;
}
-static inline void mmu_notifier_range_init_migrate(
- struct mmu_notifier_range *range, unsigned int flags,
+static inline void mmu_notifier_range_init_owner(
+ struct mmu_notifier_range *range,
+ enum mmu_notifier_event event, unsigned int flags,
struct vm_area_struct *vma, struct mm_struct *mm,
- unsigned long start, unsigned long end, void *pgmap)
+ unsigned long start, unsigned long end, void *owner)
{
- mmu_notifier_range_init(range, MMU_NOTIFY_MIGRATE, flags, vma, mm,
- start, end);
- range->migrate_pgmap_owner = pgmap;
+ mmu_notifier_range_init(range, event, flags, vma, mm, start, end);
+ range->owner = owner;
}
#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
@@ -655,8 +661,8 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \
_mmu_notifier_range_init(range, start, end)
-#define mmu_notifier_range_init_migrate(range, flags, vma, mm, start, end, \
- pgmap) \
+#define mmu_notifier_range_init_owner(range, event, flags, vma, mm, start, \
+ end, owner) \
_mmu_notifier_range_init(range, start, end)
static inline bool
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 265a32e1ff74..fcb535560028 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,7 +114,7 @@ static inline bool free_area_empty(struct free_area *area, int migratetype)
struct pglist_data;
/*
- * Add a wild amount of padding here to ensure datas fall into separate
+ * Add a wild amount of padding here to ensure data fall into separate
* cachelines. There are very few zone structures in the machine, so space
* consumption is not a concern here.
*/
@@ -1064,7 +1064,10 @@ extern char numa_zonelist_order[];
#ifndef CONFIG_NUMA
extern struct pglist_data contig_page_data;
-#define NODE_DATA(nid) (&contig_page_data)
+static inline struct pglist_data *NODE_DATA(int nid)
+{
+ return &contig_page_data;
+}
#define NODE_MEM_MAP(nid) mem_map
#else /* CONFIG_NUMA */
@@ -1445,10 +1448,30 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
#endif
#ifndef CONFIG_HAVE_ARCH_PFN_VALID
+/**
+ * pfn_valid - check if there is a valid memory map entry for a PFN
+ * @pfn: the page frame number to check
+ *
+ * Check if there is a valid memory map entry aka struct page for the @pfn.
+ * Note, that availability of the memory map entry does not imply that
+ * there is actual usable memory at that @pfn. The struct page may
+ * represent a hole or an unusable page frame.
+ *
+ * Return: 1 for PFNs that have memory map entries and 0 otherwise
+ */
static inline int pfn_valid(unsigned long pfn)
{
struct mem_section *ms;
+ /*
+ * Ensure the upper PAGE_SHIFT bits are clear in the
+ * pfn. Else it might lead to false positives when
+ * some of the upper bits are set, but the lower bits
+ * match a valid pfn.
+ */
+ if (PHYS_PFN(PFN_PHYS(pfn)) != pfn)
+ return 0;
+
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
return 0;
ms = __nr_to_section(pfn_to_section_nr(pfn));
diff --git a/include/linux/mpi.h b/include/linux/mpi.h
index 3e5358f4de2f..eb0d1c1db208 100644
--- a/include/linux/mpi.h
+++ b/include/linux/mpi.h
@@ -200,7 +200,7 @@ struct mpi_ec_ctx {
unsigned int nbits; /* Number of bits. */
/* Domain parameters. Note that they may not all be set and if set
- * the MPIs may be flaged as constant.
+ * the MPIs may be flagged as constant.
*/
MPI p; /* Prime specifying the field GF(p). */
MPI a; /* First coefficient of the Weierstrass equation. */
@@ -267,7 +267,7 @@ int mpi_ec_curve_point(MPI_POINT point, struct mpi_ec_ctx *ctx);
/**
* mpi_get_size() - returns max size required to store the number
*
- * @a: A multi precision integer for which we want to allocate a bufer
+ * @a: A multi precision integer for which we want to allocate a buffer
*
* Return: size required to store the number
*/
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 458696550028..5922031ffab6 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -704,6 +704,18 @@ PAGEFLAG_FALSE(DoubleMap)
#endif
/*
+ * Check if a page is currently marked HWPoisoned. Note that this check is
+ * best effort only and inherently racy: there is no way to synchronize with
+ * failing hardware.
+ */
+static inline bool is_page_hwpoison(struct page *page)
+{
+ if (PageHWPoison(page))
+ return true;
+ return PageHuge(page) && PageHWPoison(compound_head(page));
+}
+
+/*
* For pages that are never mapped to userspace (and aren't PageSlab),
* page_type may be used. Because it is initialised to -1, we invert the
* sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
@@ -766,9 +778,19 @@ PAGE_TYPE_OPS(Buddy, buddy)
* relies on this feature is aware that re-onlining the memory block will
* require to re-set the pages PageOffline() and not giving them to the
* buddy via online_page_callback_t.
+ *
+ * There are drivers that mark a page PageOffline() and expect there won't be
+ * any further access to page content. PFN walkers that read content of random
+ * pages should check PageOffline() and synchronize with such drivers using
+ * page_offline_freeze()/page_offline_thaw().
*/
PAGE_TYPE_OPS(Offline, offline)
+extern void page_offline_freeze(void);
+extern void page_offline_thaw(void);
+extern void page_offline_begin(void);
+extern void page_offline_end(void);
+
/*
* Marks pages in use as page tables.
*/
diff --git a/include/linux/panic.h b/include/linux/panic.h
new file mode 100644
index 000000000000..f5844908a089
--- /dev/null
+++ b/include/linux/panic.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PANIC_H
+#define _LINUX_PANIC_H
+
+#include <linux/compiler_attributes.h>
+#include <linux/types.h>
+
+struct pt_regs;
+
+extern long (*panic_blink)(int state);
+__printf(1, 2)
+void panic(const char *fmt, ...) __noreturn __cold;
+void nmi_panic(struct pt_regs *regs, const char *msg);
+extern void oops_enter(void);
+extern void oops_exit(void);
+extern bool oops_may_print(void);
+
+#ifdef CONFIG_SMP
+extern unsigned int sysctl_oops_all_cpu_backtrace;
+#else
+#define sysctl_oops_all_cpu_backtrace 0
+#endif /* CONFIG_SMP */
+
+extern int panic_timeout;
+extern unsigned long panic_print;
+extern int panic_on_oops;
+extern int panic_on_unrecovered_nmi;
+extern int panic_on_io_nmi;
+extern int panic_on_warn;
+
+extern unsigned long panic_on_taint;
+extern bool panic_on_taint_nousertaint;
+
+extern int sysctl_panic_on_rcu_stall;
+extern int sysctl_max_rcu_stall_to_panic;
+extern int sysctl_panic_on_stackoverflow;
+
+extern bool crash_kexec_post_notifiers;
+
+/*
+ * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
+ * holds a CPU number which is executing panic() currently. A value of
+ * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
+ */
+extern atomic_t panic_cpu;
+#define PANIC_CPU_INVALID -1
+
+/*
+ * Only to be used by arch init code. If the user over-wrote the default
+ * CONFIG_PANIC_TIMEOUT, honor it.
+ */
+static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout)
+{
+ if (panic_timeout == arch_default_timeout)
+ panic_timeout = timeout;
+}
+
+/* This cannot be an enum because some may be used in assembly source. */
+#define TAINT_PROPRIETARY_MODULE 0
+#define TAINT_FORCED_MODULE 1
+#define TAINT_CPU_OUT_OF_SPEC 2
+#define TAINT_FORCED_RMMOD 3
+#define TAINT_MACHINE_CHECK 4
+#define TAINT_BAD_PAGE 5
+#define TAINT_USER 6
+#define TAINT_DIE 7
+#define TAINT_OVERRIDDEN_ACPI_TABLE 8
+#define TAINT_WARN 9
+#define TAINT_CRAP 10
+#define TAINT_FIRMWARE_WORKAROUND 11
+#define TAINT_OOT_MODULE 12
+#define TAINT_UNSIGNED_MODULE 13
+#define TAINT_SOFTLOCKUP 14
+#define TAINT_LIVEPATCH 15
+#define TAINT_AUX 16
+#define TAINT_RANDSTRUCT 17
+#define TAINT_FLAGS_COUNT 18
+#define TAINT_FLAGS_MAX ((1UL << TAINT_FLAGS_COUNT) - 1)
+
+struct taint_flag {
+ char c_true; /* character printed when tainted */
+ char c_false; /* character printed when not tainted */
+ bool module; /* also show as a per-module taint flag */
+};
+
+extern const struct taint_flag taint_flags[TAINT_FLAGS_COUNT];
+
+enum lockdep_ok {
+ LOCKDEP_STILL_OK,
+ LOCKDEP_NOW_UNRELIABLE,
+};
+
+extern const char *print_tainted(void);
+extern void add_taint(unsigned flag, enum lockdep_ok);
+extern int test_taint(unsigned flag);
+extern unsigned long get_taint(void);
+
+#endif /* _LINUX_PANIC_H */
diff --git a/include/linux/panic_notifier.h b/include/linux/panic_notifier.h
new file mode 100644
index 000000000000..41e32483d7a7
--- /dev/null
+++ b/include/linux/panic_notifier.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PANIC_NOTIFIERS_H
+#define _LINUX_PANIC_NOTIFIERS_H
+
+#include <linux/notifier.h>
+#include <linux/types.h>
+
+extern struct atomic_notifier_head panic_notifier_list;
+
+extern bool crash_kexec_post_notifiers;
+
+#endif /* _LINUX_PANIC_NOTIFIERS_H */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index c32600c9e1ad..e82660f7b9e4 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -29,6 +29,24 @@
#endif
/*
+ * This defines the first usable user address. Platforms
+ * can override its value with custom FIRST_USER_ADDRESS
+ * defined in their respective <asm/pgtable.h>.
+ */
+#ifndef FIRST_USER_ADDRESS
+#define FIRST_USER_ADDRESS 0UL
+#endif
+
+/*
+ * This defines the generic helper for accessing PMD page
+ * table page. Although platforms can still override this
+ * via their respective <asm/pgtable.h>.
+ */
+#ifndef pmd_pgtable
+#define pmd_pgtable(pmd) pmd_page(pmd)
+#endif
+
+/*
* A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
*
* The pXx_index() functions return the index of the entry in the page
@@ -1379,10 +1397,34 @@ static inline int p4d_clear_huge(p4d_t *p4d)
}
#endif /* !__PAGETABLE_P4D_FOLDED */
+#ifndef __PAGETABLE_PUD_FOLDED
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
-int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud);
+#else
+static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+ return 0;
+}
+static inline int pud_clear_huge(pud_t *pud)
+{
+ return 0;
+}
+#endif /* !__PAGETABLE_PUD_FOLDED */
+
+#ifndef __PAGETABLE_PMD_FOLDED
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pmd_clear_huge(pmd_t *pmd);
+#else
+static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+ return 0;
+}
+static inline int pmd_clear_huge(pmd_t *pmd)
+{
+ return 0;
+}
+#endif /* !__PAGETABLE_PMD_FOLDED */
+
int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
int pud_free_pmd_page(pud_t *pud, unsigned long addr);
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 8d04e7deedc6..83fb86133fe1 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -86,9 +86,6 @@ struct anon_vma_chain {
};
enum ttu_flags {
- TTU_MIGRATION = 0x1, /* migration mode */
- TTU_MUNLOCK = 0x2, /* munlock mode */
-
TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */
@@ -98,7 +95,6 @@ enum ttu_flags {
* do a final flush if necessary */
TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock:
* caller holds it */
- TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */
};
#ifdef CONFIG_MMU
@@ -195,7 +191,12 @@ static inline void page_dup_rmap(struct page *page, bool compound)
int page_referenced(struct page *, int is_locked,
struct mem_cgroup *memcg, unsigned long *vm_flags);
-bool try_to_unmap(struct page *, enum ttu_flags flags);
+void try_to_migrate(struct page *page, enum ttu_flags flags);
+void try_to_unmap(struct page *, enum ttu_flags flags);
+
+int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct page **pages,
+ void *arg);
/* Avoid racy checks */
#define PVMW_SYNC (1 << 0)
@@ -240,7 +241,7 @@ int page_mkclean(struct page *);
* called in munlock()/munmap() path to check for other vmas holding
* the page mlocked.
*/
-void try_to_munlock(struct page *);
+void page_mlock(struct page *page);
void remove_migration_ptes(struct page *old, struct page *new, bool locked);
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 723b1fa1177e..dd99569595fd 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -126,8 +126,16 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num
void seq_put_hex_ll(struct seq_file *m, const char *delimiter,
unsigned long long v, unsigned int width);
+void seq_escape_mem(struct seq_file *m, const char *src, size_t len,
+ unsigned int flags, const char *esc);
+
+static inline void seq_escape_str(struct seq_file *m, const char *src,
+ unsigned int flags, const char *esc)
+{
+ seq_escape_mem(m, src, strlen(src), flags, esc);
+}
+
void seq_escape(struct seq_file *m, const char *s, const char *esc);
-void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz);
void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
int rowsize, int groupsize, const void *buf, size_t len,
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index aa77dcd1646f..8e775ce517bb 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -122,21 +122,18 @@ static inline bool shmem_file(struct file *file)
extern bool shmem_charge(struct inode *inode, long pages);
extern void shmem_uncharge(struct inode *inode, long pages);
+#ifdef CONFIG_USERFAULTFD
#ifdef CONFIG_SHMEM
-extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+extern int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
+ bool zeropage,
struct page **pagep);
-extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr);
-#else
-#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
- src_addr, pagep) ({ BUG(); 0; })
-#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
- dst_addr) ({ BUG(); 0; })
-#endif
+#else /* !CONFIG_SHMEM */
+#define shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
+ src_addr, zeropage, pagep) ({ BUG(); 0; })
+#endif /* CONFIG_SHMEM */
+#endif /* CONFIG_USERFAULTFD */
#endif
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 5160fd45e5ca..3454c7ff0778 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -462,8 +462,6 @@ int __save_altstack(stack_t __user *, unsigned long);
unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \
unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
- if (t->sas_ss_flags & SS_AUTODISARM) \
- sas_ss_reset(t); \
} while (0);
#ifdef CONFIG_PROC_FS
diff --git a/include/linux/string.h b/include/linux/string.h
index 9521d8cab18e..b48d2d28e0b1 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -2,7 +2,6 @@
#ifndef _LINUX_STRING_H_
#define _LINUX_STRING_H_
-
#include <linux/compiler.h> /* for inline */
#include <linux/types.h> /* for size_t */
#include <linux/stddef.h> /* for NULL */
@@ -184,12 +183,6 @@ extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
extern void argv_free(char **argv);
extern bool sysfs_streq(const char *s1, const char *s2);
-extern int kstrtobool(const char *s, bool *res);
-static inline int strtobool(const char *s, bool *res)
-{
- return kstrtobool(s, res);
-}
-
int match_string(const char * const *array, size_t n, const char *string);
int __sysfs_match_string(const char * const *array, size_t n, const char *s);
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index fa06dcdc481e..68189c4a2eb1 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_STRING_HELPERS_H_
#define _LINUX_STRING_HELPERS_H_
+#include <linux/bits.h>
#include <linux/ctype.h>
#include <linux/types.h>
@@ -18,13 +19,15 @@ enum string_size_units {
void string_get_size(u64 size, u64 blk_size, enum string_size_units units,
char *buf, int len);
-#define UNESCAPE_SPACE 0x01
-#define UNESCAPE_OCTAL 0x02
-#define UNESCAPE_HEX 0x04
-#define UNESCAPE_SPECIAL 0x08
+#define UNESCAPE_SPACE BIT(0)
+#define UNESCAPE_OCTAL BIT(1)
+#define UNESCAPE_HEX BIT(2)
+#define UNESCAPE_SPECIAL BIT(3)
#define UNESCAPE_ANY \
(UNESCAPE_SPACE | UNESCAPE_OCTAL | UNESCAPE_HEX | UNESCAPE_SPECIAL)
+#define UNESCAPE_ALL_MASK GENMASK(3, 0)
+
int string_unescape(char *src, char *dst, size_t size, unsigned int flags);
static inline int string_unescape_inplace(char *buf, unsigned int flags)
@@ -42,22 +45,24 @@ static inline int string_unescape_any_inplace(char *buf)
return string_unescape_any(buf, buf, 0);
}
-#define ESCAPE_SPACE 0x01
-#define ESCAPE_SPECIAL 0x02
-#define ESCAPE_NULL 0x04
-#define ESCAPE_OCTAL 0x08
+#define ESCAPE_SPACE BIT(0)
+#define ESCAPE_SPECIAL BIT(1)
+#define ESCAPE_NULL BIT(2)
+#define ESCAPE_OCTAL BIT(3)
#define ESCAPE_ANY \
(ESCAPE_SPACE | ESCAPE_OCTAL | ESCAPE_SPECIAL | ESCAPE_NULL)
-#define ESCAPE_NP 0x10
+#define ESCAPE_NP BIT(4)
#define ESCAPE_ANY_NP (ESCAPE_ANY | ESCAPE_NP)
-#define ESCAPE_HEX 0x20
+#define ESCAPE_HEX BIT(5)
+#define ESCAPE_NA BIT(6)
+#define ESCAPE_NAP BIT(7)
+#define ESCAPE_APPEND BIT(8)
+
+#define ESCAPE_ALL_MASK GENMASK(8, 0)
int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
unsigned int flags, const char *only);
-int string_escape_mem_ascii(const char *src, size_t isz, char *dst,
- size_t osz);
-
static inline int string_escape_mem_any_np(const char *src, size_t isz,
char *dst, size_t osz, const char *only)
{
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index d0965e2997b0..b134b2b3371c 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -14,6 +14,7 @@
#include <linux/kref.h>
#include <linux/slab.h>
#include <linux/atomic.h>
+#include <linux/kstrtox.h>
#include <linux/proc_fs.h>
/*
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 49b1dd2c100b..6f5a43251593 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -62,12 +62,17 @@ static inline int current_is_kswapd(void)
* migrate part of a process memory to device memory.
*
* When a page is migrated from CPU to device, we set the CPU page table entry
- * to a special SWP_DEVICE_* entry.
+ * to a special SWP_DEVICE_{READ|WRITE} entry.
+ *
+ * When a page is mapped by the device for exclusive access we set the CPU page
+ * table entries to special SWP_DEVICE_EXCLUSIVE_* entries.
*/
#ifdef CONFIG_DEVICE_PRIVATE
-#define SWP_DEVICE_NUM 2
+#define SWP_DEVICE_NUM 4
#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
+#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
+#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3)
#else
#define SWP_DEVICE_NUM 0
#endif
@@ -537,7 +542,11 @@ static inline void put_swap_device(struct swap_info_struct *si)
{
}
-#define swap_address_space(entry) (NULL)
+static inline struct address_space *swap_address_space(swp_entry_t entry)
+{
+ return NULL;
+}
+
#define get_nr_swap_pages() 0L
#define total_swap_pages 0L
#define total_swapcache_pages() 0UL
@@ -560,8 +569,8 @@ static inline void show_swap_cache_info(void)
{
}
-#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
-#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
+/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
+#define free_swap_and_cache(e) is_pfn_swap_entry(e)
static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
{
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 5907205c712c..d356ab4047f7 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -107,10 +107,14 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
}
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
+static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
- return swp_entry(write ? SWP_DEVICE_WRITE : SWP_DEVICE_READ,
- page_to_pfn(page));
+ return swp_entry(SWP_DEVICE_READ, offset);
+}
+
+static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
+{
+ return swp_entry(SWP_DEVICE_WRITE, offset);
}
static inline bool is_device_private_entry(swp_entry_t entry)
@@ -119,33 +123,40 @@ static inline bool is_device_private_entry(swp_entry_t entry)
return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
}
-static inline void make_device_private_entry_read(swp_entry_t *entry)
+static inline bool is_writable_device_private_entry(swp_entry_t entry)
{
- *entry = swp_entry(SWP_DEVICE_READ, swp_offset(*entry));
+ return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
}
-static inline bool is_write_device_private_entry(swp_entry_t entry)
+static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
{
- return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
+ return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset);
}
-static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry)
+static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
{
- return swp_offset(entry);
+ return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset);
+}
+
+static inline bool is_device_exclusive_entry(swp_entry_t entry)
+{
+ return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ ||
+ swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
}
-static inline struct page *device_private_entry_to_page(swp_entry_t entry)
+static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
{
- return pfn_to_page(swp_offset(entry));
+ return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE);
}
#else /* CONFIG_DEVICE_PRIVATE */
-static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
+static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
return swp_entry(0, 0);
}
-static inline void make_device_private_entry_read(swp_entry_t *entry)
+static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
{
+ return swp_entry(0, 0);
}
static inline bool is_device_private_entry(swp_entry_t entry)
@@ -153,61 +164,52 @@ static inline bool is_device_private_entry(swp_entry_t entry)
return false;
}
-static inline bool is_write_device_private_entry(swp_entry_t entry)
+static inline bool is_writable_device_private_entry(swp_entry_t entry)
{
return false;
}
-static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry)
+static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
{
- return 0;
+ return swp_entry(0, 0);
}
-static inline struct page *device_private_entry_to_page(swp_entry_t entry)
+static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
{
- return NULL;
+ return swp_entry(0, 0);
}
-#endif /* CONFIG_DEVICE_PRIVATE */
-#ifdef CONFIG_MIGRATION
-static inline swp_entry_t make_migration_entry(struct page *page, int write)
+static inline bool is_device_exclusive_entry(swp_entry_t entry)
{
- BUG_ON(!PageLocked(compound_head(page)));
+ return false;
+}
- return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
- page_to_pfn(page));
+static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
+{
+ return false;
}
+#endif /* CONFIG_DEVICE_PRIVATE */
+#ifdef CONFIG_MIGRATION
static inline int is_migration_entry(swp_entry_t entry)
{
return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
swp_type(entry) == SWP_MIGRATION_WRITE);
}
-static inline int is_write_migration_entry(swp_entry_t entry)
+static inline int is_writable_migration_entry(swp_entry_t entry)
{
return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
}
-static inline unsigned long migration_entry_to_pfn(swp_entry_t entry)
+static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
{
- return swp_offset(entry);
+ return swp_entry(SWP_MIGRATION_READ, offset);
}
-static inline struct page *migration_entry_to_page(swp_entry_t entry)
+static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
{
- struct page *p = pfn_to_page(swp_offset(entry));
- /*
- * Any use of migration entries may only occur while the
- * corresponding page is locked
- */
- BUG_ON(!PageLocked(compound_head(p)));
- return p;
-}
-
-static inline void make_migration_entry_read(swp_entry_t *entry)
-{
- *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
+ return swp_entry(SWP_MIGRATION_WRITE, offset);
}
extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
@@ -217,37 +219,58 @@ extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
extern void migration_entry_wait_huge(struct vm_area_struct *vma,
struct mm_struct *mm, pte_t *pte);
#else
-
-#define make_migration_entry(page, write) swp_entry(0, 0)
-static inline int is_migration_entry(swp_entry_t swp)
+static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
{
- return 0;
+ return swp_entry(0, 0);
}
-static inline unsigned long migration_entry_to_pfn(swp_entry_t entry)
+static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
{
- return 0;
+ return swp_entry(0, 0);
}
-static inline struct page *migration_entry_to_page(swp_entry_t entry)
+static inline int is_migration_entry(swp_entry_t swp)
{
- return NULL;
+ return 0;
}
-static inline void make_migration_entry_read(swp_entry_t *entryp) { }
static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl) { }
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
struct mm_struct *mm, pte_t *pte) { }
-static inline int is_write_migration_entry(swp_entry_t entry)
+static inline int is_writable_migration_entry(swp_entry_t entry)
{
return 0;
}
#endif
+static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
+{
+ struct page *p = pfn_to_page(swp_offset(entry));
+
+ /*
+ * Any use of migration entries may only occur while the
+ * corresponding page is locked
+ */
+ BUG_ON(is_migration_entry(entry) && !PageLocked(p));
+
+ return p;
+}
+
+/*
+ * A pfn swap entry is a special type of swap entry that always has a pfn stored
+ * in the swap offset. They are used to represent unaddressable device memory
+ * and to restrict access to a page undergoing migration.
+ */
+static inline bool is_pfn_swap_entry(swp_entry_t entry)
+{
+ return is_migration_entry(entry) || is_device_private_entry(entry) ||
+ is_device_exclusive_entry(entry);
+}
+
struct page_vma_mapped_walk;
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -265,6 +288,8 @@ static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
if (pmd_swp_soft_dirty(pmd))
pmd = pmd_swp_clear_soft_dirty(pmd);
+ if (pmd_swp_uffd_wp(pmd))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
arch_entry = __pmd_to_swp_entry(pmd);
return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 157762db9d4b..0999f6317978 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -9,6 +9,7 @@
#define _LINUX_THREAD_INFO_H
#include <linux/types.h>
+#include <linux/limits.h>
#include <linux/bug.h>
#include <linux/restart_block.h>
#include <linux/errno.h>
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 794d1538b8ba..331d2ccf0bcc 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -53,6 +53,11 @@ enum mcopy_atomic_mode {
MCOPY_ATOMIC_CONTINUE,
};
+extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, struct page *page,
+ bool newly_allocated, bool wp_copy);
+
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
bool *mmap_changing, __u64 mode);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index bfaaf0b6fa76..1dabd6f22486 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -104,6 +104,21 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot)
}
#endif
+#ifndef arch_vmap_pte_range_map_size
+static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, unsigned long end,
+ u64 pfn, unsigned int max_page_shift)
+{
+ return PAGE_SIZE;
+}
+#endif
+
+#ifndef arch_vmap_pte_supported_shift
+static inline int arch_vmap_pte_supported_shift(unsigned long size)
+{
+ return PAGE_SHIFT;
+}
+#endif
+
/*
* Highlevel APIs for driver use
*/
diff --git a/include/linux/zbud.h b/include/linux/zbud.h
deleted file mode 100644
index b1eaf6e31735..000000000000
--- a/include/linux/zbud.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ZBUD_H_
-#define _ZBUD_H_
-
-#include <linux/types.h>
-
-struct zbud_pool;
-
-struct zbud_ops {
- int (*evict)(struct zbud_pool *pool, unsigned long handle);
-};
-
-struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops);
-void zbud_destroy_pool(struct zbud_pool *pool);
-int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
- unsigned long *handle);
-void zbud_free(struct zbud_pool *pool, unsigned long handle);
-int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
-void *zbud_map(struct zbud_pool *pool, unsigned long handle);
-void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
-u64 zbud_get_pool_size(struct zbud_pool *pool);
-
-#endif /* _ZBUD_H_ */
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 00d1180527d8..88faf2400ec2 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -423,47 +423,6 @@ TRACE_EVENT(mm_vmscan_lru_shrink_active,
show_reclaim_flags(__entry->reclaim_flags))
);
-TRACE_EVENT(mm_vmscan_inactive_list_is_low,
-
- TP_PROTO(int nid, int reclaim_idx,
- unsigned long total_inactive, unsigned long inactive,
- unsigned long total_active, unsigned long active,
- unsigned long ratio, int file),
-
- TP_ARGS(nid, reclaim_idx, total_inactive, inactive, total_active, active, ratio, file),
-
- TP_STRUCT__entry(
- __field(int, nid)
- __field(int, reclaim_idx)
- __field(unsigned long, total_inactive)
- __field(unsigned long, inactive)
- __field(unsigned long, total_active)
- __field(unsigned long, active)
- __field(unsigned long, ratio)
- __field(int, reclaim_flags)
- ),
-
- TP_fast_assign(
- __entry->nid = nid;
- __entry->reclaim_idx = reclaim_idx;
- __entry->total_inactive = total_inactive;
- __entry->inactive = inactive;
- __entry->total_active = total_active;
- __entry->active = active;
- __entry->ratio = ratio;
- __entry->reclaim_flags = trace_reclaim_flags(file) &
- RECLAIM_WB_LRU;
- ),
-
- TP_printk("nid=%d reclaim_idx=%d total_inactive=%ld inactive=%ld total_active=%ld active=%ld ratio=%ld flags=%s",
- __entry->nid,
- __entry->reclaim_idx,
- __entry->total_inactive, __entry->inactive,
- __entry->total_active, __entry->active,
- __entry->ratio,
- show_reclaim_flags(__entry->reclaim_flags))
-);
-
TRACE_EVENT(mm_vmscan_node_reclaim_begin,
TP_PROTO(int nid, int order, gfp_t gfp_flags),
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index f94f65d429be..1567a3294c3d 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -72,6 +72,9 @@
#define MADV_COLD 20 /* deactivate these pages */
#define MADV_PAGEOUT 21 /* reclaim these pages */
+#define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */
+#define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 4832fd0b5642..19a00bc7fe86 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -60,7 +60,6 @@ enum {
* are never OR'ed into the mode in mempolicy API arguments.
*/
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
-#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 650480f41f1d..05b31d60acf6 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -31,7 +31,8 @@
UFFD_FEATURE_MISSING_SHMEM | \
UFFD_FEATURE_SIGBUS | \
UFFD_FEATURE_THREAD_ID | \
- UFFD_FEATURE_MINOR_HUGETLBFS)
+ UFFD_FEATURE_MINOR_HUGETLBFS | \
+ UFFD_FEATURE_MINOR_SHMEM)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -185,6 +186,9 @@ struct uffdio_api {
* UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults
* can be intercepted (via REGISTER_MODE_MINOR) for
* hugetlbfs-backed pages.
+ *
+ * UFFD_FEATURE_MINOR_SHMEM indicates the same support as
+ * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead.
*/
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
@@ -196,6 +200,7 @@ struct uffdio_api {
#define UFFD_FEATURE_SIGBUS (1<<7)
#define UFFD_FEATURE_THREAD_ID (1<<8)
#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9)
+#define UFFD_FEATURE_MINOR_SHMEM (1<<10)
__u64 features;
__u64 ioctls;
diff --git a/init/main.c b/init/main.c
index 359358500e54..f88d896c0123 100644
--- a/init/main.c
+++ b/init/main.c
@@ -873,6 +873,47 @@ void __init __weak arch_call_rest_init(void)
rest_init();
}
+static void __init print_unknown_bootoptions(void)
+{
+ char *unknown_options;
+ char *end;
+ const char *const *p;
+ size_t len;
+
+ if (panic_later || (!argv_init[1] && !envp_init[2]))
+ return;
+
+ /*
+ * Determine how many options we have to print out, plus a space
+ * before each
+ */
+ len = 1; /* null terminator */
+ for (p = &argv_init[1]; *p; p++) {
+ len++;
+ len += strlen(*p);
+ }
+ for (p = &envp_init[2]; *p; p++) {
+ len++;
+ len += strlen(*p);
+ }
+
+ unknown_options = memblock_alloc(len, SMP_CACHE_BYTES);
+ if (!unknown_options) {
+ pr_err("%s: Failed to allocate %zu bytes\n",
+ __func__, len);
+ return;
+ }
+ end = unknown_options;
+
+ for (p = &argv_init[1]; *p; p++)
+ end += sprintf(end, " %s", *p);
+ for (p = &envp_init[2]; *p; p++)
+ end += sprintf(end, " %s", *p);
+
+ pr_notice("Unknown command line parameters:%s\n", unknown_options);
+ memblock_free(__pa(unknown_options), len);
+}
+
asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
{
char *command_line;
@@ -914,6 +955,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
static_command_line, __start___param,
__stop___param - __start___param,
-1, -1, NULL, &unknown_bootoption);
+ print_unknown_bootoptions();
if (!IS_ERR_OR_NULL(after_dashes))
parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
NULL, set_init_arg);
diff --git a/ipc/msg.c b/ipc/msg.c
index 6e6c8e0c9380..6810276d6bb9 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -130,7 +130,7 @@ static void msg_rcu_free(struct rcu_head *head)
struct msg_queue *msq = container_of(p, struct msg_queue, q_perm);
security_msg_queue_free(&msq->q_perm);
- kvfree(msq);
+ kfree(msq);
}
/**
@@ -147,7 +147,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
key_t key = params->key;
int msgflg = params->flg;
- msq = kvmalloc(sizeof(*msq), GFP_KERNEL);
+ msq = kmalloc(sizeof(*msq), GFP_KERNEL);
if (unlikely(!msq))
return -ENOMEM;
@@ -157,7 +157,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
msq->q_perm.security = NULL;
retval = security_msg_queue_alloc(&msq->q_perm);
if (retval) {
- kvfree(msq);
+ kfree(msq);
return retval;
}
diff --git a/ipc/sem.c b/ipc/sem.c
index bf534c74293e..971e75d28364 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -217,6 +217,8 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
* this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
* is inside a spin_lock() and after a write from 0 to non-zero a
* spin_lock()+spin_unlock() is done.
+ * To prevent the compiler/cpu temporarily writing 0 to use_global_lock,
+ * READ_ONCE()/WRITE_ONCE() is used.
*
* 2) queue.status: (SEM_BARRIER_2)
* Initialization is done while holding sem_lock(), so no further barrier is
@@ -342,10 +344,10 @@ static void complexmode_enter(struct sem_array *sma)
* Nothing to do, just reset the
* counter until we return to simple mode.
*/
- sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
+ WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS);
return;
}
- sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
+ WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS);
for (i = 0; i < sma->sem_nsems; i++) {
sem = &sma->sems[i];
@@ -371,7 +373,8 @@ static void complexmode_tryleave(struct sem_array *sma)
/* See SEM_BARRIER_1 for purpose/pairing */
smp_store_release(&sma->use_global_lock, 0);
} else {
- sma->use_global_lock--;
+ WRITE_ONCE(sma->use_global_lock,
+ sma->use_global_lock-1);
}
}
@@ -412,7 +415,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
* Initial check for use_global_lock. Just an optimization,
* no locking, no memory barrier.
*/
- if (!sma->use_global_lock) {
+ if (!READ_ONCE(sma->use_global_lock)) {
/*
* It appears that no complex operation is around.
* Acquire the per-semaphore lock.
@@ -1154,7 +1157,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
un->semid = -1;
list_del_rcu(&un->list_proc);
spin_unlock(&un->ulp->lock);
- kfree_rcu(un, rcu);
+ kvfree_rcu(un, rcu);
}
/* Wake up all pending processes and let them fail with EIDRM. */
@@ -1937,7 +1940,8 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
rcu_read_unlock();
/* step 2: allocate new undo structure */
- new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
+ new = kvzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems,
+ GFP_KERNEL);
if (!new) {
ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
return ERR_PTR(-ENOMEM);
@@ -1949,7 +1953,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
if (!ipc_valid_object(&sma->sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
- kfree(new);
+ kvfree(new);
un = ERR_PTR(-EIDRM);
goto out;
}
@@ -1960,7 +1964,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
*/
un = lookup_undo(ulp, semid);
if (un) {
- kfree(new);
+ kvfree(new);
goto success;
}
/* step 5: initialize & link new undo structure */
@@ -2420,7 +2424,7 @@ void exit_sem(struct task_struct *tsk)
rcu_read_unlock();
wake_up_q(&wake_q);
- kfree_rcu(un, rcu);
+ kvfree_rcu(un, rcu);
}
kfree(ulp);
}
@@ -2435,7 +2439,8 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
/*
* The proc interface isn't aware of sem_lock(), it calls
- * ipc_lock_object() directly (in sysvipc_find_ipc).
+ * ipc_lock_object(), i.e. spin_lock(&sma->sem_perm.lock).
+ * (in sysvipc_find_ipc)
* In order to stay compatible with sem_lock(), we must
* enter / leave complex_mode.
*/
diff --git a/ipc/shm.c b/ipc/shm.c
index 003234fbbd17..748933e376ca 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -222,7 +222,7 @@ static void shm_rcu_free(struct rcu_head *head)
struct shmid_kernel *shp = container_of(ptr, struct shmid_kernel,
shm_perm);
security_shm_free(&shp->shm_perm);
- kvfree(shp);
+ kfree(shp);
}
static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
@@ -619,7 +619,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
ns->shm_tot + numpages > ns->shm_ctlall)
return -ENOSPC;
- shp = kvmalloc(sizeof(*shp), GFP_KERNEL);
+ shp = kmalloc(sizeof(*shp), GFP_KERNEL);
if (unlikely(!shp))
return -ENOMEM;
@@ -630,7 +630,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
shp->shm_perm.security = NULL;
error = security_shm_alloc(&shp->shm_perm);
if (error) {
- kvfree(shp);
+ kfree(shp);
return error;
}
diff --git a/ipc/util.c b/ipc/util.c
index cfa0045e748d..0027e47626b7 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -64,6 +64,7 @@
#include <linux/memory.h>
#include <linux/ipc_namespace.h>
#include <linux/rhashtable.h>
+#include <linux/log2.h>
#include <asm/unistd.h>
@@ -451,6 +452,41 @@ static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
}
/**
+ * ipc_search_maxidx - search for the highest assigned index
+ * @ids: ipc identifier set
+ * @limit: known upper limit for highest assigned index
+ *
+ * The function determines the highest assigned index in @ids. It is intended
+ * to be called when ids->max_idx needs to be updated.
+ * Updating ids->max_idx is necessary when the current highest index ipc
+ * object is deleted.
+ * If no ipc object is allocated, then -1 is returned.
+ *
+ * ipc_ids.rwsem needs to be held by the caller.
+ */
+static int ipc_search_maxidx(struct ipc_ids *ids, int limit)
+{
+ int tmpidx;
+ int i;
+ int retval;
+
+ i = ilog2(limit+1);
+
+ retval = 0;
+ for (; i >= 0; i--) {
+ tmpidx = retval | (1<<i);
+ /*
+ * "0" is a possible index value, thus search using
+ * e.g. 15,7,3,1,0 instead of 16,8,4,2,1.
+ */
+ tmpidx = tmpidx-1;
+ if (idr_get_next(&ids->ipcs_idr, &tmpidx))
+ retval |= (1<<i);
+ }
+ return retval - 1;
+}
+
+/**
* ipc_rmid - remove an ipc identifier
* @ids: ipc identifier set
* @ipcp: ipc perm structure containing the identifier to remove
@@ -468,11 +504,9 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
ipcp->deleted = true;
if (unlikely(idx == ids->max_idx)) {
- do {
- idx--;
- if (idx == -1)
- break;
- } while (!idr_find(&ids->ipcs_idr, idx));
+ idx = ids->max_idx-1;
+ if (idx >= 0)
+ idx = ipc_search_maxidx(ids, idx);
ids->max_idx = idx;
}
}
diff --git a/ipc/util.h b/ipc/util.h
index 5766c61aed0e..2dd7ce0416d8 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -145,6 +145,9 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
* ipc_get_maxidx - get the highest assigned index
* @ids: ipc identifier set
*
+ * The function returns the highest assigned index for @ids. The function
+ * doesn't scan the idr tree, it uses a cached value.
+ *
* Called with ipc_ids.rwsem held for reading.
*/
static inline int ipc_get_maxidx(struct ipc_ids *ids)
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index b0ce8b3f3822..9888e2bc8c76 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -15,6 +15,7 @@
#include <linux/kthread.h>
#include <linux/lockdep.h>
#include <linux/export.h>
+#include <linux/panic_notifier.h>
#include <linux/sysctl.h>
#include <linux/suspend.h>
#include <linux/utsname.h>
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index f099baee3578..4b34a9aa32bc 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -26,6 +26,7 @@
#include <linux/suspend.h>
#include <linux/device.h>
#include <linux/freezer.h>
+#include <linux/panic_notifier.h>
#include <linux/pm.h>
#include <linux/cpu.h>
#include <linux/uaccess.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e41385afe79d..297dc8bbe333 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -106,7 +106,7 @@ void __weak *alloc_insn_page(void)
return module_alloc(PAGE_SIZE);
}
-void __weak free_insn_page(void *page)
+static void free_insn_page(void *page)
{
module_memfree(page);
}
diff --git a/kernel/panic.c b/kernel/panic.c
index 332736a72a58..edad89660a2b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,6 +23,7 @@
#include <linux/reboot.h>
#include <linux/delay.h>
#include <linux/kexec.h>
+#include <linux/panic_notifier.h>
#include <linux/sched.h>
#include <linux/sysrq.h>
#include <linux/init.h>
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8e78b2430c16..f12056beb916 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -32,6 +32,8 @@
#include <linux/export.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
+#include <linux/panic.h>
+#include <linux/panic_notifier.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index de0920353d30..f6371dfa1f89 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2830,6 +2830,8 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
sigaddset(&blocked, ksig->sig);
set_current_blocked(&blocked);
+ if (current->sas_ss_flags & SS_AUTODISARM)
+ sas_ss_reset(current);
tracehook_signal_handler(stepping);
}
@@ -4148,11 +4150,7 @@ int __save_altstack(stack_t __user *uss, unsigned long sp)
int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
__put_user(t->sas_ss_flags, &uss->ss_flags) |
__put_user(t->sas_ss_size, &uss->ss_size);
- if (err)
- return err;
- if (t->sas_ss_flags & SS_AUTODISARM)
- sas_ss_reset(t);
- return 0;
+ return err;
}
#ifdef CONFIG_COMPAT
@@ -4207,11 +4205,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
&uss->ss_sp) |
__put_user(t->sas_ss_flags, &uss->ss_flags) |
__put_user(t->sas_ss_size, &uss->ss_size);
- if (err)
- return err;
- if (t->sas_ss_flags & SS_AUTODISARM)
- sas_ss_reset(t);
- return 0;
+ return err;
}
#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e1aa24e1545c..272f4a272f8c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
#include <linux/sysctl.h>
#include <linux/bitmap.h>
#include <linux/signal.h>
+#include <linux/panic.h>
#include <linux/printk.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
@@ -1495,7 +1496,6 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int err = 0;
- bool first = 1;
size_t left = *lenp;
unsigned long bitmap_len = table->maxlen;
unsigned long *bitmap = *(unsigned long **) table->data;
@@ -1580,12 +1580,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
}
bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
- first = 0;
proc_skip_char(&p, &left, '\n');
}
left += skipped;
} else {
unsigned long bit_a, bit_b = 0;
+ bool first = 1;
while (left) {
bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d23a09d3eb37..3c1384bc5c5a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -39,6 +39,7 @@
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/init.h>
+#include <linux/panic_notifier.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1c9857fdb1a0..ae62debfd8bf 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2446,6 +2446,18 @@ config SLUB_KUNIT_TEST
If unsure, say N.
+config RATIONAL_KUNIT_TEST
+ tristate "KUnit test for rational.c" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ select RATIONAL
+ default KUNIT_ALL_TESTS
+ help
+ This builds the rational math unit test.
+ For more information on KUnit and unit tests in general please refer
+ to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+ If unsure, say N.
+
config TEST_UDELAY
tristate "udelay test driver"
help
diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c
index c72c865032fa..3518e7394eca 100644
--- a/lib/decompress_bunzip2.c
+++ b/lib/decompress_bunzip2.c
@@ -80,7 +80,7 @@
/* This is what we know about each Huffman coding group */
struct group_data {
- /* We have an extra slot at the end of limit[] for a sentinal value. */
+ /* We have an extra slot at the end of limit[] for a sentinel value. */
int limit[MAX_HUFCODE_BITS+1];
int base[MAX_HUFCODE_BITS];
int permute[MAX_SYMBOLS];
@@ -337,7 +337,7 @@ static int INIT get_next_block(struct bunzip_data *bd)
pp <<= 1;
base[i+1] = pp-(t += temp[i]);
}
- limit[maxLen+1] = INT_MAX; /* Sentinal value for
+ limit[maxLen+1] = INT_MAX; /* Sentinel value for
* reading next sym. */
limit[maxLen] = pp+temp[maxLen]-1;
base[minLen] = 0;
@@ -385,7 +385,7 @@ static int INIT get_next_block(struct bunzip_data *bd)
bd->inbufBits =
(bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
bd->inbufBitCount += 8;
- };
+ }
bd->inbufBitCount -= hufGroup->maxLen;
j = (bd->inbufBits >> bd->inbufBitCount)&
((1 << hufGroup->maxLen)-1);
diff --git a/lib/decompress_unlz4.c b/lib/decompress_unlz4.c
index c0cfcfd486be..e6327391b6b6 100644
--- a/lib/decompress_unlz4.c
+++ b/lib/decompress_unlz4.c
@@ -112,6 +112,9 @@ STATIC inline int INIT unlz4(u8 *input, long in_len,
error("data corrupted");
goto exit_2;
}
+ } else if (size < 4) {
+ /* empty or end-of-file */
+ goto exit_3;
}
chunksize = get_unaligned_le32(inp);
@@ -125,6 +128,10 @@ STATIC inline int INIT unlz4(u8 *input, long in_len,
continue;
}
+ if (!fill && chunksize == 0) {
+ /* empty or end-of-file */
+ goto exit_3;
+ }
if (posp)
*posp += 4;
@@ -184,6 +191,7 @@ STATIC inline int INIT unlz4(u8 *input, long in_len,
}
}
+exit_3:
ret = 0;
exit_2:
if (!input)
diff --git a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c
index 1f439a622076..64c1358500ce 100644
--- a/lib/decompress_unlzo.c
+++ b/lib/decompress_unlzo.c
@@ -43,7 +43,6 @@ STATIC inline long INIT parse_header(u8 *input, long *skip, long in_len)
int l;
u8 *parse = input;
u8 *end = input + in_len;
- u8 level = 0;
u16 version;
/*
@@ -65,7 +64,7 @@ STATIC inline long INIT parse_header(u8 *input, long *skip, long in_len)
version = get_unaligned_be16(parse);
parse += 7;
if (version >= 0x0940)
- level = *parse++;
+ parse++;
if (get_unaligned_be32(parse) & HEADER_HAS_FILTER)
parse += 8; /* flags + filter info */
else
diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c
index 25d59a95bd66..a2f38e23004a 100644
--- a/lib/decompress_unxz.c
+++ b/lib/decompress_unxz.c
@@ -23,7 +23,7 @@
* uncompressible. Thus, we must look for worst-case expansion when the
* compressor is encoding uncompressible data.
*
- * The structure of the .xz file in case of a compresed kernel is as follows.
+ * The structure of the .xz file in case of a compressed kernel is as follows.
* Sizes (as bytes) of the fields are in parenthesis.
*
* Stream Header (12)
diff --git a/lib/decompress_unzstd.c b/lib/decompress_unzstd.c
index 790abc472f5b..6b629ab31c1e 100644
--- a/lib/decompress_unzstd.c
+++ b/lib/decompress_unzstd.c
@@ -16,7 +16,7 @@
* uncompressible. Thus, we must look for worst-case expansion when the
* compressor is encoding uncompressible data.
*
- * The structure of the .zst file in case of a compresed kernel is as follows.
+ * The structure of the .zst file in case of a compressed kernel is as follows.
* Maximum sizes (as bytes) of the fields are in parenthesis.
*
* Frame Header: (18)
@@ -56,7 +56,7 @@
/*
* Preboot environments #include "path/to/decompress_unzstd.c".
* All of the source files we depend on must be #included.
- * zstd's only source dependeny is xxhash, which has no source
+ * zstd's only source dependency is xxhash, which has no source
* dependencies.
*
* When UNZSTD_PREBOOT is defined we declare __decompress(), which is
diff --git a/lib/kstrtox.c b/lib/kstrtox.c
index 0b5fe8b41173..059b8b00dc53 100644
--- a/lib/kstrtox.c
+++ b/lib/kstrtox.c
@@ -14,11 +14,12 @@
*/
#include <linux/ctype.h>
#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/math64.h>
#include <linux/export.h>
+#include <linux/kstrtox.h>
+#include <linux/math64.h>
#include <linux/types.h>
#include <linux/uaccess.h>
+
#include "kstrtox.h"
const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 8a7724a6ce2f..926f4823d5ea 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -481,7 +481,7 @@ int LZ4_decompress_fast(const char *source, char *dest, int originalSize)
/* ===== Instantiate a few more decoding cases, used more than once. ===== */
-int LZ4_decompress_safe_withPrefix64k(const char *source, char *dest,
+static int LZ4_decompress_safe_withPrefix64k(const char *source, char *dest,
int compressedSize, int maxOutputSize)
{
return LZ4_decompress_generic(source, dest,
diff --git a/lib/math/Makefile b/lib/math/Makefile
index 7456edb864fc..bfac26ddfc22 100644
--- a/lib/math/Makefile
+++ b/lib/math/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
obj-$(CONFIG_RATIONAL) += rational.o
obj-$(CONFIG_TEST_DIV64) += test_div64.o
+obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o
diff --git a/lib/math/rational-test.c b/lib/math/rational-test.c
new file mode 100644
index 000000000000..01611ddff420
--- /dev/null
+++ b/lib/math/rational-test.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <kunit/test.h>
+
+#include <linux/rational.h>
+
+struct rational_test_param {
+ unsigned long num, den;
+ unsigned long max_num, max_den;
+ unsigned long exp_num, exp_den;
+
+ const char *name;
+};
+
+static const struct rational_test_param test_parameters[] = {
+ { 1230, 10, 100, 20, 100, 1, "Exceeds bounds, semi-convergent term > 1/2 last term" },
+ { 34567,100, 120, 20, 120, 1, "Exceeds bounds, semi-convergent term < 1/2 last term" },
+ { 1, 30, 100, 10, 0, 1, "Closest to zero" },
+ { 1, 19, 100, 10, 1, 10, "Closest to smallest non-zero" },
+ { 27,32, 16, 16, 11, 13, "Use convergent" },
+ { 1155, 7735, 255, 255, 33, 221, "Exact answer" },
+ { 87, 32, 70, 32, 68, 25, "Semiconvergent, numerator limit" },
+ { 14533, 4626, 15000, 2400, 7433, 2366, "Semiconvergent, denominator limit" },
+};
+
+static void get_desc(const struct rational_test_param *param, char *desc)
+{
+ strscpy(desc, param->name, KUNIT_PARAM_DESC_SIZE);
+}
+
+/* Creates function rational_gen_params */
+KUNIT_ARRAY_PARAM(rational, test_parameters, get_desc);
+
+static void rational_test(struct kunit *test)
+{
+ const struct rational_test_param *param = (const struct rational_test_param *)test->param_value;
+ unsigned long n = 0, d = 0;
+
+ rational_best_approximation(param->num, param->den, param->max_num, param->max_den, &n, &d);
+ KUNIT_EXPECT_EQ(test, n, param->exp_num);
+ KUNIT_EXPECT_EQ(test, d, param->exp_den);
+}
+
+static struct kunit_case rational_test_cases[] = {
+ KUNIT_CASE_PARAM(rational_test, rational_gen_params),
+ {}
+};
+
+static struct kunit_suite rational_test_suite = {
+ .name = "rational",
+ .test_cases = rational_test_cases,
+};
+
+kunit_test_suites(&rational_test_suite);
+
+MODULE_LICENSE("GPL v2");
diff --git a/lib/math/rational.c b/lib/math/rational.c
index 9781d521963d..c0ab51d8fbb9 100644
--- a/lib/math/rational.c
+++ b/lib/math/rational.c
@@ -12,6 +12,7 @@
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/minmax.h>
+#include <linux/limits.h>
/*
* calculate best rational approximation for a given fraction
@@ -78,13 +79,18 @@ void rational_best_approximation(
* found below as 't'.
*/
if ((n2 > max_numerator) || (d2 > max_denominator)) {
- unsigned long t = min((max_numerator - n0) / n1,
- (max_denominator - d0) / d1);
+ unsigned long t = ULONG_MAX;
- /* This tests if the semi-convergent is closer
- * than the previous convergent.
+ if (d1)
+ t = (max_denominator - d0) / d1;
+ if (n1)
+ t = min(t, (max_numerator - n0) / n1);
+
+ /* This tests if the semi-convergent is closer than the previous
+ * convergent. If d1 is zero there is no previous convergent as this
+ * is the 1st iteration, so always choose the semi-convergent.
*/
- if (2u * t > a || (2u * t == a && d0 * dp > d1 * d)) {
+ if (!d1 || 2u * t > a || (2u * t == a && d0 * dp > d1 * d)) {
n1 = n0 + t * n1;
d1 = d0 + t * d1;
}
diff --git a/lib/mpi/longlong.h b/lib/mpi/longlong.h
index afbd99987cf8..b6fa1d08fb55 100644
--- a/lib/mpi/longlong.h
+++ b/lib/mpi/longlong.h
@@ -48,8 +48,8 @@
/* Define auxiliary asm macros.
*
- * 1) umul_ppmm(high_prod, low_prod, multipler, multiplicand) multiplies two
- * UWtype integers MULTIPLER and MULTIPLICAND, and generates a two UWtype
+ * 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
+ * UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
* word product in HIGH_PROD and LOW_PROD.
*
* 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c
index 7ea225b2204f..39c4c6731094 100644
--- a/lib/mpi/mpicoder.c
+++ b/lib/mpi/mpicoder.c
@@ -234,11 +234,11 @@ static int count_lzeros(MPI a)
}
/**
- * mpi_read_buffer() - read MPI to a bufer provided by user (msb first)
+ * mpi_read_buffer() - read MPI to a buffer provided by user (msb first)
*
* @a: a multi precision integer
- * @buf: bufer to which the output will be written to. Needs to be at
- * leaset mpi_get_size(a) long.
+ * @buf: buffer to which the output will be written to. Needs to be at
+ * least mpi_get_size(a) long.
* @buf_len: size of the buf.
* @nbytes: receives the actual length of the data written on success and
* the data to-be-written on -EOVERFLOW in case buf_len was too
diff --git a/lib/mpi/mpiutil.c b/lib/mpi/mpiutil.c
index 3c63710c20c6..9a75ca3f7edf 100644
--- a/lib/mpi/mpiutil.c
+++ b/lib/mpi/mpiutil.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(mpi_const);
/****************
* Note: It was a bad idea to use the number of limbs to allocate
* because on a alpha the limbs are large but we normally need
- * integers of n bits - So we should chnage this to bits (or bytes).
+ * integers of n bits - So we should change this to bits (or bytes).
*
* But mpi_alloc is used in a lot of places :-)
*/
diff --git a/lib/parser.c b/lib/parser.c
index f1a6d90b8c34..bcb23484100e 100644
--- a/lib/parser.c
+++ b/lib/parser.c
@@ -6,6 +6,7 @@
#include <linux/ctype.h>
#include <linux/types.h>
#include <linux/export.h>
+#include <linux/kstrtox.h>
#include <linux/parser.h>
#include <linux/slab.h>
#include <linux/string.h>
diff --git a/lib/string.c b/lib/string.c
index 7548eb715ddb..77bd0b1d3296 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -977,7 +977,7 @@ void *memscan(void *addr, int c, size_t size)
unsigned char *p = addr;
while (size) {
- if (*p == c)
+ if (*p == (unsigned char)c)
return (void *)p;
p++;
size--;
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 7f2d5fbaf243..5a35c7e16e96 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -452,18 +452,20 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
* The process of escaping byte buffer includes several parts. They are applied
* in the following sequence.
*
- * 1. The character is matched to the printable class, if asked, and in
- * case of match it passes through to the output.
- * 2. The character is not matched to the one from @only string and thus
+ * 1. The character is not matched to the one from @only string and thus
* must go as-is to the output.
- * 3. The character is checked if it falls into the class given by @flags.
+ * 2. The character is matched to the printable and ASCII classes, if asked,
+ * and in case of match it passes through to the output.
+ * 3. The character is matched to the printable or ASCII class, if asked,
+ * and in case of match it passes through to the output.
+ * 4. The character is checked if it falls into the class given by @flags.
* %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any
* character. Note that they actually can't go together, otherwise
* %ESCAPE_HEX will be ignored.
*
* Caller must provide valid source and destination pointers. Be aware that
* destination buffer will not be NULL-terminated, thus caller have to append
- * it if needs. The supported flags are::
+ * it if needs. The supported flags are::
*
* %ESCAPE_SPACE: (special white space, not space itself)
* '\f' - form feed
@@ -482,11 +484,27 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
* %ESCAPE_ANY:
* all previous together
* %ESCAPE_NP:
- * escape only non-printable characters (checked by isprint)
+ * escape only non-printable characters, checked by isprint()
* %ESCAPE_ANY_NP:
* all previous together
* %ESCAPE_HEX:
* '\xHH' - byte with hexadecimal value HH (2 digits)
+ * %ESCAPE_NA:
+ * escape only non-ascii characters, checked by isascii()
+ * %ESCAPE_NAP:
+ * escape only non-printable or non-ascii characters
+ * %ESCAPE_APPEND:
+ * append characters from @only to be escaped by the given classes
+ *
+ * %ESCAPE_APPEND would help to pass additional characters to the escaped, when
+ * one of %ESCAPE_NP, %ESCAPE_NA, or %ESCAPE_NAP is provided.
+ *
+ * One notable caveat, the %ESCAPE_NAP, %ESCAPE_NP and %ESCAPE_NA have the
+ * higher priority than the rest of the flags (%ESCAPE_NAP is the highest).
+ * It doesn't make much sense to use either of them without %ESCAPE_OCTAL
+ * or %ESCAPE_HEX, because they cover most of the other character classes.
+ * %ESCAPE_NAP can utilize %ESCAPE_SPACE or %ESCAPE_SPECIAL in addition to
+ * the above.
*
* Return:
* The total size of the escaped output that would be generated for
@@ -500,67 +518,69 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
char *p = dst;
char *end = p + osz;
bool is_dict = only && *only;
+ bool is_append = flags & ESCAPE_APPEND;
while (isz--) {
unsigned char c = *src++;
+ bool in_dict = is_dict && strchr(only, c);
/*
* Apply rules in the following sequence:
- * - the character is printable, when @flags has
- * %ESCAPE_NP bit set
* - the @only string is supplied and does not contain a
* character under question
+ * - the character is printable and ASCII, when @flags has
+ * %ESCAPE_NAP bit set
+ * - the character is printable, when @flags has
+ * %ESCAPE_NP bit set
+ * - the character is ASCII, when @flags has
+ * %ESCAPE_NA bit set
* - the character doesn't fall into a class of symbols
* defined by given @flags
* In these cases we just pass through a character to the
* output buffer.
+ *
+ * When %ESCAPE_APPEND is passed, the characters from @only
+ * have been excluded from the %ESCAPE_NAP, %ESCAPE_NP, and
+ * %ESCAPE_NA cases.
*/
- if ((flags & ESCAPE_NP && isprint(c)) ||
- (is_dict && !strchr(only, c))) {
- /* do nothing */
- } else {
- if (flags & ESCAPE_SPACE && escape_space(c, &p, end))
- continue;
+ if (!(is_append || in_dict) && is_dict &&
+ escape_passthrough(c, &p, end))
+ continue;
- if (flags & ESCAPE_SPECIAL && escape_special(c, &p, end))
- continue;
+ if (!(is_append && in_dict) && isascii(c) && isprint(c) &&
+ flags & ESCAPE_NAP && escape_passthrough(c, &p, end))
+ continue;
- if (flags & ESCAPE_NULL && escape_null(c, &p, end))
- continue;
+ if (!(is_append && in_dict) && isprint(c) &&
+ flags & ESCAPE_NP && escape_passthrough(c, &p, end))
+ continue;
- /* ESCAPE_OCTAL and ESCAPE_HEX always go last */
- if (flags & ESCAPE_OCTAL && escape_octal(c, &p, end))
- continue;
+ if (!(is_append && in_dict) && isascii(c) &&
+ flags & ESCAPE_NA && escape_passthrough(c, &p, end))
+ continue;
- if (flags & ESCAPE_HEX && escape_hex(c, &p, end))
- continue;
- }
+ if (flags & ESCAPE_SPACE && escape_space(c, &p, end))
+ continue;
- escape_passthrough(c, &p, end);
- }
+ if (flags & ESCAPE_SPECIAL && escape_special(c, &p, end))
+ continue;
- return p - dst;
-}
-EXPORT_SYMBOL(string_escape_mem);
+ if (flags & ESCAPE_NULL && escape_null(c, &p, end))
+ continue;
-int string_escape_mem_ascii(const char *src, size_t isz, char *dst,
- size_t osz)
-{
- char *p = dst;
- char *end = p + osz;
+ /* ESCAPE_OCTAL and ESCAPE_HEX always go last */
+ if (flags & ESCAPE_OCTAL && escape_octal(c, &p, end))
+ continue;
- while (isz--) {
- unsigned char c = *src++;
+ if (flags & ESCAPE_HEX && escape_hex(c, &p, end))
+ continue;
- if (!isprint(c) || !isascii(c) || c == '"' || c == '\\')
- escape_hex(c, &p, end);
- else
- escape_passthrough(c, &p, end);
+ escape_passthrough(c, &p, end);
}
return p - dst;
}
-EXPORT_SYMBOL(string_escape_mem_ascii);
+EXPORT_SYMBOL(string_escape_mem);
/*
* Return an allocated string that has been escaped of special characters
diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c
index 10360d4ea273..2185d71704f0 100644
--- a/lib/test-string_helpers.c
+++ b/lib/test-string_helpers.c
@@ -19,7 +19,7 @@ static __init bool test_string_check_buf(const char *name, unsigned int flags,
if (q_real == q_test && !memcmp(out_test, out_real, q_test))
return true;
- pr_warn("Test '%s' failed: flags = %u\n", name, flags);
+ pr_warn("Test '%s' failed: flags = %#x\n", name, flags);
print_hex_dump(KERN_WARNING, "Input: ", DUMP_PREFIX_NONE, 16, 1,
in, p, true);
@@ -136,7 +136,7 @@ static const struct test_string_2 escape0[] __initconst = {{
.flags = ESCAPE_SPACE | ESCAPE_HEX,
},{
/* terminator */
- }},
+ }}
},{
.in = "\\h\\\"\a\e\\",
.s1 = {{
@@ -150,7 +150,7 @@ static const struct test_string_2 escape0[] __initconst = {{
.flags = ESCAPE_SPECIAL | ESCAPE_HEX,
},{
/* terminator */
- }},
+ }}
},{
.in = "\eb \\C\007\"\x90\r]",
.s1 = {{
@@ -201,12 +201,26 @@ static const struct test_string_2 escape0[] __initconst = {{
.flags = ESCAPE_NP | ESCAPE_HEX,
},{
/* terminator */
- }},
+ }}
+},{
+ .in = "\007 \eb\"\x90\xCF\r",
+ .s1 = {{
+ .out = "\007 \eb\"\\220\\317\r",
+ .flags = ESCAPE_OCTAL | ESCAPE_NA,
+ },{
+ .out = "\007 \eb\"\\x90\\xcf\r",
+ .flags = ESCAPE_HEX | ESCAPE_NA,
+ },{
+ .out = "\007 \eb\"\x90\xCF\r",
+ .flags = ESCAPE_NA,
+ },{
+ /* terminator */
+ }}
},{
/* terminator */
}};
-#define TEST_STRING_2_DICT_1 "b\\ \t\r"
+#define TEST_STRING_2_DICT_1 "b\\ \t\r\xCF"
static const struct test_string_2 escape1[] __initconst = {{
.in = "\f\\ \n\r\t\v",
.s1 = {{
@@ -216,16 +230,40 @@ static const struct test_string_2 escape1[] __initconst = {{
.out = "\f\\x5c\\x20\n\\x0d\\x09\v",
.flags = ESCAPE_HEX,
},{
+ .out = "\f\\134\\040\n\\015\\011\v",
+ .flags = ESCAPE_ANY | ESCAPE_APPEND,
+ },{
+ .out = "\\014\\134\\040\\012\\015\\011\\013",
+ .flags = ESCAPE_OCTAL | ESCAPE_APPEND | ESCAPE_NAP,
+ },{
+ .out = "\\x0c\\x5c\\x20\\x0a\\x0d\\x09\\x0b",
+ .flags = ESCAPE_HEX | ESCAPE_APPEND | ESCAPE_NAP,
+ },{
+ .out = "\f\\134\\040\n\\015\\011\v",
+ .flags = ESCAPE_OCTAL | ESCAPE_APPEND | ESCAPE_NA,
+ },{
+ .out = "\f\\x5c\\x20\n\\x0d\\x09\v",
+ .flags = ESCAPE_HEX | ESCAPE_APPEND | ESCAPE_NA,
+ },{
/* terminator */
- }},
+ }}
},{
- .in = "\\h\\\"\a\e\\",
+ .in = "\\h\\\"\a\xCF\e\\",
.s1 = {{
- .out = "\\134h\\134\"\a\e\\134",
+ .out = "\\134h\\134\"\a\\317\e\\134",
.flags = ESCAPE_OCTAL,
},{
+ .out = "\\134h\\134\"\a\\317\e\\134",
+ .flags = ESCAPE_ANY | ESCAPE_APPEND,
+ },{
+ .out = "\\134h\\134\"\\007\\317\\033\\134",
+ .flags = ESCAPE_OCTAL | ESCAPE_APPEND | ESCAPE_NAP,
+ },{
+ .out = "\\134h\\134\"\a\\317\e\\134",
+ .flags = ESCAPE_OCTAL | ESCAPE_APPEND | ESCAPE_NA,
+ },{
/* terminator */
- }},
+ }}
},{
.in = "\eb \\C\007\"\x90\r]",
.s1 = {{
@@ -233,7 +271,89 @@ static const struct test_string_2 escape1[] __initconst = {{
.flags = ESCAPE_OCTAL,
},{
/* terminator */
- }},
+ }}
+},{
+ .in = "\007 \eb\"\x90\xCF\r",
+ .s1 = {{
+ .out = "\007 \eb\"\x90\xCF\r",
+ .flags = ESCAPE_NA,
+ },{
+ .out = "\007 \eb\"\x90\xCF\r",
+ .flags = ESCAPE_SPACE | ESCAPE_NA,
+ },{
+ .out = "\007 \eb\"\x90\xCF\r",
+ .flags = ESCAPE_SPECIAL | ESCAPE_NA,
+ },{
+ .out = "\007 \eb\"\x90\xCF\r",
+ .flags = ESCAPE_SPACE | ESCAPE_SPECIAL | ESCAPE_NA,
+ },{
+ .out = "\007 \eb\"\x90\\317\r",
+ .flags = ESCAPE_OCTAL | ESCAPE_NA,
+ },{
+ .out = "\007 \eb\"\x90\\317\r",
+ .flags = ESCAPE_SPACE | ESCAPE_OCTAL | ESCAPE_NA,
+ },{
+ .out = "\007 \eb\"\x90\\317\r",
+ .flags = ESCAPE_SPECIAL | ESCAPE_OCTAL | ESCAPE_NA,
+ },{
+ .out = "\007 \eb\"\x90\\317\r",
+ .flags = ESCAPE_ANY | ESCAPE_NA,
+ },{
+ .out = "\007 \eb\"\x90\\xcf\r",
+ .flags = ESCAPE_HEX | ESCAPE_NA,
+ },{
+ .out = "\007 \eb\"\x90\\xcf\r",
+ .flags = ESCAPE_SPACE | ESCAPE_HEX | ESCAPE_NA,
+ },{
+ .out = "\007 \eb\"\x90\\xcf\r",
+ .flags = ESCAPE_SPECIAL | ESCAPE_HEX | ESCAPE_NA,
+ },{
+ .out = "\007 \eb\"\x90\\xcf\r",
+ .flags = ESCAPE_SPACE | ESCAPE_SPECIAL | ESCAPE_HEX | ESCAPE_NA,
+ },{
+ /* terminator */
+ }}
+},{
+ .in = "\007 \eb\"\x90\xCF\r",
+ .s1 = {{
+ .out = "\007 \eb\"\x90\xCF\r",
+ .flags = ESCAPE_NAP,
+ },{
+ .out = "\007 \eb\"\x90\xCF\\r",
+ .flags = ESCAPE_SPACE | ESCAPE_NAP,
+ },{
+ .out = "\007 \eb\"\x90\xCF\r",
+ .flags = ESCAPE_SPECIAL | ESCAPE_NAP,
+ },{
+ .out = "\007 \eb\"\x90\xCF\\r",
+ .flags = ESCAPE_SPACE | ESCAPE_SPECIAL | ESCAPE_NAP,
+ },{
+ .out = "\007 \eb\"\x90\\317\\015",
+ .flags = ESCAPE_OCTAL | ESCAPE_NAP,
+ },{
+ .out = "\007 \eb\"\x90\\317\\r",
+ .flags = ESCAPE_SPACE | ESCAPE_OCTAL | ESCAPE_NAP,
+ },{
+ .out = "\007 \eb\"\x90\\317\\015",
+ .flags = ESCAPE_SPECIAL | ESCAPE_OCTAL | ESCAPE_NAP,
+ },{
+ .out = "\007 \eb\"\x90\\317\r",
+ .flags = ESCAPE_ANY | ESCAPE_NAP,
+ },{
+ .out = "\007 \eb\"\x90\\xcf\\x0d",
+ .flags = ESCAPE_HEX | ESCAPE_NAP,
+ },{
+ .out = "\007 \eb\"\x90\\xcf\\r",
+ .flags = ESCAPE_SPACE | ESCAPE_HEX | ESCAPE_NAP,
+ },{
+ .out = "\007 \eb\"\x90\\xcf\\x0d",
+ .flags = ESCAPE_SPECIAL | ESCAPE_HEX | ESCAPE_NAP,
+ },{
+ .out = "\007 \eb\"\x90\\xcf\\r",
+ .flags = ESCAPE_SPACE | ESCAPE_SPECIAL | ESCAPE_HEX | ESCAPE_NAP,
+ },{
+ /* terminator */
+ }}
},{
/* terminator */
}};
@@ -290,7 +410,7 @@ test_string_escape_overflow(const char *in, int p, unsigned int flags, const cha
q_real = string_escape_mem(in, p, NULL, 0, flags, esc);
if (q_real != q_test)
- pr_warn("Test '%s' failed: flags = %u, osz = 0, expected %d, got %d\n",
+ pr_warn("Test '%s' failed: flags = %#x, osz = 0, expected %d, got %d\n",
name, flags, q_test, q_real);
}
@@ -315,8 +435,13 @@ static __init void test_string_escape(const char *name,
/* NULL injection */
if (flags & ESCAPE_NULL) {
in[p++] = '\0';
- out_test[q_test++] = '\\';
- out_test[q_test++] = '0';
+ /* '\0' passes isascii() test */
+ if (flags & ESCAPE_NA && !(flags & ESCAPE_APPEND && esc)) {
+ out_test[q_test++] = '\0';
+ } else {
+ out_test[q_test++] = '\\';
+ out_test[q_test++] = '0';
+ }
}
/* Don't try strings that have no output */
@@ -459,17 +584,17 @@ static int __init test_string_helpers_init(void)
unsigned int i;
pr_info("Running tests...\n");
- for (i = 0; i < UNESCAPE_ANY + 1; i++)
+ for (i = 0; i < UNESCAPE_ALL_MASK + 1; i++)
test_string_unescape("unescape", i, false);
test_string_unescape("unescape inplace",
get_random_int() % (UNESCAPE_ANY + 1), true);
/* Without dictionary */
- for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++)
+ for (i = 0; i < ESCAPE_ALL_MASK + 1; i++)
test_string_escape("escape 0", escape0, i, TEST_STRING_2_DICT_0);
/* With dictionary */
- for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++)
+ for (i = 0; i < ESCAPE_ALL_MASK + 1; i++)
test_string_escape("escape 1", escape1, i, TEST_STRING_2_DICT_1);
/* Test string_get_size() */
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 15f2e2db77bc..8c55c4723692 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -25,6 +25,7 @@
#include <linux/swapops.h>
#include <linux/sched/mm.h>
#include <linux/platform_device.h>
+#include <linux/rmap.h>
#include "test_hmm_uapi.h"
@@ -46,6 +47,7 @@ struct dmirror_bounce {
unsigned long cpages;
};
+#define DPT_XA_TAG_ATOMIC 1UL
#define DPT_XA_TAG_WRITE 3UL
/*
@@ -218,7 +220,7 @@ static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni,
* the invalidation is handled as part of the migration process.
*/
if (range->event == MMU_NOTIFY_MIGRATE &&
- range->migrate_pgmap_owner == dmirror->mdevice)
+ range->owner == dmirror->mdevice)
return true;
if (mmu_notifier_range_blockable(range))
@@ -619,6 +621,54 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
}
}
+static int dmirror_check_atomic(struct dmirror *dmirror, unsigned long start,
+ unsigned long end)
+{
+ unsigned long pfn;
+
+ for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
+ void *entry;
+ struct page *page;
+
+ entry = xa_load(&dmirror->pt, pfn);
+ page = xa_untag_pointer(entry);
+ if (xa_pointer_tag(entry) == DPT_XA_TAG_ATOMIC)
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+static int dmirror_atomic_map(unsigned long start, unsigned long end,
+ struct page **pages, struct dmirror *dmirror)
+{
+ unsigned long pfn, mapped = 0;
+ int i;
+
+ /* Map the migrated pages into the device's page tables. */
+ mutex_lock(&dmirror->mutex);
+
+ for (i = 0, pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, i++) {
+ void *entry;
+
+ if (!pages[i])
+ continue;
+
+ entry = pages[i];
+ entry = xa_tag_pointer(entry, DPT_XA_TAG_ATOMIC);
+ entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
+ if (xa_is_err(entry)) {
+ mutex_unlock(&dmirror->mutex);
+ return xa_err(entry);
+ }
+
+ mapped++;
+ }
+
+ mutex_unlock(&dmirror->mutex);
+ return mapped;
+}
+
static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
struct dmirror *dmirror)
{
@@ -661,6 +711,72 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
return 0;
}
+static int dmirror_exclusive(struct dmirror *dmirror,
+ struct hmm_dmirror_cmd *cmd)
+{
+ unsigned long start, end, addr;
+ unsigned long size = cmd->npages << PAGE_SHIFT;
+ struct mm_struct *mm = dmirror->notifier.mm;
+ struct page *pages[64];
+ struct dmirror_bounce bounce;
+ unsigned long next;
+ int ret;
+
+ start = cmd->addr;
+ end = start + size;
+ if (end < start)
+ return -EINVAL;
+
+ /* Since the mm is for the mirrored process, get a reference first. */
+ if (!mmget_not_zero(mm))
+ return -EINVAL;
+
+ mmap_read_lock(mm);
+ for (addr = start; addr < end; addr = next) {
+ unsigned long mapped;
+ int i;
+
+ if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT))
+ next = end;
+ else
+ next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT);
+
+ ret = make_device_exclusive_range(mm, addr, next, pages, NULL);
+ mapped = dmirror_atomic_map(addr, next, pages, dmirror);
+ for (i = 0; i < ret; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+ }
+
+ if (addr + (mapped << PAGE_SHIFT) < next) {
+ mmap_read_unlock(mm);
+ mmput(mm);
+ return -EBUSY;
+ }
+ }
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ /* Return the migrated data for verification. */
+ ret = dmirror_bounce_init(&bounce, start, size);
+ if (ret)
+ return ret;
+ mutex_lock(&dmirror->mutex);
+ ret = dmirror_do_read(dmirror, start, end, &bounce);
+ mutex_unlock(&dmirror->mutex);
+ if (ret == 0) {
+ if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
+ bounce.size))
+ ret = -EFAULT;
+ }
+
+ cmd->cpages = bounce.cpages;
+ dmirror_bounce_fini(&bounce);
+ return ret;
+}
+
static int dmirror_migrate(struct dmirror *dmirror,
struct hmm_dmirror_cmd *cmd)
{
@@ -948,6 +1064,15 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
ret = dmirror_migrate(dmirror, &cmd);
break;
+ case HMM_DMIRROR_EXCLUSIVE:
+ ret = dmirror_exclusive(dmirror, &cmd);
+ break;
+
+ case HMM_DMIRROR_CHECK_EXCLUSIVE:
+ ret = dmirror_check_atomic(dmirror, cmd.addr,
+ cmd.addr + (cmd.npages << PAGE_SHIFT));
+ break;
+
case HMM_DMIRROR_SNAPSHOT:
ret = dmirror_snapshot(dmirror, &cmd);
break;
diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
index 670b4ef2a5b6..f14dea5dcd06 100644
--- a/lib/test_hmm_uapi.h
+++ b/lib/test_hmm_uapi.h
@@ -33,6 +33,8 @@ struct hmm_dmirror_cmd {
#define HMM_DMIRROR_WRITE _IOWR('H', 0x01, struct hmm_dmirror_cmd)
#define HMM_DMIRROR_MIGRATE _IOWR('H', 0x02, struct hmm_dmirror_cmd)
#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x03, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x04, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd)
/*
* Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.
diff --git a/lib/test_string.c b/lib/test_string.c
index 7b31f4a505bf..9dfd6f52de92 100644
--- a/lib/test_string.c
+++ b/lib/test_string.c
@@ -179,6 +179,10 @@ static __init int strnchr_selftest(void)
return 0;
}
+static __exit void string_selftest_remove(void)
+{
+}
+
static __init int string_selftest_init(void)
{
int test, subtest;
@@ -216,4 +220,5 @@ fail:
}
module_init(string_selftest_init);
+module_exit(string_selftest_remove);
MODULE_LICENSE("GPL v2");
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index e5c7afbf7405..2926cc27623f 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -86,6 +86,7 @@ static unsigned long long simple_strntoull(const char *startp, size_t max_chars,
*
* This function has caveats. Please use kstrtoull instead.
*/
+noinline
unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
{
return simple_strntoull(cp, INT_MAX, endp, base);
diff --git a/lib/xz/xz_dec_bcj.c b/lib/xz/xz_dec_bcj.c
index 72ddac6ef2ec..ef449e97d1a1 100644
--- a/lib/xz/xz_dec_bcj.c
+++ b/lib/xz/xz_dec_bcj.c
@@ -422,7 +422,7 @@ XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s,
/*
* Flush pending already filtered data to the output buffer. Return
- * immediatelly if we couldn't flush everything, or if the next
+ * immediately if we couldn't flush everything, or if the next
* filter in the chain had already returned XZ_STREAM_END.
*/
if (s->temp.filtered > 0) {
diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c
index ca2603abee08..7a6781e3f47b 100644
--- a/lib/xz/xz_dec_lzma2.c
+++ b/lib/xz/xz_dec_lzma2.c
@@ -147,8 +147,8 @@ struct lzma_dec {
/*
* LZMA properties or related bit masks (number of literal
- * context bits, a mask dervied from the number of literal
- * position bits, and a mask dervied from the number
+ * context bits, a mask derived from the number of literal
+ * position bits, and a mask derived from the number
* position bits)
*/
uint32_t lc;
@@ -484,7 +484,7 @@ static __always_inline void rc_normalize(struct rc_dec *rc)
}
/*
- * Decode one bit. In some versions, this function has been splitted in three
+ * Decode one bit. In some versions, this function has been split in three
* functions so that the compiler is supposed to be able to more easily avoid
* an extra branch. In this particular version of the LZMA decoder, this
* doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3
@@ -761,7 +761,7 @@ static bool lzma_main(struct xz_dec_lzma2 *s)
}
/*
- * Reset the LZMA decoder and range decoder state. Dictionary is nore reset
+ * Reset the LZMA decoder and range decoder state. Dictionary is not reset
* here, because LZMA state may be reset without resetting the dictionary.
*/
static void lzma_reset(struct xz_dec_lzma2 *s)
diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
index ed1f3df27260..f19c4fbe1be7 100644
--- a/lib/zlib_inflate/inffast.c
+++ b/lib/zlib_inflate/inffast.c
@@ -15,7 +15,7 @@ union uu {
unsigned char b[2];
};
-/* Endian independed version */
+/* Endian independent version */
static inline unsigned short
get_unaligned16(const unsigned short *p)
{
diff --git a/lib/zstd/huf.h b/lib/zstd/huf.h
index 2143da28d952..923218d12e28 100644
--- a/lib/zstd/huf.h
+++ b/lib/zstd/huf.h
@@ -134,7 +134,7 @@ typedef enum {
HUF_repeat_none, /**< Cannot use the previous table */
HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1,
4}X_repeat */
- HUF_repeat_valid /**< Can use the previous table and it is asumed to be valid */
+ HUF_repeat_valid /**< Can use the previous table and it is assumed to be valid */
} HUF_repeat;
/** HUF_compress4X_repeat() :
* Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
diff --git a/mm/Kconfig b/mm/Kconfig
index ded98fb859ab..a02498c0e13d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -96,6 +96,9 @@ config HAVE_FAST_GUP
depends on MMU
bool
+config HOLES_IN_ZONE
+ bool
+
# Don't discard allocated memory used to track "memory" and "reserved" memblocks
# after early boot, so it can still be used to test for validity of memory.
# Also, memblocks are updated with memory hot(un)plug.
@@ -671,6 +674,7 @@ config ZPOOL
config ZBUD
tristate "Low (Up to 2x) density storage for compressed pages"
+ depends on ZPOOL
help
A special purpose allocator for storing compressed pages.
It is designed to store up to two compressed pages per physical
@@ -757,6 +761,18 @@ config ARCH_HAS_CACHE_LINE_SIZE
config ARCH_HAS_PTE_DEVMAP
bool
+config ARCH_HAS_ZONE_DMA_SET
+ bool
+
+config ZONE_DMA
+ bool "Support DMA zone" if ARCH_HAS_ZONE_DMA_SET
+ default y if ARM64 || X86
+
+config ZONE_DMA32
+ bool "Support DMA32 zone" if ARCH_HAS_ZONE_DMA_SET
+ depends on !X86_32
+ default y if ARM64
+
config ZONE_DEVICE
bool "Device memory (pmem, HMM, etc...) hotplug support"
depends on MEMORY_HOTPLUG
diff --git a/mm/Makefile b/mm/Makefile
index bf71e295e9f6..74b47c354682 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
+obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) += hugetlb_vmemmap.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
@@ -125,3 +126,4 @@ obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
obj-$(CONFIG_IO_MAPPING) += io-mapping.o
+obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
new file mode 100644
index 000000000000..5b152dba7344
--- /dev/null
+++ b/mm/bootmem_info.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Bootmem core functions.
+ *
+ * Copyright (c) 2020, Bytedance.
+ *
+ * Author: Muchun Song <songmuchun@bytedance.com>
+ *
+ */
+#include <linux/mm.h>
+#include <linux/compiler.h>
+#include <linux/memblock.h>
+#include <linux/bootmem_info.h>
+#include <linux/memory_hotplug.h>
+
+void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
+{
+ page->freelist = (void *)type;
+ SetPagePrivate(page);
+ set_page_private(page, info);
+ page_ref_inc(page);
+}
+
+void put_page_bootmem(struct page *page)
+{
+ unsigned long type;
+
+ type = (unsigned long) page->freelist;
+ BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+ type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
+
+ if (page_ref_dec_return(page) == 1) {
+ page->freelist = NULL;
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ INIT_LIST_HEAD(&page->lru);
+ free_reserved_page(page);
+ }
+}
+
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+ struct mem_section_usage *usage;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ /* Get section's memmap address */
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ /*
+ * Get page for the memmap's phys address
+ * XXX: need more consideration for sparse_vmemmap...
+ */
+ page = virt_to_page(memmap);
+ mapsize = sizeof(struct page) * PAGES_PER_SECTION;
+ mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
+
+ /* remember memmap's page */
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, SECTION_INFO);
+
+ usage = ms->usage;
+ page = virt_to_page(usage);
+
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+
+}
+#else /* CONFIG_SPARSEMEM_VMEMMAP */
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+ struct mem_section_usage *usage;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+
+ usage = ms->usage;
+ page = virt_to_page(usage);
+
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+ unsigned long i, pfn, end_pfn, nr_pages;
+ int node = pgdat->node_id;
+ struct page *page;
+
+ nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
+ page = virt_to_page(pgdat);
+
+ for (i = 0; i < nr_pages; i++, page++)
+ get_page_bootmem(node, page, NODE_INFO);
+
+ pfn = pgdat->node_start_pfn;
+ end_pfn = pgdat_end_pfn(pgdat);
+
+ /* register section info */
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ /*
+ * Some platforms can assign the same pfn to multiple nodes - on
+ * node0 as well as nodeN. To avoid registering a pfn against
+ * multiple nodes we check that this pfn does not already
+ * reside in some other nodes.
+ */
+ if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
+ register_page_bootmem_info_section(pfn);
+ }
+}
diff --git a/mm/compaction.c b/mm/compaction.c
index 3a509fbf2bea..621508e0ecd5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1297,8 +1297,7 @@ move_freelist_head(struct list_head *freelist, struct page *freepage)
if (!list_is_last(freelist, &freepage->lru)) {
list_cut_before(&sublist, freelist, &freepage->lru);
- if (!list_empty(&sublist))
- list_splice_tail(&sublist, freelist);
+ list_splice_tail(&sublist, freelist);
}
}
@@ -1315,8 +1314,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage)
if (!list_is_first(freelist, &freepage->lru)) {
list_cut_position(&sublist, freelist, &freepage->lru);
- if (!list_empty(&sublist))
- list_splice_tail(&sublist, freelist);
+ list_splice_tail(&sublist, freelist);
}
}
@@ -1380,7 +1378,7 @@ static int next_search_order(struct compact_control *cc, int order)
static unsigned long
fast_isolate_freepages(struct compact_control *cc)
{
- unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
+ unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1);
unsigned int nr_scanned = 0;
unsigned long low_pfn, min_pfn, highest = 0;
unsigned long nr_isolated = 0;
@@ -1492,11 +1490,11 @@ fast_isolate_freepages(struct compact_control *cc)
spin_unlock_irqrestore(&cc->zone->lock, flags);
/*
- * Smaller scan on next order so the total scan ig related
+ * Smaller scan on next order so the total scan is related
* to freelist_scan_limit.
*/
if (order_scanned >= limit)
- limit = min(1U, limit >> 1);
+ limit = max(1U, limit >> 1);
}
if (!page) {
@@ -2722,9 +2720,9 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
}
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
-static ssize_t sysfs_compact_node(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t compact_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
int nid = dev->id;
@@ -2737,7 +2735,7 @@ static ssize_t sysfs_compact_node(struct device *dev,
return count;
}
-static DEVICE_ATTR(compact, 0200, NULL, sysfs_compact_node);
+static DEVICE_ATTR_WO(compact);
int compaction_register_node(struct node *node)
{
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 92bfc37300df..1c922691aa61 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -91,7 +91,7 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
unsigned long pfn, unsigned long vaddr,
pgprot_t prot)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte;
/*
* Architectures optimize set_pte_at by avoiding TLB flush.
@@ -248,29 +248,6 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_leaf(pmd));
}
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
-{
- pmd_t pmd;
-
- if (!arch_vmap_pmd_supported(prot))
- return;
-
- pr_debug("Validating PMD huge\n");
- /*
- * X86 defined pmd_set_huge() verifies that the given
- * PMD is not a populated non-leaf entry.
- */
- WRITE_ONCE(*pmdp, __pmd(0));
- WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
- WARN_ON(!pmd_clear_huge(pmdp));
- pmd = READ_ONCE(*pmdp);
- WARN_ON(!pmd_none(pmd));
-}
-#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
-#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-
static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
{
pmd_t pmd;
@@ -395,30 +372,6 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
pud = pud_mkhuge(pud);
WARN_ON(!pud_leaf(pud));
}
-
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
-{
- pud_t pud;
-
- if (!arch_vmap_pud_supported(prot))
- return;
-
- pr_debug("Validating PUD huge\n");
- /*
- * X86 defined pud_set_huge() verifies that the given
- * PUD is not a populated non-leaf entry.
- */
- WRITE_ONCE(*pudp, __pud(0));
- WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
- WARN_ON(!pud_clear_huge(pudp));
- pud = READ_ONCE(*pudp);
- WARN_ON(!pud_none(pud));
-}
-#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
-#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
static void __init pud_advanced_tests(struct mm_struct *mm,
@@ -428,9 +381,6 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
{
}
static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
-{
-}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static void __init pmd_basic_tests(unsigned long pfn, int idx) { }
@@ -449,14 +399,51 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
}
static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
{
+ pmd_t pmd;
+
+ if (!arch_vmap_pmd_supported(prot))
+ return;
+
+ pr_debug("Validating PMD huge\n");
+ /*
+ * X86 defined pmd_set_huge() verifies that the given
+ * PMD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pmdp, __pmd(0));
+ WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pmd_clear_huge(pmdp));
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
}
+
static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
{
+ pud_t pud;
+
+ if (!arch_vmap_pud_supported(prot))
+ return;
+
+ pr_debug("Validating PUD huge\n");
+ /*
+ * X86 defined pud_set_huge() verifies that the given
+ * PUD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pudp, __pud(0));
+ WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pud_clear_huge(pudp));
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
}
-static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
{
@@ -791,12 +778,12 @@ static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
}
-#else /* !CONFIG_ARCH_HAS_PTE_DEVMAP */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
{
}
-#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
{
@@ -856,17 +843,17 @@ static void __init swap_migration_tests(void)
* locked, otherwise it stumbles upon a BUG_ON().
*/
__SetPageLocked(page);
- swp = make_migration_entry(page, 1);
+ swp = make_writable_migration_entry(page_to_pfn(page));
WARN_ON(!is_migration_entry(swp));
- WARN_ON(!is_write_migration_entry(swp));
+ WARN_ON(!is_writable_migration_entry(swp));
- make_migration_entry_read(&swp);
+ swp = make_readable_migration_entry(swp_offset(swp));
WARN_ON(!is_migration_entry(swp));
- WARN_ON(is_write_migration_entry(swp));
+ WARN_ON(is_writable_migration_entry(swp));
- swp = make_migration_entry(page, 0);
+ swp = make_readable_migration_entry(page_to_pfn(page));
WARN_ON(!is_migration_entry(swp));
- WARN_ON(is_write_migration_entry(swp));
+ WARN_ON(is_writable_migration_entry(swp));
__ClearPageLocked(page);
__free_page(page);
}
diff --git a/mm/gup.c b/mm/gup.c
index 8651309f8ec3..728d996767cb 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1501,6 +1501,64 @@ long populate_vma_page_range(struct vm_area_struct *vma,
}
/*
+ * faultin_vma_page_range() - populate (prefault) page tables inside the
+ * given VMA range readable/writable
+ *
+ * This takes care of mlocking the pages, too, if VM_LOCKED is set.
+ *
+ * @vma: target vma
+ * @start: start address
+ * @end: end address
+ * @write: whether to prefault readable or writable
+ * @locked: whether the mmap_lock is still held
+ *
+ * Returns either number of processed pages in the vma, or a negative error
+ * code on error (see __get_user_pages()).
+ *
+ * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
+ * covered by the VMA.
+ *
+ * If @locked is NULL, it may be held for read or write and will be unperturbed.
+ *
+ * If @locked is non-NULL, it must held for read only and may be released. If
+ * it's released, *@locked will be set to 0.
+ */
+long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, bool write, int *locked)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int gup_flags;
+
+ VM_BUG_ON(!PAGE_ALIGNED(start));
+ VM_BUG_ON(!PAGE_ALIGNED(end));
+ VM_BUG_ON_VMA(start < vma->vm_start, vma);
+ VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ mmap_assert_locked(mm);
+
+ /*
+ * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
+ * the page dirty with FOLL_WRITE -- which doesn't make a
+ * difference with !FOLL_FORCE, because the page is writable
+ * in the page table.
+ * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
+ * a poisoned page.
+ * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT.
+ * !FOLL_FORCE: Require proper access permissions.
+ */
+ gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON;
+ if (write)
+ gup_flags |= FOLL_WRITE;
+
+ /*
+ * See check_vma_flags(): Will return -EFAULT on incompatible mappings
+ * or with insufficient permissions.
+ */
+ return __get_user_pages(mm, start, nr_pages, gup_flags,
+ NULL, NULL, locked);
+}
+
+/*
* __mm_populate - populate and/or mlock pages within a range of address space.
*
* This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
diff --git a/mm/hmm.c b/mm/hmm.c
index 943cb2ba4442..fad6be2bf072 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -26,6 +26,8 @@
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
+#include "internal.h"
+
struct hmm_vma_walk {
struct hmm_range *range;
unsigned long last;
@@ -214,7 +216,7 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range,
swp_entry_t entry)
{
return is_device_private_entry(entry) &&
- device_private_entry_to_page(entry)->pgmap->owner ==
+ pfn_swap_entry_to_page(entry)->pgmap->owner ==
range->dev_private_owner;
}
@@ -255,10 +257,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
*/
if (hmm_is_device_private_entry(range, entry)) {
cpu_flags = HMM_PFN_VALID;
- if (is_write_device_private_entry(entry))
+ if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = device_private_entry_to_pfn(entry) |
- cpu_flags;
+ *hmm_pfn = swp_offset(entry) | cpu_flags;
return 0;
}
@@ -272,6 +273,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
if (!non_swap_entry(entry))
goto fault;
+ if (is_device_exclusive_entry(entry))
+ goto fault;
+
if (is_migration_entry(entry)) {
pte_unmap(ptep);
hmm_vma_walk->last = addr;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6d2a0119fc58..8b731d53e9f4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -64,7 +64,14 @@ static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
-bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
+{
+ return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file &&
+ !inode_is_open_for_write(vma->vm_file->f_inode) &&
+ (vma->vm_flags & VM_EXEC);
+}
+
+bool transparent_hugepage_active(struct vm_area_struct *vma)
{
/* The addr is used to check if the vma size fits */
unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
@@ -75,6 +82,8 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma)
return __transparent_hugepage_enabled(vma);
if (vma_is_shmem(vma))
return shmem_huge_enabled(vma);
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
+ return file_thp_enabled(vma);
return false;
}
@@ -1017,7 +1026,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
- struct vm_area_struct *vma)
+ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
@@ -1026,7 +1035,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
int ret = -ENOMEM;
/* Skip if can be re-fill on fault */
- if (!vma_is_anonymous(vma))
+ if (!vma_is_anonymous(dst_vma))
return 0;
pgtable = pte_alloc_one(dst_mm);
@@ -1040,29 +1049,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
- /*
- * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
- * does not have the VM_UFFD_WP, which means that the uffd
- * fork event is not enabled.
- */
- if (!(vma->vm_flags & VM_UFFD_WP))
- pmd = pmd_clear_uffd_wp(pmd);
-
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (unlikely(is_swap_pmd(pmd))) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
VM_BUG_ON(!is_pmd_migration_entry(pmd));
- if (is_write_migration_entry(entry)) {
- make_migration_entry_read(&entry);
+ if (is_writable_migration_entry(entry)) {
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
pmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*src_pmd))
pmd = pmd_swp_mksoft_dirty(pmd);
+ if (pmd_swp_uffd_wp(*src_pmd))
+ pmd = pmd_swp_mkuffd_wp(pmd);
set_pmd_at(src_mm, addr, src_pmd, pmd);
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
ret = 0;
goto out_unlock;
@@ -1079,17 +1085,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* a page table.
*/
if (is_huge_zero_pmd(pmd)) {
- struct page *zero_page;
/*
* get_huge_zero_page() will never allocate a new page here,
* since we already have a zero page to copy. It just takes a
* reference.
*/
- zero_page = mm_get_huge_zero_page(dst_mm);
- set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
- zero_page);
- ret = 0;
- goto out_unlock;
+ mm_get_huge_zero_page(dst_mm);
+ goto out_zero_page;
}
src_page = pmd_page(pmd);
@@ -1102,21 +1104,23 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* best effort that the pinned pages won't be replaced by another
* random page during the coming copy-on-write.
*/
- if (unlikely(page_needs_cow_for_dma(vma, src_page))) {
+ if (unlikely(page_needs_cow_for_dma(src_vma, src_page))) {
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
- __split_huge_pmd(vma, src_pmd, addr, false, NULL);
+ __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
return -EAGAIN;
}
get_page(src_page);
page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+out_zero_page:
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
-
pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_clear_uffd_wp(pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
@@ -1254,11 +1258,12 @@ unlock:
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
+void huge_pmd_set_accessed(struct vm_fault *vmf)
{
pmd_t entry;
unsigned long haddr;
bool write = vmf->flags & FAULT_FLAG_WRITE;
+ pmd_t orig_pmd = vmf->orig_pmd;
vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
@@ -1275,11 +1280,12 @@ unlock:
spin_unlock(vmf->ptl);
}
-vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
+vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ pmd_t orig_pmd = vmf->orig_pmd;
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1415,96 +1421,25 @@ out:
}
/* NUMA hinting page fault entry point for trans huge pmds */
-vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
+vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct anon_vma *anon_vma = NULL;
+ pmd_t oldpmd = vmf->orig_pmd;
+ pmd_t pmd;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
+ int page_nid = NUMA_NO_NODE;
int target_nid, last_cpupid = -1;
- bool page_locked;
bool migrated = false;
- bool was_writable;
+ bool was_writable = pmd_savedwrite(oldpmd);
int flags = 0;
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_same(pmd, *vmf->pmd)))
- goto out_unlock;
-
- /*
- * If there are potential migrations, wait for completion and retry
- * without disrupting NUMA hinting information. Do not relock and
- * check_same as the page may no longer be mapped.
- */
- if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
- page = pmd_page(*vmf->pmd);
- if (!get_page_unless_zero(page))
- goto out_unlock;
+ if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
spin_unlock(vmf->ptl);
- put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
goto out;
}
- page = pmd_page(pmd);
- BUG_ON(is_huge_zero_page(page));
- page_nid = page_to_nid(page);
- last_cpupid = page_cpupid_last(page);
- count_vm_numa_event(NUMA_HINT_FAULTS);
- if (page_nid == this_nid) {
- count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
- flags |= TNF_FAULT_LOCAL;
- }
-
- /* See similar comment in do_numa_page for explanation */
- if (!pmd_savedwrite(pmd))
- flags |= TNF_NO_GROUP;
-
- /*
- * Acquire the page lock to serialise THP migrations but avoid dropping
- * page_table_lock if at all possible
- */
- page_locked = trylock_page(page);
- target_nid = mpol_misplaced(page, vma, haddr);
- /* Migration could have started since the pmd_trans_migrating check */
- if (!page_locked) {
- page_nid = NUMA_NO_NODE;
- if (!get_page_unless_zero(page))
- goto out_unlock;
- spin_unlock(vmf->ptl);
- put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
- goto out;
- } else if (target_nid == NUMA_NO_NODE) {
- /* There are no parallel migrations and page is in the right
- * node. Clear the numa hinting info in this pmd.
- */
- goto clear_pmdnuma;
- }
-
- /*
- * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
- * to serialises splits
- */
- get_page(page);
- spin_unlock(vmf->ptl);
- anon_vma = page_lock_anon_vma_read(page);
-
- /* Confirm the PMD did not change while page_table_lock was released */
- spin_lock(vmf->ptl);
- if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
- unlock_page(page);
- put_page(page);
- page_nid = NUMA_NO_NODE;
- goto out_unlock;
- }
-
- /* Bail if we fail to protect against THP splits for any reason */
- if (unlikely(!anon_vma)) {
- put_page(page);
- page_nid = NUMA_NO_NODE;
- goto clear_pmdnuma;
- }
-
/*
* Since we took the NUMA fault, we must have observed the !accessible
* bit. Make sure all other CPUs agree with that, to avoid them
@@ -1531,43 +1466,58 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
haddr + HPAGE_PMD_SIZE);
}
- /*
- * Migrate the THP to the requested node, returns with page unlocked
- * and access rights restored.
- */
+ pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+ page = vm_normal_page_pmd(vma, haddr, pmd);
+ if (!page)
+ goto out_map;
+
+ /* See similar comment in do_numa_page for explanation */
+ if (!was_writable)
+ flags |= TNF_NO_GROUP;
+
+ page_nid = page_to_nid(page);
+ last_cpupid = page_cpupid_last(page);
+ target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
+ &flags);
+
+ if (target_nid == NUMA_NO_NODE) {
+ put_page(page);
+ goto out_map;
+ }
+
spin_unlock(vmf->ptl);
- migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
- vmf->pmd, pmd, vmf->address, page, target_nid);
+ migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
page_nid = target_nid;
- } else
+ } else {
flags |= TNF_MIGRATE_FAIL;
-
- goto out;
-clear_pmdnuma:
- BUG_ON(!PageLocked(page));
- was_writable = pmd_savedwrite(pmd);
- pmd = pmd_modify(pmd, vma->vm_page_prot);
- pmd = pmd_mkyoung(pmd);
- if (was_writable)
- pmd = pmd_mkwrite(pmd);
- set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
- update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- unlock_page(page);
-out_unlock:
- spin_unlock(vmf->ptl);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
+ spin_unlock(vmf->ptl);
+ goto out;
+ }
+ goto out_map;
+ }
out:
- if (anon_vma)
- page_unlock_anon_vma_read(anon_vma);
-
if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
flags);
return 0;
+
+out_map:
+ /* Restore the PMD */
+ pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+ pmd = pmd_mkyoung(pmd);
+ if (was_writable)
+ pmd = pmd_mkwrite(pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+ spin_unlock(vmf->ptl);
+ goto out;
}
/*
@@ -1604,7 +1554,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If other processes are mapping this page, we couldn't discard
* the page unless they all do MADV_FREE so let's skip the page.
*/
- if (page_mapcount(page) != 1)
+ if (total_mapcount(page) != 1)
goto out;
if (!trylock_page(page))
@@ -1677,12 +1627,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
- if (is_huge_zero_pmd(orig_pmd))
- tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else if (is_huge_zero_pmd(orig_pmd)) {
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
- tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else {
struct page *page = NULL;
int flush_needed = 1;
@@ -1697,7 +1644,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
flush_needed = 0;
} else
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
@@ -1796,6 +1743,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
* Returns
* - 0 if PMD could not be locked
* - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
+ * or if prot_numa but THP migration is not supported
* - HPAGE_PMD_NR if protections changed and TLB flush necessary
*/
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1810,6 +1758,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ if (prot_numa && !thp_migration_supported())
+ return 1;
+
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
return 0;
@@ -1822,16 +1773,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry = pmd_to_swp_entry(*pmd);
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
- if (is_write_migration_entry(entry)) {
+ if (is_writable_migration_entry(entry)) {
pmd_t newpmd;
/*
* A protection check is difficult so
* just be safe and disable write
*/
- make_migration_entry_read(&entry);
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
newpmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
+ if (pmd_swp_uffd_wp(*pmd))
+ newpmd = pmd_swp_mkuffd_wp(newpmd);
set_pmd_at(mm, addr, pmd, newpmd);
}
goto unlock;
@@ -2060,7 +2014,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
} else {
page = pmd_page(old_pmd);
if (!PageDirty(page) && pmd_dirty(old_pmd))
@@ -2114,8 +2068,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
- page = migration_entry_to_page(entry);
- write = is_write_migration_entry(entry);
+ page = pfn_swap_entry_to_page(entry);
+ write = is_writable_migration_entry(entry);
young = false;
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
@@ -2147,7 +2101,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (freeze || pmd_migration) {
swp_entry_t swp_entry;
- swp_entry = make_migration_entry(page + i, write);
+ if (write)
+ swp_entry = make_writable_migration_entry(
+ page_to_pfn(page + i));
+ else
+ swp_entry = make_readable_migration_entry(
+ page_to_pfn(page + i));
entry = swp_entry_to_pte(swp_entry);
if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry);
@@ -2350,15 +2309,20 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
static void unmap_page(struct page *page)
{
- enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
- TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
+ enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
+ TTU_SYNC;
VM_BUG_ON_PAGE(!PageHead(page), page);
+ /*
+ * Anon pages need migration entries to preserve them, but file
+ * pages can simply be left unmapped, then faulted back on demand.
+ * If that is ever changed (perhaps for mlock), update remap_page().
+ */
if (PageAnon(page))
- ttu_flags |= TTU_SPLIT_FREEZE;
-
- try_to_unmap(page, ttu_flags);
+ try_to_migrate(page, ttu_flags);
+ else
+ try_to_unmap(page, ttu_flags | TTU_IGNORE_MLOCK);
VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
@@ -2366,6 +2330,10 @@ static void unmap_page(struct page *page)
static void remap_page(struct page *page, unsigned int nr)
{
int i;
+
+ /* If TTU_SPLIT_FREEZE is ever extended to file, remove this check */
+ if (!PageAnon(page))
+ return;
if (PageTransHuge(page)) {
remove_migration_ptes(page, page, true);
} else {
@@ -2870,7 +2838,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
list_for_each_safe(pos, next, &ds_queue->split_queue) {
- page = list_entry((void *)pos, struct page, mapping);
+ page = list_entry((void *)pos, struct page, deferred_list);
page = compound_head(page);
if (get_page_unless_zero(page)) {
list_move(page_deferred_list(page), &list);
@@ -2885,7 +2853,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
list_for_each_safe(pos, next, &list) {
- page = list_entry((void *)pos, struct page, mapping);
+ page = list_entry((void *)pos, struct page, deferred_list);
if (!trylock_page(page))
goto next;
/* split_huge_page() removes page from list on success */
@@ -3144,7 +3112,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
tok = strsep(&buf, ",");
if (tok) {
- strncpy(file_path, tok, MAX_INPUT_BUF_SZ);
+ strcpy(file_path, tok);
} else {
ret = -EINVAL;
goto out;
@@ -3214,7 +3182,10 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
if (pmd_dirty(pmdval))
set_page_dirty(page);
- entry = make_migration_entry(page, pmd_write(pmdval));
+ if (pmd_write(pmdval))
+ entry = make_writable_migration_entry(page_to_pfn(page));
+ else
+ entry = make_readable_migration_entry(page_to_pfn(page));
pmdswp = swp_entry_to_pmd(entry);
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
@@ -3240,8 +3211,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
- if (is_write_migration_entry(entry))
+ if (is_writable_migration_entry(entry))
pmde = maybe_pmd_mkwrite(pmde, vma);
+ if (pmd_swp_uffd_wp(*pvmw->pmd))
+ pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
if (PageAnon(new))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 103f1187043f..924553aa8f78 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,6 +30,7 @@
#include <linux/numa.h>
#include <linux/llist.h>
#include <linux/cma.h>
+#include <linux/migrate.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -41,6 +42,7 @@
#include <linux/node.h>
#include <linux/page_owner.h>
#include "internal.h"
+#include "hugetlb_vmemmap.h"
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
@@ -1318,8 +1320,6 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
}
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
-static void prep_compound_gigantic_page(struct page *page, unsigned int order);
#else /* !CONFIG_CONTIG_ALLOC */
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
@@ -1375,7 +1375,40 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
h->nr_huge_pages_node[nid]--;
}
-static void update_and_free_page(struct hstate *h, struct page *page)
+static void add_hugetlb_page(struct hstate *h, struct page *page,
+ bool adjust_surplus)
+{
+ int zeroed;
+ int nid = page_to_nid(page);
+
+ VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
+
+ lockdep_assert_held(&hugetlb_lock);
+
+ INIT_LIST_HEAD(&page->lru);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
+
+ if (adjust_surplus) {
+ h->surplus_huge_pages++;
+ h->surplus_huge_pages_node[nid]++;
+ }
+
+ set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
+ set_page_private(page, 0);
+ SetHPageVmemmapOptimized(page);
+
+ /*
+ * This page is now managed by the hugetlb allocator and has
+ * no users -- drop the last reference.
+ */
+ zeroed = put_page_testzero(page);
+ VM_BUG_ON_PAGE(!zeroed, page);
+ arch_clear_hugepage_flags(page);
+ enqueue_huge_page(h, page);
+}
+
+static void __update_and_free_page(struct hstate *h, struct page *page)
{
int i;
struct page *subpage = page;
@@ -1383,6 +1416,18 @@ static void update_and_free_page(struct hstate *h, struct page *page)
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
+ if (alloc_huge_page_vmemmap(h, page)) {
+ spin_lock_irq(&hugetlb_lock);
+ /*
+ * If we cannot allocate vmemmap pages, just refuse to free the
+ * page and put the page back on the hugetlb free list and treat
+ * as a surplus page.
+ */
+ add_hugetlb_page(h, page, true);
+ spin_unlock_irq(&hugetlb_lock);
+ return;
+ }
+
for (i = 0; i < pages_per_huge_page(h);
i++, subpage = mem_map_next(subpage, page, i)) {
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1398,12 +1443,79 @@ static void update_and_free_page(struct hstate *h, struct page *page)
}
}
+/*
+ * As update_and_free_page() can be called under any context, so we cannot
+ * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
+ * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
+ * the vmemmap pages.
+ *
+ * free_hpage_workfn() locklessly retrieves the linked list of pages to be
+ * freed and frees them one-by-one. As the page->mapping pointer is going
+ * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
+ * structure of a lockless linked list of huge pages to be freed.
+ */
+static LLIST_HEAD(hpage_freelist);
+
+static void free_hpage_workfn(struct work_struct *work)
+{
+ struct llist_node *node;
+
+ node = llist_del_all(&hpage_freelist);
+
+ while (node) {
+ struct page *page;
+ struct hstate *h;
+
+ page = container_of((struct address_space **)node,
+ struct page, mapping);
+ node = node->next;
+ page->mapping = NULL;
+ /*
+ * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
+ * is going to trigger because a previous call to
+ * remove_hugetlb_page() will set_compound_page_dtor(page,
+ * NULL_COMPOUND_DTOR), so do not use page_hstate() directly.
+ */
+ h = size_to_hstate(page_size(page));
+
+ __update_and_free_page(h, page);
+
+ cond_resched();
+ }
+}
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
+
+static inline void flush_free_hpage_work(struct hstate *h)
+{
+ if (free_vmemmap_pages_per_hpage(h))
+ flush_work(&free_hpage_work);
+}
+
+static void update_and_free_page(struct hstate *h, struct page *page,
+ bool atomic)
+{
+ if (!HPageVmemmapOptimized(page) || !atomic) {
+ __update_and_free_page(h, page);
+ return;
+ }
+
+ /*
+ * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
+ *
+ * Only call schedule_work() if hpage_freelist is previously
+ * empty. Otherwise, schedule_work() had been called but the workfn
+ * hasn't retrieved the list yet.
+ */
+ if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
+ schedule_work(&free_hpage_work);
+}
+
static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
{
struct page *page, *t_page;
list_for_each_entry_safe(page, t_page, list, lru) {
- update_and_free_page(h, page);
+ update_and_free_page(h, page, false);
cond_resched();
}
}
@@ -1470,12 +1582,12 @@ void free_huge_page(struct page *page)
if (HPageTemporary(page)) {
remove_hugetlb_page(h, page, false);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page);
+ update_and_free_page(h, page, true);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
remove_hugetlb_page(h, page, true);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page);
+ update_and_free_page(h, page, true);
} else {
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
@@ -1493,8 +1605,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
h->nr_huge_pages_node[nid]++;
}
-static void __prep_new_huge_page(struct page *page)
+static void __prep_new_huge_page(struct hstate *h, struct page *page)
{
+ free_huge_page_vmemmap(h, page);
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
hugetlb_set_page_subpool(page, NULL);
@@ -1504,15 +1617,15 @@ static void __prep_new_huge_page(struct page *page)
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
- __prep_new_huge_page(page);
+ __prep_new_huge_page(h, page);
spin_lock_irq(&hugetlb_lock);
__prep_account_new_huge_page(h, nid);
spin_unlock_irq(&hugetlb_lock);
}
-static void prep_compound_gigantic_page(struct page *page, unsigned int order)
+static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
{
- int i;
+ int i, j;
int nr_pages = 1 << order;
struct page *p = page + 1;
@@ -1534,11 +1647,48 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
* after get_user_pages().
*/
__ClearPageReserved(p);
+ /*
+ * Subtle and very unlikely
+ *
+ * Gigantic 'page allocators' such as memblock or cma will
+ * return a set of pages with each page ref counted. We need
+ * to turn this set of pages into a compound page with tail
+ * page ref counts set to zero. Code such as speculative page
+ * cache adding could take a ref on a 'to be' tail page.
+ * We need to respect any increased ref count, and only set
+ * the ref count to zero if count is currently 1. If count
+ * is not 1, we call synchronize_rcu in the hope that a rcu
+ * grace period will cause ref count to drop and then retry.
+ * If count is still inflated on retry we return an error and
+ * must discard the pages.
+ */
+ if (!page_ref_freeze(p, 1)) {
+ pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
+ synchronize_rcu();
+ if (!page_ref_freeze(p, 1))
+ goto out_error;
+ }
set_page_count(p, 0);
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
atomic_set(compound_pincount_ptr(page), 0);
+ return true;
+
+out_error:
+ /* undo tail page modifications made above */
+ p = page + 1;
+ for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
+ clear_compound_head(p);
+ set_page_refcounted(p);
+ }
+ /* need to clear PG_reserved on remaining tail pages */
+ for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
+ __ClearPageReserved(p);
+ set_compound_order(page, 0);
+ page[1].compound_nr = 0;
+ __ClearPageHead(page);
+ return false;
}
/*
@@ -1658,7 +1808,9 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
nodemask_t *node_alloc_noretry)
{
struct page *page;
+ bool retry = false;
+retry:
if (hstate_is_gigantic(h))
page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
else
@@ -1667,8 +1819,21 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
if (!page)
return NULL;
- if (hstate_is_gigantic(h))
- prep_compound_gigantic_page(page, huge_page_order(h));
+ if (hstate_is_gigantic(h)) {
+ if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
+ /*
+ * Rare failure to convert pages to compound page.
+ * Free pages and try again - ONCE!
+ */
+ free_gigantic_page(page, huge_page_order(h));
+ if (!retry) {
+ retry = true;
+ goto retry;
+ }
+ pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+ return NULL;
+ }
+ }
prep_new_huge_page(h, page, page_to_nid(page));
return page;
@@ -1737,10 +1902,14 @@ static struct page *remove_pool_huge_page(struct hstate *h,
* nothing for in-use hugepages and non-hugepages.
* This function returns values like below:
*
- * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
- * (allocated or reserved.)
- * 0: successfully dissolved free hugepages or the page is not a
- * hugepage (considered as already dissolved)
+ * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
+ * when the system is under memory pressure and the feature of
+ * freeing unused vmemmap pages associated with each hugetlb page
+ * is enabled.
+ * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
+ * (allocated or reserved.)
+ * 0: successfully dissolved free hugepages or the page is not a
+ * hugepage (considered as already dissolved)
*/
int dissolve_free_huge_page(struct page *page)
{
@@ -1782,19 +1951,38 @@ retry:
goto retry;
}
- /*
- * Move PageHWPoison flag from head page to the raw error page,
- * which makes any subpages rather than the error page reusable.
- */
- if (PageHWPoison(head) && page != head) {
- SetPageHWPoison(page);
- ClearPageHWPoison(head);
- }
remove_hugetlb_page(h, head, false);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
- update_and_free_page(h, head);
- return 0;
+
+ /*
+ * Normally update_and_free_page will allocate required vmemmmap
+ * before freeing the page. update_and_free_page will fail to
+ * free the page if it can not allocate required vmemmap. We
+ * need to adjust max_huge_pages if the page is not freed.
+ * Attempt to allocate vmemmmap here so that we can take
+ * appropriate action on failure.
+ */
+ rc = alloc_huge_page_vmemmap(h, head);
+ if (!rc) {
+ /*
+ * Move PageHWPoison flag from head page to the raw
+ * error page, which makes any subpages rather than
+ * the error page reusable.
+ */
+ if (PageHWPoison(head) && page != head) {
+ SetPageHWPoison(page);
+ ClearPageHWPoison(head);
+ }
+ update_and_free_page(h, head, false);
+ } else {
+ spin_lock_irq(&hugetlb_lock);
+ add_hugetlb_page(h, head, false);
+ h->max_huge_pages++;
+ spin_unlock_irq(&hugetlb_lock);
+ }
+
+ return rc;
}
out:
spin_unlock_irq(&hugetlb_lock);
@@ -2351,14 +2539,15 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
/*
* Before dissolving the page, we need to allocate a new one for the
- * pool to remain stable. Using alloc_buddy_huge_page() allows us to
- * not having to deal with prep_new_huge_page() and avoids dealing of any
- * counters. This simplifies and let us do the whole thing under the
- * lock.
+ * pool to remain stable. Here, we allocate the page and 'prep' it
+ * by doing everything but actually updating counters and adding to
+ * the pool. This simplifies and let us do most of the processing
+ * under the lock.
*/
new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
if (!new_page)
return -ENOMEM;
+ __prep_new_huge_page(h, new_page);
retry:
spin_lock_irq(&hugetlb_lock);
@@ -2397,14 +2586,9 @@ retry:
remove_hugetlb_page(h, old_page, false);
/*
- * new_page needs to be initialized with the standard hugetlb
- * state. This is normally done by prep_new_huge_page() but
- * that takes hugetlb_lock which is already held so we need to
- * open code it here.
* Reference count trick is needed because allocator gives us
* referenced page but the pool requires pages with 0 refcount.
*/
- __prep_new_huge_page(new_page);
__prep_account_new_huge_page(h, nid);
page_ref_dec(new_page);
enqueue_huge_page(h, new_page);
@@ -2413,14 +2597,14 @@ retry:
* Pages have been replaced, we can safely free the old one.
*/
spin_unlock_irq(&hugetlb_lock);
- update_and_free_page(h, old_page);
+ update_and_free_page(h, old_page, false);
}
return ret;
free_new:
spin_unlock_irq(&hugetlb_lock);
- __free_pages(new_page, huge_page_order(h));
+ update_and_free_page(h, new_page, false);
return ret;
}
@@ -2625,16 +2809,10 @@ found:
return 1;
}
-static void __init prep_compound_huge_page(struct page *page,
- unsigned int order)
-{
- if (unlikely(order > (MAX_ORDER - 1)))
- prep_compound_gigantic_page(page, order);
- else
- prep_compound_page(page, order);
-}
-
-/* Put bootmem huge pages into the standard lists after mem_map is up */
+/*
+ * Put bootmem huge pages into the standard lists after mem_map is up.
+ * Note: This only applies to gigantic (order > MAX_ORDER) pages.
+ */
static void __init gather_bootmem_prealloc(void)
{
struct huge_bootmem_page *m;
@@ -2643,20 +2821,23 @@ static void __init gather_bootmem_prealloc(void)
struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
+ VM_BUG_ON(!hstate_is_gigantic(h));
WARN_ON(page_count(page) != 1);
- prep_compound_huge_page(page, huge_page_order(h));
- WARN_ON(PageReserved(page));
- prep_new_huge_page(h, page, page_to_nid(page));
- put_page(page); /* free it into the hugepage allocator */
+ if (prep_compound_gigantic_page(page, huge_page_order(h))) {
+ WARN_ON(PageReserved(page));
+ prep_new_huge_page(h, page, page_to_nid(page));
+ put_page(page); /* add to the hugepage allocator */
+ } else {
+ free_gigantic_page(page, huge_page_order(h));
+ pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+ }
/*
- * If we had gigantic hugepages allocated at boot time, we need
- * to restore the 'stolen' pages to totalram_pages in order to
- * fix confusing memory reports from free(1) and another
- * side-effects, like CommitLimit going negative.
+ * We need to restore the 'stolen' pages to totalram_pages
+ * in order to fix confusing memory reports from free(1) and
+ * other side-effects, like CommitLimit going negative.
*/
- if (hstate_is_gigantic(h))
- adjust_managed_page_count(page, pages_per_huge_page(h));
+ adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
}
@@ -2834,6 +3015,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* pages in hstate via the proc/sysfs interfaces.
*/
mutex_lock(&h->resize_lock);
+ flush_free_hpage_work(h);
spin_lock_irq(&hugetlb_lock);
/*
@@ -2943,6 +3125,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
/* free the pages after dropping lock */
spin_unlock_irq(&hugetlb_lock);
update_and_free_pages_bulk(h, &page_list);
+ flush_free_hpage_work(h);
spin_lock_irq(&hugetlb_lock);
while (count < persistent_huge_pages(h)) {
@@ -3450,6 +3633,7 @@ void __init hugetlb_add_hstate(unsigned int order)
h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
+ hugetlb_vmemmap_init(h);
parsed_hstate = h;
}
@@ -3924,6 +4108,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
int writable)
{
pte_t entry;
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
if (writable) {
entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
@@ -3934,7 +4119,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
- entry = arch_make_huge_pte(entry, vma, page, writable);
+ entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
return entry;
}
@@ -4057,12 +4242,13 @@ again:
is_hugetlb_entry_hwpoisoned(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
- if (is_write_migration_entry(swp_entry) && cow) {
+ if (is_writable_migration_entry(swp_entry) && cow) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
- make_migration_entry_read(&swp_entry);
+ swp_entry = make_readable_migration_entry(
+ swp_offset(swp_entry));
entry = swp_entry_to_pte(swp_entry);
set_huge_swap_pte_at(src, addr, src_pte,
entry, sz);
@@ -4939,20 +5125,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
struct page **pagep)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
- struct address_space *mapping;
- pgoff_t idx;
+ struct hstate *h = hstate_vma(dst_vma);
+ struct address_space *mapping = dst_vma->vm_file->f_mapping;
+ pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
unsigned long size;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
- struct hstate *h = hstate_vma(dst_vma);
pte_t _dst_pte;
spinlock_t *ptl;
- int ret;
+ int ret = -ENOMEM;
struct page *page;
int writable;
- mapping = dst_vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
if (is_continue) {
ret = -EFAULT;
page = find_lock_page(mapping, idx);
@@ -4981,12 +5164,44 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
+ /* Free the allocated page which may have
+ * consumed a reservation.
+ */
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
+ put_page(page);
+
+ /* Allocate a temporary page to hold the copied
+ * contents.
+ */
+ page = alloc_huge_page_vma(h, dst_vma, dst_addr);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
*pagep = page;
- /* don't free the page */
+ /* Set the outparam pagep and return to the caller to
+ * copy the contents outside the lock. Don't free the
+ * page.
+ */
goto out;
}
} else {
- page = *pagep;
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ put_page(*pagep);
+ ret = -EEXIST;
+ *pagep = NULL;
+ goto out;
+ }
+
+ page = alloc_huge_page(dst_vma, dst_addr, 0);
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
+ *pagep = NULL;
+ goto out;
+ }
+ copy_huge_page(page, *pagep);
+ put_page(*pagep);
*pagep = NULL;
}
@@ -5318,10 +5533,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
- if (is_write_migration_entry(entry)) {
+ if (is_writable_migration_entry(entry)) {
pte_t newpte;
- make_migration_entry_read(&entry);
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
newpte = swp_entry_to_pte(entry);
set_huge_swap_pte_at(mm, address, ptep,
newpte, huge_page_size(h));
@@ -5332,10 +5548,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
}
if (!huge_pte_none(pte)) {
pte_t old_pte;
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
- pte = arch_make_huge_pte(pte, vma, NULL, 0);
+ pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
new file mode 100644
index 000000000000..c540c21e26f5
--- /dev/null
+++ b/mm/hugetlb_vmemmap.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Free some vmemmap pages of HugeTLB
+ *
+ * Copyright (c) 2020, Bytedance. All rights reserved.
+ *
+ * Author: Muchun Song <songmuchun@bytedance.com>
+ *
+ * The struct page structures (page structs) are used to describe a physical
+ * page frame. By default, there is a one-to-one mapping from a page frame to
+ * it's corresponding page struct.
+ *
+ * HugeTLB pages consist of multiple base page size pages and is supported by
+ * many architectures. See hugetlbpage.rst in the Documentation directory for
+ * more details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB
+ * are currently supported. Since the base page size on x86 is 4KB, a 2MB
+ * HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of
+ * 4096 base pages. For each base page, there is a corresponding page struct.
+ *
+ * Within the HugeTLB subsystem, only the first 4 page structs are used to
+ * contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
+ * this upper limit. The only 'useful' information in the remaining page structs
+ * is the compound_head field, and this field is the same for all tail pages.
+ *
+ * By removing redundant page structs for HugeTLB pages, memory can be returned
+ * to the buddy allocator for other uses.
+ *
+ * Different architectures support different HugeTLB pages. For example, the
+ * following table is the HugeTLB page size supported by x86 and arm64
+ * architectures. Because arm64 supports 4k, 16k, and 64k base pages and
+ * supports contiguous entries, so it supports many kinds of sizes of HugeTLB
+ * page.
+ *
+ * +--------------+-----------+-----------------------------------------------+
+ * | Architecture | Page Size | HugeTLB Page Size |
+ * +--------------+-----------+-----------+-----------+-----------+-----------+
+ * | x86-64 | 4KB | 2MB | 1GB | | |
+ * +--------------+-----------+-----------+-----------+-----------+-----------+
+ * | | 4KB | 64KB | 2MB | 32MB | 1GB |
+ * | +-----------+-----------+-----------+-----------+-----------+
+ * | arm64 | 16KB | 2MB | 32MB | 1GB | |
+ * | +-----------+-----------+-----------+-----------+-----------+
+ * | | 64KB | 2MB | 512MB | 16GB | |
+ * +--------------+-----------+-----------+-----------+-----------+-----------+
+ *
+ * When the system boot up, every HugeTLB page has more than one struct page
+ * structs which size is (unit: pages):
+ *
+ * struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
+ *
+ * Where HugeTLB_Size is the size of the HugeTLB page. We know that the size
+ * of the HugeTLB page is always n times PAGE_SIZE. So we can get the following
+ * relationship.
+ *
+ * HugeTLB_Size = n * PAGE_SIZE
+ *
+ * Then,
+ *
+ * struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
+ * = n * sizeof(struct page) / PAGE_SIZE
+ *
+ * We can use huge mapping at the pud/pmd level for the HugeTLB page.
+ *
+ * For the HugeTLB page of the pmd level mapping, then
+ *
+ * struct_size = n * sizeof(struct page) / PAGE_SIZE
+ * = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE
+ * = sizeof(struct page) / sizeof(pte_t)
+ * = 64 / 8
+ * = 8 (pages)
+ *
+ * Where n is how many pte entries which one page can contains. So the value of
+ * n is (PAGE_SIZE / sizeof(pte_t)).
+ *
+ * This optimization only supports 64-bit system, so the value of sizeof(pte_t)
+ * is 8. And this optimization also applicable only when the size of struct page
+ * is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
+ * x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
+ * size of struct page structs of it is 8 page frames which size depends on the
+ * size of the base page.
+ *
+ * For the HugeTLB page of the pud level mapping, then
+ *
+ * struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd)
+ * = PAGE_SIZE / 8 * 8 (pages)
+ * = PAGE_SIZE (pages)
+ *
+ * Where the struct_size(pmd) is the size of the struct page structs of a
+ * HugeTLB page of the pmd level mapping.
+ *
+ * E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
+ * HugeTLB page consists in 4096.
+ *
+ * Next, we take the pmd level mapping of the HugeTLB page as an example to
+ * show the internal implementation of this optimization. There are 8 pages
+ * struct page structs associated with a HugeTLB page which is pmd mapped.
+ *
+ * Here is how things look before optimization.
+ *
+ * HugeTLB struct pages(8 pages) page frame(8 pages)
+ * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
+ * | | | 0 | -------------> | 0 |
+ * | | +-----------+ +-----------+
+ * | | | 1 | -------------> | 1 |
+ * | | +-----------+ +-----------+
+ * | | | 2 | -------------> | 2 |
+ * | | +-----------+ +-----------+
+ * | | | 3 | -------------> | 3 |
+ * | | +-----------+ +-----------+
+ * | | | 4 | -------------> | 4 |
+ * | PMD | +-----------+ +-----------+
+ * | level | | 5 | -------------> | 5 |
+ * | mapping | +-----------+ +-----------+
+ * | | | 6 | -------------> | 6 |
+ * | | +-----------+ +-----------+
+ * | | | 7 | -------------> | 7 |
+ * | | +-----------+ +-----------+
+ * | |
+ * | |
+ * | |
+ * +-----------+
+ *
+ * The value of page->compound_head is the same for all tail pages. The first
+ * page of page structs (page 0) associated with the HugeTLB page contains the 4
+ * page structs necessary to describe the HugeTLB. The only use of the remaining
+ * pages of page structs (page 1 to page 7) is to point to page->compound_head.
+ * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs
+ * will be used for each HugeTLB page. This will allow us to free the remaining
+ * 6 pages to the buddy allocator.
+ *
+ * Here is how things look after remapping.
+ *
+ * HugeTLB struct pages(8 pages) page frame(8 pages)
+ * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
+ * | | | 0 | -------------> | 0 |
+ * | | +-----------+ +-----------+
+ * | | | 1 | -------------> | 1 |
+ * | | +-----------+ +-----------+
+ * | | | 2 | ----------------^ ^ ^ ^ ^ ^
+ * | | +-----------+ | | | | |
+ * | | | 3 | ------------------+ | | | |
+ * | | +-----------+ | | | |
+ * | | | 4 | --------------------+ | | |
+ * | PMD | +-----------+ | | |
+ * | level | | 5 | ----------------------+ | |
+ * | mapping | +-----------+ | |
+ * | | | 6 | ------------------------+ |
+ * | | +-----------+ |
+ * | | | 7 | --------------------------+
+ * | | +-----------+
+ * | |
+ * | |
+ * | |
+ * +-----------+
+ *
+ * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for
+ * vmemmap pages and restore the previous mapping relationship.
+ *
+ * For the HugeTLB page of the pud level mapping. It is similar to the former.
+ * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages.
+ *
+ * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
+ * (e.g. aarch64) provides a contiguous bit in the translation table entries
+ * that hints to the MMU to indicate that it is one of a contiguous set of
+ * entries that can be cached in a single TLB entry.
+ *
+ * The contiguous bit is used to increase the mapping size at the pmd and pte
+ * (last) level. So this type of HugeTLB page can be optimized only when its
+ * size of the struct page structs is greater than 2 pages.
+ */
+#define pr_fmt(fmt) "HugeTLB: " fmt
+
+#include "hugetlb_vmemmap.h"
+
+/*
+ * There are a lot of struct page structures associated with each HugeTLB page.
+ * For tail pages, the value of compound_head is the same. So we can reuse first
+ * page of tail page structures. We map the virtual addresses of the remaining
+ * pages of tail page structures to the first tail page struct, and then free
+ * these page frames. Therefore, we need to reserve two pages as vmemmap areas.
+ */
+#define RESERVE_VMEMMAP_NR 2U
+#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
+
+bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
+
+static int __init early_hugetlb_free_vmemmap_param(char *buf)
+{
+ /* We cannot optimize if a "struct page" crosses page boundaries. */
+ if ((!is_power_of_2(sizeof(struct page)))) {
+ pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n");
+ return 0;
+ }
+
+ if (!buf)
+ return -EINVAL;
+
+ if (!strcmp(buf, "on"))
+ hugetlb_free_vmemmap_enabled = true;
+ else if (!strcmp(buf, "off"))
+ hugetlb_free_vmemmap_enabled = false;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param);
+
+static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
+{
+ return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
+}
+
+/*
+ * Previously discarded vmemmap pages will be allocated and remapping
+ * after this function returns zero.
+ */
+int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+ int ret;
+ unsigned long vmemmap_addr = (unsigned long)head;
+ unsigned long vmemmap_end, vmemmap_reuse;
+
+ if (!HPageVmemmapOptimized(head))
+ return 0;
+
+ vmemmap_addr += RESERVE_VMEMMAP_SIZE;
+ vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
+ vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+ /*
+ * The pages which the vmemmap virtual address range [@vmemmap_addr,
+ * @vmemmap_end) are mapped to are freed to the buddy allocator, and
+ * the range is mapped to the page which @vmemmap_reuse is mapped to.
+ * When a HugeTLB page is freed to the buddy allocator, previously
+ * discarded vmemmap pages must be allocated and remapping.
+ */
+ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
+ GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
+
+ if (!ret)
+ ClearHPageVmemmapOptimized(head);
+
+ return ret;
+}
+
+void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+ unsigned long vmemmap_addr = (unsigned long)head;
+ unsigned long vmemmap_end, vmemmap_reuse;
+
+ if (!free_vmemmap_pages_per_hpage(h))
+ return;
+
+ vmemmap_addr += RESERVE_VMEMMAP_SIZE;
+ vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
+ vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+
+ /*
+ * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
+ * to the page which @vmemmap_reuse is mapped to, then free the pages
+ * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
+ */
+ if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
+ SetHPageVmemmapOptimized(head);
+}
+
+void __init hugetlb_vmemmap_init(struct hstate *h)
+{
+ unsigned int nr_pages = pages_per_huge_page(h);
+ unsigned int vmemmap_pages;
+
+ /*
+ * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
+ * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP,
+ * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
+ */
+ BUILD_BUG_ON(__NR_USED_SUBPAGE >=
+ RESERVE_VMEMMAP_SIZE / sizeof(struct page));
+
+ if (!hugetlb_free_vmemmap_enabled)
+ return;
+
+ vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
+ /*
+ * The head page and the first tail page are not to be freed to buddy
+ * allocator, the other pages will map to the first tail page, so they
+ * can be freed.
+ *
+ * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
+ * on some architectures (e.g. aarch64). See Documentation/arm64/
+ * hugetlbpage.rst for more details.
+ */
+ if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
+ h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
+
+ pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages,
+ h->name);
+}
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
new file mode 100644
index 000000000000..cb2bef8f9e73
--- /dev/null
+++ b/mm/hugetlb_vmemmap.h
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Free some vmemmap pages of HugeTLB
+ *
+ * Copyright (c) 2020, Bytedance. All rights reserved.
+ *
+ * Author: Muchun Song <songmuchun@bytedance.com>
+ */
+#ifndef _LINUX_HUGETLB_VMEMMAP_H
+#define _LINUX_HUGETLB_VMEMMAP_H
+#include <linux/hugetlb.h>
+
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+int alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
+void free_huge_page_vmemmap(struct hstate *h, struct page *head);
+void hugetlb_vmemmap_init(struct hstate *h);
+
+/*
+ * How many vmemmap pages associated with a HugeTLB page that can be freed
+ * to the buddy allocator.
+ */
+static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+{
+ return h->nr_free_vmemmap_pages;
+}
+#else
+static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+ return 0;
+}
+
+static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+}
+
+static inline void hugetlb_vmemmap_init(struct hstate *h)
+{
+}
+
+static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+{
+ return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
+#endif /* _LINUX_HUGETLB_VMEMMAP_H */
diff --git a/mm/internal.h b/mm/internal.h
index 6ec2cea9926b..2d7c9a2e0118 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -274,11 +274,10 @@ isolate_freepages_range(struct compact_control *cc,
int
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
+#endif
int find_suitable_fallback(struct free_area *area, unsigned int order,
int migratetype, bool only_stealable, bool *can_steal);
-#endif
-
/*
* This function returns the order of a free page in the buddy system. In
* general, page_zone(page)->lock must be held by the caller to prevent the
@@ -344,7 +343,10 @@ void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
#ifdef CONFIG_MMU
extern long populate_vma_page_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, int *nonblocking);
+ unsigned long start, unsigned long end, int *locked);
+extern long faultin_vma_page_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ bool write, int *locked);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
@@ -369,23 +371,6 @@ extern unsigned int munlock_vma_page(struct page *page);
*/
extern void clear_page_mlock(struct page *page);
-/*
- * mlock_migrate_page - called only from migrate_misplaced_transhuge_page()
- * (because that does not go through the full procedure of migration ptes):
- * to migrate the Mlocked page flag; update statistics.
- */
-static inline void mlock_migrate_page(struct page *newpage, struct page *page)
-{
- if (TestClearPageMlocked(page)) {
- int nr_pages = thp_nr_pages(page);
-
- /* Holding pmd lock, no change in irq context: __mod is safe */
- __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
- SetPageMlocked(newpage);
- __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
- }
-}
-
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
/*
@@ -461,7 +446,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
-static inline void mlock_migrate_page(struct page *new, struct page *old) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
}
@@ -672,4 +656,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
void vunmap_range_noflush(unsigned long start, unsigned long end);
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int page_nid, int *flags);
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 4d21ac44d5d3..d7666ace9d2e 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -636,7 +636,7 @@ static void toggle_allocation_gate(struct work_struct *work)
/* Disable static key and reset timer. */
static_branch_disable(&kfence_allocation_key);
#endif
- queue_delayed_work(system_power_efficient_wq, &kfence_timer,
+ queue_delayed_work(system_unbound_wq, &kfence_timer,
msecs_to_jiffies(kfence_sample_interval));
}
static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
@@ -666,7 +666,7 @@ void __init kfence_init(void)
}
WRITE_ONCE(kfence_enabled, true);
- queue_delayed_work(system_power_efficient_wq, &kfence_timer, 0);
+ queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
(void *)(__kfence_pool + KFENCE_POOL_SIZE));
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6c0185fdd815..b0412be08fa2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -442,9 +442,7 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
static bool hugepage_vma_check(struct vm_area_struct *vma,
unsigned long vm_flags)
{
- /* Explicitly disabled through madvise. */
- if ((vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ if (!transhuge_vma_enabled(vma, vm_flags))
return false;
/* Enabled via shmem mount options or sysfs settings. */
@@ -459,7 +457,8 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
/* Read-only file mappings need to be aligned for THP to work. */
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
- (vm_flags & VM_DENYWRITE)) {
+ !inode_is_open_for_write(vma->vm_file->f_inode) &&
+ (vm_flags & VM_EXEC)) {
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
HPAGE_PMD_NR);
}
@@ -1864,6 +1863,19 @@ out_unlock:
else {
__mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
filemap_nr_thps_inc(mapping);
+ /*
+ * Paired with smp_mb() in do_dentry_open() to ensure
+ * i_writecount is up to date and the update to nr_thps is
+ * visible. Ensures the page cache will be truncated if the
+ * file is opened writable.
+ */
+ smp_mb();
+ if (inode_is_open_for_write(mapping->host)) {
+ result = SCAN_FAIL;
+ __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
+ filemap_nr_thps_dec(mapping);
+ goto xa_locked;
+ }
}
if (nr_none) {
diff --git a/mm/madvise.c b/mm/madvise.c
index 63e489e5bfdb..6d3d348b17f4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -53,6 +53,8 @@ static int madvise_need_mmap_write(int behavior)
case MADV_COLD:
case MADV_PAGEOUT:
case MADV_FREE:
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -822,6 +824,61 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
return -EINVAL;
}
+static long madvise_populate(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ int behavior)
+{
+ const bool write = behavior == MADV_POPULATE_WRITE;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long tmp_end;
+ int locked = 1;
+ long pages;
+
+ *prev = vma;
+
+ while (start < end) {
+ /*
+ * We might have temporarily dropped the lock. For example,
+ * our VMA might have been split.
+ */
+ if (!vma || start >= vma->vm_end) {
+ vma = find_vma(mm, start);
+ if (!vma || start < vma->vm_start)
+ return -ENOMEM;
+ }
+
+ tmp_end = min_t(unsigned long, end, vma->vm_end);
+ /* Populate (prefault) page tables readable/writable. */
+ pages = faultin_vma_page_range(vma, start, tmp_end, write,
+ &locked);
+ if (!locked) {
+ mmap_read_lock(mm);
+ locked = 1;
+ *prev = NULL;
+ vma = NULL;
+ }
+ if (pages < 0) {
+ switch (pages) {
+ case -EINTR:
+ return -EINTR;
+ case -EFAULT: /* Incompatible mappings / permissions. */
+ return -EINVAL;
+ case -EHWPOISON:
+ return -EHWPOISON;
+ default:
+ pr_warn_once("%s: unhandled return value: %ld\n",
+ __func__, pages);
+ fallthrough;
+ case -ENOMEM:
+ return -ENOMEM;
+ }
+ }
+ start += pages * PAGE_SIZE;
+ }
+ return 0;
+}
+
/*
* Application wants to free up the pages and associated backing store.
* This is effectively punching a hole into the middle of a file.
@@ -935,6 +992,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ return madvise_populate(vma, prev, start, end, behavior);
default:
return madvise_behavior(vma, prev, start, end, behavior);
}
@@ -955,6 +1015,8 @@ madvise_behavior_valid(int behavior)
case MADV_FREE:
case MADV_COLD:
case MADV_PAGEOUT:
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
@@ -1042,6 +1104,10 @@ process_madvise_behavior_valid(int behavior)
* easily if memory pressure happens.
* MADV_PAGEOUT - the application is not expected to use this memory soon,
* page out the pages in this range immediately.
+ * MADV_POPULATE_READ - populate (prefault) page tables readable by
+ * triggering read faults if required
+ * MADV_POPULATE_WRITE - populate (prefault) page tables writable by
+ * triggering write faults if required
*
* return values:
* zero - success
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index b890854ec761..ea734f248fce 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -317,7 +317,7 @@ EXPORT_SYMBOL_GPL(wp_shared_mapping_range);
* pfn_mkwrite(). And then after a TLB flush following the write-protection
* pick up all dirty bits.
*
- * Note: This function currently skips transhuge page-table entries, since
+ * This function currently skips transhuge page-table entries, since
* it's intended for dirty-tracking on the PTE level. It will warn on
* encountering transhuge dirty entries, though, and can easily be extended
* to handle them as well.
diff --git a/mm/memblock.c b/mm/memblock.c
index 123feef5259d..3e4acbf03ab7 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -906,6 +906,11 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
* @base: the base phys addr of the region
* @size: the size of the region
*
+ * The memory regions marked with %MEMBLOCK_NOMAP will not be added to the
+ * direct mapping of the physical memory. These regions will still be
+ * covered by the memory map. The struct page representing NOMAP memory
+ * frames in the memory map will be PageReserved()
+ *
* Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
@@ -2002,6 +2007,26 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
return end_pfn - start_pfn;
}
+static void __init memmap_init_reserved_pages(void)
+{
+ struct memblock_region *region;
+ phys_addr_t start, end;
+ u64 i;
+
+ /* initialize struct pages for the reserved regions */
+ for_each_reserved_mem_range(i, &start, &end)
+ reserve_bootmem_region(start, end);
+
+ /* and also treat struct pages for the NOMAP regions as PageReserved */
+ for_each_mem_region(region) {
+ if (memblock_is_nomap(region)) {
+ start = region->base;
+ end = start + region->size;
+ reserve_bootmem_region(start, end);
+ }
+ }
+}
+
static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = 0;
@@ -2010,8 +2035,7 @@ static unsigned long __init free_low_memory_core_early(void)
memblock_clear_hotplug(0, -1);
- for_each_reserved_mem_range(i, &start, &end)
- reserve_bootmem_region(start, end);
+ memmap_init_reserved_pages();
/*
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b80aae448a49..ae1f5d0cb581 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5537,7 +5537,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
* as special swap entry in the CPU page table.
*/
if (is_device_private_entry(ent)) {
- page = device_private_entry_to_page(ent);
+ page = pfn_swap_entry_to_page(ent);
/*
* MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
* a refcount of 1 when free (unlike normal page)
@@ -6644,7 +6644,7 @@ static unsigned long effective_protection(unsigned long usage,
}
/**
- * mem_cgroup_protected - check if memory consumption is in the normal range
+ * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
* @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e5a1531f7f4e..eefd823deb67 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -66,6 +66,19 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
+static bool __page_handle_poison(struct page *page)
+{
+ bool ret;
+
+ zone_pcp_disable(page_zone(page));
+ ret = dissolve_free_huge_page(page);
+ if (!ret)
+ ret = take_page_off_buddy(page);
+ zone_pcp_enable(page_zone(page));
+
+ return ret;
+}
+
static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
{
if (hugepage_or_freepage) {
@@ -73,7 +86,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
* Doing this check for free pages is also fine since dissolve_free_huge_page
* returns 0 for non-hugetlb pages as well.
*/
- if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
+ if (!__page_handle_poison(page))
/*
* We could fail to take off the target page from buddy
* for example due to racy page allocation, but that's
@@ -985,7 +998,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
*/
if (PageAnon(hpage))
put_page(hpage);
- if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) {
+ if (__page_handle_poison(p)) {
page_ref_inc(p);
res = MF_RECOVERED;
}
@@ -1253,10 +1266,10 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
int flags, struct page **hpagep)
{
- enum ttu_flags ttu = TTU_IGNORE_MLOCK;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC;
struct address_space *mapping;
LIST_HEAD(tokill);
- bool unmap_success = true;
+ bool unmap_success;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
bool mlocked = PageMlocked(hpage);
@@ -1319,7 +1332,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
if (!PageHuge(hpage)) {
- unmap_success = try_to_unmap(hpage, ttu);
+ try_to_unmap(hpage, ttu);
} else {
if (!PageAnon(hpage)) {
/*
@@ -1327,21 +1340,20 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* could potentially call huge_pmd_unshare. Because of
* this, take semaphore in write mode here and set
* TTU_RMAP_LOCKED to indicate we have taken the lock
- * at this higer level.
+ * at this higher level.
*/
mapping = hugetlb_page_mapping_lock_write(hpage);
if (mapping) {
- unmap_success = try_to_unmap(hpage,
- ttu|TTU_RMAP_LOCKED);
+ try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
i_mmap_unlock_write(mapping);
- } else {
+ } else
pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
- unmap_success = false;
- }
} else {
- unmap_success = try_to_unmap(hpage, ttu);
+ try_to_unmap(hpage, ttu);
}
}
+
+ unmap_success = !page_mapped(hpage);
if (!unmap_success)
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
@@ -1446,7 +1458,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
}
unlock_page(head);
res = MF_FAILED;
- if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) {
+ if (__page_handle_poison(p)) {
page_ref_inc(p);
res = MF_RECOVERED;
}
diff --git a/mm/memory.c b/mm/memory.c
index 48c4576df898..747a01d495f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -699,6 +699,68 @@ out:
}
#endif
+static void restore_exclusive_pte(struct vm_area_struct *vma,
+ struct page *page, unsigned long address,
+ pte_t *ptep)
+{
+ pte_t pte;
+ swp_entry_t entry;
+
+ pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+ if (pte_swp_soft_dirty(*ptep))
+ pte = pte_mksoft_dirty(pte);
+
+ entry = pte_to_swp_entry(*ptep);
+ if (pte_swp_uffd_wp(*ptep))
+ pte = pte_mkuffd_wp(pte);
+ else if (is_writable_device_exclusive_entry(entry))
+ pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+
+ set_pte_at(vma->vm_mm, address, ptep, pte);
+
+ /*
+ * No need to take a page reference as one was already
+ * created when the swap entry was made.
+ */
+ if (PageAnon(page))
+ page_add_anon_rmap(page, vma, address, false);
+ else
+ /*
+ * Currently device exclusive access only supports anonymous
+ * memory so the entry shouldn't point to a filebacked page.
+ */
+ WARN_ON_ONCE(!PageAnon(page));
+
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_vma_page(page);
+
+ /*
+ * No need to invalidate - it was non-present before. However
+ * secondary CPUs may have mappings that need invalidating.
+ */
+ update_mmu_cache(vma, address, ptep);
+}
+
+/*
+ * Tries to restore an exclusive pte if the page lock can be acquired without
+ * sleeping.
+ */
+static int
+try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ swp_entry_t entry = pte_to_swp_entry(*src_pte);
+ struct page *page = pfn_swap_entry_to_page(entry);
+
+ if (trylock_page(page)) {
+ restore_exclusive_pte(vma, page, addr, src_pte);
+ unlock_page(page);
+ return 0;
+ }
+
+ return -EBUSY;
+}
+
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
@@ -707,17 +769,17 @@ out:
static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
- unsigned long addr, int *rss)
+ pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
- unsigned long vm_flags = vma->vm_flags;
+ unsigned long vm_flags = dst_vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
swp_entry_t entry = pte_to_swp_entry(pte);
if (likely(!non_swap_entry(entry))) {
if (swap_duplicate(entry) < 0)
- return entry.val;
+ return -EIO;
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
@@ -729,17 +791,18 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
rss[MM_SWAPENTS]++;
} else if (is_migration_entry(entry)) {
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
rss[mm_counter(page)]++;
- if (is_write_migration_entry(entry) &&
+ if (is_writable_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
- make_migration_entry_read(&entry);
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
pte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(*src_pte))
pte = pte_swp_mksoft_dirty(pte);
@@ -748,7 +811,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
set_pte_at(src_mm, addr, src_pte, pte);
}
} else if (is_device_private_entry(entry)) {
- page = device_private_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
/*
* Update rss count even for unaddressable pages, as
@@ -770,15 +833,29 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* when a device driver is involved (you cannot easily
* save and restore device driver state).
*/
- if (is_write_device_private_entry(entry) &&
+ if (is_writable_device_private_entry(entry) &&
is_cow_mapping(vm_flags)) {
- make_device_private_entry_read(&entry);
+ entry = make_readable_device_private_entry(
+ swp_offset(entry));
pte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(*src_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
+ } else if (is_device_exclusive_entry(entry)) {
+ /*
+ * Make device exclusive entries present by restoring the
+ * original entry then copying as for a present pte. Device
+ * exclusive entries currently only support private writable
+ * (ie. COW) mappings.
+ */
+ VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
+ if (try_restore_exclusive_pte(src_pte, src_vma, addr))
+ return -EBUSY;
+ return -ENOENT;
}
+ if (!userfaultfd_wp(dst_vma))
+ pte = pte_swp_clear_uffd_wp(pte);
set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
}
@@ -844,6 +921,9 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
/* All done, just insert the new page copy in the child */
pte = mk_pte(new_page, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
+ if (userfaultfd_pte_wp(dst_vma, *src_pte))
+ /* Uffd-wp needs to be delivered to dest pte as well */
+ pte = pte_wrprotect(pte_mkuffd_wp(pte));
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
@@ -893,12 +973,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
- /*
- * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
- * does not have the VM_UFFD_WP, which means that the uffd
- * fork event is not enabled.
- */
- if (!(vm_flags & VM_UFFD_WP))
+ if (!userfaultfd_wp(dst_vma))
pte = pte_clear_uffd_wp(pte);
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
@@ -971,13 +1046,25 @@ again:
continue;
}
if (unlikely(!pte_present(*src_pte))) {
- entry.val = copy_nonpresent_pte(dst_mm, src_mm,
- dst_pte, src_pte,
- src_vma, addr, rss);
- if (entry.val)
+ ret = copy_nonpresent_pte(dst_mm, src_mm,
+ dst_pte, src_pte,
+ dst_vma, src_vma,
+ addr, rss);
+ if (ret == -EIO) {
+ entry = pte_to_swp_entry(*src_pte);
break;
- progress += 8;
- continue;
+ } else if (ret == -EBUSY) {
+ break;
+ } else if (!ret) {
+ progress += 8;
+ continue;
+ }
+
+ /*
+ * Device exclusive entry restored, continue by copying
+ * the now present pte.
+ */
+ WARN_ON_ONCE(ret != -ENOENT);
}
/* copy_present_pte() will clear `*prealloc' if consumed */
ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
@@ -1008,20 +1095,26 @@ again:
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
- if (entry.val) {
+ if (ret == -EIO) {
+ VM_WARN_ON_ONCE(!entry.val);
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
ret = -ENOMEM;
goto out;
}
entry.val = 0;
- } else if (ret) {
- WARN_ON_ONCE(ret != -EAGAIN);
+ } else if (ret == -EBUSY) {
+ goto out;
+ } else if (ret == -EAGAIN) {
prealloc = page_copy_prealloc(src_mm, src_vma, addr);
if (!prealloc)
return -ENOMEM;
- /* We've captured and resolved the error. Reset, try again. */
- ret = 0;
+ } else if (ret) {
+ VM_WARN_ON_ONCE(1);
}
+
+ /* We've captured and resolved the error. Reset, try again. */
+ ret = 0;
+
if (addr != end)
goto again;
out:
@@ -1050,8 +1143,8 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
|| pmd_devmap(*src_pmd)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
- err = copy_huge_pmd(dst_mm, src_mm,
- dst_pmd, src_pmd, addr, src_vma);
+ err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+ addr, dst_vma, src_vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
@@ -1278,8 +1371,9 @@ again:
}
entry = pte_to_swp_entry(ptent);
- if (is_device_private_entry(entry)) {
- struct page *page = device_private_entry_to_page(entry);
+ if (is_device_private_entry(entry) ||
+ is_device_exclusive_entry(entry)) {
+ struct page *page = pfn_swap_entry_to_page(entry);
if (unlikely(details && details->check_mapping)) {
/*
@@ -1294,7 +1388,10 @@ again:
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;
- page_remove_rmap(page, false);
+
+ if (is_device_private_entry(entry))
+ page_remove_rmap(page, false);
+
put_page(page);
continue;
}
@@ -1308,7 +1405,7 @@ again:
else if (is_migration_entry(entry)) {
struct page *page;
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
rss[mm_counter(page)]--;
}
if (unlikely(!free_swap_and_cache(entry)))
@@ -3343,6 +3440,34 @@ void unmap_mapping_range(struct address_space *mapping,
EXPORT_SYMBOL(unmap_mapping_range);
/*
+ * Restore a potential device exclusive pte to a working pte entry
+ */
+static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct vm_area_struct *vma = vmf->vma;
+ struct mmu_notifier_range range;
+
+ if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+ return VM_FAULT_RETRY;
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+ vma->vm_mm, vmf->address & PAGE_MASK,
+ (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
+ mmu_notifier_invalidate_range_start(&range);
+
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+ restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ unlock_page(page);
+
+ mmu_notifier_invalidate_range_end(&range);
+ return 0;
+}
+
+/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
@@ -3370,8 +3495,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (is_migration_entry(entry)) {
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
+ } else if (is_device_exclusive_entry(entry)) {
+ vmf->page = pfn_swap_entry_to_page(entry);
+ ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
- vmf->page = device_private_entry_to_page(entry);
+ vmf->page = pfn_swap_entry_to_page(entry);
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
@@ -4025,9 +4153,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
- ret = do_fault_around(vmf);
- if (ret)
- return ret;
+ if (likely(!userfaultfd_minor(vmf->vma))) {
+ ret = do_fault_around(vmf);
+ if (ret)
+ return ret;
+ }
}
ret = __do_fault(vmf);
@@ -4172,9 +4302,8 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
return ret;
}
-static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, int page_nid,
- int *flags)
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int page_nid, int *flags)
{
get_page(page);
@@ -4295,12 +4424,12 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
}
/* `inline' is required to avoid gcc 4.1.2 build error */
-static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
+static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
if (vma_is_anonymous(vmf->vma)) {
- if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
+ if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
- return do_huge_pmd_wp_page(vmf, orig_pmd);
+ return do_huge_pmd_wp_page(vmf);
}
if (vmf->vma->vm_ops->huge_fault) {
vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
@@ -4527,26 +4656,26 @@ retry_pud:
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- pmd_t orig_pmd = *vmf.pmd;
+ vmf.orig_pmd = *vmf.pmd;
barrier();
- if (unlikely(is_swap_pmd(orig_pmd))) {
+ if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
- if (is_pmd_migration_entry(orig_pmd))
+ !is_pmd_migration_entry(vmf.orig_pmd));
+ if (is_pmd_migration_entry(vmf.orig_pmd))
pmd_migration_entry_wait(mm, vmf.pmd);
return 0;
}
- if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
- if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
- return do_huge_pmd_numa_page(&vmf, orig_pmd);
+ if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
+ if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+ return do_huge_pmd_numa_page(&vmf);
- if (dirty && !pmd_write(orig_pmd)) {
- ret = wp_huge_pmd(&vmf, orig_pmd);
+ if (dirty && !pmd_write(vmf.orig_pmd)) {
+ ret = wp_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- huge_pmd_set_accessed(&vmf, orig_pmd);
+ huge_pmd_set_accessed(&vmf);
return 0;
}
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 974a565797d8..8cb75b26ea4f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -154,122 +154,6 @@ static void release_memory_resource(struct resource *res)
}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-void get_page_bootmem(unsigned long info, struct page *page,
- unsigned long type)
-{
- page->freelist = (void *)type;
- SetPagePrivate(page);
- set_page_private(page, info);
- page_ref_inc(page);
-}
-
-void put_page_bootmem(struct page *page)
-{
- unsigned long type;
-
- type = (unsigned long) page->freelist;
- BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
- type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
-
- if (page_ref_dec_return(page) == 1) {
- page->freelist = NULL;
- ClearPagePrivate(page);
- set_page_private(page, 0);
- INIT_LIST_HEAD(&page->lru);
- free_reserved_page(page);
- }
-}
-
-#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void register_page_bootmem_info_section(unsigned long start_pfn)
-{
- unsigned long mapsize, section_nr, i;
- struct mem_section *ms;
- struct page *page, *memmap;
- struct mem_section_usage *usage;
-
- section_nr = pfn_to_section_nr(start_pfn);
- ms = __nr_to_section(section_nr);
-
- /* Get section's memmap address */
- memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
-
- /*
- * Get page for the memmap's phys address
- * XXX: need more consideration for sparse_vmemmap...
- */
- page = virt_to_page(memmap);
- mapsize = sizeof(struct page) * PAGES_PER_SECTION;
- mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
-
- /* remember memmap's page */
- for (i = 0; i < mapsize; i++, page++)
- get_page_bootmem(section_nr, page, SECTION_INFO);
-
- usage = ms->usage;
- page = virt_to_page(usage);
-
- mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
-
- for (i = 0; i < mapsize; i++, page++)
- get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
-
-}
-#else /* CONFIG_SPARSEMEM_VMEMMAP */
-static void register_page_bootmem_info_section(unsigned long start_pfn)
-{
- unsigned long mapsize, section_nr, i;
- struct mem_section *ms;
- struct page *page, *memmap;
- struct mem_section_usage *usage;
-
- section_nr = pfn_to_section_nr(start_pfn);
- ms = __nr_to_section(section_nr);
-
- memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
-
- register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
-
- usage = ms->usage;
- page = virt_to_page(usage);
-
- mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
-
- for (i = 0; i < mapsize; i++, page++)
- get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
-}
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-
-void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
- unsigned long i, pfn, end_pfn, nr_pages;
- int node = pgdat->node_id;
- struct page *page;
-
- nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
- page = virt_to_page(pgdat);
-
- for (i = 0; i < nr_pages; i++, page++)
- get_page_bootmem(node, page, NODE_INFO);
-
- pfn = pgdat->node_start_pfn;
- end_pfn = pgdat_end_pfn(pgdat);
-
- /* register section info */
- for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- /*
- * Some platforms can assign the same pfn to multiple nodes - on
- * node0 as well as nodeN. To avoid registering a pfn against
- * multiple nodes we check that this pfn does not already
- * reside in some other nodes.
- */
- if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
- register_page_bootmem_info_section(pfn);
- }
-}
-#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
-
static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
const char *reason)
{
@@ -445,7 +329,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long pfn;
int nid = zone_to_nid(zone);
- zone_span_writelock(zone);
if (zone->zone_start_pfn == start_pfn) {
/*
* If the section is smallest section in the zone, it need
@@ -478,7 +361,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
zone->spanned_pages = 0;
}
}
- zone_span_writeunlock(zone);
}
static void update_pgdat_span(struct pglist_data *pgdat)
@@ -515,7 +397,7 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
{
const unsigned long end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long pfn, cur_nr_pages, flags;
+ unsigned long pfn, cur_nr_pages;
/* Poison struct pages because they are now uninitialized again. */
for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
@@ -540,10 +422,8 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
clear_zone_contiguous(zone);
- pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
update_pgdat_span(pgdat);
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
set_zone_contiguous(zone);
}
@@ -750,19 +630,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
- unsigned long flags;
clear_zone_contiguous(zone);
- /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
- pgdat_resize_lock(pgdat, &flags);
- zone_span_writelock(zone);
if (zone_is_empty(zone))
init_currently_empty_zone(zone, start_pfn, nr_pages);
resize_zone_range(zone, start_pfn, nr_pages);
- zone_span_writeunlock(zone);
resize_pgdat_range(pgdat, start_pfn, nr_pages);
- pgdat_resize_unlock(pgdat, &flags);
/*
* Subsection population requires care in pfn_to_online_page().
@@ -852,12 +726,8 @@ struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
*/
void adjust_present_page_count(struct zone *zone, long nr_pages)
{
- unsigned long flags;
-
zone->present_pages += nr_pages;
- pgdat_resize_lock(zone->zone_pgdat, &flags);
zone->zone_pgdat->node_present_pages += nr_pages;
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
@@ -913,7 +783,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z
/*
* {on,off}lining is constrained to full memory sections (or more
- * precisly to memory blocks from the user space POV).
+ * precisely to memory blocks from the user space POV).
* memmap_on_memory is an exception because it reserves initial part
* of the physical memory space for vmemmaps. That space is pageblock
* aligned.
@@ -1072,8 +942,8 @@ static void rollback_node_hotadd(int nid)
}
-/**
- * try_online_node - online a node if offlined
+/*
+ * __try_online_node - online a node if offlined
* @nid: the node ID
* @set_node_online: Whether we want to online the node
* called by cpu_up() to online a node without onlined memory.
@@ -1172,6 +1042,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
* populate a single PMD.
*/
return memmap_on_memory &&
+ !hugetlb_free_vmemmap_enabled &&
IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
size == memory_block_size_bytes() &&
IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
@@ -1521,6 +1392,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
struct page *page, *head;
int ret = 0;
LIST_HEAD(source);
+ static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
if (!pfn_valid(pfn))
@@ -1567,8 +1440,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
page_is_file_lru(page));
} else {
- pr_warn("failed to isolate pfn %lx\n", pfn);
- dump_page(page, "isolation failed");
+ if (__ratelimit(&migrate_rs)) {
+ pr_warn("failed to isolate pfn %lx\n", pfn);
+ dump_page(page, "isolation failed");
+ }
}
put_page(page);
}
@@ -1597,9 +1472,11 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret) {
list_for_each_entry(page, &source, lru) {
- pr_warn("migrating pfn %lx failed ret:%d ",
- page_to_pfn(page), ret);
- dump_page(page, "migration failure");
+ if (__ratelimit(&migrate_rs)) {
+ pr_warn("migrating pfn %lx failed ret:%d\n",
+ page_to_pfn(page), ret);
+ dump_page(page, "migration failure");
+ }
}
putback_movable_pages(&source);
}
@@ -1703,7 +1580,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
/*
* {on,off}lining is constrained to full memory sections (or more
- * precisly to memory blocks from the user space POV).
+ * precisely to memory blocks from the user space POV).
* memmap_on_memory is an exception because it reserves initial part
* of the physical memory space for vmemmaps. That space is pageblock
* aligned.
@@ -2031,7 +1908,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
}
/**
- * remove_memory
+ * __remove_memory - Remove memory if every memory block is offline
* @nid: the node ID
* @start: physical address of the region to remove
* @size: size of the region to remove
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b5d95bf1025d..e32360e90274 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -121,8 +121,7 @@ enum zone_type policy_zone = 0;
*/
static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
- .mode = MPOL_PREFERRED,
- .flags = MPOL_F_LOCAL,
+ .mode = MPOL_LOCAL,
};
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
@@ -194,18 +193,17 @@ static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
- pol->v.nodes = *nodes;
+ pol->nodes = *nodes;
return 0;
}
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
- if (!nodes)
- pol->flags |= MPOL_F_LOCAL; /* local allocation */
- else if (nodes_empty(*nodes))
- return -EINVAL; /* no allowed nodes */
- else
- pol->v.preferred_node = first_node(*nodes);
+ if (nodes_empty(*nodes))
+ return -EINVAL;
+
+ nodes_clear(pol->nodes);
+ node_set(first_node(*nodes), pol->nodes);
return 0;
}
@@ -213,15 +211,14 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
- pol->v.nodes = *nodes;
+ pol->nodes = *nodes;
return 0;
}
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
- * parameter with respect to the policy mode and flags. But, we need to
- * handle an empty nodemask with MPOL_PREFERRED here.
+ * parameter with respect to the policy mode and flags.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
* and mempolicy. May also be called holding the mmap_lock for write.
@@ -231,33 +228,31 @@ static int mpol_set_nodemask(struct mempolicy *pol,
{
int ret;
- /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
- if (pol == NULL)
+ /*
+ * Default (pol==NULL) resp. local memory policies are not a
+ * subject of any remapping. They also do not need any special
+ * constructor.
+ */
+ if (!pol || pol->mode == MPOL_LOCAL)
return 0;
+
/* Check N_MEMORY */
nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_MEMORY]);
VM_BUG_ON(!nodes);
- if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
- nodes = NULL; /* explicit local allocation */
- else {
- if (pol->flags & MPOL_F_RELATIVE_NODES)
- mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
- else
- nodes_and(nsc->mask2, *nodes, nsc->mask1);
- if (mpol_store_user_nodemask(pol))
- pol->w.user_nodemask = *nodes;
- else
- pol->w.cpuset_mems_allowed =
- cpuset_current_mems_allowed;
- }
+ if (pol->flags & MPOL_F_RELATIVE_NODES)
+ mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
+ else
+ nodes_and(nsc->mask2, *nodes, nsc->mask1);
- if (nodes)
- ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
+ if (mpol_store_user_nodemask(pol))
+ pol->w.user_nodemask = *nodes;
else
- ret = mpol_ops[pol->mode].create(pol, NULL);
+ pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
+
+ ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
return ret;
}
@@ -290,13 +285,14 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
+
+ mode = MPOL_LOCAL;
}
} else if (mode == MPOL_LOCAL) {
if (!nodes_empty(*nodes) ||
(flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL);
- mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -330,7 +326,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
- nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
+ nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
*nodes);
pol->w.cpuset_mems_allowed = *nodes;
}
@@ -338,31 +334,13 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
if (nodes_empty(tmp))
tmp = *nodes;
- pol->v.nodes = tmp;
+ pol->nodes = tmp;
}
static void mpol_rebind_preferred(struct mempolicy *pol,
const nodemask_t *nodes)
{
- nodemask_t tmp;
-
- if (pol->flags & MPOL_F_STATIC_NODES) {
- int node = first_node(pol->w.user_nodemask);
-
- if (node_isset(node, *nodes)) {
- pol->v.preferred_node = node;
- pol->flags &= ~MPOL_F_LOCAL;
- } else
- pol->flags |= MPOL_F_LOCAL;
- } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
- mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
- pol->v.preferred_node = first_node(tmp);
- } else if (!(pol->flags & MPOL_F_LOCAL)) {
- pol->v.preferred_node = node_remap(pol->v.preferred_node,
- pol->w.cpuset_mems_allowed,
- *nodes);
- pol->w.cpuset_mems_allowed = *nodes;
- }
+ pol->w.cpuset_mems_allowed = *nodes;
}
/*
@@ -376,7 +354,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
+ if (!mpol_store_user_nodemask(pol) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
@@ -427,6 +405,9 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.create = mpol_new_bind,
.rebind = mpol_rebind_nodemask,
},
+ [MPOL_LOCAL] = {
+ .rebind = mpol_rebind_default,
+ },
};
static int migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -458,7 +439,8 @@ static inline bool queue_pages_required(struct page *page,
/*
* queue_pages_pmd() has four possible return values:
- * 0 - pages are placed on the right node or queued successfully.
+ * 0 - pages are placed on the right node or queued successfully, or
+ * special page is met, i.e. huge zero page.
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* 2 - THP was split.
@@ -482,8 +464,7 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
- __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
- ret = 2;
+ walk->action = ACTION_CONTINUE;
goto out;
}
if (!queue_pages_required(page, qp))
@@ -510,7 +491,8 @@ out:
* and move them to the pagelist if they do.
*
* queue_pages_pte_range() has three possible return values:
- * 0 - pages are placed on the right node or queued successfully.
+ * 0 - pages are placed on the right node or queued successfully, or
+ * special page is met, i.e. zero page.
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* -EIO - only MPOL_MF_STRICT was specified and an existing page was already
@@ -917,12 +899,11 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
switch (p->mode) {
case MPOL_BIND:
case MPOL_INTERLEAVE:
- *nodes = p->v.nodes;
- break;
case MPOL_PREFERRED:
- if (!(p->flags & MPOL_F_LOCAL))
- node_set(p->v.preferred_node, *nodes);
- /* else return empty node mask for local allocation */
+ *nodes = p->nodes;
+ break;
+ case MPOL_LOCAL:
+ /* return empty node mask for local allocation */
break;
default:
BUG();
@@ -1007,7 +988,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
*policy = err;
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
- *policy = next_node_in(current->il_prev, pol->v.nodes);
+ *policy = next_node_in(current->il_prev, pol->nodes);
} else {
err = -EINVAL;
goto out;
@@ -1460,26 +1441,38 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
+/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
+static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
+{
+ *flags = *mode & MPOL_MODE_FLAGS;
+ *mode &= ~MPOL_MODE_FLAGS;
+ if ((unsigned int)(*mode) >= MPOL_MAX)
+ return -EINVAL;
+ if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
+ return -EINVAL;
+
+ return 0;
+}
+
static long kernel_mbind(unsigned long start, unsigned long len,
unsigned long mode, const unsigned long __user *nmask,
unsigned long maxnode, unsigned int flags)
{
+ unsigned short mode_flags;
nodemask_t nodes;
+ int lmode = mode;
int err;
- unsigned short mode_flags;
start = untagged_addr(start);
- mode_flags = mode & MPOL_MODE_FLAGS;
- mode &= ~MPOL_MODE_FLAGS;
- if (mode >= MPOL_MAX)
- return -EINVAL;
- if ((mode_flags & MPOL_F_STATIC_NODES) &&
- (mode_flags & MPOL_F_RELATIVE_NODES))
- return -EINVAL;
+ err = sanitize_mpol_flags(&lmode, &mode_flags);
+ if (err)
+ return err;
+
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_mbind(start, len, mode, mode_flags, &nodes, flags);
+
+ return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
}
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
@@ -1493,20 +1486,20 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
unsigned long maxnode)
{
- int err;
+ unsigned short mode_flags;
nodemask_t nodes;
- unsigned short flags;
+ int lmode = mode;
+ int err;
+
+ err = sanitize_mpol_flags(&lmode, &mode_flags);
+ if (err)
+ return err;
- flags = mode & MPOL_MODE_FLAGS;
- mode &= ~MPOL_MODE_FLAGS;
- if ((unsigned int)mode >= MPOL_MAX)
- return -EINVAL;
- if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
- return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_set_mempolicy(mode, flags, &nodes);
+
+ return do_set_mempolicy(lmode, mode_flags, &nodes);
}
SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
@@ -1863,14 +1856,14 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
/*
- * if policy->v.nodes has movable memory only,
+ * if policy->nodes has movable memory only,
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
*
- * policy->v.nodes is intersect with node_states[N_MEMORY].
+ * policy->nodes is intersect with node_states[N_MEMORY].
* so if the following test fails, it implies
- * policy->v.nodes has movable memory only.
+ * policy->nodes has movable memory only.
*/
- if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
+ if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
dynamic_policy_zone = ZONE_MOVABLE;
return zone >= dynamic_policy_zone;
@@ -1885,8 +1878,8 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->mode == MPOL_BIND) &&
apply_policy_zone(policy, gfp_zone(gfp)) &&
- cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
- return &policy->v.nodes;
+ cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+ return &policy->nodes;
return NULL;
}
@@ -1894,9 +1887,9 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
/* Return the node id preferred by the given mempolicy, or the given id */
static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
- if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
- nd = policy->v.preferred_node;
- else {
+ if (policy->mode == MPOL_PREFERRED) {
+ nd = first_node(policy->nodes);
+ } else {
/*
* __GFP_THISNODE shouldn't even be used with the bind policy
* because we might easily break the expectation to stay on the
@@ -1914,7 +1907,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
unsigned next;
struct task_struct *me = current;
- next = next_node_in(me->il_prev, policy->v.nodes);
+ next = next_node_in(me->il_prev, policy->nodes);
if (next < MAX_NUMNODES)
me->il_prev = next;
return next;
@@ -1933,15 +1926,12 @@ unsigned int mempolicy_slab_node(void)
return node;
policy = current->mempolicy;
- if (!policy || policy->flags & MPOL_F_LOCAL)
+ if (!policy)
return node;
switch (policy->mode) {
case MPOL_PREFERRED:
- /*
- * handled MPOL_F_LOCAL above
- */
- return policy->v.preferred_node;
+ return first_node(policy->nodes);
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
@@ -1957,9 +1947,11 @@ unsigned int mempolicy_slab_node(void)
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zoneidx,
- &policy->v.nodes);
+ &policy->nodes);
return z->zone ? zone_to_nid(z->zone) : node;
}
+ case MPOL_LOCAL:
+ return node;
default:
BUG();
@@ -1968,12 +1960,12 @@ unsigned int mempolicy_slab_node(void)
/*
* Do static interleaving for a VMA with known offset @n. Returns the n'th
- * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
+ * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
* number of present nodes.
*/
static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
{
- unsigned nnodes = nodes_weight(pol->v.nodes);
+ unsigned nnodes = nodes_weight(pol->nodes);
unsigned target;
int i;
int nid;
@@ -1981,9 +1973,9 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
if (!nnodes)
return numa_node_id();
target = (unsigned int)n % nnodes;
- nid = first_node(pol->v.nodes);
+ nid = first_node(pol->nodes);
for (i = 0; i < target; i++)
- nid = next_node(nid, pol->v.nodes);
+ nid = next_node(nid, pol->nodes);
return nid;
}
@@ -2039,7 +2031,7 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
} else {
nid = policy_node(gfp_flags, *mpol, numa_node_id());
if ((*mpol)->mode == MPOL_BIND)
- *nodemask = &(*mpol)->v.nodes;
+ *nodemask = &(*mpol)->nodes;
}
return nid;
}
@@ -2063,7 +2055,6 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
struct mempolicy *mempolicy;
- int nid;
if (!(mask && current->mempolicy))
return false;
@@ -2072,16 +2063,13 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
- if (mempolicy->flags & MPOL_F_LOCAL)
- nid = numa_node_id();
- else
- nid = mempolicy->v.preferred_node;
- init_nodemask_of_node(mask, nid);
- break;
-
case MPOL_BIND:
case MPOL_INTERLEAVE:
- *mask = mempolicy->v.nodes;
+ *mask = mempolicy->nodes;
+ break;
+
+ case MPOL_LOCAL:
+ init_nodemask_of_node(mask, numa_node_id());
break;
default:
@@ -2094,16 +2082,16 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
#endif
/*
- * mempolicy_nodemask_intersects
+ * mempolicy_in_oom_domain
*
- * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
- * policy. Otherwise, check for intersection between mask and the policy
- * nodemask for 'bind' or 'interleave' policy. For 'preferred' or 'local'
- * policy, always return true since it may allocate elsewhere on fallback.
+ * If tsk's mempolicy is "bind", check for intersection between mask and
+ * the policy nodemask. Otherwise, return true for all other policies
+ * including "interleave", as a tsk with "interleave" policy may have
+ * memory allocated from all nodes in system.
*
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
*/
-bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask)
{
struct mempolicy *mempolicy;
@@ -2111,29 +2099,13 @@ bool mempolicy_nodemask_intersects(struct task_struct *tsk,
if (!mask)
return ret;
+
task_lock(tsk);
mempolicy = tsk->mempolicy;
- if (!mempolicy)
- goto out;
-
- switch (mempolicy->mode) {
- case MPOL_PREFERRED:
- /*
- * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
- * allocate from, they may fallback to other nodes when oom.
- * Thus, it's possible for tsk to have allocated memory from
- * nodes in mask.
- */
- break;
- case MPOL_BIND:
- case MPOL_INTERLEAVE:
- ret = nodes_intersects(mempolicy->v.nodes, *mask);
- break;
- default:
- BUG();
- }
-out:
+ if (mempolicy && mempolicy->mode == MPOL_BIND)
+ ret = nodes_intersects(mempolicy->nodes, *mask);
task_unlock(tsk);
+
return ret;
}
@@ -2204,8 +2176,8 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
* If the policy is interleave, or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
- if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
- hpage_node = pol->v.preferred_node;
+ if (pol->mode == MPOL_PREFERRED)
+ hpage_node = first_node(pol->nodes);
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
@@ -2338,12 +2310,10 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
switch (a->mode) {
case MPOL_BIND:
case MPOL_INTERLEAVE:
- return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
- /* a's ->flags is the same as b's */
- if (a->flags & MPOL_F_LOCAL)
- return true;
- return a->v.preferred_node == b->v.preferred_node;
+ return !!nodes_equal(a->nodes, b->nodes);
+ case MPOL_LOCAL:
+ return true;
default:
BUG();
return false;
@@ -2481,16 +2451,17 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break;
case MPOL_PREFERRED:
- if (pol->flags & MPOL_F_LOCAL)
- polnid = numa_node_id();
- else
- polnid = pol->v.preferred_node;
+ polnid = first_node(pol->nodes);
+ break;
+
+ case MPOL_LOCAL:
+ polnid = numa_node_id();
break;
case MPOL_BIND:
/* Optimize placement among multiple nodes via NUMA balancing */
if (pol->flags & MPOL_F_MORON) {
- if (node_isset(thisnid, pol->v.nodes))
+ if (node_isset(thisnid, pol->nodes))
break;
goto out;
}
@@ -2501,12 +2472,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
*/
- if (node_isset(curnid, pol->v.nodes))
+ if (node_isset(curnid, pol->nodes))
goto out;
z = first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
- &pol->v.nodes);
+ &pol->nodes);
polnid = zone_to_nid(z->zone);
break;
@@ -2709,7 +2680,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
vma->vm_pgoff,
sz, npol ? npol->mode : -1,
npol ? npol->flags : -1,
- npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
+ npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -2807,7 +2778,7 @@ void __init numa_policy_init(void)
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
- .v = { .preferred_node = nid, },
+ .nodes = nodemask_of_node(nid),
};
}
@@ -2851,9 +2822,6 @@ void numa_default_policy(void)
* Parse and format mempolicy from/to strings
*/
-/*
- * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
- */
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
@@ -2931,7 +2899,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
*/
if (nodelist)
goto out;
- mode = MPOL_PREFERRED;
break;
case MPOL_DEFAULT:
/*
@@ -2970,12 +2937,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
* Save nodes for mpol_to_str() to show the tmpfs mount options
* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
*/
- if (mode != MPOL_PREFERRED)
- new->v.nodes = nodes;
- else if (nodelist)
- new->v.preferred_node = first_node(nodes);
- else
- new->flags |= MPOL_F_LOCAL;
+ if (mode != MPOL_PREFERRED) {
+ new->nodes = nodes;
+ } else if (nodelist) {
+ nodes_clear(new->nodes);
+ node_set(first_node(nodes), new->nodes);
+ } else {
+ new->mode = MPOL_LOCAL;
+ }
/*
* Save nodes for contextualization: this will be used to "clone"
@@ -3021,16 +2990,12 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
switch (mode) {
case MPOL_DEFAULT:
+ case MPOL_LOCAL:
break;
case MPOL_PREFERRED:
- if (flags & MPOL_F_LOCAL)
- mode = MPOL_LOCAL;
- else
- node_set(pol->v.preferred_node, nodes);
- break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
- nodes = pol->v.nodes;
+ nodes = pol->nodes;
break;
default:
WARN_ON_ONCE(1);
diff --git a/mm/migrate.c b/mm/migrate.c
index 380ca57b9031..23cbd9de030b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -210,13 +210,18 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
* Recheck VMA as permissions can change since migration started
*/
entry = pte_to_swp_entry(*pvmw.pte);
- if (is_write_migration_entry(entry))
+ if (is_writable_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(*pvmw.pte))
pte = pte_mkuffd_wp(pte);
if (unlikely(is_device_private_page(new))) {
- entry = make_device_private_entry(new, pte_write(pte));
+ if (pte_write(pte))
+ entry = make_writable_device_private_entry(
+ page_to_pfn(new));
+ else
+ entry = make_readable_device_private_entry(
+ page_to_pfn(new));
pte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(*pvmw.pte))
pte = pte_swp_mksoft_dirty(pte);
@@ -226,8 +231,10 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
+
pte = pte_mkhuge(pte);
- pte = arch_make_huge_pte(pte, vma, new, 0);
+ pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, pvmw.address);
@@ -294,7 +301,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
if (!is_migration_entry(entry))
goto out;
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
page = compound_head(page);
/*
@@ -335,7 +342,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
ptl = pmd_lock(mm, pmd);
if (!is_pmd_migration_entry(*pmd))
goto unlock;
- page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
+ page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
if (!get_page_unless_zero(page))
goto unlock;
spin_unlock(ptl);
@@ -551,7 +558,7 @@ static void __copy_gigantic_page(struct page *dst, struct page *src,
}
}
-static void copy_huge_page(struct page *dst, struct page *src)
+void copy_huge_page(struct page *dst, struct page *src)
{
int i;
int nr_pages;
@@ -626,7 +633,10 @@ void migrate_page_states(struct page *newpage, struct page *page)
if (PageSwapCache(page))
ClearPageSwapCache(page);
ClearPagePrivate(page);
- set_page_private(page, 0);
+
+ /* page->private contains hugetlb specific flags */
+ if (!PageHuge(page))
+ set_page_private(page, 0);
/*
* If any waiters have accumulated on the new page then
@@ -1099,7 +1109,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
/* Establish migration ptes */
VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
page);
- try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK);
+ try_to_migrate(page, 0);
page_was_mapped = 1;
}
@@ -1288,7 +1298,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
* page_mapping() set, hugetlbfs specific move page routine will not
* be called and we could leak usage counts for subpools.
*/
- if (page_private(hpage) && !page_mapping(hpage)) {
+ if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
rc = -EBUSY;
goto out_unlock;
}
@@ -1301,7 +1311,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (page_mapped(hpage)) {
bool mapping_locked = false;
- enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
+ enum ttu_flags ttu = 0;
if (!PageAnon(hpage)) {
/*
@@ -1318,7 +1328,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
ttu |= TTU_RMAP_LOCKED;
}
- try_to_unmap(hpage, ttu);
+ try_to_migrate(hpage, ttu);
page_was_mapped = 1;
if (mapping_locked)
@@ -1418,6 +1428,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
int swapwrite = current->flags & PF_SWAPWRITE;
int rc, nr_subpages;
LIST_HEAD(ret_pages);
+ bool nosplit = (reason == MR_NUMA_MISPLACED);
trace_mm_migrate_pages_start(mode, reason);
@@ -1489,8 +1500,9 @@ retry:
/*
* When memory is low, don't bother to try to migrate
* other pages, just exit.
+ * THP NUMA faulting doesn't split THP to retry.
*/
- if (is_thp) {
+ if (is_thp && !nosplit) {
if (!try_split_thp(page, &page2, from)) {
nr_thp_split++;
goto retry;
@@ -2043,12 +2055,33 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
return newpage;
}
+static struct page *alloc_misplaced_dst_page_thp(struct page *page,
+ unsigned long data)
+{
+ int nid = (int) data;
+ struct page *newpage;
+
+ newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
+ HPAGE_PMD_ORDER);
+ if (!newpage)
+ goto out;
+
+ prep_transhuge_page(newpage);
+
+out:
+ return newpage;
+}
+
static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
int page_lru;
VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
+ /* Do not migrate THP mapped by multiple processes */
+ if (PageTransHuge(page) && total_mapcount(page) > 1)
+ return 0;
+
/* Avoid migrating to a node that is nearly full */
if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
return 0;
@@ -2056,18 +2089,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
if (isolate_lru_page(page))
return 0;
- /*
- * migrate_misplaced_transhuge_page() skips page migration's usual
- * check on page_count(), so we must do it here, now that the page
- * has been isolated: a GUP pin, or any other pin, prevents migration.
- * The expected page count is 3: 1 for page's mapcount and 1 for the
- * caller's pin and 1 for the reference taken by isolate_lru_page().
- */
- if (PageTransHuge(page) && page_count(page) != 3) {
- putback_lru_page(page);
- return 0;
- }
-
page_lru = page_is_file_lru(page);
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
thp_nr_pages(page));
@@ -2081,12 +2102,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
return 1;
}
-bool pmd_trans_migrating(pmd_t pmd)
-{
- struct page *page = pmd_page(pmd);
- return PageLocked(page);
-}
-
/*
* Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on
@@ -2099,6 +2114,21 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
int isolated;
int nr_remaining;
LIST_HEAD(migratepages);
+ new_page_t *new;
+ bool compound;
+ unsigned int nr_pages = thp_nr_pages(page);
+
+ /*
+ * PTE mapped THP or HugeTLB page can't reach here so the page could
+ * be either base page or THP. And it must be head page if it is
+ * THP.
+ */
+ compound = PageTransHuge(page);
+
+ if (compound)
+ new = alloc_misplaced_dst_page_thp;
+ else
+ new = alloc_misplaced_dst_page;
/*
* Don't migrate file pages that are mapped in multiple processes
@@ -2120,19 +2150,18 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
goto out;
list_add(&page->lru, &migratepages);
- nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
- NULL, node, MIGRATE_ASYNC,
- MR_NUMA_MISPLACED);
+ nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
+ MIGRATE_ASYNC, MR_NUMA_MISPLACED);
if (nr_remaining) {
if (!list_empty(&migratepages)) {
list_del(&page->lru);
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_lru(page));
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+ page_is_file_lru(page), -nr_pages);
putback_lru_page(page);
}
isolated = 0;
} else
- count_vm_numa_event(NUMA_PAGE_MIGRATE);
+ count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages);
BUG_ON(!list_empty(&migratepages));
return isolated;
@@ -2141,141 +2170,6 @@ out:
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
-
-#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-/*
- * Migrates a THP to a given target node. page must be locked and is unlocked
- * before returning.
- */
-int migrate_misplaced_transhuge_page(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pmd_t *pmd, pmd_t entry,
- unsigned long address,
- struct page *page, int node)
-{
- spinlock_t *ptl;
- pg_data_t *pgdat = NODE_DATA(node);
- int isolated = 0;
- struct page *new_page = NULL;
- int page_lru = page_is_file_lru(page);
- unsigned long start = address & HPAGE_PMD_MASK;
-
- new_page = alloc_pages_node(node,
- (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
- HPAGE_PMD_ORDER);
- if (!new_page)
- goto out_fail;
- prep_transhuge_page(new_page);
-
- isolated = numamigrate_isolate_page(pgdat, page);
- if (!isolated) {
- put_page(new_page);
- goto out_fail;
- }
-
- /* Prepare a page as a migration target */
- __SetPageLocked(new_page);
- if (PageSwapBacked(page))
- __SetPageSwapBacked(new_page);
-
- /* anon mapping, we can simply copy page->mapping to the new page: */
- new_page->mapping = page->mapping;
- new_page->index = page->index;
- /* flush the cache before copying using the kernel virtual address */
- flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
- migrate_page_copy(new_page, page);
- WARN_ON(PageLRU(new_page));
-
- /* Recheck the target PMD */
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
- spin_unlock(ptl);
-
- /* Reverse changes made by migrate_page_copy() */
- if (TestClearPageActive(new_page))
- SetPageActive(page);
- if (TestClearPageUnevictable(new_page))
- SetPageUnevictable(page);
-
- unlock_page(new_page);
- put_page(new_page); /* Free it */
-
- /* Retake the callers reference and putback on LRU */
- get_page(page);
- putback_lru_page(page);
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
-
- goto out_unlock;
- }
-
- entry = mk_huge_pmd(new_page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-
- /*
- * Overwrite the old entry under pagetable lock and establish
- * the new PTE. Any parallel GUP will either observe the old
- * page blocking on the page lock, block on the page table
- * lock or observe the new page. The SetPageUptodate on the
- * new page and page_add_new_anon_rmap guarantee the copy is
- * visible before the pagetable update.
- */
- page_add_anon_rmap(new_page, vma, start, true);
- /*
- * At this point the pmd is numa/protnone (i.e. non present) and the TLB
- * has already been flushed globally. So no TLB can be currently
- * caching this non present pmd mapping. There's no need to clear the
- * pmd before doing set_pmd_at(), nor to flush the TLB after
- * set_pmd_at(). Clearing the pmd here would introduce a race
- * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
- * mmap_lock for reading. If the pmd is set to NULL at any given time,
- * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
- * pmd.
- */
- set_pmd_at(mm, start, pmd, entry);
- update_mmu_cache_pmd(vma, address, &entry);
-
- page_ref_unfreeze(page, 2);
- mlock_migrate_page(new_page, page);
- page_remove_rmap(page, true);
- set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
-
- spin_unlock(ptl);
-
- /* Take an "isolate" reference and put new page on the LRU. */
- get_page(new_page);
- putback_lru_page(new_page);
-
- unlock_page(new_page);
- unlock_page(page);
- put_page(page); /* Drop the rmap reference */
- put_page(page); /* Drop the LRU isolation reference */
-
- count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
- count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
-
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_lru,
- -HPAGE_PMD_NR);
- return isolated;
-
-out_fail:
- count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
- ptl = pmd_lock(mm, pmd);
- if (pmd_same(*pmd, entry)) {
- entry = pmd_modify(entry, vma->vm_page_prot);
- set_pmd_at(mm, start, pmd, entry);
- update_mmu_cache_pmd(vma, address, &entry);
- }
- spin_unlock(ptl);
-
-out_unlock:
- unlock_page(page);
- put_page(page);
- return 0;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DEVICE_PRIVATE
@@ -2400,7 +2294,7 @@ again:
if (!is_device_private_entry(entry))
goto next;
- page = device_private_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
if (!(migrate->flags &
MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
page->pgmap->owner != migrate->pgmap_owner)
@@ -2408,7 +2302,7 @@ again:
mpfn = migrate_pfn(page_to_pfn(page)) |
MIGRATE_PFN_MIGRATE;
- if (is_write_device_private_entry(entry))
+ if (is_writable_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
@@ -2454,8 +2348,12 @@ again:
ptep_get_and_clear(mm, addr, ptep);
/* Setup special migration page table entry */
- entry = make_migration_entry(page, mpfn &
- MIGRATE_PFN_WRITE);
+ if (mpfn & MIGRATE_PFN_WRITE)
+ entry = make_writable_migration_entry(
+ page_to_pfn(page));
+ else
+ entry = make_readable_migration_entry(
+ page_to_pfn(page));
swp_pte = swp_entry_to_pte(entry);
if (pte_present(pte)) {
if (pte_soft_dirty(pte))
@@ -2518,8 +2416,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
* that the registered device driver can skip invalidating device
* private page mappings that won't be migrated.
*/
- mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
- migrate->vma->vm_mm, migrate->start, migrate->end,
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
+ migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
@@ -2704,7 +2602,6 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
*/
static void migrate_vma_unmap(struct migrate_vma *migrate)
{
- int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK;
const unsigned long npages = migrate->npages;
const unsigned long start = migrate->start;
unsigned long addr, i, restore = 0;
@@ -2716,7 +2613,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
continue;
if (page_mapped(page)) {
- try_to_unmap(page, flags);
+ try_to_migrate(page, 0);
if (page_mapped(page))
goto restore;
}
@@ -2928,7 +2825,12 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
if (is_device_private_page(page)) {
swp_entry_t swp_entry;
- swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
+ if (vma->vm_flags & VM_WRITE)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page));
entry = swp_entry_to_pte(swp_entry);
} else {
/*
@@ -3025,9 +2927,9 @@ void migrate_vma_pages(struct migrate_vma *migrate)
if (!notified) {
notified = true;
- mmu_notifier_range_init_migrate(&range, 0,
- migrate->vma, migrate->vma->vm_mm,
- addr, migrate->end,
+ mmu_notifier_range_init_owner(&range,
+ MMU_NOTIFY_MIGRATE, 0, migrate->vma,
+ migrate->vma->vm_mm, addr, migrate->end,
migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
}
diff --git a/mm/mlock.c b/mm/mlock.c
index e338ebc4ad29..0d639bf48794 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -108,7 +108,7 @@ void mlock_vma_page(struct page *page)
/*
* Finish munlock after successful page isolation
*
- * Page must be locked. This is a wrapper for try_to_munlock()
+ * Page must be locked. This is a wrapper for page_mlock()
* and putback_lru_page() with munlock accounting.
*/
static void __munlock_isolated_page(struct page *page)
@@ -118,7 +118,7 @@ static void __munlock_isolated_page(struct page *page)
* and we don't need to check all the other vmas.
*/
if (page_mapcount(page) > 1)
- try_to_munlock(page);
+ page_mlock(page);
/* Did try_to_unlock() succeed or punt? */
if (!PageMlocked(page))
@@ -158,7 +158,7 @@ static void __munlock_isolation_failed(struct page *page)
* munlock()ed or munmap()ed, we want to check whether other vmas hold the
* page locked so that we can leave it on the unevictable lru list and not
* bother vmscan with it. However, to walk the page's rmap list in
- * try_to_munlock() we must isolate the page from the LRU. If some other
+ * page_mlock() we must isolate the page from the LRU. If some other
* task has removed the page from the LRU, we won't be able to do that.
* So we clear the PageMlocked as we might not get another chance. If we
* can't isolate the page, we leave it for putback_lru_page() and vmscan
@@ -168,7 +168,7 @@ unsigned int munlock_vma_page(struct page *page)
{
int nr_pages;
- /* For try_to_munlock() and to serialize with page migration */
+ /* For page_mlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -205,7 +205,7 @@ static int __mlock_posix_error_return(long retval)
*
* The fast path is available only for evictable pages with single mapping.
* Then we can bypass the per-cpu pvec and get better performance.
- * when mapcount > 1 we need try_to_munlock() which can fail.
+ * when mapcount > 1 we need page_mlock() which can fail.
* when !page_evictable(), we need the full redo logic of putback_lru_page to
* avoid leaving evictable page in unevictable list.
*
@@ -414,7 +414,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
*
* We don't save and restore VM_LOCKED here because pages are
* still on lru. In unmap path, pages might be scanned by reclaim
- * and re-mlocked by try_to_{munlock|unmap} before we unmap and
+ * and re-mlocked by page_mlock/try_to_unmap before we unmap and
* free them. This will result in freeing mlocked pages.
*/
void munlock_vma_pages_range(struct vm_area_struct *vma,
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 2ae3f33b85b1..f5852a058ce0 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -153,6 +153,37 @@ static inline void put_memcg_path_buf(void)
rcu_read_unlock();
}
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ do { \
+ const char *memcg_path; \
+ preempt_disable(); \
+ memcg_path = get_mm_memcg_path(mm); \
+ trace_mmap_lock_##type(mm, \
+ memcg_path != NULL ? memcg_path : "", \
+ ##__VA_ARGS__); \
+ if (likely(memcg_path != NULL)) \
+ put_memcg_path_buf(); \
+ preempt_enable(); \
+ } while (0)
+
+#else /* !CONFIG_MEMCG */
+
+int trace_mmap_lock_reg(void)
+{
+ return 0;
+}
+
+void trace_mmap_lock_unreg(void)
+{
+}
+
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
+
+#endif /* CONFIG_MEMCG */
+
+#ifdef CONFIG_TRACING
+#ifdef CONFIG_MEMCG
/*
* Write the given mm_struct's memcg path to a percpu buffer, and return a
* pointer to it. If the path cannot be determined, or no buffer was available
@@ -187,33 +218,6 @@ out:
return buf;
}
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- do { \
- const char *memcg_path; \
- local_lock(&memcg_paths.lock); \
- memcg_path = get_mm_memcg_path(mm); \
- trace_mmap_lock_##type(mm, \
- memcg_path != NULL ? memcg_path : "", \
- ##__VA_ARGS__); \
- if (likely(memcg_path != NULL)) \
- put_memcg_path_buf(); \
- local_unlock(&memcg_paths.lock); \
- } while (0)
-
-#else /* !CONFIG_MEMCG */
-
-int trace_mmap_lock_reg(void)
-{
- return 0;
-}
-
-void trace_mmap_lock_unreg(void)
-{
-}
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
-
#endif /* CONFIG_MEMCG */
/*
@@ -239,3 +243,4 @@ void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
TRACE_MMAP_LOCK_EVENT(released, mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_released);
+#endif /* CONFIG_TRACING */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e7a443157988..883e2cc85cad 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -143,26 +143,36 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry = pte_to_swp_entry(oldpte);
pte_t newpte;
- if (is_write_migration_entry(entry)) {
+ if (is_writable_migration_entry(entry)) {
/*
* A protection check is difficult so
* just be safe and disable write
*/
- make_migration_entry_read(&entry);
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
- } else if (is_write_device_private_entry(entry)) {
+ } else if (is_writable_device_private_entry(entry)) {
/*
* We do not preserve soft-dirtiness. See
* copy_one_pte() for explanation.
*/
- make_device_private_entry_read(&entry);
+ entry = make_readable_device_private_entry(
+ swp_offset(entry));
newpte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
+ } else if (is_writable_device_exclusive_entry(entry)) {
+ entry = make_readable_device_exclusive_entry(
+ swp_offset(entry));
+ newpte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(oldpte))
+ newpte = pte_swp_mksoft_dirty(newpte);
+ if (pte_swp_uffd_wp(oldpte))
+ newpte = pte_swp_mkuffd_wp(newpte);
} else {
newpte = oldpte;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index affda71641ca..3a93d4054810 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -223,7 +223,7 @@ long vread(char *buf, char *addr, unsigned long count)
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM);
+ return __vmalloc(size, GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc);
@@ -241,7 +241,7 @@ EXPORT_SYMBOL(vmalloc);
*/
void *vzalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+ return __vmalloc(size, GFP_KERNEL | __GFP_ZERO);
}
EXPORT_SYMBOL(vzalloc);
@@ -1501,7 +1501,6 @@ erase_whole_vma:
delete_vma(mm, vma);
return 0;
}
-EXPORT_SYMBOL(do_munmap);
int vm_munmap(unsigned long addr, size_t len)
{
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eefd3f5fde46..fcc29e9a3064 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -104,7 +104,7 @@ static bool oom_cpuset_eligible(struct task_struct *start,
* mempolicy intersects current, otherwise it may be
* needlessly killed.
*/
- ret = mempolicy_nodemask_intersects(tsk, mask);
+ ret = mempolicy_in_oom_domain(tsk, mask);
} else {
/*
* This is not a mempolicy constrained oom, so only
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0817d88383d5..d6e94cc8066c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -749,7 +749,6 @@ void prep_compound_page(struct page *page, unsigned int order)
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- set_page_count(p, 0);
p->mapping = TAIL_MAPPING;
set_compound_head(p, page);
}
@@ -3193,7 +3192,7 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
int cpu;
/*
- * Allocate in the BSS so we wont require allocation in
+ * Allocate in the BSS so we won't require allocation in
* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
*/
static cpumask_t cpus_with_pcps;
@@ -3832,7 +3831,7 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
-noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return __should_fail_alloc_page(gfp_mask, order);
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index a4435311754b..f7b331081791 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -41,7 +41,8 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
/* Handle un-addressable ZONE_DEVICE memory */
entry = pte_to_swp_entry(*pvmw->pte);
- if (!is_device_private_entry(entry))
+ if (!is_device_private_entry(entry) &&
+ !is_device_exclusive_entry(entry))
return false;
} else if (!pte_present(*pvmw->pte))
return false;
@@ -93,19 +94,21 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
return false;
entry = pte_to_swp_entry(*pvmw->pte);
- if (!is_migration_entry(entry))
+ if (!is_migration_entry(entry) &&
+ !is_device_exclusive_entry(entry))
return false;
- pfn = migration_entry_to_pfn(entry);
+ pfn = swp_offset(entry);
} else if (is_swap_pte(*pvmw->pte)) {
swp_entry_t entry;
/* Handle un-addressable ZONE_DEVICE memory */
entry = pte_to_swp_entry(*pvmw->pte);
- if (!is_device_private_entry(entry))
+ if (!is_device_private_entry(entry) &&
+ !is_device_exclusive_entry(entry))
return false;
- pfn = device_private_entry_to_pfn(entry);
+ pfn = swp_offset(entry);
} else {
if (!pte_present(*pvmw->pte))
return false;
@@ -233,7 +236,7 @@ restart:
return not_found(pvmw);
entry = pmd_to_swp_entry(pmde);
if (!is_migration_entry(entry) ||
- migration_entry_to_page(entry) != page)
+ pfn_swap_entry_to_page(entry) != page)
return not_found(pvmw);
return true;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index e05c300048e6..37c24672125c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1405,24 +1405,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and page_remove_rmap(),
- * try_to_unmap() may return false when it is about to become true,
+ * try_to_unmap() may return before page_mapped() has become false,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
if (flags & TTU_SYNC)
pvmw.flags = PVMW_SYNC;
- /* munlock has nothing to gain from examining un-locked vmas */
- if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
- return true;
-
- if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
- is_zone_device_page(page) && !is_device_private_page(page))
- return true;
-
- if (flags & TTU_SPLIT_HUGE_PMD) {
- split_huge_pmd_address(vma, address,
- flags & TTU_SPLIT_FREEZE, page);
- }
+ if (flags & TTU_SPLIT_HUGE_PMD)
+ split_huge_pmd_address(vma, address, false, page);
/*
* For THP, we have to assume the worse case ie pmd for invalidation.
@@ -1447,16 +1437,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- /* PMD-mapped THP migration entry */
- if (!pvmw.pte && (flags & TTU_MIGRATION)) {
- VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
-
- set_pmd_migration_entry(&pvmw, page);
- continue;
- }
-#endif
-
/*
* If the page is mlock()d, we cannot swap it out.
* If it's recently referenced (perhaps page_referenced
@@ -1476,8 +1456,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
- if (flags & TTU_MUNLOCK)
- continue;
}
/* Unexpected PMD-mapped THP? */
@@ -1520,46 +1498,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
}
- if (IS_ENABLED(CONFIG_MIGRATION) &&
- (flags & TTU_MIGRATION) &&
- is_zone_device_page(page)) {
- swp_entry_t entry;
- pte_t swp_pte;
-
- pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- entry = make_migration_entry(page, 0);
- swp_pte = swp_entry_to_pte(entry);
-
- /*
- * pteval maps a zone device page and is therefore
- * a swap pte.
- */
- if (pte_swp_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_swp_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
- /*
- * No need to invalidate here it will synchronize on
- * against the special swap migration pte.
- *
- * The assignment to subpage above was computed from a
- * swap PTE which results in an invalid pointer.
- * Since only PAGE_SIZE pages can currently be
- * migrated, just set it to page. This will need to be
- * changed when hugepage migrations to device private
- * memory are supported.
- */
- subpage = page;
- goto discard;
- }
-
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
if (should_defer_flush(mm, flags)) {
@@ -1612,35 +1550,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* We have to invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
address + PAGE_SIZE);
- } else if (IS_ENABLED(CONFIG_MIGRATION) &&
- (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
- swp_entry_t entry;
- pte_t swp_pte;
-
- if (arch_unmap_one(mm, vma, address, pteval) < 0) {
- set_pte_at(mm, address, pvmw.pte, pteval);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- entry = make_migration_entry(subpage,
- pte_write(pteval));
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, address, pvmw.pte, swp_pte);
- /*
- * No need to invalidate here it will synchronize on
- * against the special swap migration pte.
- */
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(subpage) };
pte_t swp_pte;
@@ -1756,9 +1665,10 @@ static int page_not_mapped(struct page *page)
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock.
*
- * If unmap is successful, return true. Otherwise, false.
+ * It is the caller's responsibility to check if the page is still
+ * mapped when needed (use TTU_SYNC to prevent accounting races).
*/
-bool try_to_unmap(struct page *page, enum ttu_flags flags)
+void try_to_unmap(struct page *page, enum ttu_flags flags)
{
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
@@ -1767,6 +1677,277 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
.anon_lock = page_lock_anon_vma_read,
};
+ if (flags & TTU_RMAP_LOCKED)
+ rmap_walk_locked(page, &rwc);
+ else
+ rmap_walk(page, &rwc);
+}
+
+/*
+ * @arg: enum ttu_flags will be passed to this argument.
+ *
+ * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
+ * containing migration entries. This and TTU_RMAP_LOCKED are the only supported
+ * flags.
+ */
+static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ };
+ pte_t pteval;
+ struct page *subpage;
+ bool ret = true;
+ struct mmu_notifier_range range;
+ enum ttu_flags flags = (enum ttu_flags)(long)arg;
+
+ if (is_zone_device_page(page) && !is_device_private_page(page))
+ return true;
+
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_migrate() may return before page_mapped() has become false,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ if (flags & TTU_SYNC)
+ pvmw.flags = PVMW_SYNC;
+
+ /*
+ * unmap_page() in mm/huge_memory.c is the only user of migration with
+ * TTU_SPLIT_HUGE_PMD and it wants to freeze.
+ */
+ if (flags & TTU_SPLIT_HUGE_PMD)
+ split_huge_pmd_address(vma, address, true, page);
+
+ /*
+ * For THP, we have to assume the worse case ie pmd for invalidation.
+ * For hugetlb, it could be much worse if we need to do pud
+ * invalidation in the case of pmd sharing.
+ *
+ * Note that the page can not be free in this function as call of
+ * try_to_unmap() must hold a reference on the page.
+ */
+ range.end = PageKsm(page) ?
+ address + PAGE_SIZE : vma_address_end(page, vma);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address, range.end);
+ if (PageHuge(page)) {
+ /*
+ * If sharing is possible, start and end will be adjusted
+ * accordingly.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
+ &range.end);
+ }
+ mmu_notifier_invalidate_range_start(&range);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ /* PMD-mapped THP migration entry */
+ if (!pvmw.pte) {
+ VM_BUG_ON_PAGE(PageHuge(page) ||
+ !PageTransCompound(page), page);
+
+ set_pmd_migration_entry(&pvmw, page);
+ continue;
+ }
+#endif
+
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_PAGE(!pvmw.pte, page);
+
+ subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ address = pvmw.address;
+
+ if (PageHuge(page) && !PageAnon(page)) {
+ /*
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
+ */
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+ /*
+ * huge_pmd_unshare unmapped an entire PMD
+ * page. There is no way of knowing exactly
+ * which PMDs may be cached for this mm, so
+ * we must flush them all. start/end were
+ * already adjusted above to cover this range.
+ */
+ flush_cache_range(vma, range.start, range.end);
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
+
+ /*
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
+
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+
+ /* Move the dirty bit to the page. Now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
+
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
+ if (is_zone_device_page(page)) {
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ entry = make_readable_migration_entry(
+ page_to_pfn(page));
+ swp_pte = swp_entry_to_pte(entry);
+
+ /*
+ * pteval maps a zone device page and is therefore
+ * a swap pte.
+ */
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ /*
+ * No need to invalidate here it will synchronize on
+ * against the special swap migration pte.
+ *
+ * The assignment to subpage above was computed from a
+ * swap PTE which results in an invalid pointer.
+ * Since only PAGE_SIZE pages can currently be
+ * migrated, just set it to page. This will need to be
+ * changed when hugepage migrations to device private
+ * memory are supported.
+ */
+ subpage = page;
+ } else if (PageHWPoison(page)) {
+ pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
+ if (PageHuge(page)) {
+ hugetlb_count_sub(compound_nr(page), mm);
+ set_huge_swap_pte_at(mm, address,
+ pvmw.pte, pteval,
+ vma_mmu_pagesize(vma));
+ } else {
+ dec_mm_counter(mm, mm_counter(page));
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ }
+
+ } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ /*
+ * The guest indicated that the page content is of no
+ * interest anymore. Simply discard the pte, vmscan
+ * will take care of the rest.
+ * A future reference will then fault in a new zero
+ * page. When userfaultfd is active, we must not drop
+ * this page though, as its main user (postcopy
+ * migration) will not expect userfaults on already
+ * copied pages.
+ */
+ dec_mm_counter(mm, mm_counter(page));
+ /* We have to invalidate as we cleared the pte */
+ mmu_notifier_invalidate_range(mm, address,
+ address + PAGE_SIZE);
+ } else {
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ if (pte_write(pteval))
+ entry = make_writable_migration_entry(
+ page_to_pfn(subpage));
+ else
+ entry = make_readable_migration_entry(
+ page_to_pfn(subpage));
+
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
+ /*
+ * No need to invalidate here it will synchronize on
+ * against the special swap migration pte.
+ */
+ }
+
+ /*
+ * No need to call mmu_notifier_invalidate_range() it has be
+ * done above for all cases requiring it to happen under page
+ * table lock before mmu_notifier_invalidate_range_end()
+ *
+ * See Documentation/vm/mmu_notifier.rst
+ */
+ page_remove_rmap(subpage, PageHuge(page));
+ put_page(page);
+ }
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ return ret;
+}
+
+/**
+ * try_to_migrate - try to replace all page table mappings with swap entries
+ * @page: the page to replace page table entries for
+ * @flags: action and flags
+ *
+ * Tries to remove all the page table entries which are mapping this page and
+ * replace them with special swap entries. Caller must hold the page lock.
+ *
+ * If is successful, return true. Otherwise, false.
+ */
+void try_to_migrate(struct page *page, enum ttu_flags flags)
+{
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_migrate_one,
+ .arg = (void *)flags,
+ .done = page_not_mapped,
+ .anon_lock = page_lock_anon_vma_read,
+ };
+
+ /*
+ * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
+ * TTU_SPLIT_HUGE_PMD and TTU_SYNC flags.
+ */
+ if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
+ TTU_SYNC)))
+ return;
+
/*
* During exec, a temporary VMA is setup and later moved.
* The VMA is moved under the anon_vma lock but not the
@@ -1775,38 +1956,67 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
* locking requirements of exec(), migration skips
* temporary VMAs until after exec() completes.
*/
- if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
- && !PageKsm(page) && PageAnon(page))
+ if (!PageKsm(page) && PageAnon(page))
rwc.invalid_vma = invalid_migration_vma;
if (flags & TTU_RMAP_LOCKED)
rmap_walk_locked(page, &rwc);
else
rmap_walk(page, &rwc);
+}
- /*
- * When racing against e.g. zap_pte_range() on another cpu,
- * in between its ptep_get_and_clear_full() and page_remove_rmap(),
- * try_to_unmap() may return false when it is about to become true,
- * if page table locking is skipped: use TTU_SYNC to wait for that.
- */
- return !page_mapcount(page);
+/*
+ * Walks the vma's mapping a page and mlocks the page if any locked vma's are
+ * found. Once one is found the page is locked and the scan can be terminated.
+ */
+static bool page_mlock_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, void *unused)
+{
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ };
+
+ /* An un-locked vma doesn't have any pages to lock, continue the scan */
+ if (!(vma->vm_flags & VM_LOCKED))
+ return true;
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ /*
+ * Need to recheck under the ptl to serialise with
+ * __munlock_pagevec_fill() after VM_LOCKED is cleared in
+ * munlock_vma_pages_range().
+ */
+ if (vma->vm_flags & VM_LOCKED) {
+ /* PTE-mapped THP are never mlocked */
+ if (!PageTransCompound(page))
+ mlock_vma_page(page);
+ page_vma_mapped_walk_done(&pvmw);
+ }
+
+ /*
+ * no need to continue scanning other vma's if the page has
+ * been locked.
+ */
+ return false;
+ }
+
+ return true;
}
/**
- * try_to_munlock - try to munlock a page
- * @page: the page to be munlocked
+ * page_mlock - try to mlock a page
+ * @page: the page to be mlocked
*
- * Called from munlock code. Checks all of the VMAs mapping the page
- * to make sure nobody else has this page mlocked. The page will be
- * returned with PG_mlocked cleared if no other vmas have it mlocked.
+ * Called from munlock code. Checks all of the VMAs mapping the page and mlocks
+ * the page if any are found. The page will be returned with PG_mlocked cleared
+ * if it is not mapped by any locked vmas.
*/
-
-void try_to_munlock(struct page *page)
+void page_mlock(struct page *page)
{
struct rmap_walk_control rwc = {
- .rmap_one = try_to_unmap_one,
- .arg = (void *)TTU_MUNLOCK,
+ .rmap_one = page_mlock_one,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
@@ -1818,6 +2028,192 @@ void try_to_munlock(struct page *page)
rmap_walk(page, &rwc);
}
+#ifdef CONFIG_DEVICE_PRIVATE
+struct make_exclusive_args {
+ struct mm_struct *mm;
+ unsigned long address;
+ void *owner;
+ bool valid;
+};
+
+static bool page_make_device_exclusive_one(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, void *priv)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ };
+ struct make_exclusive_args *args = priv;
+ pte_t pteval;
+ struct page *subpage;
+ bool ret = true;
+ struct mmu_notifier_range range;
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+ vma->vm_mm, address, min(vma->vm_end,
+ address + page_size(page)), args->owner);
+ mmu_notifier_invalidate_range_start(&range);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_PAGE(!pvmw.pte, page);
+
+ if (!pte_present(*pvmw.pte)) {
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ address = pvmw.address;
+
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+
+ /* Move the dirty bit to the page. Now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
+
+ /*
+ * Check that our target page is still mapped at the expected
+ * address.
+ */
+ if (args->mm == mm && args->address == address &&
+ pte_write(pteval))
+ args->valid = true;
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ if (pte_write(pteval))
+ entry = make_writable_device_exclusive_entry(
+ page_to_pfn(subpage));
+ else
+ entry = make_readable_device_exclusive_entry(
+ page_to_pfn(subpage));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
+
+ /*
+ * There is a reference on the page for the swap entry which has
+ * been removed, so shouldn't take another.
+ */
+ page_remove_rmap(subpage, false);
+ }
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ return ret;
+}
+
+/**
+ * page_make_device_exclusive - mark the page exclusively owned by a device
+ * @page: the page to replace page table entries for
+ * @mm: the mm_struct where the page is expected to be mapped
+ * @address: address where the page is expected to be mapped
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
+ *
+ * Tries to remove all the page table entries which are mapping this page and
+ * replace them with special device exclusive swap entries to grant a device
+ * exclusive access to the page. Caller must hold the page lock.
+ *
+ * Returns false if the page is still mapped, or if it could not be unmapped
+ * from the expected address. Otherwise returns true (success).
+ */
+static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm,
+ unsigned long address, void *owner)
+{
+ struct make_exclusive_args args = {
+ .mm = mm,
+ .address = address,
+ .owner = owner,
+ .valid = false,
+ };
+ struct rmap_walk_control rwc = {
+ .rmap_one = page_make_device_exclusive_one,
+ .done = page_not_mapped,
+ .anon_lock = page_lock_anon_vma_read,
+ .arg = &args,
+ };
+
+ /*
+ * Restrict to anonymous pages for now to avoid potential writeback
+ * issues. Also tail pages shouldn't be passed to rmap_walk so skip
+ * those.
+ */
+ if (!PageAnon(page) || PageTail(page))
+ return false;
+
+ rmap_walk(page, &rwc);
+
+ return args.valid && !page_mapcount(page);
+}
+
+/**
+ * make_device_exclusive_range() - Mark a range for exclusive use by a device
+ * @mm: mm_struct of assoicated target process
+ * @start: start of the region to mark for exclusive device access
+ * @end: end address of region
+ * @pages: returns the pages which were successfully marked for exclusive access
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
+ *
+ * Returns: number of pages found in the range by GUP. A page is marked for
+ * exclusive access only if the page pointer is non-NULL.
+ *
+ * This function finds ptes mapping page(s) to the given address range, locks
+ * them and replaces mappings with special swap entries preventing userspace CPU
+ * access. On fault these entries are replaced with the original mapping after
+ * calling MMU notifiers.
+ *
+ * A driver using this to program access from a device must use a mmu notifier
+ * critical section to hold a device specific lock during programming. Once
+ * programming is complete it should drop the page lock and reference after
+ * which point CPU access to the page will revoke the exclusive access.
+ */
+int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct page **pages,
+ void *owner)
+{
+ long npages = (end - start) >> PAGE_SHIFT;
+ long i;
+
+ npages = get_user_pages_remote(mm, start, npages,
+ FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
+ pages, NULL, NULL);
+ if (npages < 0)
+ return npages;
+
+ for (i = 0; i < npages; i++, start += PAGE_SIZE) {
+ if (!trylock_page(pages[i])) {
+ put_page(pages[i]);
+ pages[i] = NULL;
+ continue;
+ }
+
+ if (!page_make_device_exclusive(pages[i], mm, start, owner)) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ pages[i] = NULL;
+ }
+ }
+
+ return npages;
+}
+EXPORT_SYMBOL_GPL(make_device_exclusive_range);
+#endif
+
void __put_anon_vma(struct anon_vma *anon_vma)
{
struct anon_vma *root = anon_vma->root;
@@ -1858,7 +2254,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the anon_vma struct it points to.
*
- * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
+ * When called from page_mlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
@@ -1911,7 +2307,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
*
- * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
+ * When called from page_mlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
diff --git a/mm/shmem.c b/mm/shmem.c
index 6268b9b4e41a..70d9ce294bb4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1797,7 +1797,7 @@ unlock:
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache.
*
- * vmf and fault_type are only supplied by shmem_fault:
+ * vma, vmf, and fault_type are only supplied by shmem_fault:
* otherwise they are NULL.
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
@@ -1832,6 +1832,16 @@ repeat:
page = pagecache_get_page(mapping, index,
FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
+
+ if (page && vma && userfaultfd_minor(vma)) {
+ if (!xa_is_value(page)) {
+ unlock_page(page);
+ put_page(page);
+ }
+ *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
+ return 0;
+ }
+
if (xa_is_value(page)) {
error = shmem_swapin_page(inode, index, &page,
sgp, gfp, vma, fault_type);
@@ -2352,27 +2362,25 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
return inode;
}
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- bool zeropage,
- struct page **pagep)
+#ifdef CONFIG_USERFAULTFD
+int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ bool zeropage,
+ struct page **pagep)
{
struct inode *inode = file_inode(dst_vma->vm_file);
struct shmem_inode_info *info = SHMEM_I(inode);
struct address_space *mapping = inode->i_mapping;
gfp_t gfp = mapping_gfp_mask(mapping);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
- spinlock_t *ptl;
void *page_kaddr;
struct page *page;
- pte_t _dst_pte, *dst_pte;
int ret;
- pgoff_t offset, max_off;
+ pgoff_t max_off;
- ret = -ENOMEM;
if (!shmem_inode_acct_block(inode, 1)) {
/*
* We may have got a page, returned -ENOENT triggering a retry,
@@ -2383,15 +2391,16 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
put_page(*pagep);
*pagep = NULL;
}
- goto out;
+ return -ENOMEM;
}
if (!*pagep) {
+ ret = -ENOMEM;
page = shmem_alloc_page(gfp, info, pgoff);
if (!page)
goto out_unacct_blocks;
- if (!zeropage) { /* mcopy_atomic */
+ if (!zeropage) { /* COPY */
page_kaddr = kmap_atomic(page);
ret = copy_from_user(page_kaddr,
(const void __user *)src_addr,
@@ -2401,11 +2410,11 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
*pagep = page;
- shmem_inode_unacct_blocks(inode, 1);
+ ret = -ENOENT;
/* don't free the page */
- return -ENOENT;
+ goto out_unacct_blocks;
}
- } else { /* mfill_zeropage_atomic */
+ } else { /* ZEROPAGE */
clear_highpage(page);
}
} else {
@@ -2413,15 +2422,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
*pagep = NULL;
}
- VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
+ VM_BUG_ON(PageLocked(page));
+ VM_BUG_ON(PageSwapBacked(page));
__SetPageLocked(page);
__SetPageSwapBacked(page);
__SetPageUptodate(page);
ret = -EFAULT;
- offset = linear_page_index(dst_vma, dst_addr);
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off))
+ if (unlikely(pgoff >= max_off))
goto out_release;
ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
@@ -2429,32 +2438,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (ret)
goto out_release;
- _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
- if (dst_vma->vm_flags & VM_WRITE)
- _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
- else {
- /*
- * We don't set the pte dirty if the vma has no
- * VM_WRITE permission, so mark the page dirty or it
- * could be freed from under us. We could do it
- * unconditionally before unlock_page(), but doing it
- * only if VM_WRITE is not set is faster.
- */
- set_page_dirty(page);
- }
-
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
-
- ret = -EFAULT;
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off))
- goto out_release_unlock;
-
- ret = -EEXIST;
- if (!pte_none(*dst_pte))
- goto out_release_unlock;
-
- lru_cache_add(page);
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ page, true, false);
+ if (ret)
+ goto out_delete_from_cache;
spin_lock_irq(&info->lock);
info->alloced++;
@@ -2462,50 +2449,19 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
- inc_mm_counter(dst_mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
-
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(dst_vma, dst_addr, dst_pte);
- pte_unmap_unlock(dst_pte, ptl);
+ SetPageDirty(page);
unlock_page(page);
- ret = 0;
-out:
- return ret;
-out_release_unlock:
- pte_unmap_unlock(dst_pte, ptl);
- ClearPageDirty(page);
+ return 0;
+out_delete_from_cache:
delete_from_page_cache(page);
out_release:
unlock_page(page);
put_page(page);
out_unacct_blocks:
shmem_inode_unacct_blocks(inode, 1);
- goto out;
-}
-
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep)
-{
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, false, pagep);
-}
-
-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
-{
- struct page *page = NULL;
-
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, 0, true, &page);
+ return ret;
}
+#endif /* CONFIG_USERFAULTFD */
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
@@ -4040,8 +3996,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
loff_t i_size;
pgoff_t off;
- if ((vma->vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ if (!transhuge_vma_enabled(vma, vma->vm_flags))
return false;
if (shmem_huge == SHMEM_HUGE_FORCE)
return true;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 16183d85a7d5..bdce883f9286 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -27,8 +27,362 @@
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
+#include <linux/pgtable.h>
+#include <linux/bootmem_info.h>
+
#include <asm/dma.h>
#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+/**
+ * struct vmemmap_remap_walk - walk vmemmap page table
+ *
+ * @remap_pte: called for each lowest-level entry (PTE).
+ * @nr_walked: the number of walked pte.
+ * @reuse_page: the page which is reused for the tail vmemmap pages.
+ * @reuse_addr: the virtual address of the @reuse_page page.
+ * @vmemmap_pages: the list head of the vmemmap pages that can be freed
+ * or is mapped from.
+ */
+struct vmemmap_remap_walk {
+ void (*remap_pte)(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk);
+ unsigned long nr_walked;
+ struct page *reuse_page;
+ unsigned long reuse_addr;
+ struct list_head *vmemmap_pages;
+};
+
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
+ struct vmemmap_remap_walk *walk)
+{
+ pmd_t __pmd;
+ int i;
+ unsigned long addr = start;
+ struct page *page = pmd_page(*pmd);
+ pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
+
+ if (!pgtable)
+ return -ENOMEM;
+
+ pmd_populate_kernel(&init_mm, &__pmd, pgtable);
+
+ for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
+ pte_t entry, *pte;
+ pgprot_t pgprot = PAGE_KERNEL;
+
+ entry = mk_pte(page + i, pgprot);
+ pte = pte_offset_kernel(&__pmd, addr);
+ set_pte_at(&init_mm, addr, pte, entry);
+ }
+
+ /* Make pte visible before pmd. See comment in __pte_alloc(). */
+ smp_wmb();
+ pmd_populate_kernel(&init_mm, pmd, pgtable);
+
+ flush_tlb_kernel_range(start, start + PMD_SIZE);
+
+ return 0;
+}
+
+static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ pte_t *pte = pte_offset_kernel(pmd, addr);
+
+ /*
+ * The reuse_page is found 'first' in table walk before we start
+ * remapping (which is calling @walk->remap_pte).
+ */
+ if (!walk->reuse_page) {
+ walk->reuse_page = pte_page(*pte);
+ /*
+ * Because the reuse address is part of the range that we are
+ * walking, skip the reuse address range.
+ */
+ addr += PAGE_SIZE;
+ pte++;
+ walk->nr_walked++;
+ }
+
+ for (; addr != end; addr += PAGE_SIZE, pte++) {
+ walk->remap_pte(pte, addr, walk);
+ walk->nr_walked++;
+ }
+}
+
+static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ if (pmd_leaf(*pmd)) {
+ int ret;
+
+ ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk);
+ if (ret)
+ return ret;
+ }
+ next = pmd_addr_end(addr, end);
+ vmemmap_pte_range(pmd, addr, next, walk);
+ } while (pmd++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_offset(p4d, addr);
+ do {
+ int ret;
+
+ next = pud_addr_end(addr, end);
+ ret = vmemmap_pmd_range(pud, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pud++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ int ret;
+
+ next = p4d_addr_end(addr, end);
+ ret = vmemmap_pud_range(p4d, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (p4d++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int vmemmap_remap_range(unsigned long start, unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ unsigned long addr = start;
+ unsigned long next;
+ pgd_t *pgd;
+
+ VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
+ VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
+
+ pgd = pgd_offset_k(addr);
+ do {
+ int ret;
+
+ next = pgd_addr_end(addr, end);
+ ret = vmemmap_p4d_range(pgd, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pgd++, addr = next, addr != end);
+
+ /*
+ * We only change the mapping of the vmemmap virtual address range
+ * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
+ * belongs to the range.
+ */
+ flush_tlb_kernel_range(start + PAGE_SIZE, end);
+
+ return 0;
+}
+
+/*
+ * Free a vmemmap page. A vmemmap page can be allocated from the memblock
+ * allocator or buddy allocator. If the PG_reserved flag is set, it means
+ * that it allocated from the memblock allocator, just free it via the
+ * free_bootmem_page(). Otherwise, use __free_page().
+ */
+static inline void free_vmemmap_page(struct page *page)
+{
+ if (PageReserved(page))
+ free_bootmem_page(page);
+ else
+ __free_page(page);
+}
+
+/* Free a list of the vmemmap pages */
+static void free_vmemmap_page_list(struct list_head *list)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, list, lru) {
+ list_del(&page->lru);
+ free_vmemmap_page(page);
+ }
+}
+
+static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk)
+{
+ /*
+ * Remap the tail pages as read-only to catch illegal write operation
+ * to the tail pages.
+ */
+ pgprot_t pgprot = PAGE_KERNEL_RO;
+ pte_t entry = mk_pte(walk->reuse_page, pgprot);
+ struct page *page = pte_page(*pte);
+
+ list_add_tail(&page->lru, walk->vmemmap_pages);
+ set_pte_at(&init_mm, addr, pte, entry);
+}
+
+static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk)
+{
+ pgprot_t pgprot = PAGE_KERNEL;
+ struct page *page;
+ void *to;
+
+ BUG_ON(pte_page(*pte) != walk->reuse_page);
+
+ page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+ list_del(&page->lru);
+ to = page_to_virt(page);
+ copy_page(to, (void *)walk->reuse_addr);
+
+ set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
+/**
+ * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
+ * to the page which @reuse is mapped to, then free vmemmap
+ * which the range are mapped to.
+ * @start: start address of the vmemmap virtual address range that we want
+ * to remap.
+ * @end: end address of the vmemmap virtual address range that we want to
+ * remap.
+ * @reuse: reuse address.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int vmemmap_remap_free(unsigned long start, unsigned long end,
+ unsigned long reuse)
+{
+ int ret;
+ LIST_HEAD(vmemmap_pages);
+ struct vmemmap_remap_walk walk = {
+ .remap_pte = vmemmap_remap_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
+
+ /*
+ * In order to make remapping routine most efficient for the huge pages,
+ * the routine of vmemmap page table walking has the following rules
+ * (see more details from the vmemmap_pte_range()):
+ *
+ * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
+ * should be continuous.
+ * - The @reuse address is part of the range [@reuse, @end) that we are
+ * walking which is passed to vmemmap_remap_range().
+ * - The @reuse address is the first in the complete range.
+ *
+ * So we need to make sure that @start and @reuse meet the above rules.
+ */
+ BUG_ON(start - reuse != PAGE_SIZE);
+
+ mmap_write_lock(&init_mm);
+ ret = vmemmap_remap_range(reuse, end, &walk);
+ mmap_write_downgrade(&init_mm);
+
+ if (ret && walk.nr_walked) {
+ end = reuse + walk.nr_walked * PAGE_SIZE;
+ /*
+ * vmemmap_pages contains pages from the previous
+ * vmemmap_remap_range call which failed. These
+ * are pages which were removed from the vmemmap.
+ * They will be restored in the following call.
+ */
+ walk = (struct vmemmap_remap_walk) {
+ .remap_pte = vmemmap_restore_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
+
+ vmemmap_remap_range(reuse, end, &walk);
+ }
+ mmap_read_unlock(&init_mm);
+
+ free_vmemmap_page_list(&vmemmap_pages);
+
+ return ret;
+}
+
+static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
+ gfp_t gfp_mask, struct list_head *list)
+{
+ unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
+ int nid = page_to_nid((struct page *)start);
+ struct page *page, *next;
+
+ while (nr_pages--) {
+ page = alloc_pages_node(nid, gfp_mask, 0);
+ if (!page)
+ goto out;
+ list_add_tail(&page->lru, list);
+ }
+
+ return 0;
+out:
+ list_for_each_entry_safe(page, next, list, lru)
+ __free_pages(page, 0);
+ return -ENOMEM;
+}
+
+/**
+ * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
+ * to the page which is from the @vmemmap_pages
+ * respectively.
+ * @start: start address of the vmemmap virtual address range that we want
+ * to remap.
+ * @end: end address of the vmemmap virtual address range that we want to
+ * remap.
+ * @reuse: reuse address.
+ * @gfp_mask: GFP flag for allocating vmemmap pages.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+ unsigned long reuse, gfp_t gfp_mask)
+{
+ LIST_HEAD(vmemmap_pages);
+ struct vmemmap_remap_walk walk = {
+ .remap_pte = vmemmap_restore_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
+
+ /* See the comment in the vmemmap_remap_free(). */
+ BUG_ON(start - reuse != PAGE_SIZE);
+
+ if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
+ return -ENOMEM;
+
+ mmap_read_lock(&init_mm);
+ vmemmap_remap_range(reuse, end, &walk);
+ mmap_read_unlock(&init_mm);
+
+ return 0;
+}
/*
* Allocate a block of memory to be used to back the virtual memory map
diff --git a/mm/sparse.c b/mm/sparse.c
index 7272f7a1449d..6326cdf36c4f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -13,6 +13,7 @@
#include <linux/vmalloc.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/bootmem_info.h>
#include "internal.h"
#include <asm/dma.h>
diff --git a/mm/swap.c b/mm/swap.c
index 6c11db780467..19600430e536 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -554,7 +554,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
} else {
/*
* The page's writeback ends up during pagevec
- * We moves tha page into tail of inactive.
+ * We move that page into tail of inactive.
*/
add_page_to_lru_list_tail(page, lruvec);
__count_vm_events(PGROTATED, nr_pages);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e898c879a434..1e07d1c776f2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2967,7 +2967,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
return 0;
}
- /* swap partition endianess hack... */
+ /* swap partition endianness hack... */
if (swab32(swap_header->info.version) == 1) {
swab32s(&swap_header->info.version);
swab32s(&swap_header->info.last_page);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 63a73e164d55..0e2132834bc7 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -48,6 +48,78 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
return dst_vma;
}
+/*
+ * Install PTEs, to map dst_addr (within dst_vma) to page.
+ *
+ * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
+ * and anon, and for both shared and private VMAs.
+ */
+int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, struct page *page,
+ bool newly_allocated, bool wp_copy)
+{
+ int ret;
+ pte_t _dst_pte, *dst_pte;
+ bool writable = dst_vma->vm_flags & VM_WRITE;
+ bool vm_shared = dst_vma->vm_flags & VM_SHARED;
+ bool page_in_cache = page->mapping;
+ spinlock_t *ptl;
+ struct inode *inode;
+ pgoff_t offset, max_off;
+
+ _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+ if (page_in_cache && !vm_shared)
+ writable = false;
+ if (writable || !page_in_cache)
+ _dst_pte = pte_mkdirty(_dst_pte);
+ if (writable) {
+ if (wp_copy)
+ _dst_pte = pte_mkuffd_wp(_dst_pte);
+ else
+ _dst_pte = pte_mkwrite(_dst_pte);
+ }
+
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+
+ if (vma_is_shmem(dst_vma)) {
+ /* serialize against truncate with the page table lock */
+ inode = dst_vma->vm_file->f_inode;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ ret = -EFAULT;
+ if (unlikely(offset >= max_off))
+ goto out_unlock;
+ }
+
+ ret = -EEXIST;
+ if (!pte_none(*dst_pte))
+ goto out_unlock;
+
+ if (page_in_cache)
+ page_add_file_rmap(page, false);
+ else
+ page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+
+ /*
+ * Must happen after rmap, as mm_counter() checks mapping (via
+ * PageAnon()), which is set by __page_set_anon_rmap().
+ */
+ inc_mm_counter(dst_mm, mm_counter(page));
+
+ if (newly_allocated)
+ lru_cache_add_inactive_or_unevictable(page, dst_vma);
+
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ return ret;
+}
+
static int mcopy_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
@@ -56,13 +128,9 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
struct page **pagep,
bool wp_copy)
{
- pte_t _dst_pte, *dst_pte;
- spinlock_t *ptl;
void *page_kaddr;
int ret;
struct page *page;
- pgoff_t offset, max_off;
- struct inode *inode;
if (!*pagep) {
ret = -ENOMEM;
@@ -99,43 +167,12 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
goto out_release;
- _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
- if (dst_vma->vm_flags & VM_WRITE) {
- if (wp_copy)
- _dst_pte = pte_mkuffd_wp(_dst_pte);
- else
- _dst_pte = pte_mkwrite(_dst_pte);
- }
-
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
- if (dst_vma->vm_file) {
- /* the shmem MAP_PRIVATE case requires checking the i_size */
- inode = dst_vma->vm_file->f_inode;
- offset = linear_page_index(dst_vma, dst_addr);
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- ret = -EFAULT;
- if (unlikely(offset >= max_off))
- goto out_release_uncharge_unlock;
- }
- ret = -EEXIST;
- if (!pte_none(*dst_pte))
- goto out_release_uncharge_unlock;
-
- inc_mm_counter(dst_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
- lru_cache_add_inactive_or_unevictable(page, dst_vma);
-
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
-
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(dst_vma, dst_addr, dst_pte);
-
- pte_unmap_unlock(dst_pte, ptl);
- ret = 0;
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ page, true, wp_copy);
+ if (ret)
+ goto out_release;
out:
return ret;
-out_release_uncharge_unlock:
- pte_unmap_unlock(dst_pte, ptl);
out_release:
put_page(page);
goto out;
@@ -176,6 +213,41 @@ out_unlock:
return ret;
}
+/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
+static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ bool wp_copy)
+{
+ struct inode *inode = file_inode(dst_vma->vm_file);
+ pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ struct page *page;
+ int ret;
+
+ ret = shmem_getpage(inode, pgoff, &page, SGP_READ);
+ if (ret)
+ goto out;
+ if (!page) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ page, false, wp_copy);
+ if (ret)
+ goto out_release;
+
+ unlock_page(page);
+ ret = 0;
+out:
+ return ret;
+out_release:
+ unlock_page(page);
+ put_page(page);
+ goto out;
+}
+
static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
@@ -209,7 +281,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long len,
enum mcopy_atomic_mode mode)
{
- int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
pte_t *dst_pte;
@@ -308,7 +379,6 @@ retry:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
- vm_alloc_shared = vm_shared;
cond_resched();
@@ -346,54 +416,8 @@ retry:
out_unlock:
mmap_read_unlock(dst_mm);
out:
- if (page) {
- /*
- * We encountered an error and are about to free a newly
- * allocated huge page.
- *
- * Reservation handling is very subtle, and is different for
- * private and shared mappings. See the routine
- * restore_reserve_on_error for details. Unfortunately, we
- * can not call restore_reserve_on_error now as it would
- * require holding mmap_lock.
- *
- * If a reservation for the page existed in the reservation
- * map of a private mapping, the map was modified to indicate
- * the reservation was consumed when the page was allocated.
- * We clear the HPageRestoreReserve flag now so that the global
- * reserve count will not be incremented in free_huge_page.
- * The reservation map will still indicate the reservation
- * was consumed and possibly prevent later page allocation.
- * This is better than leaking a global reservation. If no
- * reservation existed, it is still safe to clear
- * HPageRestoreReserve as no adjustments to reservation counts
- * were made during allocation.
- *
- * The reservation map for shared mappings indicates which
- * pages have reservations. When a huge page is allocated
- * for an address with a reservation, no change is made to
- * the reserve map. In this case HPageRestoreReserve will be
- * set to indicate that the global reservation count should be
- * incremented when the page is freed. This is the desired
- * behavior. However, when a huge page is allocated for an
- * address without a reservation a reservation entry is added
- * to the reservation map, and HPageRestoreReserve will not be
- * set. When the page is freed, the global reserve count will
- * NOT be incremented and it will appear as though we have
- * leaked reserved page. In this case, set HPageRestoreReserve
- * so that the global reserve count will be incremented to
- * match the reservation map entry which was created.
- *
- * Note that vm_alloc_shared is based on the flags of the vma
- * for which the page was originally allocated. dst_vma could
- * be different or NULL on error.
- */
- if (vm_alloc_shared)
- SetHPageRestoreReserve(page);
- else
- ClearHPageRestoreReserve(page);
+ if (page)
put_page(page);
- }
BUG_ON(copied < 0);
BUG_ON(err > 0);
BUG_ON(!copied && !err);
@@ -415,11 +439,16 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
unsigned long dst_addr,
unsigned long src_addr,
struct page **page,
- bool zeropage,
+ enum mcopy_atomic_mode mode,
bool wp_copy)
{
ssize_t err;
+ if (mode == MCOPY_ATOMIC_CONTINUE) {
+ return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ wp_copy);
+ }
+
/*
* The normal page fault path for a shmem will invoke the
* fault, fill the hole in the file and COW it right away. The
@@ -431,7 +460,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
* and not in the radix tree.
*/
if (!(dst_vma->vm_flags & VM_SHARED)) {
- if (!zeropage)
+ if (mode == MCOPY_ATOMIC_NORMAL)
err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
dst_addr, src_addr, page,
wp_copy);
@@ -440,13 +469,10 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
dst_vma, dst_addr);
} else {
VM_WARN_ON_ONCE(wp_copy);
- if (!zeropage)
- err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr,
- src_addr, page);
- else
- err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr);
+ err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr,
+ mode != MCOPY_ATOMIC_NORMAL,
+ page);
}
return err;
@@ -467,7 +493,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
long copied;
struct page *page;
bool wp_copy;
- bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
/*
* Sanitize the command parameters:
@@ -530,7 +555,7 @@ retry:
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
- if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+ if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
goto out_unlock;
/*
@@ -578,7 +603,7 @@ retry:
BUG_ON(pmd_trans_huge(*dst_pmd));
err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- src_addr, &page, zeropage, wp_copy);
+ src_addr, &page, mcopy_mode, wp_copy);
cond_resched();
if (unlikely(err == -ENOENT)) {
diff --git a/mm/util.c b/mm/util.c
index a8bf17f18a81..a034525e7ba2 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1010,3 +1010,43 @@ void mem_dump_obj(void *object)
}
EXPORT_SYMBOL_GPL(mem_dump_obj);
#endif
+
+/*
+ * A driver might set a page logically offline -- PageOffline() -- and
+ * turn the page inaccessible in the hypervisor; after that, access to page
+ * content can be fatal.
+ *
+ * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
+ * pages after checking PageOffline(); however, these PFN walkers can race
+ * with drivers that set PageOffline().
+ *
+ * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
+ * synchronize with such drivers, achieving that a page cannot be set
+ * PageOffline() while frozen.
+ *
+ * page_offline_begin()/page_offline_end() is used by drivers that care about
+ * such races when setting a page PageOffline().
+ */
+static DECLARE_RWSEM(page_offline_rwsem);
+
+void page_offline_freeze(void)
+{
+ down_read(&page_offline_rwsem);
+}
+
+void page_offline_thaw(void)
+{
+ up_read(&page_offline_rwsem);
+}
+
+void page_offline_begin(void)
+{
+ down_write(&page_offline_rwsem);
+}
+EXPORT_SYMBOL(page_offline_begin);
+
+void page_offline_end(void)
+{
+ up_write(&page_offline_rwsem);
+}
+EXPORT_SYMBOL(page_offline_end);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b2ec7f751bd0..d5cd52805149 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -25,6 +25,7 @@
#include <linux/notifier.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
+#include <linux/io.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
@@ -36,6 +37,7 @@
#include <linux/overflow.h>
#include <linux/pgtable.h>
#include <linux/uaccess.h>
+#include <linux/hugetlb.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
@@ -83,10 +85,11 @@ static void free_work(struct work_struct *w)
/*** Page table manipulation functions ***/
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
pte_t *pte;
u64 pfn;
+ unsigned long size = PAGE_SIZE;
pfn = phys_addr >> PAGE_SHIFT;
pte = pte_alloc_kernel_track(pmd, addr, mask);
@@ -94,9 +97,22 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
return -ENOMEM;
do {
BUG_ON(!pte_none(*pte));
+
+#ifdef CONFIG_HUGETLB_PAGE
+ size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
+ if (size != PAGE_SIZE) {
+ pte_t entry = pfn_pte(pfn, prot);
+
+ entry = pte_mkhuge(entry);
+ entry = arch_make_huge_pte(entry, ilog2(size), 0);
+ set_huge_pte_at(&init_mm, addr, pte, entry);
+ pfn += PFN_DOWN(size);
+ continue;
+ }
+#endif
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += PFN_DOWN(size), addr += size, addr != end);
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -145,7 +161,7 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
continue;
}
- if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+ if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
return -ENOMEM;
} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
return 0;
@@ -1592,6 +1608,7 @@ static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
+#ifdef CONFIG_X86_64
/*
* called before a call to iounmap() if the caller wants vm_area_struct's
* immediately freed.
@@ -1600,6 +1617,7 @@ void set_iounmap_nonlazy(void)
{
atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
}
+#endif /* CONFIG_X86_64 */
/*
* Purges all lazily-freed vmap areas.
@@ -2912,8 +2930,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return NULL;
}
- if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) &&
- arch_vmap_pmd_supported(prot)) {
+ if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) {
unsigned long size_per_node;
/*
@@ -2926,11 +2943,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
size_per_node = size;
if (node == NUMA_NO_NODE)
size_per_node /= num_online_nodes();
- if (size_per_node >= PMD_SIZE) {
+ if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
shift = PMD_SHIFT;
- align = max(real_align, 1UL << shift);
- size = ALIGN(real_size, 1UL << shift);
- }
+ else
+ shift = arch_vmap_pte_supported_shift(size_per_node);
+
+ align = max(real_align, 1UL << shift);
+ size = ALIGN(real_size, 1UL << shift);
}
again:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d7c3cb8688dd..4620df62f0ff 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1499,7 +1499,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
- if (!try_to_unmap(page, flags)) {
+ try_to_unmap(page, flags);
+ if (page_mapped(page)) {
stat->nr_unmap_fail += nr_pages;
if (!was_swapbacked && PageSwapBacked(page))
stat->nr_lazyfree_fail += nr_pages;
@@ -1701,6 +1702,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
unsigned int nr_reclaimed;
struct page *page, *next;
LIST_HEAD(clean_pages);
+ unsigned int noreclaim_flag;
list_for_each_entry_safe(page, next, page_list, lru) {
if (!PageHuge(page) && page_is_file_lru(page) &&
@@ -1711,8 +1713,17 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
}
}
+ /*
+ * We should be safe here since we are only dealing with file pages and
+ * we are not kswapd and therefore cannot write dirty file pages. But
+ * call memalloc_noreclaim_save() anyway, just in case these conditions
+ * change in the future.
+ */
+ noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
&stat, true);
+ memalloc_noreclaim_restore(noreclaim_flag);
+
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
-(long)nr_reclaimed);
@@ -1810,7 +1821,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
}
-/**
+/*
* Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
*
* lruvec->lru_lock is heavily contended. Some of the functions that
@@ -2306,6 +2317,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
LIST_HEAD(node_page_list);
struct reclaim_stat dummy_stat;
struct page *page;
+ unsigned int noreclaim_flag;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.priority = DEF_PRIORITY,
@@ -2314,6 +2326,8 @@ unsigned long reclaim_pages(struct list_head *page_list)
.may_swap = 1,
};
+ noreclaim_flag = memalloc_noreclaim_save();
+
while (!list_empty(page_list)) {
page = lru_to_page(page_list);
if (nid == NUMA_NO_NODE) {
@@ -2350,6 +2364,8 @@ unsigned long reclaim_pages(struct list_head *page_list)
}
}
+ memalloc_noreclaim_restore(noreclaim_flag);
+
return nr_reclaimed;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 4f7a306ce75a..5ba3e42446fa 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -168,8 +168,10 @@
* refault distance will immediately activate the refaulting page.
*/
+#define WORKINGSET_SHIFT 1
#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
- 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
+ WORKINGSET_SHIFT + NODES_SHIFT + \
+ MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
/*
@@ -189,7 +191,7 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
eviction &= EVICTION_MASK;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
- eviction = (eviction << 1) | workingset;
+ eviction = (eviction << WORKINGSET_SHIFT) | workingset;
return xa_mk_value(eviction);
}
@@ -201,8 +203,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
int memcgid, nid;
bool workingset;
- workingset = entry & 1;
- entry >>= 1;
+ workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
+ entry >>= WORKINGSET_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 7fe7adaaad01..b3c0577b8095 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -62,7 +62,7 @@
#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
-#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+#define NCHUNKS (TOTAL_CHUNKS - ZHDR_CHUNKS)
#define BUDDY_MASK (0x3)
#define BUDDY_SHIFT 2
@@ -144,6 +144,8 @@ struct z3fold_header {
* @c_handle: cache for z3fold_buddy_slots allocation
* @ops: pointer to a structure of user defined operations specified at
* pool creation time.
+ * @zpool: zpool driver
+ * @zpool_ops: zpool operations structure with an evict callback
* @compact_wq: workqueue for page layout background optimization
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
@@ -253,9 +255,8 @@ static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
spin_unlock(&zhdr->page_lock);
}
-
-static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
- bool lock)
+/* return locked z3fold page if it's not headless */
+static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
{
struct z3fold_buddy_slots *slots;
struct z3fold_header *zhdr;
@@ -269,13 +270,12 @@ static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
read_lock(&slots->lock);
addr = *(unsigned long *)handle;
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
- if (lock)
- locked = z3fold_page_trylock(zhdr);
+ locked = z3fold_page_trylock(zhdr);
read_unlock(&slots->lock);
if (locked)
break;
cpu_relax();
- } while (lock);
+ } while (true);
} else {
zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
}
@@ -283,18 +283,6 @@ static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
return zhdr;
}
-/* Returns the z3fold page where a given handle is stored */
-static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
-{
- return __get_z3fold_header(h, false);
-}
-
-/* return locked z3fold page if it's not headless */
-static inline struct z3fold_header *get_z3fold_header(unsigned long h)
-{
- return __get_z3fold_header(h, true);
-}
-
static inline void put_z3fold_header(struct z3fold_header *zhdr)
{
struct page *page = virt_to_page(zhdr);
@@ -998,7 +986,8 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
goto out_c;
spin_lock_init(&pool->lock);
spin_lock_init(&pool->stale_lock);
- pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
+ pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS,
+ __alignof__(struct list_head));
if (!pool->unbuddied)
goto out_pool;
for_each_possible_cpu(cpu) {
@@ -1059,6 +1048,7 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
destroy_workqueue(pool->compact_wq);
destroy_workqueue(pool->release_wq);
z3fold_unregister_migration(pool);
+ free_percpu(pool->unbuddied);
kfree(pool);
}
@@ -1382,7 +1372,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
if (zhdr->foreign_handles ||
test_and_set_bit(PAGE_CLAIMED, &page->private)) {
if (kref_put(&zhdr->refcount,
- release_z3fold_page))
+ release_z3fold_page_locked))
atomic64_dec(&pool->pages_nr);
else
z3fold_page_unlock(zhdr);
@@ -1803,8 +1793,11 @@ static int __init init_z3fold(void)
{
int ret;
- /* Make sure the z3fold header is not larger than the page size */
- BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
+ /*
+ * Make sure the z3fold header is not larger than the page size and
+ * there has remaining spaces for its buddy.
+ */
+ BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE);
ret = z3fold_mount();
if (ret)
return ret;
diff --git a/mm/zbud.c b/mm/zbud.c
index 7ec5f27a68b0..6348932430b8 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -51,7 +51,6 @@
#include <linux/preempt.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/zbud.h>
#include <linux/zpool.h>
/*****************
@@ -73,6 +72,12 @@
#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+struct zbud_pool;
+
+struct zbud_ops {
+ int (*evict)(struct zbud_pool *pool, unsigned long handle);
+};
+
/**
* struct zbud_pool - stores metadata for each zbud pool
* @lock: protects all pool fields and first|last_chunk fields of any
@@ -87,21 +92,27 @@
* @pages_nr: number of zbud pages in the pool.
* @ops: pointer to a structure of user defined operations specified at
* pool creation time.
+ * @zpool: zpool driver
+ * @zpool_ops: zpool operations structure with an evict callback
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular zbud pool.
*/
struct zbud_pool {
spinlock_t lock;
- struct list_head unbuddied[NCHUNKS];
- struct list_head buddied;
+ union {
+ /*
+ * Reuse unbuddied[0] as buddied on the ground that
+ * unbuddied[0] is unused.
+ */
+ struct list_head buddied;
+ struct list_head unbuddied[NCHUNKS];
+ };
struct list_head lru;
u64 pages_nr;
const struct zbud_ops *ops;
-#ifdef CONFIG_ZPOOL
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
-#endif
};
/*
@@ -121,104 +132,6 @@ struct zbud_header {
};
/*****************
- * zpool
- ****************/
-
-#ifdef CONFIG_ZPOOL
-
-static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
-{
- if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
- return pool->zpool_ops->evict(pool->zpool, handle);
- else
- return -ENOENT;
-}
-
-static const struct zbud_ops zbud_zpool_ops = {
- .evict = zbud_zpool_evict
-};
-
-static void *zbud_zpool_create(const char *name, gfp_t gfp,
- const struct zpool_ops *zpool_ops,
- struct zpool *zpool)
-{
- struct zbud_pool *pool;
-
- pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
- if (pool) {
- pool->zpool = zpool;
- pool->zpool_ops = zpool_ops;
- }
- return pool;
-}
-
-static void zbud_zpool_destroy(void *pool)
-{
- zbud_destroy_pool(pool);
-}
-
-static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- return zbud_alloc(pool, size, gfp, handle);
-}
-static void zbud_zpool_free(void *pool, unsigned long handle)
-{
- zbud_free(pool, handle);
-}
-
-static int zbud_zpool_shrink(void *pool, unsigned int pages,
- unsigned int *reclaimed)
-{
- unsigned int total = 0;
- int ret = -EINVAL;
-
- while (total < pages) {
- ret = zbud_reclaim_page(pool, 8);
- if (ret < 0)
- break;
- total++;
- }
-
- if (reclaimed)
- *reclaimed = total;
-
- return ret;
-}
-
-static void *zbud_zpool_map(void *pool, unsigned long handle,
- enum zpool_mapmode mm)
-{
- return zbud_map(pool, handle);
-}
-static void zbud_zpool_unmap(void *pool, unsigned long handle)
-{
- zbud_unmap(pool, handle);
-}
-
-static u64 zbud_zpool_total_size(void *pool)
-{
- return zbud_get_pool_size(pool) * PAGE_SIZE;
-}
-
-static struct zpool_driver zbud_zpool_driver = {
- .type = "zbud",
- .sleep_mapped = true,
- .owner = THIS_MODULE,
- .create = zbud_zpool_create,
- .destroy = zbud_zpool_destroy,
- .malloc = zbud_zpool_malloc,
- .free = zbud_zpool_free,
- .shrink = zbud_zpool_shrink,
- .map = zbud_zpool_map,
- .unmap = zbud_zpool_unmap,
- .total_size = zbud_zpool_total_size,
-};
-
-MODULE_ALIAS("zpool-zbud");
-#endif /* CONFIG_ZPOOL */
-
-/*****************
* Helpers
*****************/
/* Just to make the code easier to read */
@@ -304,7 +217,7 @@ static int num_free_chunks(struct zbud_header *zhdr)
* Return: pointer to the new zbud pool or NULL if the metadata allocation
* failed.
*/
-struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
+static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
{
struct zbud_pool *pool;
int i;
@@ -328,7 +241,7 @@ struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
*
* The pool should be emptied before this function is called.
*/
-void zbud_destroy_pool(struct zbud_pool *pool)
+static void zbud_destroy_pool(struct zbud_pool *pool)
{
kfree(pool);
}
@@ -352,7 +265,7 @@ void zbud_destroy_pool(struct zbud_pool *pool)
* gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
* a new page.
*/
-int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
+static int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
unsigned long *handle)
{
int chunks, i, freechunks;
@@ -427,7 +340,7 @@ found:
* only sets the first|last_chunks to 0. The page is actually freed
* once both buddies are evicted (see zbud_reclaim_page() below).
*/
-void zbud_free(struct zbud_pool *pool, unsigned long handle)
+static void zbud_free(struct zbud_pool *pool, unsigned long handle)
{
struct zbud_header *zhdr;
int freechunks;
@@ -499,7 +412,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
* no pages to evict or an eviction handler is not registered, -EAGAIN if
* the retry limit was hit.
*/
-int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
+static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
{
int i, ret, freechunks;
struct zbud_header *zhdr;
@@ -581,7 +494,7 @@ next:
*
* Returns: a pointer to the mapped allocation
*/
-void *zbud_map(struct zbud_pool *pool, unsigned long handle)
+static void *zbud_map(struct zbud_pool *pool, unsigned long handle)
{
return (void *)(handle);
}
@@ -591,7 +504,7 @@ void *zbud_map(struct zbud_pool *pool, unsigned long handle)
* @pool: pool in which the allocation resides
* @handle: handle associated with the allocation to be unmapped
*/
-void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
+static void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
{
}
@@ -602,30 +515,120 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
* Returns: size in pages of the given pool. The pool lock need not be
* taken to access pages_nr.
*/
-u64 zbud_get_pool_size(struct zbud_pool *pool)
+static u64 zbud_get_pool_size(struct zbud_pool *pool)
{
return pool->pages_nr;
}
+/*****************
+ * zpool
+ ****************/
+
+static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
+{
+ if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
+ return pool->zpool_ops->evict(pool->zpool, handle);
+ else
+ return -ENOENT;
+}
+
+static const struct zbud_ops zbud_zpool_ops = {
+ .evict = zbud_zpool_evict
+};
+
+static void *zbud_zpool_create(const char *name, gfp_t gfp,
+ const struct zpool_ops *zpool_ops,
+ struct zpool *zpool)
+{
+ struct zbud_pool *pool;
+
+ pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
+ if (pool) {
+ pool->zpool = zpool;
+ pool->zpool_ops = zpool_ops;
+ }
+ return pool;
+}
+
+static void zbud_zpool_destroy(void *pool)
+{
+ zbud_destroy_pool(pool);
+}
+
+static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+ unsigned long *handle)
+{
+ return zbud_alloc(pool, size, gfp, handle);
+}
+static void zbud_zpool_free(void *pool, unsigned long handle)
+{
+ zbud_free(pool, handle);
+}
+
+static int zbud_zpool_shrink(void *pool, unsigned int pages,
+ unsigned int *reclaimed)
+{
+ unsigned int total = 0;
+ int ret = -EINVAL;
+
+ while (total < pages) {
+ ret = zbud_reclaim_page(pool, 8);
+ if (ret < 0)
+ break;
+ total++;
+ }
+
+ if (reclaimed)
+ *reclaimed = total;
+
+ return ret;
+}
+
+static void *zbud_zpool_map(void *pool, unsigned long handle,
+ enum zpool_mapmode mm)
+{
+ return zbud_map(pool, handle);
+}
+static void zbud_zpool_unmap(void *pool, unsigned long handle)
+{
+ zbud_unmap(pool, handle);
+}
+
+static u64 zbud_zpool_total_size(void *pool)
+{
+ return zbud_get_pool_size(pool) * PAGE_SIZE;
+}
+
+static struct zpool_driver zbud_zpool_driver = {
+ .type = "zbud",
+ .sleep_mapped = true,
+ .owner = THIS_MODULE,
+ .create = zbud_zpool_create,
+ .destroy = zbud_zpool_destroy,
+ .malloc = zbud_zpool_malloc,
+ .free = zbud_zpool_free,
+ .shrink = zbud_zpool_shrink,
+ .map = zbud_zpool_map,
+ .unmap = zbud_zpool_unmap,
+ .total_size = zbud_zpool_total_size,
+};
+
+MODULE_ALIAS("zpool-zbud");
+
static int __init init_zbud(void)
{
/* Make sure the zbud header will fit in one chunk */
BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
pr_info("loaded\n");
-#ifdef CONFIG_ZPOOL
zpool_register_driver(&zbud_zpool_driver);
-#endif
return 0;
}
static void __exit exit_zbud(void)
{
-#ifdef CONFIG_ZPOOL
zpool_unregister_driver(&zbud_zpool_driver);
-#endif
-
pr_info("unloaded\n");
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 19b563bc6c48..68e8831068f4 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1471,7 +1471,6 @@ static void obj_free(struct size_class *class, unsigned long obj)
unsigned int f_objidx;
void *vaddr;
- obj &= ~OBJ_ALLOCATED_TAG;
obj_to_location(obj, &f_page, &f_objidx);
f_offset = (class->size * f_objidx) & ~PAGE_MASK;
zspage = get_zspage(f_page);
@@ -2163,7 +2162,7 @@ static void async_free_zspage(struct work_struct *work)
VM_BUG_ON(fullness != ZS_EMPTY);
class = pool->size_class[class_idx];
spin_lock(&class->lock);
- __free_zspage(pool, pool->size_class[class_idx], zspage);
+ __free_zspage(pool, class, zspage);
spin_unlock(&class->lock);
}
};
diff --git a/mm/zswap.c b/mm/zswap.c
index 20763267a219..7944e3e57e78 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -967,6 +967,13 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
spin_unlock(&tree->lock);
BUG_ON(offset != entry->offset);
+ src = (u8 *)zhdr + sizeof(struct zswap_header);
+ if (!zpool_can_sleep_mapped(pool)) {
+ memcpy(tmp, src, entry->length);
+ src = tmp;
+ zpool_unmap_handle(pool, handle);
+ }
+
/* try to allocate swap cache page */
switch (zswap_get_swap_cache_page(swpentry, &page)) {
case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
@@ -982,17 +989,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
case ZSWAP_SWAPCACHE_NEW: /* page is locked */
/* decompress */
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-
dlen = PAGE_SIZE;
- src = (u8 *)zhdr + sizeof(struct zswap_header);
-
- if (!zpool_can_sleep_mapped(pool)) {
-
- memcpy(tmp, src, entry->length);
- src = tmp;
-
- zpool_unmap_handle(pool, handle);
- }
mutex_lock(acomp_ctx->mutex);
sg_init_one(&input, src, entry->length);
@@ -1203,7 +1200,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
zswap_reject_alloc_fail++;
goto put_dstmem;
}
- buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
+ buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
memcpy(buf, &zhdr, hlen);
memcpy(buf + hlen, dst, dlen);
zpool_unmap_handle(entry->pool->zpool, handle);
@@ -1427,18 +1424,11 @@ static int __init zswap_debugfs_init(void)
return 0;
}
-
-static void __exit zswap_debugfs_exit(void)
-{
- debugfs_remove_recursive(zswap_debugfs_root);
-}
#else
static int __init zswap_debugfs_init(void)
{
return 0;
}
-
-static void __exit zswap_debugfs_exit(void) { }
#endif
/*********************************
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 23697a6b1eaa..461d4221e4a4 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1084,10 +1084,10 @@ sub is_maintained_obsolete {
sub is_SPDX_License_valid {
my ($license) = @_;
- return 1 if (!$tree || which("python") eq "" || !(-e "$root/scripts/spdxcheck.py") || !(-e "$gitroot"));
+ return 1 if (!$tree || which("python3") eq "" || !(-x "$root/scripts/spdxcheck.py") || !(-e "$gitroot"));
my $root_path = abs_path($root);
- my $status = `cd "$root_path"; echo "$license" | python scripts/spdxcheck.py -`;
+ my $status = `cd "$root_path"; echo "$license" | scripts/spdxcheck.py -`;
return 0 if ($status ne "");
return 1;
}
@@ -5361,9 +5361,13 @@ sub process {
}
}
-#goto labels aren't indented, allow a single space however
- if ($line=~/^.\s+[A-Za-z\d_]+:(?![0-9]+)/ and
- !($line=~/^. [A-Za-z\d_]+:/) and !($line=~/^.\s+default:/)) {
+# check that goto labels aren't indented (allow a single space indentation)
+# and ignore bitfield definitions like foo:1
+# Strictly, labels can have whitespace after the identifier and before the :
+# but this is not allowed here as many ?: uses would appear to be labels
+ if ($sline =~ /^.\s+[A-Za-z_][A-Za-z\d_]*:(?!\s*\d+)/ &&
+ $sline !~ /^. [A-Za-z\d_][A-Za-z\d_]*:/ &&
+ $sline !~ /^.\s+default:/) {
if (WARN("INDENTED_LABEL",
"labels should not be indented\n" . $herecurr) &&
$fix) {
@@ -5458,7 +5462,7 @@ sub process {
# Return of what appears to be an errno should normally be negative
if ($sline =~ /\breturn(?:\s*\(+\s*|\s+)(E[A-Z]+)(?:\s*\)+\s*|\s*)[;:,]/) {
my $name = $1;
- if ($name ne 'EOF' && $name ne 'ERROR') {
+ if ($name ne 'EOF' && $name ne 'ERROR' && $name !~ /^EPOLL/) {
WARN("USE_NEGATIVE_ERRNO",
"return of an errno should typically be negative (ie: return -$1)\n" . $herecurr);
}
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 1f651e85ed60..d683a49d07d5 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -12,6 +12,9 @@ mremap_test
on-fault-limit
transhuge-stress
protection_keys
+protection_keys_32
+protection_keys_64
+madv_populate
userfaultfd
mlock-intersect-test
mlock-random-test
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 73e1cc96d7c2..812bc03e3142 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -31,6 +31,7 @@ TEST_GEN_FILES += hmm-tests
TEST_GEN_FILES += hugepage-mmap
TEST_GEN_FILES += hugepage-shm
TEST_GEN_FILES += khugepaged
+TEST_GEN_FILES += madv_populate
TEST_GEN_FILES += map_fixed_noreplace
TEST_GEN_FILES += map_hugetlb
TEST_GEN_FILES += map_populate
@@ -100,7 +101,7 @@ $(1) $(1)_64: $(OUTPUT)/$(1)_64
endef
ifeq ($(CAN_BUILD_I386),1)
-$(BINARIES_32): CFLAGS += -m32
+$(BINARIES_32): CFLAGS += -m32 -mxsave
$(BINARIES_32): LDLIBS += -lrt -ldl -lm
$(BINARIES_32): $(OUTPUT)/%_32: %.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
@@ -108,7 +109,7 @@ $(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t))))
endif
ifeq ($(CAN_BUILD_X86_64),1)
-$(BINARIES_64): CFLAGS += -m64
+$(BINARIES_64): CFLAGS += -m64 -mxsave
$(BINARIES_64): LDLIBS += -lrt -ldl
$(BINARIES_64): $(OUTPUT)/%_64: %.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
index 5d1ac691b9f4..864f126ffd78 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -1485,4 +1485,162 @@ TEST_F(hmm2, double_map)
hmm_buffer_free(buffer);
}
+/*
+ * Basic check of exclusive faulting.
+ */
+TEST_F(hmm, exclusive)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Map memory exclusively for device access. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Fault pages back to system memory and check them. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i]++, i);
+
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i+1);
+
+ /* Check atomic access revoked */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+
+ hmm_buffer_free(buffer);
+}
+
+TEST_F(hmm, exclusive_mprotect)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Map memory exclusively for device access. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ ret = mprotect(buffer->ptr, size, PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, -EPERM);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Check copy-on-write works.
+ */
+TEST_F(hmm, exclusive_cow)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Map memory exclusively for device access. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ fork();
+
+ /* Fault pages back to system memory and check them. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i]++, i);
+
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i+1);
+
+ hmm_buffer_free(buffer);
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
index 8b75821302a7..155120b67a16 100644
--- a/tools/testing/selftests/vm/khugepaged.c
+++ b/tools/testing/selftests/vm/khugepaged.c
@@ -86,7 +86,6 @@ struct settings {
enum thp_enabled thp_enabled;
enum thp_defrag thp_defrag;
enum shmem_enabled shmem_enabled;
- bool debug_cow;
bool use_zero_page;
struct khugepaged_settings khugepaged;
};
@@ -95,7 +94,6 @@ static struct settings default_settings = {
.thp_enabled = THP_MADVISE,
.thp_defrag = THP_DEFRAG_ALWAYS,
.shmem_enabled = SHMEM_NEVER,
- .debug_cow = 0,
.use_zero_page = 0,
.khugepaged = {
.defrag = 1,
@@ -268,7 +266,6 @@ static void write_settings(struct settings *settings)
write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
write_string("shmem_enabled",
shmem_enabled_strings[settings->shmem_enabled]);
- write_num("debug_cow", settings->debug_cow);
write_num("use_zero_page", settings->use_zero_page);
write_num("khugepaged/defrag", khugepaged->defrag);
@@ -304,7 +301,6 @@ static void save_settings(void)
.thp_defrag = read_string("defrag", thp_defrag_strings),
.shmem_enabled =
read_string("shmem_enabled", shmem_enabled_strings),
- .debug_cow = read_num("debug_cow"),
.use_zero_page = read_num("use_zero_page"),
};
saved_settings.khugepaged = (struct khugepaged_settings) {
diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/vm/madv_populate.c
new file mode 100644
index 000000000000..b959e4ebdad4
--- /dev/null
+++ b/tools/testing/selftests/vm/madv_populate.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
+ *
+ * Copyright 2021, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+#include "../kselftest.h"
+
+#if defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE)
+
+/*
+ * For now, we're using 2 MiB of private anonymous memory for all tests.
+ */
+#define SIZE (2 * 1024 * 1024)
+
+static size_t pagesize;
+
+static uint64_t pagemap_get_entry(int fd, char *start)
+{
+ const unsigned long pfn = (unsigned long)start / pagesize;
+ uint64_t entry;
+ int ret;
+
+ ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry));
+ if (ret != sizeof(entry))
+ ksft_exit_fail_msg("reading pagemap failed\n");
+ return entry;
+}
+
+static bool pagemap_is_populated(int fd, char *start)
+{
+ uint64_t entry = pagemap_get_entry(fd, start);
+
+ /* Present or swapped. */
+ return entry & 0xc000000000000000ull;
+}
+
+static bool pagemap_is_softdirty(int fd, char *start)
+{
+ uint64_t entry = pagemap_get_entry(fd, start);
+
+ return entry & 0x0080000000000000ull;
+}
+
+static void sense_support(void)
+{
+ char *addr;
+ int ret;
+
+ addr = mmap(0, pagesize, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (!addr)
+ ksft_exit_fail_msg("mmap failed\n");
+
+ ret = madvise(addr, pagesize, MADV_POPULATE_READ);
+ if (ret)
+ ksft_exit_skip("MADV_POPULATE_READ is not available\n");
+
+ ret = madvise(addr, pagesize, MADV_POPULATE_WRITE);
+ if (ret)
+ ksft_exit_skip("MADV_POPULATE_WRITE is not available\n");
+
+ munmap(addr, pagesize);
+}
+
+static void test_prot_read(void)
+{
+ char *addr;
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+
+ ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+ ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n");
+
+ ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+ ksft_test_result(ret == -1 && errno == EINVAL,
+ "MADV_POPULATE_WRITE with PROT_READ\n");
+
+ munmap(addr, SIZE);
+}
+
+static void test_prot_write(void)
+{
+ char *addr;
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+
+ ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+ ksft_test_result(ret == -1 && errno == EINVAL,
+ "MADV_POPULATE_READ with PROT_WRITE\n");
+
+ ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+ ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n");
+
+ munmap(addr, SIZE);
+}
+
+static void test_holes(void)
+{
+ char *addr;
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+ ret = munmap(addr + pagesize, pagesize);
+ if (ret)
+ ksft_exit_fail_msg("munmap failed\n");
+
+ /* Hole in the middle */
+ ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+ ksft_test_result(ret == -1 && errno == ENOMEM,
+ "MADV_POPULATE_READ with holes in the middle\n");
+ ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+ ksft_test_result(ret == -1 && errno == ENOMEM,
+ "MADV_POPULATE_WRITE with holes in the middle\n");
+
+ /* Hole at end */
+ ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ);
+ ksft_test_result(ret == -1 && errno == ENOMEM,
+ "MADV_POPULATE_READ with holes at the end\n");
+ ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE);
+ ksft_test_result(ret == -1 && errno == ENOMEM,
+ "MADV_POPULATE_WRITE with holes at the end\n");
+
+ /* Hole at beginning */
+ ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ);
+ ksft_test_result(ret == -1 && errno == ENOMEM,
+ "MADV_POPULATE_READ with holes at the beginning\n");
+ ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE);
+ ksft_test_result(ret == -1 && errno == ENOMEM,
+ "MADV_POPULATE_WRITE with holes at the beginning\n");
+
+ munmap(addr, SIZE);
+}
+
+static bool range_is_populated(char *start, ssize_t size)
+{
+ int fd = open("/proc/self/pagemap", O_RDONLY);
+ bool ret = true;
+
+ if (fd < 0)
+ ksft_exit_fail_msg("opening pagemap failed\n");
+ for (; size > 0 && ret; size -= pagesize, start += pagesize)
+ if (!pagemap_is_populated(fd, start))
+ ret = false;
+ close(fd);
+ return ret;
+}
+
+static bool range_is_not_populated(char *start, ssize_t size)
+{
+ int fd = open("/proc/self/pagemap", O_RDONLY);
+ bool ret = true;
+
+ if (fd < 0)
+ ksft_exit_fail_msg("opening pagemap failed\n");
+ for (; size > 0 && ret; size -= pagesize, start += pagesize)
+ if (pagemap_is_populated(fd, start))
+ ret = false;
+ close(fd);
+ return ret;
+}
+
+static void test_populate_read(void)
+{
+ char *addr;
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+ ksft_test_result(range_is_not_populated(addr, SIZE),
+ "range initially not populated\n");
+
+ ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+ ksft_test_result(!ret, "MADV_POPULATE_READ\n");
+ ksft_test_result(range_is_populated(addr, SIZE),
+ "range is populated\n");
+
+ munmap(addr, SIZE);
+}
+
+static void test_populate_write(void)
+{
+ char *addr;
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+ ksft_test_result(range_is_not_populated(addr, SIZE),
+ "range initially not populated\n");
+
+ ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+ ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
+ ksft_test_result(range_is_populated(addr, SIZE),
+ "range is populated\n");
+
+ munmap(addr, SIZE);
+}
+
+static bool range_is_softdirty(char *start, ssize_t size)
+{
+ int fd = open("/proc/self/pagemap", O_RDONLY);
+ bool ret = true;
+
+ if (fd < 0)
+ ksft_exit_fail_msg("opening pagemap failed\n");
+ for (; size > 0 && ret; size -= pagesize, start += pagesize)
+ if (!pagemap_is_softdirty(fd, start))
+ ret = false;
+ close(fd);
+ return ret;
+}
+
+static bool range_is_not_softdirty(char *start, ssize_t size)
+{
+ int fd = open("/proc/self/pagemap", O_RDONLY);
+ bool ret = true;
+
+ if (fd < 0)
+ ksft_exit_fail_msg("opening pagemap failed\n");
+ for (; size > 0 && ret; size -= pagesize, start += pagesize)
+ if (pagemap_is_softdirty(fd, start))
+ ret = false;
+ close(fd);
+ return ret;
+}
+
+static void clear_softdirty(void)
+{
+ int fd = open("/proc/self/clear_refs", O_WRONLY);
+ const char *ctrl = "4";
+ int ret;
+
+ if (fd < 0)
+ ksft_exit_fail_msg("opening clear_refs failed\n");
+ ret = write(fd, ctrl, strlen(ctrl));
+ if (ret != strlen(ctrl))
+ ksft_exit_fail_msg("writing clear_refs failed\n");
+ close(fd);
+}
+
+static void test_softdirty(void)
+{
+ char *addr;
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+
+ /* Clear any softdirty bits. */
+ clear_softdirty();
+ ksft_test_result(range_is_not_softdirty(addr, SIZE),
+ "range is not softdirty\n");
+
+ /* Populating READ should set softdirty. */
+ ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+ ksft_test_result(!ret, "MADV_POPULATE_READ\n");
+ ksft_test_result(range_is_not_softdirty(addr, SIZE),
+ "range is not softdirty\n");
+
+ /* Populating WRITE should set softdirty. */
+ ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+ ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
+ ksft_test_result(range_is_softdirty(addr, SIZE),
+ "range is softdirty\n");
+
+ munmap(addr, SIZE);
+}
+
+int main(int argc, char **argv)
+{
+ int err;
+
+ pagesize = getpagesize();
+
+ ksft_print_header();
+ ksft_set_plan(21);
+
+ sense_support();
+ test_prot_read();
+ test_prot_write();
+ test_holes();
+ test_populate_read();
+ test_populate_write();
+ test_softdirty();
+
+ err = ksft_get_fail_cnt();
+ if (err)
+ ksft_exit_fail_msg("%d out of %d tests failed\n",
+ err, ksft_test_num());
+ return ksft_exit_pass();
+}
+
+#else /* defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE) */
+
+#warning "missing MADV_POPULATE_READ or MADV_POPULATE_WRITE definition"
+
+int main(int argc, char **argv)
+{
+ ksft_print_header();
+ ksft_exit_skip("MADV_POPULATE_READ or MADV_POPULATE_WRITE not defined\n");
+}
+
+#endif /* defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE) */
diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h
index 3be20f5d5275..e4a4ce2b826d 100644
--- a/tools/testing/selftests/vm/pkey-x86.h
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -126,6 +126,7 @@ static inline u32 pkey_bit_position(int pkey)
#define XSTATE_PKEY_BIT (9)
#define XSTATE_PKEY 0x200
+#define XSTATE_BV_OFFSET 512
int pkey_reg_xstate_offset(void)
{
diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c
index fdbb602ecf32..2d0ae88665db 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -510,7 +510,7 @@ int alloc_pkey(void)
" shadow: 0x%016llx\n",
__func__, __LINE__, ret, __read_pkey_reg(),
shadow_pkey_reg);
- if (ret) {
+ if (ret > 0) {
/* clear both the bits: */
shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
~PKEY_MASK);
@@ -561,7 +561,6 @@ int alloc_random_pkey(void)
int nr_alloced = 0;
int random_index;
memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
- srand((unsigned int)time(NULL));
/* allocate every possible key and make a note of which ones we got */
max_nr_pkey_allocs = NR_PKEYS;
@@ -1278,6 +1277,78 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
}
}
+void arch_force_pkey_reg_init(void)
+{
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ u64 *buf;
+
+ /*
+ * All keys should be allocated and set to allow reads and
+ * writes, so the register should be all 0. If not, just
+ * skip the test.
+ */
+ if (read_pkey_reg())
+ return;
+
+ /*
+ * Just allocate an absurd about of memory rather than
+ * doing the XSAVE size enumeration dance.
+ */
+ buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+
+ /* These __builtins require compiling with -mxsave */
+
+ /* XSAVE to build a valid buffer: */
+ __builtin_ia32_xsave(buf, XSTATE_PKEY);
+ /* Clear XSTATE_BV[PKRU]: */
+ buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY;
+ /* XRSTOR will likely get PKRU back to the init state: */
+ __builtin_ia32_xrstor(buf, XSTATE_PKEY);
+
+ munmap(buf, 1*MB);
+#endif
+}
+
+
+/*
+ * This is mostly useless on ppc for now. But it will not
+ * hurt anything and should give some better coverage as
+ * a long-running test that continually checks the pkey
+ * register.
+ */
+void test_pkey_init_state(int *ptr, u16 pkey)
+{
+ int err;
+ int allocated_pkeys[NR_PKEYS] = {0};
+ int nr_allocated_pkeys = 0;
+ int i;
+
+ for (i = 0; i < NR_PKEYS; i++) {
+ int new_pkey = alloc_pkey();
+
+ if (new_pkey < 0)
+ continue;
+ allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
+ }
+
+ dprintf3("%s()::%d\n", __func__, __LINE__);
+
+ arch_force_pkey_reg_init();
+
+ /*
+ * Loop for a bit, hoping to get exercise the kernel
+ * context switch code.
+ */
+ for (i = 0; i < 1000000; i++)
+ read_pkey_reg();
+
+ for (i = 0; i < nr_allocated_pkeys; i++) {
+ err = sys_pkey_free(allocated_pkeys[i]);
+ pkey_assert(!err);
+ read_pkey_reg(); /* for shadow checking */
+ }
+}
+
/*
* pkey 0 is special. It is allocated by default, so you do not
* have to call pkey_alloc() to use it first. Make sure that it
@@ -1449,6 +1520,13 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
pkey_assert(!ret);
+ /*
+ * Reset the shadow, assuming that the above mprotect()
+ * correctly changed PKRU, but to an unknown value since
+ * the actual alllocated pkey is unknown.
+ */
+ shadow_pkey_reg = __read_pkey_reg();
+
dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
/* Make sure this is an *instruction* fault */
@@ -1502,6 +1580,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = {
test_implicit_mprotect_exec_only_memory,
test_mprotect_with_pkey_0,
test_ptrace_of_child,
+ test_pkey_init_state,
test_pkey_syscalls_on_non_allocated_pkey,
test_pkey_syscalls_bad_args,
test_pkey_alloc_exhaust,
@@ -1552,6 +1631,8 @@ int main(void)
int nr_iterations = 22;
int pkeys_supported = is_pkeys_supported();
+ srand((unsigned int)time(NULL));
+
setup_handlers();
printf("has pkeys: %d\n", pkeys_supported);
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
index e953f3cd9664..955782d138ab 100755
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ b/tools/testing/selftests/vm/run_vmtests.sh
@@ -346,4 +346,20 @@ else
exitcode=1
fi
+echo "--------------------------------------------------------"
+echo "running MADV_POPULATE_READ and MADV_POPULATE_WRITE tests"
+echo "--------------------------------------------------------"
+./madv_populate
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
exit $exitcode
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index f5ab5e0312e7..e363bdaff59d 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -85,10 +85,12 @@ static bool test_uffdio_wp = false;
static bool test_uffdio_minor = false;
static bool map_shared;
+static int shm_fd;
static int huge_fd;
static char *huge_fd_off0;
static unsigned long long *count_verify;
-static int uffd, uffd_flags, finished, *pipefd;
+static int uffd = -1;
+static int uffd_flags, finished, *pipefd;
static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
static char *zeropage;
pthread_attr_t attr;
@@ -140,11 +142,18 @@ static void usage(void)
exit(1);
}
-#define uffd_error(code, fmt, ...) \
- do { \
- fprintf(stderr, fmt, ##__VA_ARGS__); \
- fprintf(stderr, ": %" PRId64 "\n", (int64_t)(code)); \
- exit(1); \
+#define _err(fmt, ...) \
+ do { \
+ int ret = errno; \
+ fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \
+ fprintf(stderr, " (errno=%d, line=%d)\n", \
+ ret, __LINE__); \
+ } while (0)
+
+#define err(fmt, ...) \
+ do { \
+ _err(fmt, ##__VA_ARGS__); \
+ exit(1); \
} while (0)
static void uffd_stats_reset(struct uffd_stats *uffd_stats,
@@ -171,56 +180,50 @@ static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
minor_total += stats[i].minor_faults;
}
- printf("userfaults: %llu missing (", miss_total);
- for (i = 0; i < n_cpus; i++)
- printf("%lu+", stats[i].missing_faults);
- printf("\b), %llu wp (", wp_total);
- for (i = 0; i < n_cpus; i++)
- printf("%lu+", stats[i].wp_faults);
- printf("\b), %llu minor (", minor_total);
- for (i = 0; i < n_cpus; i++)
- printf("%lu+", stats[i].minor_faults);
- printf("\b)\n");
+ printf("userfaults: ");
+ if (miss_total) {
+ printf("%llu missing (", miss_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", stats[i].missing_faults);
+ printf("\b) ");
+ }
+ if (wp_total) {
+ printf("%llu wp (", wp_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", stats[i].wp_faults);
+ printf("\b) ");
+ }
+ if (minor_total) {
+ printf("%llu minor (", minor_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", stats[i].minor_faults);
+ printf("\b)");
+ }
+ printf("\n");
}
-static int anon_release_pages(char *rel_area)
+static void anon_release_pages(char *rel_area)
{
- int ret = 0;
-
- if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
- perror("madvise");
- ret = 1;
- }
-
- return ret;
+ if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
}
static void anon_allocate_area(void **alloc_area)
{
- if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
- fprintf(stderr, "out of memory\n");
- *alloc_area = NULL;
- }
+ if (posix_memalign(alloc_area, page_size, nr_pages * page_size))
+ err("posix_memalign() failed");
}
static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
{
}
-/* HugeTLB memory */
-static int hugetlb_release_pages(char *rel_area)
+static void hugetlb_release_pages(char *rel_area)
{
- int ret = 0;
-
if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
- rel_area == huge_fd_off0 ? 0 :
- nr_pages * page_size,
- nr_pages * page_size)) {
- perror("fallocate");
- ret = 1;
- }
-
- return ret;
+ rel_area == huge_fd_off0 ? 0 : nr_pages * page_size,
+ nr_pages * page_size))
+ err("fallocate() failed");
}
static void hugetlb_allocate_area(void **alloc_area)
@@ -233,20 +236,16 @@ static void hugetlb_allocate_area(void **alloc_area)
MAP_HUGETLB,
huge_fd, *alloc_area == area_src ? 0 :
nr_pages * page_size);
- if (*alloc_area == MAP_FAILED) {
- perror("mmap of hugetlbfs file failed");
- goto fail;
- }
+ if (*alloc_area == MAP_FAILED)
+ err("mmap of hugetlbfs file failed");
if (map_shared) {
area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_HUGETLB,
huge_fd, *alloc_area == area_src ? 0 :
nr_pages * page_size);
- if (area_alias == MAP_FAILED) {
- perror("mmap of hugetlb file alias failed");
- goto fail_munmap;
- }
+ if (area_alias == MAP_FAILED)
+ err("mmap of hugetlb file alias failed");
}
if (*alloc_area == area_src) {
@@ -257,16 +256,6 @@ static void hugetlb_allocate_area(void **alloc_area)
}
if (area_alias)
*alloc_area_alias = area_alias;
-
- return;
-
-fail_munmap:
- if (munmap(*alloc_area, nr_pages * page_size) < 0) {
- perror("hugetlb munmap");
- exit(1);
- }
-fail:
- *alloc_area = NULL;
}
static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
@@ -282,33 +271,43 @@ static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset
*start = (unsigned long) area_dst_alias + offset;
}
-/* Shared memory */
-static int shmem_release_pages(char *rel_area)
+static void shmem_release_pages(char *rel_area)
{
- int ret = 0;
-
- if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) {
- perror("madvise");
- ret = 1;
- }
-
- return ret;
+ if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
+ err("madvise(MADV_REMOVE) failed");
}
static void shmem_allocate_area(void **alloc_area)
{
+ void *area_alias = NULL;
+ bool is_src = alloc_area == (void **)&area_src;
+ unsigned long offset = is_src ? 0 : nr_pages * page_size;
+
*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
- MAP_ANONYMOUS | MAP_SHARED, -1, 0);
- if (*alloc_area == MAP_FAILED) {
- fprintf(stderr, "shared memory mmap failed\n");
- *alloc_area = NULL;
- }
+ MAP_SHARED, shm_fd, offset);
+ if (*alloc_area == MAP_FAILED)
+ err("mmap of memfd failed");
+
+ area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, shm_fd, offset);
+ if (area_alias == MAP_FAILED)
+ err("mmap of memfd alias failed");
+
+ if (is_src)
+ area_src_alias = area_alias;
+ else
+ area_dst_alias = area_alias;
+}
+
+static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+ *start = (unsigned long)area_dst_alias + offset;
}
struct uffd_test_ops {
unsigned long expected_ioctls;
void (*allocate_area)(void **alloc_area);
- int (*release_pages)(char *rel_area);
+ void (*release_pages)(char *rel_area);
void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
};
@@ -332,7 +331,7 @@ static struct uffd_test_ops shmem_uffd_test_ops = {
.expected_ioctls = SHMEM_EXPECTED_IOCTLS,
.allocate_area = shmem_allocate_area,
.release_pages = shmem_release_pages,
- .alias_mapping = noop_alias_mapping,
+ .alias_mapping = shmem_alias_mapping,
};
static struct uffd_test_ops hugetlb_uffd_test_ops = {
@@ -344,6 +343,111 @@ static struct uffd_test_ops hugetlb_uffd_test_ops = {
static struct uffd_test_ops *uffd_test_ops;
+static void userfaultfd_open(uint64_t *features)
+{
+ struct uffdio_api uffdio_api;
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
+ if (uffd < 0)
+ err("userfaultfd syscall not available in this kernel");
+ uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = *features;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api))
+ err("UFFDIO_API failed.\nPlease make sure to "
+ "run with either root or ptrace capability.");
+ if (uffdio_api.api != UFFD_API)
+ err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
+
+ *features = uffdio_api.features;
+}
+
+static inline void munmap_area(void **area)
+{
+ if (*area)
+ if (munmap(*area, nr_pages * page_size))
+ err("munmap");
+
+ *area = NULL;
+}
+
+static void uffd_test_ctx_clear(void)
+{
+ size_t i;
+
+ if (pipefd) {
+ for (i = 0; i < nr_cpus * 2; ++i) {
+ if (close(pipefd[i]))
+ err("close pipefd");
+ }
+ free(pipefd);
+ pipefd = NULL;
+ }
+
+ if (count_verify) {
+ free(count_verify);
+ count_verify = NULL;
+ }
+
+ if (uffd != -1) {
+ if (close(uffd))
+ err("close uffd");
+ uffd = -1;
+ }
+
+ huge_fd_off0 = NULL;
+ munmap_area((void **)&area_src);
+ munmap_area((void **)&area_src_alias);
+ munmap_area((void **)&area_dst);
+ munmap_area((void **)&area_dst_alias);
+}
+
+static void uffd_test_ctx_init_ext(uint64_t *features)
+{
+ unsigned long nr, cpu;
+
+ uffd_test_ctx_clear();
+
+ uffd_test_ops->allocate_area((void **)&area_src);
+ uffd_test_ops->allocate_area((void **)&area_dst);
+
+ uffd_test_ops->release_pages(area_src);
+ uffd_test_ops->release_pages(area_dst);
+
+ userfaultfd_open(features);
+
+ count_verify = malloc(nr_pages * sizeof(unsigned long long));
+ if (!count_verify)
+ err("count_verify");
+
+ for (nr = 0; nr < nr_pages; nr++) {
+ *area_mutex(area_src, nr) =
+ (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
+ count_verify[nr] = *area_count(area_src, nr) = 1;
+ /*
+ * In the transition between 255 to 256, powerpc will
+ * read out of order in my_bcmp and see both bytes as
+ * zero, so leave a placeholder below always non-zero
+ * after the count, to avoid my_bcmp to trigger false
+ * positives.
+ */
+ *(area_count(area_src, nr) + 1) = 1;
+ }
+
+ pipefd = malloc(sizeof(int) * nr_cpus * 2);
+ if (!pipefd)
+ err("pipefd");
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
+ err("pipe");
+}
+
+static inline void uffd_test_ctx_init(uint64_t features)
+{
+ uffd_test_ctx_init_ext(&features);
+}
+
static int my_bcmp(char *str1, char *str2, size_t n)
{
unsigned long i;
@@ -363,27 +467,33 @@ static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
/* Undo write-protect, do wakeup after that */
prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
- if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) {
- fprintf(stderr, "clear WP failed for address 0x%" PRIx64 "\n",
- (uint64_t)start);
- exit(1);
- }
+ if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
+ err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
}
static void continue_range(int ufd, __u64 start, __u64 len)
{
struct uffdio_continue req;
+ int ret;
req.range.start = start;
req.range.len = len;
req.mode = 0;
- if (ioctl(ufd, UFFDIO_CONTINUE, &req)) {
- fprintf(stderr,
- "UFFDIO_CONTINUE failed for address 0x%" PRIx64 "\n",
- (uint64_t)start);
- exit(1);
- }
+ if (ioctl(ufd, UFFDIO_CONTINUE, &req))
+ err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
+ (uint64_t)start);
+
+ /*
+ * Error handling within the kernel for continue is subtly different
+ * from copy or zeropage, so it may be a source of bugs. Trigger an
+ * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
+ */
+ req.mapped = 0;
+ ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
+ if (ret >= 0 || req.mapped != -EEXIST)
+ err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
+ ret, (int64_t) req.mapped);
}
static void *locking_thread(void *arg)
@@ -395,7 +505,6 @@ static void *locking_thread(void *arg)
unsigned long long count;
char randstate[64];
unsigned int seed;
- time_t start;
if (bounces & BOUNCE_RANDOM) {
seed = (unsigned int) time(NULL) - bounces;
@@ -403,10 +512,8 @@ static void *locking_thread(void *arg)
seed += cpu;
bzero(&rand, sizeof(rand));
bzero(&randstate, sizeof(randstate));
- if (initstate_r(seed, randstate, sizeof(randstate), &rand)) {
- fprintf(stderr, "srandom_r error\n");
- exit(1);
- }
+ if (initstate_r(seed, randstate, sizeof(randstate), &rand))
+ err("initstate_r failed");
} else {
page_nr = -bounces;
if (!(bounces & BOUNCE_RACINGFAULTS))
@@ -415,92 +522,26 @@ static void *locking_thread(void *arg)
while (!finished) {
if (bounces & BOUNCE_RANDOM) {
- if (random_r(&rand, &rand_nr)) {
- fprintf(stderr, "random_r 1 error\n");
- exit(1);
- }
+ if (random_r(&rand, &rand_nr))
+ err("random_r failed");
page_nr = rand_nr;
if (sizeof(page_nr) > sizeof(rand_nr)) {
- if (random_r(&rand, &rand_nr)) {
- fprintf(stderr, "random_r 2 error\n");
- exit(1);
- }
+ if (random_r(&rand, &rand_nr))
+ err("random_r failed");
page_nr |= (((unsigned long) rand_nr) << 16) <<
16;
}
} else
page_nr += 1;
page_nr %= nr_pages;
-
- start = time(NULL);
- if (bounces & BOUNCE_VERIFY) {
- count = *area_count(area_dst, page_nr);
- if (!count) {
- fprintf(stderr,
- "page_nr %lu wrong count %Lu %Lu\n",
- page_nr, count,
- count_verify[page_nr]);
- exit(1);
- }
-
-
- /*
- * We can't use bcmp (or memcmp) because that
- * returns 0 erroneously if the memory is
- * changing under it (even if the end of the
- * page is never changing and always
- * different).
- */
-#if 1
- if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
- page_size)) {
- fprintf(stderr,
- "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
- page_nr, count, count_verify[page_nr]);
- exit(1);
- }
-#else
- unsigned long loops;
-
- loops = 0;
- /* uncomment the below line to test with mutex */
- /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
- while (!bcmp(area_dst + page_nr * page_size, zeropage,
- page_size)) {
- loops += 1;
- if (loops > 10)
- break;
- }
- /* uncomment below line to test with mutex */
- /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
- if (loops) {
- fprintf(stderr,
- "page_nr %lu all zero thread %lu %p %lu\n",
- page_nr, cpu, area_dst + page_nr * page_size,
- loops);
- if (loops > 10)
- exit(1);
- }
-#endif
- }
-
pthread_mutex_lock(area_mutex(area_dst, page_nr));
count = *area_count(area_dst, page_nr);
- if (count != count_verify[page_nr]) {
- fprintf(stderr,
- "page_nr %lu memory corruption %Lu %Lu\n",
- page_nr, count,
- count_verify[page_nr]); exit(1);
- }
+ if (count != count_verify[page_nr])
+ err("page_nr %lu memory corruption %llu %llu",
+ page_nr, count, count_verify[page_nr]);
count++;
*area_count(area_dst, page_nr) = count_verify[page_nr] = count;
pthread_mutex_unlock(area_mutex(area_dst, page_nr));
-
- if (time(NULL) - start > 1)
- fprintf(stderr,
- "userfault too slow %ld "
- "possible false positive with overcommit\n",
- time(NULL) - start);
}
return NULL;
@@ -514,22 +555,21 @@ static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
offset);
if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
/* real retval in ufdio_copy.copy */
- if (uffdio_copy->copy != -EEXIST) {
- uffd_error(uffdio_copy->copy,
- "UFFDIO_COPY retry error");
- }
- } else
- uffd_error(uffdio_copy->copy, "UFFDIO_COPY retry unexpected");
+ if (uffdio_copy->copy != -EEXIST)
+ err("UFFDIO_COPY retry error: %"PRId64,
+ (int64_t)uffdio_copy->copy);
+ } else {
+ err("UFFDIO_COPY retry unexpected: %"PRId64,
+ (int64_t)uffdio_copy->copy);
+ }
}
static int __copy_page(int ufd, unsigned long offset, bool retry)
{
struct uffdio_copy uffdio_copy;
- if (offset >= nr_pages * page_size) {
- fprintf(stderr, "unexpected offset %lu\n", offset);
- exit(1);
- }
+ if (offset >= nr_pages * page_size)
+ err("unexpected offset %lu\n", offset);
uffdio_copy.dst = (unsigned long) area_dst + offset;
uffdio_copy.src = (unsigned long) area_src + offset;
uffdio_copy.len = page_size;
@@ -541,9 +581,10 @@ static int __copy_page(int ufd, unsigned long offset, bool retry)
if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
/* real retval in ufdio_copy.copy */
if (uffdio_copy.copy != -EEXIST)
- uffd_error(uffdio_copy.copy, "UFFDIO_COPY error");
+ err("UFFDIO_COPY error: %"PRId64,
+ (int64_t)uffdio_copy.copy);
} else if (uffdio_copy.copy != page_size) {
- uffd_error(uffdio_copy.copy, "UFFDIO_COPY unexpected copy");
+ err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
} else {
if (test_uffdio_copy_eexist && retry) {
test_uffdio_copy_eexist = false;
@@ -572,11 +613,10 @@ static int uffd_read_msg(int ufd, struct uffd_msg *msg)
if (ret < 0) {
if (errno == EAGAIN)
return 1;
- perror("blocking read error");
+ err("blocking read error");
} else {
- fprintf(stderr, "short read\n");
+ err("short read");
}
- exit(1);
}
return 0;
@@ -587,10 +627,8 @@ static void uffd_handle_page_fault(struct uffd_msg *msg,
{
unsigned long offset;
- if (msg->event != UFFD_EVENT_PAGEFAULT) {
- fprintf(stderr, "unexpected msg event %u\n", msg->event);
- exit(1);
- }
+ if (msg->event != UFFD_EVENT_PAGEFAULT)
+ err("unexpected msg event %u", msg->event);
if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
/* Write protect page faults */
@@ -621,11 +659,8 @@ static void uffd_handle_page_fault(struct uffd_msg *msg,
stats->minor_faults++;
} else {
/* Missing page faults */
- if (bounces & BOUNCE_VERIFY &&
- msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) {
- fprintf(stderr, "unexpected write fault\n");
- exit(1);
- }
+ if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ err("unexpected write fault");
offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
offset &= ~(page_size-1);
@@ -652,32 +687,20 @@ static void *uffd_poll_thread(void *arg)
for (;;) {
ret = poll(pollfd, 2, -1);
- if (!ret) {
- fprintf(stderr, "poll error %d\n", ret);
- exit(1);
- }
- if (ret < 0) {
- perror("poll");
- exit(1);
- }
+ if (ret <= 0)
+ err("poll error: %d", ret);
if (pollfd[1].revents & POLLIN) {
- if (read(pollfd[1].fd, &tmp_chr, 1) != 1) {
- fprintf(stderr, "read pipefd error\n");
- exit(1);
- }
+ if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+ err("read pipefd error");
break;
}
- if (!(pollfd[0].revents & POLLIN)) {
- fprintf(stderr, "pollfd[0].revents %d\n",
- pollfd[0].revents);
- exit(1);
- }
+ if (!(pollfd[0].revents & POLLIN))
+ err("pollfd[0].revents %d", pollfd[0].revents);
if (uffd_read_msg(uffd, &msg))
continue;
switch (msg.event) {
default:
- fprintf(stderr, "unexpected msg event %u\n",
- msg.event); exit(1);
+ err("unexpected msg event %u\n", msg.event);
break;
case UFFD_EVENT_PAGEFAULT:
uffd_handle_page_fault(&msg, stats);
@@ -691,10 +714,8 @@ static void *uffd_poll_thread(void *arg)
uffd_reg.range.start = msg.arg.remove.start;
uffd_reg.range.len = msg.arg.remove.end -
msg.arg.remove.start;
- if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) {
- fprintf(stderr, "remove failure\n");
- exit(1);
- }
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
+ err("remove failure");
break;
case UFFD_EVENT_REMAP:
area_dst = (char *)(unsigned long)msg.arg.remap.to;
@@ -797,9 +818,7 @@ static int stress(struct uffd_stats *uffd_stats)
* UFFDIO_COPY without writing zero pages into area_dst
* because the background threads already completed).
*/
- if (uffd_test_ops->release_pages(area_src))
- return 1;
-
+ uffd_test_ops->release_pages(area_src);
finished = 1;
for (cpu = 0; cpu < nr_cpus; cpu++)
@@ -809,10 +828,8 @@ static int stress(struct uffd_stats *uffd_stats)
for (cpu = 0; cpu < nr_cpus; cpu++) {
char c;
if (bounces & BOUNCE_POLL) {
- if (write(pipefd[cpu*2+1], &c, 1) != 1) {
- fprintf(stderr, "pipefd write error\n");
- return 1;
- }
+ if (write(pipefd[cpu*2+1], &c, 1) != 1)
+ err("pipefd write error");
if (pthread_join(uffd_threads[cpu],
(void *)&uffd_stats[cpu]))
return 1;
@@ -827,40 +844,6 @@ static int stress(struct uffd_stats *uffd_stats)
return 0;
}
-static int userfaultfd_open_ext(uint64_t *features)
-{
- struct uffdio_api uffdio_api;
-
- uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
- if (uffd < 0) {
- fprintf(stderr,
- "userfaultfd syscall not available in this kernel\n");
- return 1;
- }
- uffd_flags = fcntl(uffd, F_GETFD, NULL);
-
- uffdio_api.api = UFFD_API;
- uffdio_api.features = *features;
- if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
- fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to "
- "run with either root or ptrace capability.\n");
- return 1;
- }
- if (uffdio_api.api != UFFD_API) {
- fprintf(stderr, "UFFDIO_API error: %" PRIu64 "\n",
- (uint64_t)uffdio_api.api);
- return 1;
- }
-
- *features = uffdio_api.features;
- return 0;
-}
-
-static int userfaultfd_open(uint64_t features)
-{
- return userfaultfd_open_ext(&features);
-}
-
sigjmp_buf jbuf, *sigbuf;
static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
@@ -912,10 +895,8 @@ static int faulting_process(int signal_test)
memset(&act, 0, sizeof(act));
act.sa_sigaction = sighndl;
act.sa_flags = SA_SIGINFO;
- if (sigaction(SIGBUS, &act, 0)) {
- perror("sigaction");
- return 1;
- }
+ if (sigaction(SIGBUS, &act, 0))
+ err("sigaction");
lastnr = (unsigned long)-1;
}
@@ -925,10 +906,8 @@ static int faulting_process(int signal_test)
if (signal_test) {
if (sigsetjmp(*sigbuf, 1) != 0) {
- if (steps == 1 && nr == lastnr) {
- fprintf(stderr, "Signal repeated\n");
- return 1;
- }
+ if (steps == 1 && nr == lastnr)
+ err("Signal repeated");
lastnr = nr;
if (signal_test == 1) {
@@ -953,12 +932,9 @@ static int faulting_process(int signal_test)
}
count = *area_count(area_dst, nr);
- if (count != count_verify[nr]) {
- fprintf(stderr,
- "nr %lu memory corruption %Lu %Lu\n",
- nr, count,
- count_verify[nr]);
- }
+ if (count != count_verify[nr])
+ err("nr %lu memory corruption %llu %llu\n",
+ nr, count, count_verify[nr]);
/*
* Trigger write protection if there is by writing
* the same value back.
@@ -974,18 +950,16 @@ static int faulting_process(int signal_test)
area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size,
MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
- if (area_dst == MAP_FAILED) {
- perror("mremap");
- exit(1);
- }
+ if (area_dst == MAP_FAILED)
+ err("mremap");
+ /* Reset area_src since we just clobbered it */
+ area_src = NULL;
for (; nr < nr_pages; nr++) {
count = *area_count(area_dst, nr);
if (count != count_verify[nr]) {
- fprintf(stderr,
- "nr %lu memory corruption %Lu %Lu\n",
- nr, count,
- count_verify[nr]); exit(1);
+ err("nr %lu memory corruption %llu %llu\n",
+ nr, count, count_verify[nr]);
}
/*
* Trigger write protection if there is by writing
@@ -994,15 +968,11 @@ static int faulting_process(int signal_test)
*area_count(area_dst, nr) = count;
}
- if (uffd_test_ops->release_pages(area_dst))
- return 1;
+ uffd_test_ops->release_pages(area_dst);
- for (nr = 0; nr < nr_pages; nr++) {
- if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) {
- fprintf(stderr, "nr %lu is not zero\n", nr);
- exit(1);
- }
- }
+ for (nr = 0; nr < nr_pages; nr++)
+ if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
+ err("nr %lu is not zero", nr);
return 0;
}
@@ -1015,13 +985,12 @@ static void retry_uffdio_zeropage(int ufd,
uffdio_zeropage->range.len,
offset);
if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
- if (uffdio_zeropage->zeropage != -EEXIST) {
- uffd_error(uffdio_zeropage->zeropage,
- "UFFDIO_ZEROPAGE retry error");
- }
+ if (uffdio_zeropage->zeropage != -EEXIST)
+ err("UFFDIO_ZEROPAGE error: %"PRId64,
+ (int64_t)uffdio_zeropage->zeropage);
} else {
- uffd_error(uffdio_zeropage->zeropage,
- "UFFDIO_ZEROPAGE retry unexpected");
+ err("UFFDIO_ZEROPAGE error: %"PRId64,
+ (int64_t)uffdio_zeropage->zeropage);
}
}
@@ -1034,10 +1003,8 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
- if (offset >= nr_pages * page_size) {
- fprintf(stderr, "unexpected offset %lu\n", offset);
- exit(1);
- }
+ if (offset >= nr_pages * page_size)
+ err("unexpected offset %lu", offset);
uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
uffdio_zeropage.range.len = page_size;
uffdio_zeropage.mode = 0;
@@ -1045,14 +1012,13 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
res = uffdio_zeropage.zeropage;
if (ret) {
/* real retval in ufdio_zeropage.zeropage */
- if (has_zeropage) {
- uffd_error(res, "UFFDIO_ZEROPAGE %s",
- res == -EEXIST ? "-EEXIST" : "error");
- } else if (res != -EINVAL)
- uffd_error(res, "UFFDIO_ZEROPAGE not -EINVAL");
+ if (has_zeropage)
+ err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
+ else if (res != -EINVAL)
+ err("UFFDIO_ZEROPAGE not -EINVAL");
} else if (has_zeropage) {
if (res != page_size) {
- uffd_error(res, "UFFDIO_ZEROPAGE unexpected");
+ err("UFFDIO_ZEROPAGE unexpected size");
} else {
if (test_uffdio_zeropage_eexist && retry) {
test_uffdio_zeropage_eexist = false;
@@ -1062,7 +1028,7 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
return 1;
}
} else
- uffd_error(res, "UFFDIO_ZEROPAGE succeeded");
+ err("UFFDIO_ZEROPAGE succeeded");
return 0;
}
@@ -1081,37 +1047,24 @@ static int userfaultfd_zeropage_test(void)
printf("testing UFFDIO_ZEROPAGE: ");
fflush(stdout);
- if (uffd_test_ops->release_pages(area_dst))
- return 1;
+ uffd_test_ctx_init(0);
- if (userfaultfd_open(0))
- return 1;
uffdio_register.range.start = (unsigned long) area_dst;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (test_uffdio_wp)
uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
- fprintf(stderr, "register failure\n");
- exit(1);
- }
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+ err("register failure");
expected_ioctls = uffd_test_ops->expected_ioctls;
- if ((uffdio_register.ioctls & expected_ioctls) !=
- expected_ioctls) {
- fprintf(stderr,
- "unexpected missing ioctl for anon memory\n");
- exit(1);
- }
+ if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
+ err("unexpected missing ioctl for anon memory");
- if (uffdio_zeropage(uffd, 0)) {
- if (my_bcmp(area_dst, zeropage, page_size)) {
- fprintf(stderr, "zeropage is not zero\n");
- exit(1);
- }
- }
+ if (uffdio_zeropage(uffd, 0))
+ if (my_bcmp(area_dst, zeropage, page_size))
+ err("zeropage is not zero");
- close(uffd);
printf("done.\n");
return 0;
}
@@ -1129,13 +1082,10 @@ static int userfaultfd_events_test(void)
printf("testing events (fork, remap, remove): ");
fflush(stdout);
- if (uffd_test_ops->release_pages(area_dst))
- return 1;
-
features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
UFFD_FEATURE_EVENT_REMOVE;
- if (userfaultfd_open(features))
- return 1;
+ uffd_test_ctx_init(features);
+
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
uffdio_register.range.start = (unsigned long) area_dst;
@@ -1143,46 +1093,31 @@ static int userfaultfd_events_test(void)
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (test_uffdio_wp)
uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
- fprintf(stderr, "register failure\n");
- exit(1);
- }
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+ err("register failure");
expected_ioctls = uffd_test_ops->expected_ioctls;
- if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
- fprintf(stderr, "unexpected missing ioctl for anon memory\n");
- exit(1);
- }
+ if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
+ err("unexpected missing ioctl for anon memory");
- if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
- perror("uffd_poll_thread create");
- exit(1);
- }
+ if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
+ err("uffd_poll_thread create");
pid = fork();
- if (pid < 0) {
- perror("fork");
- exit(1);
- }
+ if (pid < 0)
+ err("fork");
if (!pid)
exit(faulting_process(0));
waitpid(pid, &err, 0);
- if (err) {
- fprintf(stderr, "faulting process failed\n");
- exit(1);
- }
-
- if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
- perror("pipe write");
- exit(1);
- }
+ if (err)
+ err("faulting process failed");
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
if (pthread_join(uffd_mon, NULL))
return 1;
- close(uffd);
-
uffd_stats_report(&stats, 1);
return stats.missing_faults != nr_pages;
@@ -1202,12 +1137,9 @@ static int userfaultfd_sig_test(void)
printf("testing signal delivery: ");
fflush(stdout);
- if (uffd_test_ops->release_pages(area_dst))
- return 1;
-
features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
- if (userfaultfd_open(features))
- return 1;
+ uffd_test_ctx_init(features);
+
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
uffdio_register.range.start = (unsigned long) area_dst;
@@ -1215,57 +1147,40 @@ static int userfaultfd_sig_test(void)
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (test_uffdio_wp)
uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
- fprintf(stderr, "register failure\n");
- exit(1);
- }
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+ err("register failure");
expected_ioctls = uffd_test_ops->expected_ioctls;
- if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
- fprintf(stderr, "unexpected missing ioctl for anon memory\n");
- exit(1);
- }
+ if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
+ err("unexpected missing ioctl for anon memory");
- if (faulting_process(1)) {
- fprintf(stderr, "faulting process failed\n");
- exit(1);
- }
+ if (faulting_process(1))
+ err("faulting process failed");
- if (uffd_test_ops->release_pages(area_dst))
- return 1;
+ uffd_test_ops->release_pages(area_dst);
- if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
- perror("uffd_poll_thread create");
- exit(1);
- }
+ if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
+ err("uffd_poll_thread create");
pid = fork();
- if (pid < 0) {
- perror("fork");
- exit(1);
- }
+ if (pid < 0)
+ err("fork");
if (!pid)
exit(faulting_process(2));
waitpid(pid, &err, 0);
- if (err) {
- fprintf(stderr, "faulting process failed\n");
- exit(1);
- }
-
- if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
- perror("pipe write");
- exit(1);
- }
+ if (err)
+ err("faulting process failed");
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
if (pthread_join(uffd_mon, (void **)&userfaults))
return 1;
printf("done.\n");
if (userfaults)
- fprintf(stderr, "Signal test failed, userfaults: %ld\n",
- userfaults);
- close(uffd);
+ err("Signal test failed, userfaults: %ld", userfaults);
+
return userfaults != 0;
}
@@ -1279,7 +1194,7 @@ static int userfaultfd_minor_test(void)
void *expected_page;
char c;
struct uffd_stats stats = { 0 };
- uint64_t features = UFFD_FEATURE_MINOR_HUGETLBFS;
+ uint64_t req_features, features_out;
if (!test_uffdio_minor)
return 0;
@@ -1287,13 +1202,17 @@ static int userfaultfd_minor_test(void)
printf("testing minor faults: ");
fflush(stdout);
- if (uffd_test_ops->release_pages(area_dst))
+ if (test_type == TEST_HUGETLB)
+ req_features = UFFD_FEATURE_MINOR_HUGETLBFS;
+ else if (test_type == TEST_SHMEM)
+ req_features = UFFD_FEATURE_MINOR_SHMEM;
+ else
return 1;
- if (userfaultfd_open_ext(&features))
- return 1;
- /* If kernel reports the feature isn't supported, skip the test. */
- if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) {
+ features_out = req_features;
+ uffd_test_ctx_init_ext(&features_out);
+ /* If kernel reports required features aren't supported, skip test. */
+ if ((features_out & req_features) != req_features) {
printf("skipping test due to lack of feature support\n");
fflush(stdout);
return 0;
@@ -1302,17 +1221,13 @@ static int userfaultfd_minor_test(void)
uffdio_register.range.start = (unsigned long)area_dst_alias;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
- fprintf(stderr, "register failure\n");
- exit(1);
- }
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+ err("register failure");
expected_ioctls = uffd_test_ops->expected_ioctls;
expected_ioctls |= 1 << _UFFDIO_CONTINUE;
- if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
- fprintf(stderr, "unexpected missing ioctl(s)\n");
- exit(1);
- }
+ if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
+ err("unexpected missing ioctl(s)");
/*
* After registering with UFFD, populate the non-UFFD-registered side of
@@ -1323,10 +1238,8 @@ static int userfaultfd_minor_test(void)
page_size);
}
- if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
- perror("uffd_poll_thread create");
- exit(1);
- }
+ if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
+ err("uffd_poll_thread create");
/*
* Read each of the pages back using the UFFD-registered mapping. We
@@ -1335,92 +1248,173 @@ static int userfaultfd_minor_test(void)
* page's contents, and then issuing a CONTINUE ioctl.
*/
- if (posix_memalign(&expected_page, page_size, page_size)) {
- fprintf(stderr, "out of memory\n");
- return 1;
- }
+ if (posix_memalign(&expected_page, page_size, page_size))
+ err("out of memory");
for (p = 0; p < nr_pages; ++p) {
expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
memset(expected_page, expected_byte, page_size);
if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
- page_size)) {
- fprintf(stderr,
- "unexpected page contents after minor fault\n");
- exit(1);
- }
+ page_size))
+ err("unexpected page contents after minor fault");
}
- if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
- perror("pipe write");
- exit(1);
- }
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
if (pthread_join(uffd_mon, NULL))
return 1;
- close(uffd);
-
uffd_stats_report(&stats, 1);
return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
}
-static int userfaultfd_stress(void)
+#define BIT_ULL(nr) (1ULL << (nr))
+#define PM_SOFT_DIRTY BIT_ULL(55)
+#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_UFFD_WP BIT_ULL(57)
+#define PM_FILE BIT_ULL(61)
+#define PM_SWAP BIT_ULL(62)
+#define PM_PRESENT BIT_ULL(63)
+
+static int pagemap_open(void)
{
- void *area;
- char *tmp_area;
- unsigned long nr;
- struct uffdio_register uffdio_register;
- unsigned long cpu;
- int err;
- struct uffd_stats uffd_stats[nr_cpus];
+ int fd = open("/proc/self/pagemap", O_RDONLY);
- uffd_test_ops->allocate_area((void **)&area_src);
- if (!area_src)
- return 1;
- uffd_test_ops->allocate_area((void **)&area_dst);
- if (!area_dst)
- return 1;
+ if (fd < 0)
+ err("open pagemap");
- if (userfaultfd_open(0))
- return 1;
+ return fd;
+}
- count_verify = malloc(nr_pages * sizeof(unsigned long long));
- if (!count_verify) {
- perror("count_verify");
- return 1;
- }
+static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
+{
+ uint64_t value;
+ int ret;
- for (nr = 0; nr < nr_pages; nr++) {
- *area_mutex(area_src, nr) = (pthread_mutex_t)
- PTHREAD_MUTEX_INITIALIZER;
- count_verify[nr] = *area_count(area_src, nr) = 1;
+ ret = pread(fd, &value, sizeof(uint64_t),
+ ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
+ if (ret != sizeof(uint64_t))
+ err("pread() on pagemap failed");
+
+ return value;
+}
+
+/* This macro let __LINE__ works in err() */
+#define pagemap_check_wp(value, wp) do { \
+ if (!!(value & PM_UFFD_WP) != wp) \
+ err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
+ } while (0)
+
+static int pagemap_test_fork(bool present)
+{
+ pid_t child = fork();
+ uint64_t value;
+ int fd, result;
+
+ if (!child) {
+ /* Open the pagemap fd of the child itself */
+ fd = pagemap_open();
+ value = pagemap_read_vaddr(fd, area_dst);
/*
- * In the transition between 255 to 256, powerpc will
- * read out of order in my_bcmp and see both bytes as
- * zero, so leave a placeholder below always non-zero
- * after the count, to avoid my_bcmp to trigger false
- * positives.
+ * After fork() uffd-wp bit should be gone as long as we're
+ * without UFFD_FEATURE_EVENT_FORK
*/
- *(area_count(area_src, nr) + 1) = 1;
+ pagemap_check_wp(value, false);
+ /* Succeed */
+ exit(0);
}
+ waitpid(child, &result, 0);
+ return result;
+}
- pipefd = malloc(sizeof(int) * nr_cpus * 2);
- if (!pipefd) {
- perror("pipefd");
- return 1;
- }
- for (cpu = 0; cpu < nr_cpus; cpu++) {
- if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
- perror("pipe");
- return 1;
- }
- }
+static void userfaultfd_pagemap_test(unsigned int test_pgsize)
+{
+ struct uffdio_register uffdio_register;
+ int pagemap_fd;
+ uint64_t value;
- if (posix_memalign(&area, page_size, page_size)) {
- fprintf(stderr, "out of memory\n");
- return 1;
+ /* Pagemap tests uffd-wp only */
+ if (!test_uffdio_wp)
+ return;
+
+ /* Not enough memory to test this page size */
+ if (test_pgsize > nr_pages * page_size)
+ return;
+
+ printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
+ /* Flush so it doesn't flush twice in parent/child later */
+ fflush(stdout);
+
+ uffd_test_ctx_init(0);
+
+ if (test_pgsize > page_size) {
+ /* This is a thp test */
+ if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
+ err("madvise(MADV_HUGEPAGE) failed");
+ } else if (test_pgsize == page_size) {
+ /* This is normal page test; force no thp */
+ if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
+ err("madvise(MADV_NOHUGEPAGE) failed");
}
+
+ uffdio_register.range.start = (unsigned long) area_dst;
+ uffdio_register.range.len = nr_pages * page_size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+ err("register failed");
+
+ pagemap_fd = pagemap_open();
+
+ /* Touch the page */
+ *area_dst = 1;
+ wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
+ value = pagemap_read_vaddr(pagemap_fd, area_dst);
+ pagemap_check_wp(value, true);
+ /* Make sure uffd-wp bit dropped when fork */
+ if (pagemap_test_fork(true))
+ err("Detected stall uffd-wp bit in child");
+
+ /* Exclusive required or PAGEOUT won't work */
+ if (!(value & PM_MMAP_EXCLUSIVE))
+ err("multiple mapping detected: 0x%"PRIx64, value);
+
+ if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
+ err("madvise(MADV_PAGEOUT) failed");
+
+ /* Uffd-wp should persist even swapped out */
+ value = pagemap_read_vaddr(pagemap_fd, area_dst);
+ pagemap_check_wp(value, true);
+ /* Make sure uffd-wp bit dropped when fork */
+ if (pagemap_test_fork(false))
+ err("Detected stall uffd-wp bit in child");
+
+ /* Unprotect; this tests swap pte modifications */
+ wp_range(uffd, (uint64_t)area_dst, page_size, false);
+ value = pagemap_read_vaddr(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+
+ /* Fault in the page from disk */
+ *area_dst = 2;
+ value = pagemap_read_vaddr(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+
+ close(pagemap_fd);
+ printf("done\n");
+}
+
+static int userfaultfd_stress(void)
+{
+ void *area;
+ char *tmp_area;
+ unsigned long nr;
+ struct uffdio_register uffdio_register;
+ struct uffd_stats uffd_stats[nr_cpus];
+
+ uffd_test_ctx_init(0);
+
+ if (posix_memalign(&area, page_size, page_size))
+ err("out of memory");
zeropage = area;
bzero(zeropage, page_size);
@@ -1429,7 +1423,6 @@ static int userfaultfd_stress(void)
pthread_attr_init(&attr);
pthread_attr_setstacksize(&attr, 16*1024*1024);
- err = 0;
while (bounces--) {
unsigned long expected_ioctls;
@@ -1458,25 +1451,18 @@ static int userfaultfd_stress(void)
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (test_uffdio_wp)
uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
- fprintf(stderr, "register failure\n");
- return 1;
- }
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+ err("register failure");
expected_ioctls = uffd_test_ops->expected_ioctls;
if ((uffdio_register.ioctls & expected_ioctls) !=
- expected_ioctls) {
- fprintf(stderr,
- "unexpected missing ioctl for anon memory\n");
- return 1;
- }
+ expected_ioctls)
+ err("unexpected missing ioctl for anon memory");
if (area_dst_alias) {
uffdio_register.range.start = (unsigned long)
area_dst_alias;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
- fprintf(stderr, "register failure alias\n");
- return 1;
- }
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+ err("register failure alias");
}
/*
@@ -1503,8 +1489,7 @@ static int userfaultfd_stress(void)
* MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
* required to MADV_DONTNEED here.
*/
- if (uffd_test_ops->release_pages(area_dst))
- return 1;
+ uffd_test_ops->release_pages(area_dst);
uffd_stats_reset(uffd_stats, nr_cpus);
@@ -1518,33 +1503,22 @@ static int userfaultfd_stress(void)
nr_pages * page_size, false);
/* unregister */
- if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
- fprintf(stderr, "unregister failure\n");
- return 1;
- }
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
+ err("unregister failure");
if (area_dst_alias) {
uffdio_register.range.start = (unsigned long) area_dst;
if (ioctl(uffd, UFFDIO_UNREGISTER,
- &uffdio_register.range)) {
- fprintf(stderr, "unregister failure alias\n");
- return 1;
- }
+ &uffdio_register.range))
+ err("unregister failure alias");
}
/* verification */
- if (bounces & BOUNCE_VERIFY) {
- for (nr = 0; nr < nr_pages; nr++) {
- if (*area_count(area_dst, nr) != count_verify[nr]) {
- fprintf(stderr,
- "error area_count %Lu %Lu %lu\n",
- *area_count(area_src, nr),
- count_verify[nr],
- nr);
- err = 1;
- bounces = 0;
- }
- }
- }
+ if (bounces & BOUNCE_VERIFY)
+ for (nr = 0; nr < nr_pages; nr++)
+ if (*area_count(area_dst, nr) != count_verify[nr])
+ err("error area_count %llu %llu %lu\n",
+ *area_count(area_src, nr),
+ count_verify[nr], nr);
/* prepare next bounce */
tmp_area = area_src;
@@ -1558,10 +1532,21 @@ static int userfaultfd_stress(void)
uffd_stats_report(uffd_stats, nr_cpus);
}
- if (err)
- return err;
+ if (test_type == TEST_ANON) {
+ /*
+ * shmem/hugetlb won't be able to run since they have different
+ * behavior on fork() (file-backed memory normally drops ptes
+ * directly when fork), meanwhile the pagemap test will verify
+ * pgtable entry of fork()ed child.
+ */
+ userfaultfd_pagemap_test(page_size);
+ /*
+ * Hard-code for x86_64 for now for 2M THP, as x86_64 is
+ * currently the only one that supports uffd-wp
+ */
+ userfaultfd_pagemap_test(page_size * 512);
+ }
- close(uffd);
return userfaultfd_zeropage_test() || userfaultfd_sig_test()
|| userfaultfd_events_test() || userfaultfd_minor_test();
}
@@ -1610,8 +1595,9 @@ static void set_test_type(const char *type)
map_shared = true;
test_type = TEST_SHMEM;
uffd_test_ops = &shmem_uffd_test_ops;
+ test_uffdio_minor = true;
} else {
- fprintf(stderr, "Unknown test type: %s\n", type); exit(1);
+ err("Unknown test type: %s", type);
}
if (test_type == TEST_HUGETLB)
@@ -1619,15 +1605,11 @@ static void set_test_type(const char *type)
else
page_size = sysconf(_SC_PAGE_SIZE);
- if (!page_size) {
- fprintf(stderr, "Unable to determine page size\n");
- exit(2);
- }
+ if (!page_size)
+ err("Unable to determine page size");
if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
- > page_size) {
- fprintf(stderr, "Impossible to run this test\n");
- exit(2);
- }
+ > page_size)
+ err("Impossible to run this test");
}
static void sigalrm(int sig)
@@ -1644,10 +1626,8 @@ int main(int argc, char **argv)
if (argc < 4)
usage();
- if (signal(SIGALRM, sigalrm) == SIG_ERR) {
- fprintf(stderr, "failed to arm SIGALRM");
- exit(1);
- }
+ if (signal(SIGALRM, sigalrm) == SIG_ERR)
+ err("failed to arm SIGALRM");
alarm(ALARM_INTERVAL_SECS);
set_test_type(argv[1]);
@@ -1656,13 +1636,13 @@ int main(int argc, char **argv)
nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
nr_cpus;
if (!nr_pages_per_cpu) {
- fprintf(stderr, "invalid MiB\n");
+ _err("invalid MiB");
usage();
}
bounces = atoi(argv[3]);
if (bounces <= 0) {
- fprintf(stderr, "invalid bounces\n");
+ _err("invalid bounces");
usage();
}
nr_pages = nr_pages_per_cpu * nr_cpus;
@@ -1671,16 +1651,20 @@ int main(int argc, char **argv)
if (argc < 5)
usage();
huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
- if (huge_fd < 0) {
- fprintf(stderr, "Open of %s failed", argv[3]);
- perror("open");
- exit(1);
- }
- if (ftruncate(huge_fd, 0)) {
- fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
- perror("ftruncate");
- exit(1);
- }
+ if (huge_fd < 0)
+ err("Open of %s failed", argv[4]);
+ if (ftruncate(huge_fd, 0))
+ err("ftruncate %s to size 0 failed", argv[4]);
+ } else if (test_type == TEST_SHMEM) {
+ shm_fd = memfd_create(argv[0], 0);
+ if (shm_fd < 0)
+ err("memfd_create");
+ if (ftruncate(shm_fd, nr_pages * page_size * 2))
+ err("ftruncate");
+ if (fallocate(shm_fd,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
+ nr_pages * page_size * 2))
+ err("fallocate");
}
printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
nr_pages, nr_pages_per_cpu);