summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kdump/vmcoreinfo.rst6
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt2
-rw-r--r--Documentation/admin-guide/mm/userfaultfd.rst25
-rw-r--r--Documentation/core-api/printk-formats.rst16
-rw-r--r--Documentation/filesystems/locking.rst4
-rw-r--r--Documentation/filesystems/proc.rst8
-rw-r--r--Documentation/filesystems/tmpfs.rst66
-rw-r--r--Documentation/mm/active_mm.rst6
-rw-r--r--Documentation/mm/arch_pgtable_helpers.rst2
-rw-r--r--Documentation/mm/multigen_lru.rst44
-rw-r--r--Documentation/mm/unevictable-lru.rst2
-rw-r--r--MAINTAINERS5
-rw-r--r--arch/Kconfig32
-rw-r--r--arch/arc/Kconfig4
-rw-r--r--arch/arm/Kconfig9
-rw-r--r--arch/arm/configs/imx_v6_v7_defconfig2
-rw-r--r--arch/arm/configs/milbeaut_m10v_defconfig2
-rw-r--r--arch/arm/configs/oxnas_v6_defconfig2
-rw-r--r--arch/arm/configs/pxa_defconfig2
-rw-r--r--arch/arm/configs/sama7_defconfig2
-rw-r--r--arch/arm/configs/sp7021_defconfig2
-rw-r--r--arch/arm/mach-rpc/ecard.c2
-rw-r--r--arch/arm64/Kconfig28
-rw-r--r--arch/arm64/include/asm/memory.h8
-rw-r--r--arch/arm64/include/asm/mte-kasan.h81
-rw-r--r--arch/arm64/include/asm/mte.h12
-rw-r--r--arch/arm64/include/asm/pgtable.h2
-rw-r--r--arch/arm64/include/asm/sparsemem.h2
-rw-r--r--arch/arm64/include/asm/uaccess.h66
-rw-r--r--arch/arm64/include/asm/word-at-a-time.h4
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/gfp.h2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/page_alloc.c10
-rw-r--r--arch/arm64/mm/fault.c36
-rw-r--r--arch/csky/Kconfig2
-rw-r--r--arch/ia64/Kconfig8
-rw-r--r--arch/ia64/include/asm/sparsemem.h4
-rw-r--r--arch/ia64/mm/hugetlbpage.c2
-rw-r--r--arch/loongarch/Kconfig12
-rw-r--r--arch/m68k/Kconfig.cpu5
-rw-r--r--arch/mips/Kconfig15
-rw-r--r--arch/mips/include/asm/fixmap.h2
-rw-r--r--arch/mips/include/asm/pgtable.h3
-rw-r--r--arch/nios2/Kconfig7
-rw-r--r--arch/powerpc/Kconfig28
-rw-r--r--arch/powerpc/configs/85xx/ge_imp3a_defconfig2
-rw-r--r--arch/powerpc/configs/fsl-emb-nonhw.config2
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush.h3
-rw-r--r--arch/powerpc/kernel/smp.c2
-rw-r--r--arch/powerpc/mm/book3s64/iommu_api.c2
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c4
-rw-r--r--arch/powerpc/mm/fault.c37
-rw-r--r--arch/powerpc/mm/hugetlbpage.c2
-rw-r--r--arch/powerpc/platforms/powernv/Kconfig1
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c2
-rw-r--r--arch/powerpc/platforms/pseries/Kconfig1
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/include/asm/pgtable.h12
-rw-r--r--arch/s390/mm/fault.c24
-rw-r--r--arch/sh/configs/ecovec24_defconfig2
-rw-r--r--arch/sh/mm/Kconfig17
-rw-r--r--arch/sparc/Kconfig5
-rw-r--r--arch/sparc/kernel/pci_sun4v.c2
-rw-r--r--arch/sparc/kernel/traps_64.c2
-rw-r--r--arch/sparc/mm/tsb.c4
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/string_64.h23
-rw-r--r--arch/x86/mm/fault.c36
-rw-r--r--arch/x86/mm/pat/memtype.c12
-rw-r--r--arch/xtensa/Kconfig5
-rw-r--r--drivers/base/regmap/regmap-debugfs.c8
-rw-r--r--drivers/block/floppy.c2
-rw-r--r--drivers/crypto/ccp/sev-dev.c2
-rw-r--r--drivers/crypto/hisilicon/sgl.c6
-rw-r--r--drivers/dma-buf/heaps/system_heap.c5
-rw-r--r--drivers/gpu/drm/i915/gem/selftests/huge_pages.c2
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo_vm.c2
-rw-r--r--drivers/gpu/drm/ttm/ttm_pool.c22
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h2
-rw-r--r--drivers/iommu/dma-iommu.c4
-rw-r--r--drivers/irqchip/irq-gic-v3-its.c4
-rw-r--r--drivers/md/dm-bufio.c2
-rw-r--r--drivers/md/dm-cache-metadata.c2
-rw-r--r--drivers/md/dm-thin-metadata.c2
-rw-r--r--drivers/misc/genwqe/card_utils.c2
-rw-r--r--drivers/net/ethernet/hisilicon/hns3/hns3_enet.c2
-rw-r--r--drivers/net/ethernet/ibm/ibmvnic.h2
-rw-r--r--drivers/video/fbdev/hyperv_fb.c4
-rw-r--r--drivers/video/fbdev/vermilion/vermilion.c2
-rw-r--r--drivers/virtio/virtio_balloon.c2
-rw-r--r--drivers/virtio/virtio_mem.c12
-rw-r--r--fs/afs/dir.c10
-rw-r--r--fs/afs/dir_edit.c2
-rw-r--r--fs/afs/file.c14
-rw-r--r--fs/afs/inode.c27
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/write.c4
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/ext4/move_extent.c8
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/iomap/buffered-io.c11
-rw-r--r--fs/jfs/jfs_metapage.c39
-rw-r--r--fs/netfs/buffered_read.c4
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nilfs2/page.c6
-rw-r--r--fs/ocfs2/refcounttree.c9
-rw-r--r--fs/proc/kcore.c85
-rw-r--r--fs/proc/meminfo.c13
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/super.c2
-rw-r--r--fs/ufs/dir.c29
-rw-r--r--fs/userfaultfd.c45
-rw-r--r--fs/xfs/xfs_file.c17
-rw-r--r--include/asm-generic/pgalloc.h4
-rw-r--r--include/drm/ttm/ttm_pool.h2
-rw-r--r--include/linux/gfp.h7
-rw-r--r--include/linux/gfp_types.h30
-rw-r--r--include/linux/highmem.h8
-rw-r--r--include/linux/huge_mm.h41
-rw-r--r--include/linux/hugetlb.h29
-rw-r--r--include/linux/io-mapping.h20
-rw-r--r--include/linux/memblock.h2
-rw-r--r--include/linux/memcontrol.h1
-rw-r--r--include/linux/mm.h168
-rw-r--r--include/linux/mm_inline.h6
-rw-r--r--include/linux/mm_types.h46
-rw-r--r--include/linux/mmap_lock.h37
-rw-r--r--include/linux/mmzone.h21
-rw-r--r--include/linux/page-flags.h16
-rw-r--r--include/linux/page_ext.h2
-rw-r--r--include/linux/pageblock-flags.h4
-rw-r--r--include/linux/pagemap.h21
-rw-r--r--include/linux/pgtable.h9
-rw-r--r--include/linux/sched/mm.h28
-rw-r--r--include/linux/shmem_fs.h10
-rw-r--r--include/linux/slab.h7
-rw-r--r--include/linux/swap.h20
-rw-r--r--include/linux/uio.h2
-rw-r--r--include/linux/userfaultfd_k.h92
-rw-r--r--include/linux/vm_event_item.h6
-rw-r--r--include/linux/vmalloc.h7
-rw-r--r--include/linux/vmstat.h6
-rw-r--r--include/trace/events/cma.h58
-rw-r--r--include/trace/events/ksm.h251
-rw-r--r--include/trace/events/mmflags.h94
-rw-r--r--include/uapi/linux/userfaultfd.h17
-rw-r--r--init/main.c74
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/dma/pool.c6
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c163
-rw-r--r--kernel/kthread.c22
-rw-r--r--kernel/sched/core.c15
-rw-r--r--kernel/sched/fair.c57
-rw-r--r--lib/Kconfig.debug10
-rw-r--r--lib/iov_iter.c48
-rw-r--r--lib/maple_tree.c1
-rw-r--r--lib/stackdepot.c12
-rw-r--r--lib/test_printf.c26
-rw-r--r--lib/vsprintf.c21
-rw-r--r--mm/Kconfig31
-rw-r--r--mm/Kconfig.debug6
-rw-r--r--mm/Makefile1
-rw-r--r--mm/cma.c1
-rw-r--r--mm/cma_sysfs.c2
-rw-r--r--mm/compaction.c20
-rw-r--r--mm/damon/sysfs-schemes.c4
-rw-r--r--mm/debug.c7
-rw-r--r--mm/debug_vm_pgtable.c10
-rw-r--r--mm/dmapool_test.c147
-rw-r--r--mm/filemap.c27
-rw-r--r--mm/folio-compat.c4
-rw-r--r--mm/gup.c2
-rw-r--r--mm/huge_memory.c67
-rw-r--r--mm/hugetlb.c38
-rw-r--r--mm/hugetlb_vmemmap.c16
-rw-r--r--mm/init-mm.c3
-rw-r--r--mm/internal.h55
-rw-r--r--mm/kasan/hw_tags.c14
-rw-r--r--mm/kasan/kasan.h38
-rw-r--r--mm/kasan/kasan_test.c2
-rw-r--r--mm/kasan/report.c59
-rw-r--r--mm/khugepaged.c27
-rw-r--r--mm/kmsan/init.c6
-rw-r--r--mm/kmsan/kmsan_test.c97
-rw-r--r--mm/ksm.c21
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c32
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c269
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/memtest.c6
-rw-r--r--mm/migrate.c1
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mm_init.c2548
-rw-r--r--mm/mmap.c270
-rw-r--r--mm/mmu_gather.c2
-rw-r--r--mm/mprotect.c51
-rw-r--r--mm/mremap.c23
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/page-writeback.c40
-rw-r--r--mm/page_alloc.c2752
-rw-r--r--mm/page_isolation.c12
-rw-r--r--mm/page_owner.c6
-rw-r--r--mm/page_reporting.c4
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/rmap.c38
-rw-r--r--mm/shmem.c117
-rw-r--r--mm/shrinker_debug.c39
-rw-r--r--mm/shuffle.h2
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h1
-rw-r--r--mm/slub.c4
-rw-r--r--mm/swap_state.c19
-rw-r--r--mm/swapfile.c10
-rw-r--r--mm/truncate.c15
-rw-r--r--mm/userfaultfd.c209
-rw-r--r--mm/vmalloc.c239
-rw-r--r--mm/vmscan.c186
-rw-r--r--mm/vmstat.c20
-rw-r--r--mm/zsmalloc.c358
-rw-r--r--mm/zswap.c16
-rw-r--r--net/smc/smc_ib.c2
-rw-r--r--security/integrity/ima/ima_crypto.c2
-rw-r--r--tools/testing/memblock/linux/mmzone.h6
-rw-r--r--tools/testing/selftests/cgroup/test_memcontrol.c15
-rw-r--r--tools/testing/selftests/mm/run_vmtests.sh8
-rw-r--r--tools/testing/selftests/mm/split_huge_page_test.c6
-rw-r--r--tools/testing/selftests/mm/userfaultfd.c49
-rw-r--r--tools/testing/selftests/mm/virtual_address_range.c24
233 files changed, 6282 insertions, 4602 deletions
diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 86fd88492870..c18d94fa6470 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -172,7 +172,7 @@ variables.
Offset of the free_list's member. This value is used to compute the number
of free pages.
-Each zone has a free_area structure array called free_area[MAX_ORDER].
+Each zone has a free_area structure array called free_area[MAX_ORDER + 1].
The free_list represents a linked list of free page blocks.
(list_head, next|prev)
@@ -189,8 +189,8 @@ Offsets of the vmap_area's members. They carry vmalloc-specific
information. Makedumpfile gets the start address of the vmalloc region
from this.
-(zone.free_area, MAX_ORDER)
----------------------------
+(zone.free_area, MAX_ORDER + 1)
+-------------------------------
Free areas descriptor. User-space tools use this value to iterate the
free_area ranges. MAX_ORDER is used by the zone buddy allocator.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6221a1d057dd..50da4f26fad5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3969,7 +3969,7 @@
[KNL] Minimal page reporting order
Format: <integer>
Adjust the minimal page reporting order. The page
- reporting is disabled when it exceeds (MAX_ORDER-1).
+ reporting is disabled when it exceeds MAX_ORDER.
panic= [KNL] Kernel behaviour on panic: delay <timeout>
timeout > 0: seconds before rebooting
diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index 7dc823b56ca4..7c304e432205 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -219,6 +219,31 @@ former will have ``UFFD_PAGEFAULT_FLAG_WP`` set, the latter
you still need to supply a page when ``UFFDIO_REGISTER_MODE_MISSING`` was
used.
+Userfaultfd write-protect mode currently behave differently on none ptes
+(when e.g. page is missing) over different types of memories.
+
+For anonymous memory, ``ioctl(UFFDIO_WRITEPROTECT)`` will ignore none ptes
+(e.g. when pages are missing and not populated). For file-backed memories
+like shmem and hugetlbfs, none ptes will be write protected just like a
+present pte. In other words, there will be a userfaultfd write fault
+message generated when writing to a missing page on file typed memories,
+as long as the page range was write-protected before. Such a message will
+not be generated on anonymous memories by default.
+
+If the application wants to be able to write protect none ptes on anonymous
+memory, one can pre-populate the memory with e.g. MADV_POPULATE_READ. On
+newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED
+and set the feature bit in advance to make sure none ptes will also be
+write protected even upon anonymous memory.
+
+When using ``UFFDIO_REGISTER_MODE_WP`` in combination with either
+``UFFDIO_REGISTER_MODE_MISSING`` or ``UFFDIO_REGISTER_MODE_MINOR``, when
+resolving missing / minor faults with ``UFFDIO_COPY`` or ``UFFDIO_CONTINUE``
+respectively, it may be desirable for the new page / mapping to be
+write-protected (so future writes will also result in a WP fault). These ioctls
+support a mode flag (``UFFDIO_COPY_MODE_WP`` or ``UFFDIO_CONTINUE_MODE_WP``
+respectively) to configure the mapping this way.
+
QEMU/KVM
========
diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
index dbe1aacc79d0..dfe7e75a71de 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -575,20 +575,26 @@ The field width is passed by value, the bitmap is passed by reference.
Helper macros cpumask_pr_args() and nodemask_pr_args() are available to ease
printing cpumask and nodemask.
-Flags bitfields such as page flags, gfp_flags
----------------------------------------------
+Flags bitfields such as page flags, page_type, gfp_flags
+--------------------------------------------------------
::
%pGp 0x17ffffc0002036(referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1fffff)
+ %pGt 0xffffff7f(buddy)
%pGg GFP_USER|GFP_DMA32|GFP_NOWARN
%pGv read|exec|mayread|maywrite|mayexec|denywrite
For printing flags bitfields as a collection of symbolic constants that
would construct the value. The type of flags is given by the third
-character. Currently supported are [p]age flags, [v]ma_flags (both
-expect ``unsigned long *``) and [g]fp_flags (expects ``gfp_t *``). The flag
-names and print order depends on the particular type.
+character. Currently supported are:
+
+ - p - [p]age flags, expects value of type (``unsigned long *``)
+ - t - page [t]ype, expects value of type (``unsigned int *``)
+ - v - [v]ma_flags, expects value of type (``unsigned long *``)
+ - g - [g]fp_flags, expects value of type (``gfp_t *``)
+
+The flag names and print order depends on the particular type.
Note that this format should not be used directly in the
:c:func:`TP_printk()` part of a tracepoint. Instead, use the show_*_flags()
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 7de7a7272a5e..aa1a233b0fa8 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -645,7 +645,7 @@ ops mmap_lock PageLocked(page)
open: yes
close: yes
fault: yes can return with page locked
-map_pages: yes
+map_pages: read
page_mkwrite: yes can return with page locked
pfn_mkwrite: yes
access: yes
@@ -661,7 +661,7 @@ locked. The VM will unlock the page.
->map_pages() is called when VM asks to map easy accessible pages.
Filesystem should find and map pages associated with offsets from "start_pgoff"
-till "end_pgoff". ->map_pages() is called with page table locked and must
+till "end_pgoff". ->map_pages() is called with the RCU lock held and must
not block. If it's not possible to reach a page without blocking,
filesystem should skip it. Filesystem should use do_set_pte() to setup
page table entry. Pointer to entry associated with the page is passed in
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 9d5fd9424e8b..8740362f31c6 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -996,6 +996,7 @@ Example output. You may not have all of these fields.
VmallocUsed: 40444 kB
VmallocChunk: 0 kB
Percpu: 29312 kB
+ EarlyMemtestBad: 0 kB
HardwareCorrupted: 0 kB
AnonHugePages: 4149248 kB
ShmemHugePages: 0 kB
@@ -1146,6 +1147,13 @@ VmallocChunk
Percpu
Memory allocated to the percpu allocator used to back percpu
allocations. This stat excludes the cost of metadata.
+EarlyMemtestBad
+ The amount of RAM/memory in kB, that was identified as corrupted
+ by early memtest. If memtest was not run, this field will not
+ be displayed at all. Size is never rounded down to 0 kB.
+ That means if 0 kB is reported, you can safely assume
+ there was at least one pass of memtest and none of the passes
+ found a single faulty byte of RAM.
HardwareCorrupted
The amount of RAM/memory in KB, the kernel identifies as
corrupted.
diff --git a/Documentation/filesystems/tmpfs.rst b/Documentation/filesystems/tmpfs.rst
index 0408c245785e..f18f46be5c0c 100644
--- a/Documentation/filesystems/tmpfs.rst
+++ b/Documentation/filesystems/tmpfs.rst
@@ -13,17 +13,29 @@ everything stored therein is lost.
tmpfs puts everything into the kernel internal caches and grows and
shrinks to accommodate the files it contains and is able to swap
-unneeded pages out to swap space. It has maximum size limits which can
-be adjusted on the fly via 'mount -o remount ...'
-
-If you compare it to ramfs (which was the template to create tmpfs)
-you gain swapping and limit checking. Another similar thing is the RAM
-disk (/dev/ram*), which simulates a fixed size hard disk in physical
-RAM, where you have to create an ordinary filesystem on top. Ramdisks
-cannot swap and you do not have the possibility to resize them.
-
-Since tmpfs lives completely in the page cache and on swap, all tmpfs
-pages will be shown as "Shmem" in /proc/meminfo and "Shared" in
+unneeded pages out to swap space, if swap was enabled for the tmpfs
+mount. tmpfs also supports THP.
+
+tmpfs extends ramfs with a few userspace configurable options listed and
+explained further below, some of which can be reconfigured dynamically on the
+fly using a remount ('mount -o remount ...') of the filesystem. A tmpfs
+filesystem can be resized but it cannot be resized to a size below its current
+usage. tmpfs also supports POSIX ACLs, and extended attributes for the
+trusted.* and security.* namespaces. ramfs does not use swap and you cannot
+modify any parameter for a ramfs filesystem. The size limit of a ramfs
+filesystem is how much memory you have available, and so care must be taken if
+used so to not run out of memory.
+
+An alternative to tmpfs and ramfs is to use brd to create RAM disks
+(/dev/ram*), which allows you to simulate a block device disk in physical RAM.
+To write data you would just then need to create an regular filesystem on top
+this ramdisk. As with ramfs, brd ramdisks cannot swap. brd ramdisks are also
+configured in size at initialization and you cannot dynamically resize them.
+Contrary to brd ramdisks, tmpfs has its own filesystem, it does not rely on the
+block layer at all.
+
+Since tmpfs lives completely in the page cache and optionally on swap,
+all tmpfs pages will be shown as "Shmem" in /proc/meminfo and "Shared" in
free(1). Notice that these counters also include shared memory
(shmem, see ipcs(1)). The most reliable way to get the count is
using df(1) and du(1).
@@ -72,6 +84,8 @@ nr_inodes The maximum number of inodes for this instance. The default
is half of the number of your physical RAM pages, or (on a
machine with highmem) the number of lowmem RAM pages,
whichever is the lower.
+noswap Disables swap. Remounts must respect the original settings.
+ By default swap is enabled.
========= ============================================================
These parameters accept a suffix k, m or g for kilo, mega and giga and
@@ -85,6 +99,36 @@ mount with such options, since it allows any user with write access to
use up all the memory on the machine; but enhances the scalability of
that instance in a system with many CPUs making intensive use of it.
+tmpfs also supports Transparent Huge Pages which requires a kernel
+configured with CONFIG_TRANSPARENT_HUGEPAGE and with huge supported for
+your system (has_transparent_hugepage(), which is architecture specific).
+The mount options for this are:
+
+====== ============================================================
+huge=0 never: disables huge pages for the mount
+huge=1 always: enables huge pages for the mount
+huge=2 within_size: only allocate huge pages if the page will be
+ fully within i_size, also respect fadvise()/madvise() hints.
+huge=3 advise: only allocate huge pages if requested with
+ fadvise()/madvise()
+====== ============================================================
+
+There is a sysfs file which you can also use to control system wide THP
+configuration for all tmpfs mounts, the file is:
+
+/sys/kernel/mm/transparent_hugepage/shmem_enabled
+
+This sysfs file is placed on top of THP sysfs directory and so is registered
+by THP code. It is however only used to control all tmpfs mounts with one
+single knob. Since it controls all tmpfs mounts it should only be used either
+for emergency or testing purposes. The values you can set for shmem_enabled are:
+
+== ============================================================
+-1 deny: disables huge on shm_mnt and all mounts, for
+ emergency use
+-2 force: enables huge on shm_mnt and all mounts, w/o needing
+ option, for testing
+== ============================================================
tmpfs has a mount option to set the NUMA memory allocation policy for
all files in that instance (if CONFIG_NUMA is enabled) - which can be
diff --git a/Documentation/mm/active_mm.rst b/Documentation/mm/active_mm.rst
index 45d89f8fb3a8..d096fc091e23 100644
--- a/Documentation/mm/active_mm.rst
+++ b/Documentation/mm/active_mm.rst
@@ -2,6 +2,12 @@
Active MM
=========
+Note, the mm_count refcount may no longer include the "lazy" users
+(running tasks with ->active_mm == mm && ->mm == NULL) on kernels
+with CONFIG_MMU_LAZY_TLB_REFCOUNT=n. Taking and releasing these lazy
+references must be done with mmgrab_lazy_tlb() and mmdrop_lazy_tlb()
+helpers, which abstract this config option.
+
::
List: linux-kernel
diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst
index 30d9a09f01f4..af3891f895b0 100644
--- a/Documentation/mm/arch_pgtable_helpers.rst
+++ b/Documentation/mm/arch_pgtable_helpers.rst
@@ -214,7 +214,7 @@ HugeTLB Page Table Helpers
+---------------------------+--------------------------------------------------+
| pte_huge | Tests a HugeTLB |
+---------------------------+--------------------------------------------------+
-| pte_mkhuge | Creates a HugeTLB |
+| arch_make_huge_pte | Creates a HugeTLB |
+---------------------------+--------------------------------------------------+
| huge_pte_dirty | Tests a dirty HugeTLB |
+---------------------------+--------------------------------------------------+
diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst
index 5f1f6ecbb79b..52ed5092022f 100644
--- a/Documentation/mm/multigen_lru.rst
+++ b/Documentation/mm/multigen_lru.rst
@@ -103,7 +103,8 @@ moving across tiers only involves atomic operations on
``folio->flags`` and therefore has a negligible cost. A feedback loop
modeled after the PID controller monitors refaults over all the tiers
from anon and file types and decides which tiers from which types to
-evict or protect.
+evict or protect. The desired effect is to balance refault percentages
+between anon and file types proportional to the swappiness level.
There are two conceptually independent procedures: the aging and the
eviction. They form a closed-loop system, i.e., the page reclaim.
@@ -156,6 +157,27 @@ This time-based approach has the following advantages:
and memory sizes.
2. It is more reliable because it is directly wired to the OOM killer.
+``mm_struct`` list
+------------------
+An ``mm_struct`` list is maintained for each memcg, and an
+``mm_struct`` follows its owner task to the new memcg when this task
+is migrated.
+
+A page table walker iterates ``lruvec_memcg()->mm_list`` and calls
+``walk_page_range()`` with each ``mm_struct`` on this list to scan
+PTEs. When multiple page table walkers iterate the same list, each of
+them gets a unique ``mm_struct``, and therefore they can run in
+parallel.
+
+Page table walkers ignore any misplaced pages, e.g., if an
+``mm_struct`` was migrated, pages left in the previous memcg will be
+ignored when the current memcg is under reclaim. Similarly, page table
+walkers will ignore pages from nodes other than the one under reclaim.
+
+This infrastructure also tracks the usage of ``mm_struct`` between
+context switches so that page table walkers can skip processes that
+have been sleeping since the last iteration.
+
Rmap/PT walk feedback
---------------------
Searching the rmap for PTEs mapping each page on an LRU list (to test
@@ -170,7 +192,7 @@ promotes hot pages. If the scan was done cacheline efficiently, it
adds the PMD entry pointing to the PTE table to the Bloom filter. This
forms a feedback loop between the eviction and the aging.
-Bloom Filters
+Bloom filters
-------------
Bloom filters are a space and memory efficient data structure for set
membership test, i.e., test if an element is not in the set or may be
@@ -186,6 +208,18 @@ is false positive, the cost is an additional scan of a range of PTEs,
which may yield hot pages anyway. Parameters of the filter itself can
control the false positive rate in the limit.
+PID controller
+--------------
+A feedback loop modeled after the Proportional-Integral-Derivative
+(PID) controller monitors refaults over anon and file types and
+decides which type to evict when both types are available from the
+same generation.
+
+The PID controller uses generations rather than the wall clock as the
+time domain because a CPU can scan pages at different rates under
+varying memory pressure. It calculates a moving average for each new
+generation to avoid being permanently locked in a suboptimal state.
+
Memcg LRU
---------
An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
@@ -223,9 +257,9 @@ parts:
* Generations
* Rmap walks
-* Page table walks
-* Bloom filters
-* PID controller
+* Page table walks via ``mm_struct`` list
+* Bloom filters for rmap/PT walk feedback
+* PID controller for refault feedback
The aging and the eviction form a producer-consumer model;
specifically, the latter drives the former by the sliding window over
diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst
index 92ac5dca420c..d5ac8511eb67 100644
--- a/Documentation/mm/unevictable-lru.rst
+++ b/Documentation/mm/unevictable-lru.rst
@@ -42,6 +42,8 @@ The unevictable list addresses the following classes of unevictable pages:
* Those owned by ramfs.
+ * Those owned by tmpfs with the noswap mount option.
+
* Those mapped into SHM_LOCK'd shared memory regions.
* Those mapped into VM_LOCKED [mlock()ed] VMAs.
diff --git a/MAINTAINERS b/MAINTAINERS
index 1dc8bd26b6cf..1b48ddff4159 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13368,13 +13368,14 @@ F: arch/powerpc/include/asm/membarrier.h
F: include/uapi/linux/membarrier.h
F: kernel/sched/membarrier.c
-MEMBLOCK
+MEMBLOCK AND MEMORY MANAGEMENT INITIALIZATION
M: Mike Rapoport <rppt@kernel.org>
L: linux-mm@kvack.org
S: Maintained
F: Documentation/core-api/boot-time-mm.rst
F: include/linux/memblock.h
F: mm/memblock.c
+F: mm/mm_init.c
F: tools/testing/memblock/
MEMORY CONTROLLER DRIVERS
@@ -13409,6 +13410,7 @@ F: include/linux/memory_hotplug.h
F: include/linux/mm.h
F: include/linux/mmzone.h
F: include/linux/pagewalk.h
+F: include/trace/events/ksm.h
F: mm/
F: tools/mm/
F: tools/testing/selftests/mm/
@@ -13417,6 +13419,7 @@ VMALLOC
M: Andrew Morton <akpm@linux-foundation.org>
R: Uladzislau Rezki <urezki@gmail.com>
R: Christoph Hellwig <hch@infradead.org>
+R: Lorenzo Stoakes <lstoakes@gmail.com>
L: linux-mm@kvack.org
S: Maintained
W: http://www.linux-mm.org
diff --git a/arch/Kconfig b/arch/Kconfig
index e3511afbb7f2..205fd23e0cad 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -465,6 +465,38 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
irqs disabled over activate_mm. Architectures that do IPI based TLB
shootdowns should enable this.
+# Use normal mm refcounting for MMU_LAZY_TLB kernel thread references.
+# MMU_LAZY_TLB_REFCOUNT=n can improve the scalability of context switching
+# to/from kernel threads when the same mm is running on a lot of CPUs (a large
+# multi-threaded application), by reducing contention on the mm refcount.
+#
+# This can be disabled if the architecture ensures no CPUs are using an mm as a
+# "lazy tlb" beyond its final refcount (i.e., by the time __mmdrop frees the mm
+# or its kernel page tables). This could be arranged by arch_exit_mmap(), or
+# final exit(2) TLB flush, for example.
+#
+# To implement this, an arch *must*:
+# Ensure the _lazy_tlb variants of mmgrab/mmdrop are used when manipulating
+# the lazy tlb reference of a kthread's ->active_mm (non-arch code has been
+# converted already).
+config MMU_LAZY_TLB_REFCOUNT
+ def_bool y
+ depends on !MMU_LAZY_TLB_SHOOTDOWN
+
+# This option allows MMU_LAZY_TLB_REFCOUNT=n. It ensures no CPUs are using an
+# mm as a lazy tlb beyond its last reference count, by shooting down these
+# users before the mm is deallocated. __mmdrop() first IPIs all CPUs that may
+# be using the mm as a lazy tlb, so that they may switch themselves to using
+# init_mm for their active mm. mm_cpumask(mm) is used to determine which CPUs
+# may be using mm as a lazy tlb mm.
+#
+# To implement this, an arch *must*:
+# - At the time of the final mmdrop of the mm, ensure mm_cpumask(mm) contains
+# at least all possible CPUs in which the mm is lazy.
+# - It must meet the requirements for MMU_LAZY_TLB_REFCOUNT=n (see above).
+config MMU_LAZY_TLB_SHOOTDOWN
+ bool
+
config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index d9a13ccf89a3..ab6d701365bb 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -556,7 +556,7 @@ endmenu # "ARC Architecture Configuration"
config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
- default "12" if ARC_HUGEPAGE_16M
- default "11"
+ default "11" if ARC_HUGEPAGE_16M
+ default "10"
source "kernel/power/Kconfig"
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e24a9820e12f..929e646e84b9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1355,9 +1355,9 @@ config ARM_MODULE_PLTS
config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
- default "12" if SOC_AM33XX
- default "9" if SA1111
- default "11"
+ default "11" if SOC_AM33XX
+ default "8" if SA1111
+ default "10"
help
The kernel memory allocator divides physically contiguous memory
blocks into "zones", where each zone is a power of two number of
@@ -1366,9 +1366,6 @@ config ARCH_FORCE_MAX_ORDER
blocks of physically contiguous memory, then you may need to
increase this value.
- This config option is actually maximum order plus one. For example,
- a value of 11 means that the largest free memory block is 2^10 pages.
-
config ALIGNMENT_TRAP
def_bool CPU_CP15_MMU
select HAVE_PROC_CPU if PROC_FS
diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig
index 6dc6fed12af8..345a67e67dbd 100644
--- a/arch/arm/configs/imx_v6_v7_defconfig
+++ b/arch/arm/configs/imx_v6_v7_defconfig
@@ -31,7 +31,7 @@ CONFIG_SOC_VF610=y
CONFIG_SMP=y
CONFIG_ARM_PSCI=y
CONFIG_HIGHMEM=y
-CONFIG_ARCH_FORCE_MAX_ORDER=14
+CONFIG_ARCH_FORCE_MAX_ORDER=13
CONFIG_CMDLINE="noinitrd console=ttymxc0,115200"
CONFIG_KEXEC=y
CONFIG_CPU_FREQ=y
diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig
index bd29e5012cb0..385ad0f391a8 100644
--- a/arch/arm/configs/milbeaut_m10v_defconfig
+++ b/arch/arm/configs/milbeaut_m10v_defconfig
@@ -26,7 +26,7 @@ CONFIG_THUMB2_KERNEL=y
# CONFIG_THUMB2_AVOID_R_ARM_THM_JUMP11 is not set
# CONFIG_ARM_PATCH_IDIV is not set
CONFIG_HIGHMEM=y
-CONFIG_ARCH_FORCE_MAX_ORDER=12
+CONFIG_ARCH_FORCE_MAX_ORDER=11
CONFIG_SECCOMP=y
CONFIG_KEXEC=y
CONFIG_EFI=y
diff --git a/arch/arm/configs/oxnas_v6_defconfig b/arch/arm/configs/oxnas_v6_defconfig
index 70a67b3fc91b..90779812c6dd 100644
--- a/arch/arm/configs/oxnas_v6_defconfig
+++ b/arch/arm/configs/oxnas_v6_defconfig
@@ -12,7 +12,7 @@ CONFIG_ARCH_OXNAS=y
CONFIG_MACH_OX820=y
CONFIG_SMP=y
CONFIG_NR_CPUS=16
-CONFIG_ARCH_FORCE_MAX_ORDER=12
+CONFIG_ARCH_FORCE_MAX_ORDER=11
CONFIG_SECCOMP=y
CONFIG_ARM_APPENDED_DTB=y
CONFIG_ARM_ATAG_DTB_COMPAT=y
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index e656d3af2266..b46e39369dbb 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -20,7 +20,7 @@ CONFIG_PXA_SHARPSL=y
CONFIG_MACH_AKITA=y
CONFIG_MACH_BORZOI=y
CONFIG_AEABI=y
-CONFIG_ARCH_FORCE_MAX_ORDER=9
+CONFIG_ARCH_FORCE_MAX_ORDER=8
CONFIG_CMDLINE="root=/dev/ram0 ro"
CONFIG_KEXEC=y
CONFIG_CPU_FREQ=y
diff --git a/arch/arm/configs/sama7_defconfig b/arch/arm/configs/sama7_defconfig
index 0d964c613d71..954112041403 100644
--- a/arch/arm/configs/sama7_defconfig
+++ b/arch/arm/configs/sama7_defconfig
@@ -19,7 +19,7 @@ CONFIG_ATMEL_CLOCKSOURCE_TCB=y
# CONFIG_CACHE_L2X0 is not set
# CONFIG_ARM_PATCH_IDIV is not set
# CONFIG_CPU_SW_DOMAIN_PAN is not set
-CONFIG_ARCH_FORCE_MAX_ORDER=15
+CONFIG_ARCH_FORCE_MAX_ORDER=14
CONFIG_UACCESS_WITH_MEMCPY=y
# CONFIG_ATAGS is not set
CONFIG_CMDLINE="console=ttyS0,115200 earlyprintk ignore_loglevel"
diff --git a/arch/arm/configs/sp7021_defconfig b/arch/arm/configs/sp7021_defconfig
index 5bca2eb59b86..c6448ac860b6 100644
--- a/arch/arm/configs/sp7021_defconfig
+++ b/arch/arm/configs/sp7021_defconfig
@@ -17,7 +17,7 @@ CONFIG_ARCH_SUNPLUS=y
# CONFIG_VDSO is not set
CONFIG_SMP=y
CONFIG_THUMB2_KERNEL=y
-CONFIG_ARCH_FORCE_MAX_ORDER=12
+CONFIG_ARCH_FORCE_MAX_ORDER=11
CONFIG_VFP=y
CONFIG_NEON=y
CONFIG_MODULES=y
diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 53813f9464a2..c30df1097c52 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -253,7 +253,7 @@ static int ecard_init_mm(void)
current->mm = mm;
current->active_mm = mm;
activate_mm(active_mm, mm);
- mmdrop(active_mm);
+ mmdrop_lazy_tlb(active_mm);
ecard_init_pgtables(mm);
return 0;
}
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1023e896d46b..e60baf7859d1 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -95,6 +95,7 @@ config ARM64
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_SUPPORTS_PAGE_TABLE_CHECK
+ select ARCH_SUPPORTS_PER_VMA_LOCK
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
select ARCH_WANT_DEFAULT_BPF_JIT
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
@@ -1476,22 +1477,22 @@ config XEN
# include/linux/mmzone.h requires the following to be true:
#
-# MAX_ORDER - 1 + PAGE_SHIFT <= SECTION_SIZE_BITS
+# MAX_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS
#
-# so the maximum value of MAX_ORDER is SECTION_SIZE_BITS + 1 - PAGE_SHIFT:
+# so the maximum value of MAX_ORDER is SECTION_SIZE_BITS - PAGE_SHIFT:
#
# | SECTION_SIZE_BITS | PAGE_SHIFT | max MAX_ORDER | default MAX_ORDER |
# ----+-------------------+--------------+-----------------+--------------------+
-# 4K | 27 | 12 | 16 | 11 |
-# 16K | 27 | 14 | 14 | 12 |
-# 64K | 29 | 16 | 14 | 14 |
+# 4K | 27 | 12 | 15 | 10 |
+# 16K | 27 | 14 | 13 | 11 |
+# 64K | 29 | 16 | 13 | 13 |
config ARCH_FORCE_MAX_ORDER
int "Maximum zone order" if ARM64_4K_PAGES || ARM64_16K_PAGES
- default "14" if ARM64_64K_PAGES
- range 12 14 if ARM64_16K_PAGES
- default "12" if ARM64_16K_PAGES
- range 11 16 if ARM64_4K_PAGES
- default "11"
+ default "13" if ARM64_64K_PAGES
+ range 11 13 if ARM64_16K_PAGES
+ default "11" if ARM64_16K_PAGES
+ range 10 15 if ARM64_4K_PAGES
+ default "10"
help
The kernel memory allocator divides physically contiguous memory
blocks into "zones", where each zone is a power of two number of
@@ -1500,14 +1501,11 @@ config ARCH_FORCE_MAX_ORDER
blocks of physically contiguous memory, then you may need to
increase this value.
- This config option is actually maximum order plus one. For example,
- a value of 11 means that the largest free memory block is 2^10 pages.
-
We make sure that we can allocate up to a HugePage size for each configuration.
Hence we have :
- MAX_ORDER = (PMD_SHIFT - PAGE_SHIFT) + 1 => PAGE_SHIFT - 2
+ MAX_ORDER = PMD_SHIFT - PAGE_SHIFT => PAGE_SHIFT - 3
- However for 4K, we choose a higher default value, 11 as opposed to 10, giving us
+ However for 4K, we choose a higher default value, 10 as opposed to 9, giving us
4M allocations matching the default size used by generic code.
config UNMAP_KERNEL_AT_EL0
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 78e5163836a0..05e42bd3555f 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -261,9 +261,11 @@ static inline const void *__tag_set(const void *addr, u8 tag)
}
#ifdef CONFIG_KASAN_HW_TAGS
-#define arch_enable_tagging_sync() mte_enable_kernel_sync()
-#define arch_enable_tagging_async() mte_enable_kernel_async()
-#define arch_enable_tagging_asymm() mte_enable_kernel_asymm()
+#define arch_enable_tag_checks_sync() mte_enable_kernel_sync()
+#define arch_enable_tag_checks_async() mte_enable_kernel_async()
+#define arch_enable_tag_checks_asymm() mte_enable_kernel_asymm()
+#define arch_suppress_tag_checks_start() mte_enable_tco()
+#define arch_suppress_tag_checks_stop() mte_disable_tco()
#define arch_force_async_tag_fault() mte_check_tfsr_exit()
#define arch_get_random_tag() mte_get_random_tag()
#define arch_get_mem_tag(addr) mte_get_mem_tag(addr)
diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
index 9f79425fc65a..2e98028c1965 100644
--- a/arch/arm64/include/asm/mte-kasan.h
+++ b/arch/arm64/include/asm/mte-kasan.h
@@ -13,9 +13,74 @@
#include <linux/types.h>
+#ifdef CONFIG_KASAN_HW_TAGS
+
+/* Whether the MTE asynchronous mode is enabled. */
+DECLARE_STATIC_KEY_FALSE(mte_async_or_asymm_mode);
+
+static inline bool system_uses_mte_async_or_asymm_mode(void)
+{
+ return static_branch_unlikely(&mte_async_or_asymm_mode);
+}
+
+#else /* CONFIG_KASAN_HW_TAGS */
+
+static inline bool system_uses_mte_async_or_asymm_mode(void)
+{
+ return false;
+}
+
+#endif /* CONFIG_KASAN_HW_TAGS */
+
#ifdef CONFIG_ARM64_MTE
/*
+ * The Tag Check Flag (TCF) mode for MTE is per EL, hence TCF0
+ * affects EL0 and TCF affects EL1 irrespective of which TTBR is
+ * used.
+ * The kernel accesses TTBR0 usually with LDTR/STTR instructions
+ * when UAO is available, so these would act as EL0 accesses using
+ * TCF0.
+ * However futex.h code uses exclusives which would be executed as
+ * EL1, this can potentially cause a tag check fault even if the
+ * user disables TCF0.
+ *
+ * To address the problem we set the PSTATE.TCO bit in uaccess_enable()
+ * and reset it in uaccess_disable().
+ *
+ * The Tag check override (TCO) bit disables temporarily the tag checking
+ * preventing the issue.
+ */
+static inline void mte_disable_tco(void)
+{
+ asm volatile(ALTERNATIVE("nop", SET_PSTATE_TCO(0),
+ ARM64_MTE, CONFIG_KASAN_HW_TAGS));
+}
+
+static inline void mte_enable_tco(void)
+{
+ asm volatile(ALTERNATIVE("nop", SET_PSTATE_TCO(1),
+ ARM64_MTE, CONFIG_KASAN_HW_TAGS));
+}
+
+/*
+ * These functions disable tag checking only if in MTE async mode
+ * since the sync mode generates exceptions synchronously and the
+ * nofault or load_unaligned_zeropad can handle them.
+ */
+static inline void __mte_disable_tco_async(void)
+{
+ if (system_uses_mte_async_or_asymm_mode())
+ mte_disable_tco();
+}
+
+static inline void __mte_enable_tco_async(void)
+{
+ if (system_uses_mte_async_or_asymm_mode())
+ mte_enable_tco();
+}
+
+/*
* These functions are meant to be only used from KASAN runtime through
* the arch_*() interface defined in asm/memory.h.
* These functions don't include system_supports_mte() checks,
@@ -138,6 +203,22 @@ void mte_enable_kernel_asymm(void);
#else /* CONFIG_ARM64_MTE */
+static inline void mte_disable_tco(void)
+{
+}
+
+static inline void mte_enable_tco(void)
+{
+}
+
+static inline void __mte_disable_tco_async(void)
+{
+}
+
+static inline void __mte_enable_tco_async(void)
+{
+}
+
static inline u8 mte_get_ptr_tag(void *ptr)
{
return 0xFF;
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 20dd06d70af5..c028afb1cd0b 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -178,14 +178,6 @@ static inline void mte_disable_tco_entry(struct task_struct *task)
}
#ifdef CONFIG_KASAN_HW_TAGS
-/* Whether the MTE asynchronous mode is enabled. */
-DECLARE_STATIC_KEY_FALSE(mte_async_or_asymm_mode);
-
-static inline bool system_uses_mte_async_or_asymm_mode(void)
-{
- return static_branch_unlikely(&mte_async_or_asymm_mode);
-}
-
void mte_check_tfsr_el1(void);
static inline void mte_check_tfsr_entry(void)
@@ -212,10 +204,6 @@ static inline void mte_check_tfsr_exit(void)
mte_check_tfsr_el1();
}
#else
-static inline bool system_uses_mte_async_or_asymm_mode(void)
-{
- return false;
-}
static inline void mte_check_tfsr_el1(void)
{
}
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b6ba466e2e8a..0bd18de9fd97 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -57,7 +57,7 @@ static inline bool arch_thp_swp_supported(void)
* fault on one CPU which has been handled concurrently by another CPU
* does not need to perform additional invalidation.
*/
-#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
+#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
/*
* ZERO_PAGE is a global shared page that is always zero: used
diff --git a/arch/arm64/include/asm/sparsemem.h b/arch/arm64/include/asm/sparsemem.h
index 4b73463423c3..5f5437621029 100644
--- a/arch/arm64/include/asm/sparsemem.h
+++ b/arch/arm64/include/asm/sparsemem.h
@@ -10,7 +10,7 @@
/*
* Section size must be at least 512MB for 64K base
* page size config. Otherwise it will be less than
- * (MAX_ORDER - 1) and the build process will fail.
+ * MAX_ORDER and the build process will fail.
*/
#ifdef CONFIG_ARM64_64K_PAGES
#define SECTION_SIZE_BITS 29
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 5c7b2f9d5913..30ea7b5c3ccb 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -136,55 +136,9 @@ static inline void __uaccess_enable_hw_pan(void)
CONFIG_ARM64_PAN));
}
-/*
- * The Tag Check Flag (TCF) mode for MTE is per EL, hence TCF0
- * affects EL0 and TCF affects EL1 irrespective of which TTBR is
- * used.
- * The kernel accesses TTBR0 usually with LDTR/STTR instructions
- * when UAO is available, so these would act as EL0 accesses using
- * TCF0.
- * However futex.h code uses exclusives which would be executed as
- * EL1, this can potentially cause a tag check fault even if the
- * user disables TCF0.
- *
- * To address the problem we set the PSTATE.TCO bit in uaccess_enable()
- * and reset it in uaccess_disable().
- *
- * The Tag check override (TCO) bit disables temporarily the tag checking
- * preventing the issue.
- */
-static inline void __uaccess_disable_tco(void)
-{
- asm volatile(ALTERNATIVE("nop", SET_PSTATE_TCO(0),
- ARM64_MTE, CONFIG_KASAN_HW_TAGS));
-}
-
-static inline void __uaccess_enable_tco(void)
-{
- asm volatile(ALTERNATIVE("nop", SET_PSTATE_TCO(1),
- ARM64_MTE, CONFIG_KASAN_HW_TAGS));
-}
-
-/*
- * These functions disable tag checking only if in MTE async mode
- * since the sync mode generates exceptions synchronously and the
- * nofault or load_unaligned_zeropad can handle them.
- */
-static inline void __uaccess_disable_tco_async(void)
-{
- if (system_uses_mte_async_or_asymm_mode())
- __uaccess_disable_tco();
-}
-
-static inline void __uaccess_enable_tco_async(void)
-{
- if (system_uses_mte_async_or_asymm_mode())
- __uaccess_enable_tco();
-}
-
static inline void uaccess_disable_privileged(void)
{
- __uaccess_disable_tco();
+ mte_disable_tco();
if (uaccess_ttbr0_disable())
return;
@@ -194,7 +148,7 @@ static inline void uaccess_disable_privileged(void)
static inline void uaccess_enable_privileged(void)
{
- __uaccess_enable_tco();
+ mte_enable_tco();
if (uaccess_ttbr0_enable())
return;
@@ -302,8 +256,8 @@ do { \
#define get_user __get_user
/*
- * We must not call into the scheduler between __uaccess_enable_tco_async() and
- * __uaccess_disable_tco_async(). As `dst` and `src` may contain blocking
+ * We must not call into the scheduler between __mte_enable_tco_async() and
+ * __mte_disable_tco_async(). As `dst` and `src` may contain blocking
* functions, we must evaluate these outside of the critical section.
*/
#define __get_kernel_nofault(dst, src, type, err_label) \
@@ -312,10 +266,10 @@ do { \
__typeof__(src) __gkn_src = (src); \
int __gkn_err = 0; \
\
- __uaccess_enable_tco_async(); \
+ __mte_enable_tco_async(); \
__raw_get_mem("ldr", *((type *)(__gkn_dst)), \
(__force type *)(__gkn_src), __gkn_err, K); \
- __uaccess_disable_tco_async(); \
+ __mte_disable_tco_async(); \
\
if (unlikely(__gkn_err)) \
goto err_label; \
@@ -388,8 +342,8 @@ do { \
#define put_user __put_user
/*
- * We must not call into the scheduler between __uaccess_enable_tco_async() and
- * __uaccess_disable_tco_async(). As `dst` and `src` may contain blocking
+ * We must not call into the scheduler between __mte_enable_tco_async() and
+ * __mte_disable_tco_async(). As `dst` and `src` may contain blocking
* functions, we must evaluate these outside of the critical section.
*/
#define __put_kernel_nofault(dst, src, type, err_label) \
@@ -398,10 +352,10 @@ do { \
__typeof__(src) __pkn_src = (src); \
int __pkn_err = 0; \
\
- __uaccess_enable_tco_async(); \
+ __mte_enable_tco_async(); \
__raw_put_mem("str", *((type *)(__pkn_src)), \
(__force type *)(__pkn_dst), __pkn_err, K); \
- __uaccess_disable_tco_async(); \
+ __mte_disable_tco_async(); \
\
if (unlikely(__pkn_err)) \
goto err_label; \
diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h
index 1c8e4f2490bf..f3b151ed0d7a 100644
--- a/arch/arm64/include/asm/word-at-a-time.h
+++ b/arch/arm64/include/asm/word-at-a-time.h
@@ -55,7 +55,7 @@ static inline unsigned long load_unaligned_zeropad(const void *addr)
{
unsigned long ret;
- __uaccess_enable_tco_async();
+ __mte_enable_tco_async();
/* Load word from unaligned pointer addr */
asm(
@@ -65,7 +65,7 @@ static inline unsigned long load_unaligned_zeropad(const void *addr)
: "=&r" (ret)
: "r" (addr), "Q" (*(unsigned long *)addr));
- __uaccess_disable_tco_async();
+ __mte_disable_tco_async();
return ret;
}
diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
index 0a048dc06a7d..fe5472a184a3 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -16,7 +16,7 @@ struct hyp_pool {
* API at EL2.
*/
hyp_spinlock_t lock;
- struct list_head free_area[MAX_ORDER];
+ struct list_head free_area[MAX_ORDER + 1];
phys_addr_t range_start;
phys_addr_t range_end;
unsigned short max_order;
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index 803ba3222e75..b1e392186a0f 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -110,7 +110,7 @@ static void __hyp_attach_page(struct hyp_pool *pool,
* after coalescing, so make sure to mark it HYP_NO_ORDER proactively.
*/
p->order = HYP_NO_ORDER;
- for (; (order + 1) < pool->max_order; order++) {
+ for (; (order + 1) <= pool->max_order; order++) {
buddy = __find_buddy_avail(pool, p, order);
if (!buddy)
break;
@@ -203,9 +203,9 @@ void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order)
hyp_spin_lock(&pool->lock);
/* Look for a high-enough-order page */
- while (i < pool->max_order && list_empty(&pool->free_area[i]))
+ while (i <= pool->max_order && list_empty(&pool->free_area[i]))
i++;
- if (i >= pool->max_order) {
+ if (i > pool->max_order) {
hyp_spin_unlock(&pool->lock);
return NULL;
}
@@ -228,8 +228,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
int i;
hyp_spin_lock_init(&pool->lock);
- pool->max_order = min(MAX_ORDER, get_order((nr_pages + 1) << PAGE_SHIFT));
- for (i = 0; i < pool->max_order; i++)
+ pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT));
+ for (i = 0; i <= pool->max_order; i++)
INIT_LIST_HEAD(&pool->free_area[i]);
pool->range_start = phys;
pool->range_end = phys + (nr_pages << PAGE_SHIFT);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index f4cb0f85ccf4..9e0db5c387e3 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -535,6 +535,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
unsigned long vm_flags;
unsigned int mm_flags = FAULT_FLAG_DEFAULT;
unsigned long addr = untagged_addr(far);
+#ifdef CONFIG_PER_VMA_LOCK
+ struct vm_area_struct *vma;
+#endif
if (kprobe_page_fault(regs, esr))
return 0;
@@ -585,6 +588,36 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+#ifdef CONFIG_PER_VMA_LOCK
+ if (!(mm_flags & FAULT_FLAG_USER))
+ goto lock_mmap;
+
+ vma = lock_vma_under_rcu(mm, addr);
+ if (!vma)
+ goto lock_mmap;
+
+ if (!(vma->vm_flags & vm_flags)) {
+ vma_end_read(vma);
+ goto lock_mmap;
+ }
+ fault = handle_mm_fault(vma, addr & PAGE_MASK,
+ mm_flags | FAULT_FLAG_VMA_LOCK, regs);
+ vma_end_read(vma);
+
+ if (!(fault & VM_FAULT_RETRY)) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ goto done;
+ }
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
+
+ /* Quick path to respond to signals */
+ if (fault_signal_pending(fault, regs)) {
+ if (!user_mode(regs))
+ goto no_context;
+ return 0;
+ }
+lock_mmap:
+#endif /* CONFIG_PER_VMA_LOCK */
/*
* As per x86, we may deadlock here. However, since the kernel only
* validly references user space from well defined areas of the code,
@@ -628,6 +661,9 @@ retry:
}
mmap_read_unlock(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+done:
+#endif
/*
* Handle the "normal" (no error) case first.
*/
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index dba02da6fa34..c694fac43bed 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -334,7 +334,7 @@ config HIGHMEM
config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
- default "11"
+ default "10"
config DRAM_BASE
hex "DRAM start addr (the same with memory-section in dts)"
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index d7e4a24e8644..0d2f41fa56ee 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -202,10 +202,10 @@ config IA64_CYCLONE
If you're unsure, answer N.
config ARCH_FORCE_MAX_ORDER
- int "MAX_ORDER (11 - 17)" if !HUGETLB_PAGE
- range 11 17 if !HUGETLB_PAGE
- default "17" if HUGETLB_PAGE
- default "11"
+ int "MAX_ORDER (10 - 16)" if !HUGETLB_PAGE
+ range 10 16 if !HUGETLB_PAGE
+ default "16" if HUGETLB_PAGE
+ default "10"
config SMP
bool "Symmetric multi-processing support"
diff --git a/arch/ia64/include/asm/sparsemem.h b/arch/ia64/include/asm/sparsemem.h
index 84e8ce387b69..a58f8b466d96 100644
--- a/arch/ia64/include/asm/sparsemem.h
+++ b/arch/ia64/include/asm/sparsemem.h
@@ -12,9 +12,9 @@
#define SECTION_SIZE_BITS (30)
#define MAX_PHYSMEM_BITS (50)
#ifdef CONFIG_ARCH_FORCE_MAX_ORDER
-#if ((CONFIG_ARCH_FORCE_MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS)
+#if (CONFIG_ARCH_FORCE_MAX_ORDER + PAGE_SHIFT > SECTION_SIZE_BITS)
#undef SECTION_SIZE_BITS
-#define SECTION_SIZE_BITS (CONFIG_ARCH_FORCE_MAX_ORDER - 1 + PAGE_SHIFT)
+#define SECTION_SIZE_BITS (CONFIG_ARCH_FORCE_MAX_ORDER + PAGE_SHIFT)
#endif
#endif
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 380d2f3966c9..e8dd4323fb86 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -170,7 +170,7 @@ static int __init hugetlb_setup_sz(char *str)
size = memparse(str, &str);
if (*str || !is_power_of_2(size) || !(tr_pages & size) ||
size <= PAGE_SIZE ||
- size >= (1UL << PAGE_SHIFT << MAX_ORDER)) {
+ size > (1UL << PAGE_SHIFT << MAX_ORDER)) {
printk(KERN_WARNING "Invalid huge page size specified\n");
return 1;
}
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 7fd51257e0ed..e1e3a3828962 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -420,12 +420,9 @@ config NODES_SHIFT
config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
- range 14 64 if PAGE_SIZE_64KB
- default "14" if PAGE_SIZE_64KB
- range 12 64 if PAGE_SIZE_16KB
- default "12" if PAGE_SIZE_16KB
- range 11 64
- default "11"
+ default "13" if PAGE_SIZE_64KB
+ default "11" if PAGE_SIZE_16KB
+ default "10"
help
The kernel memory allocator divides physically contiguous memory
blocks into "zones", where each zone is a power of two number of
@@ -434,9 +431,6 @@ config ARCH_FORCE_MAX_ORDER
blocks of physically contiguous memory, then you may need to
increase this value.
- This config option is actually maximum order plus one. For example,
- a value of 11 means that the largest free memory block is 2^10 pages.
-
The page size is not necessarily 4KB. Keep this in mind
when choosing a value for this option.
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 9380f6e3bb66..c9df6572133f 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -400,7 +400,7 @@ config SINGLE_MEMORY_CHUNK
config ARCH_FORCE_MAX_ORDER
int "Maximum zone order" if ADVANCED
depends on !SINGLE_MEMORY_CHUNK
- default "11"
+ default "10"
help
The kernel memory allocator divides physically contiguous memory
blocks into "zones", where each zone is a power of two number of
@@ -413,9 +413,6 @@ config ARCH_FORCE_MAX_ORDER
value also defines the minimal size of the hole that allows
freeing unused memory map.
- This config option is actually maximum order plus one. For example,
- a value of 11 means that the largest free memory block is 2^10 pages.
-
config 060_WRITETHROUGH
bool "Use write-through caching for 68060 supervisor accesses"
depends on ADVANCED && M68060
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index e2f3ca73f40d..a0f6e9d0a561 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2137,14 +2137,10 @@ endchoice
config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
- range 14 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB
- default "14" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB
- range 13 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB
- default "13" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB
- range 12 64 if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB
- default "12" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB
- range 0 64
- default "11"
+ default "13" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_64KB
+ default "12" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_32KB
+ default "11" if MIPS_HUGE_TLB_SUPPORT && PAGE_SIZE_16KB
+ default "10"
help
The kernel memory allocator divides physically contiguous memory
blocks into "zones", where each zone is a power of two number of
@@ -2153,9 +2149,6 @@ config ARCH_FORCE_MAX_ORDER
blocks of physically contiguous memory, then you may need to
increase this value.
- This config option is actually maximum order plus one. For example,
- a value of 11 means that the largest free memory block is 2^10 pages.
-
The page size is not necessarily 4KB. Keep this in mind
when choosing a value for this option.
diff --git a/arch/mips/include/asm/fixmap.h b/arch/mips/include/asm/fixmap.h
index beea14761cef..b037718d7e8b 100644
--- a/arch/mips/include/asm/fixmap.h
+++ b/arch/mips/include/asm/fixmap.h
@@ -70,7 +70,7 @@ enum fixed_addresses {
#include <asm-generic/fixmap.h>
/*
- * Called from pgtable_init()
+ * Called from pagetable_init()
*/
extern void fixrange_init(unsigned long start, unsigned long end,
pgd_t *pgd_base);
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 791389bf3c12..574fa14ac8b2 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -469,7 +469,8 @@ static inline pgprot_t pgprot_writecombine(pgprot_t _prot)
}
static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
- unsigned long address)
+ unsigned long address,
+ pte_t *ptep)
{
}
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index a582f72104f3..89708b95978c 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -46,8 +46,8 @@ source "kernel/Kconfig.hz"
config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
- range 9 20
- default "11"
+ range 8 19
+ default "10"
help
The kernel memory allocator divides physically contiguous memory
blocks into "zones", where each zone is a power of two number of
@@ -56,9 +56,6 @@ config ARCH_FORCE_MAX_ORDER
blocks of physically contiguous memory, then you may need to
increase this value.
- This config option is actually maximum order plus one. For example,
- a value of 11 means that the largest free memory block is 2^10 pages.
-
endmenu
source "arch/nios2/platform/Kconfig.platform"
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a6c4407d3ec8..24d56536b269 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -266,6 +266,7 @@ config PPC
select MMU_GATHER_PAGE_SIZE
select MMU_GATHER_RCU_TABLE_FREE
select MMU_GATHER_MERGE_VMAS
+ select MMU_LAZY_TLB_SHOOTDOWN if PPC_BOOK3S_64
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE
select NEED_PER_CPU_EMBED_FIRST_CHUNK if PPC64
@@ -896,18 +897,18 @@ config DATA_SHIFT
config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
- range 8 9 if PPC64 && PPC_64K_PAGES
- default "9" if PPC64 && PPC_64K_PAGES
- range 13 13 if PPC64 && !PPC_64K_PAGES
- default "13" if PPC64 && !PPC_64K_PAGES
- range 9 64 if PPC32 && PPC_16K_PAGES
- default "9" if PPC32 && PPC_16K_PAGES
- range 7 64 if PPC32 && PPC_64K_PAGES
- default "7" if PPC32 && PPC_64K_PAGES
- range 5 64 if PPC32 && PPC_256K_PAGES
- default "5" if PPC32 && PPC_256K_PAGES
- range 11 64
- default "11"
+ range 7 8 if PPC64 && PPC_64K_PAGES
+ default "8" if PPC64 && PPC_64K_PAGES
+ range 12 12 if PPC64 && !PPC_64K_PAGES
+ default "12" if PPC64 && !PPC_64K_PAGES
+ range 8 63 if PPC32 && PPC_16K_PAGES
+ default "8" if PPC32 && PPC_16K_PAGES
+ range 6 63 if PPC32 && PPC_64K_PAGES
+ default "6" if PPC32 && PPC_64K_PAGES
+ range 4 63 if PPC32 && PPC_256K_PAGES
+ default "4" if PPC32 && PPC_256K_PAGES
+ range 10 63
+ default "10"
help
The kernel memory allocator divides physically contiguous memory
blocks into "zones", where each zone is a power of two number of
@@ -916,9 +917,6 @@ config ARCH_FORCE_MAX_ORDER
blocks of physically contiguous memory, then you may need to
increase this value.
- This config option is actually maximum order plus one. For example,
- a value of 11 means that the largest free memory block is 2^10 pages.
-
The page size is not necessarily 4KB. For example, on 64-bit
systems, 64KB pages can be enabled via CONFIG_PPC_64K_PAGES. Keep
this in mind when choosing a value for this option.
diff --git a/arch/powerpc/configs/85xx/ge_imp3a_defconfig b/arch/powerpc/configs/85xx/ge_imp3a_defconfig
index ea719898b581..6cb7e90d52c1 100644
--- a/arch/powerpc/configs/85xx/ge_imp3a_defconfig
+++ b/arch/powerpc/configs/85xx/ge_imp3a_defconfig
@@ -30,7 +30,7 @@ CONFIG_PREEMPT=y
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
CONFIG_MATH_EMULATION=y
-CONFIG_ARCH_FORCE_MAX_ORDER=17
+CONFIG_ARCH_FORCE_MAX_ORDER=16
CONFIG_PCI=y
CONFIG_PCIEPORTBUS=y
CONFIG_PCI_MSI=y
diff --git a/arch/powerpc/configs/fsl-emb-nonhw.config b/arch/powerpc/configs/fsl-emb-nonhw.config
index ab8a8c4530d9..3009b0efaf34 100644
--- a/arch/powerpc/configs/fsl-emb-nonhw.config
+++ b/arch/powerpc/configs/fsl-emb-nonhw.config
@@ -41,7 +41,7 @@ CONFIG_FIXED_PHY=y
CONFIG_FONT_8x16=y
CONFIG_FONT_8x8=y
CONFIG_FONTS=y
-CONFIG_ARCH_FORCE_MAX_ORDER=13
+CONFIG_ARCH_FORCE_MAX_ORDER=12
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAME_WARN=1024
CONFIG_FTL=y
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 2bbc0fcce04a..ff7f0ee179e5 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -121,7 +121,8 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
- unsigned long address)
+ unsigned long address,
+ pte_t *ptep)
{
/*
* Book3S 64 does not require spurious fault flushes because the PTE
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 6b90f10a6c81..7db6b3faea65 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1611,7 +1611,7 @@ void start_secondary(void *unused)
if (IS_ENABLED(CONFIG_PPC32))
setup_kup();
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
current->active_mm = &init_mm;
smp_store_cpu_info(cpu);
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index 7fcfba162e0d..81d7185e2ae8 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -97,7 +97,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
}
mmap_read_lock(mm);
- chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) /
+ chunk = (1UL << (PAGE_SHIFT + MAX_ORDER)) /
sizeof(struct vm_area_struct *);
chunk = min(chunk, entries);
for (entry = 0; entry < entries; entry += chunk) {
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index e50bc5fc7ddf..ce804b7bf84e 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -797,10 +797,10 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush)
if (current->active_mm == mm) {
WARN_ON_ONCE(current->mm != NULL);
/* Is a kernel thread and is using mm as the lazy tlb */
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
current->active_mm = &init_mm;
switch_mm_irqs_off(mm, &init_mm, current);
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
}
/*
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index af46aa88422b..531177a4ee08 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -474,6 +474,40 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
if (is_exec)
flags |= FAULT_FLAG_INSTRUCTION;
+#ifdef CONFIG_PER_VMA_LOCK
+ if (!(flags & FAULT_FLAG_USER))
+ goto lock_mmap;
+
+ vma = lock_vma_under_rcu(mm, address);
+ if (!vma)
+ goto lock_mmap;
+
+ if (unlikely(access_pkey_error(is_write, is_exec,
+ (error_code & DSISR_KEYFAULT), vma))) {
+ vma_end_read(vma);
+ goto lock_mmap;
+ }
+
+ if (unlikely(access_error(is_write, is_exec, vma))) {
+ vma_end_read(vma);
+ goto lock_mmap;
+ }
+
+ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+ vma_end_read(vma);
+
+ if (!(fault & VM_FAULT_RETRY)) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ goto done;
+ }
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
+
+ if (fault_signal_pending(fault, regs))
+ return user_mode(regs) ? 0 : SIGBUS;
+
+lock_mmap:
+#endif /* CONFIG_PER_VMA_LOCK */
+
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
@@ -550,6 +584,9 @@ retry:
mmap_read_unlock(current->mm);
+#ifdef CONFIG_PER_VMA_LOCK
+done:
+#endif
if (unlikely(fault & VM_FAULT_ERROR))
return mm_fault_error(regs, address, fault);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f1ba8d1e8c1a..b900933507da 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -615,7 +615,7 @@ void __init gigantic_hugetlb_cma_reserve(void)
order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
if (order) {
- VM_WARN_ON(order < MAX_ORDER);
+ VM_WARN_ON(order <= MAX_ORDER);
hugetlb_cma_reserve(order);
}
}
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index ae248a161b43..70a46acc70d6 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -16,6 +16,7 @@ config PPC_POWERNV
select PPC_DOORBELL
select MMU_NOTIFIER
select FORCE_SMP
+ select ARCH_SUPPORTS_PER_VMA_LOCK
default y
config OPAL_PRD
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 4f6e20a35aa1..5a81f106068e 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1740,7 +1740,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
* DMA window can be larger than available memory, which will
* cause errors later.
*/
- const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
+ const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER);
/*
* We create the default window as big as we can. The constraint is
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 21b22bf16ce6..4ebf2ef2845d 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -22,6 +22,7 @@ config PPC_PSERIES
select HOTPLUG_CPU
select FORCE_SMP
select SWIOTLB
+ select ARCH_SUPPORTS_PER_VMA_LOCK
default y
config PARAVIRT
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 9809c74e1240..548b5b587003 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -120,6 +120,7 @@ config S390
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
select ARCH_SUPPORTS_HUGETLBFS
select ARCH_SUPPORTS_NUMA_BALANCING
+ select ARCH_SUPPORTS_PER_VMA_LOCK
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2c70b4d1263d..c1f6b46ec555 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1239,7 +1239,8 @@ static inline int pte_allow_rdp(pte_t old, pte_t new)
}
static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
- unsigned long address)
+ unsigned long address,
+ pte_t *ptep)
{
/*
* RDP might not have propagated the PTE protection reset to all CPUs,
@@ -1247,11 +1248,12 @@ static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
* NOTE: This will also be called when a racing pagetable update on
* another thread already installed the correct PTE. Both cases cannot
* really be distinguished.
- * Therefore, only do the local TLB flush when RDP can be used, to avoid
- * unnecessary overhead.
+ * Therefore, only do the local TLB flush when RDP can be used, and the
+ * PTE does not have _PAGE_PROTECT set, to avoid unnecessary overhead.
+ * A local RDP can be used to do the flush.
*/
- if (MACHINE_HAS_RDP)
- asm volatile("ptlb" : : : "memory");
+ if (MACHINE_HAS_RDP && !(pte_val(*ptep) & _PAGE_PROTECT))
+ __ptep_rdp(address, ptep, 0, 0, 1);
}
#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index a2632fd97d00..b65144c392b0 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -407,6 +407,30 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
access = VM_WRITE;
if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
+#ifdef CONFIG_PER_VMA_LOCK
+ if (!(flags & FAULT_FLAG_USER))
+ goto lock_mmap;
+ vma = lock_vma_under_rcu(mm, address);
+ if (!vma)
+ goto lock_mmap;
+ if (!(vma->vm_flags & access)) {
+ vma_end_read(vma);
+ goto lock_mmap;
+ }
+ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+ vma_end_read(vma);
+ if (!(fault & VM_FAULT_RETRY)) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ goto out;
+ }
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
+ /* Quick path to respond to signals */
+ if (fault_signal_pending(fault, regs)) {
+ fault = VM_FAULT_SIGNAL;
+ goto out;
+ }
+lock_mmap:
+#endif /* CONFIG_PER_VMA_LOCK */
mmap_read_lock(mm);
gmap = NULL;
diff --git a/arch/sh/configs/ecovec24_defconfig b/arch/sh/configs/ecovec24_defconfig
index b52e14ccb450..4d655e8d4d74 100644
--- a/arch/sh/configs/ecovec24_defconfig
+++ b/arch/sh/configs/ecovec24_defconfig
@@ -8,7 +8,7 @@ CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
CONFIG_CPU_SUBTYPE_SH7724=y
-CONFIG_ARCH_FORCE_MAX_ORDER=12
+CONFIG_ARCH_FORCE_MAX_ORDER=11
CONFIG_MEMORY_SIZE=0x10000000
CONFIG_FLATMEM_MANUAL=y
CONFIG_SH_ECOVEC=y
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index 411fdc0901f7..40271090bd7d 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -20,13 +20,13 @@ config PAGE_OFFSET
config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
- range 9 64 if PAGE_SIZE_16KB
- default "9" if PAGE_SIZE_16KB
- range 7 64 if PAGE_SIZE_64KB
- default "7" if PAGE_SIZE_64KB
- range 11 64
- default "14" if !MMU
- default "11"
+ range 8 63 if PAGE_SIZE_16KB
+ default "8" if PAGE_SIZE_16KB
+ range 6 63 if PAGE_SIZE_64KB
+ default "6" if PAGE_SIZE_64KB
+ range 10 63
+ default "13" if !MMU
+ default "10"
help
The kernel memory allocator divides physically contiguous memory
blocks into "zones", where each zone is a power of two number of
@@ -35,9 +35,6 @@ config ARCH_FORCE_MAX_ORDER
blocks of physically contiguous memory, then you may need to
increase this value.
- This config option is actually maximum order plus one. For example,
- a value of 11 means that the largest free memory block is 2^10 pages.
-
The page size is not necessarily 4KB. Keep this in mind when
choosing a value for this option.
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 84437a4c6545..e3242bf5a8df 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -271,7 +271,7 @@ config ARCH_SPARSEMEM_DEFAULT
config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
- default "13"
+ default "12"
help
The kernel memory allocator divides physically contiguous memory
blocks into "zones", where each zone is a power of two number of
@@ -280,9 +280,6 @@ config ARCH_FORCE_MAX_ORDER
blocks of physically contiguous memory, then you may need to
increase this value.
- This config option is actually maximum order plus one. For example,
- a value of 13 means that the largest free memory block is 2^12 pages.
-
if SPARC64 || COMPILE_TEST
source "kernel/power/Kconfig"
endif
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 384480971805..7d91ca6aa675 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -193,7 +193,7 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size,
size = IO_PAGE_ALIGN(size);
order = get_order(size);
- if (unlikely(order >= MAX_ORDER))
+ if (unlikely(order > MAX_ORDER))
return NULL;
npages = size >> IO_PAGE_SHIFT;
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 5b4de4a89dec..08ffd17d5ec3 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -897,7 +897,7 @@ void __init cheetah_ecache_flush_init(void)
/* Now allocate error trap reporting scoreboard. */
sz = NR_CPUS * (2 * sizeof(struct cheetah_err_info));
- for (order = 0; order < MAX_ORDER; order++) {
+ for (order = 0; order <= MAX_ORDER; order++) {
if ((PAGE_SIZE << order) >= sz)
break;
}
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index 912205787161..5e2931a18409 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -402,8 +402,8 @@ void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long rss)
unsigned long new_rss_limit;
gfp_t gfp_flags;
- if (max_tsb_size > (PAGE_SIZE << MAX_ORDER))
- max_tsb_size = (PAGE_SIZE << MAX_ORDER);
+ if (max_tsb_size > PAGE_SIZE << MAX_ORDER)
+ max_tsb_size = PAGE_SIZE << MAX_ORDER;
new_cache_index = 0;
for (new_size = 8192; new_size < max_tsb_size; new_size <<= 1UL) {
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a825bf031f49..df21fba77db1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86_64
# Options that are inherently 64-bit kernel only:
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
+ select ARCH_SUPPORTS_PER_VMA_LOCK
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
select MODULES_USE_ELF_RELA
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 7425f32e5293..15ae4d6ba476 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1097,7 +1097,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
}
-#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
+#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index c1e14cee0722..857d364b9888 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -15,24 +15,18 @@
#endif
#define __HAVE_ARCH_MEMCPY 1
-#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY)
-#undef memcpy
-#define memcpy __msan_memcpy
-#else
extern void *memcpy(void *to, const void *from, size_t len);
-#endif
extern void *__memcpy(void *to, const void *from, size_t len);
#define __HAVE_ARCH_MEMSET
-#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY)
-extern void *__msan_memset(void *s, int c, size_t n);
-#undef memset
-#define memset __msan_memset
-#else
void *memset(void *s, int c, size_t n);
-#endif
void *__memset(void *s, int c, size_t n);
+/*
+ * KMSAN needs to instrument as much code as possible. Use C versions of
+ * memsetXX() from lib/string.c under KMSAN.
+ */
+#if !defined(CONFIG_KMSAN)
#define __HAVE_ARCH_MEMSET16
static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
{
@@ -68,15 +62,10 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
: "memory");
return s;
}
+#endif
#define __HAVE_ARCH_MEMMOVE
-#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY)
-#undef memmove
-void *__msan_memmove(void *dest, const void *src, size_t len);
-#define memmove __msan_memmove
-#else
void *memmove(void *dest, const void *src, size_t count);
-#endif
void *__memmove(void *dest, const void *src, size_t count);
int memcmp(const void *cs, const void *ct, size_t count);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a498ae1fbe66..e4399983c50c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -19,6 +19,7 @@
#include <linux/uaccess.h> /* faulthandler_disabled() */
#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
+#include <linux/mm.h> /* find_and_lock_vma() */
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
@@ -1333,6 +1334,38 @@ void do_user_addr_fault(struct pt_regs *regs,
}
#endif
+#ifdef CONFIG_PER_VMA_LOCK
+ if (!(flags & FAULT_FLAG_USER))
+ goto lock_mmap;
+
+ vma = lock_vma_under_rcu(mm, address);
+ if (!vma)
+ goto lock_mmap;
+
+ if (unlikely(access_error(error_code, vma))) {
+ vma_end_read(vma);
+ goto lock_mmap;
+ }
+ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+ vma_end_read(vma);
+
+ if (!(fault & VM_FAULT_RETRY)) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ goto done;
+ }
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
+
+ /* Quick path to respond to signals */
+ if (fault_signal_pending(fault, regs)) {
+ if (!user_mode(regs))
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ SIGBUS, BUS_ADRERR,
+ ARCH_DEFAULT_PKEY);
+ return;
+ }
+lock_mmap:
+#endif /* CONFIG_PER_VMA_LOCK */
+
/*
* Kernel-mode access to the user address space should only occur
* on well-defined single instructions listed in the exception
@@ -1433,6 +1466,9 @@ good_area:
}
mmap_read_unlock(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+done:
+#endif
if (likely(!(fault & VM_FAULT_ERROR)))
return;
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 46a00aa858b6..de10800cd4dd 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -1073,11 +1073,15 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
}
/*
- * untrack_pfn_moved is called, while mremapping a pfnmap for a new region,
- * with the old vma after its pfnmap page table has been removed. The new
- * vma has a new pfnmap to the same pfn & cache type with VM_PAT set.
+ * untrack_pfn_clear is called if the following situation fits:
+ *
+ * 1) while mremapping a pfnmap for a new region, with the old vma after
+ * its pfnmap page table has been removed. The new vma has a new pfnmap
+ * to the same pfn & cache type with VM_PAT set.
+ * 2) while duplicating vm area, the new vma fails to copy the pgtable from
+ * old vma.
*/
-void untrack_pfn_moved(struct vm_area_struct *vma)
+void untrack_pfn_clear(struct vm_area_struct *vma)
{
vm_flags_clear(vma, VM_PAT);
}
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index bcb0c5d2abc2..3eee334ba873 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -773,7 +773,7 @@ config HIGHMEM
config ARCH_FORCE_MAX_ORDER
int "Maximum zone order"
- default "11"
+ default "10"
help
The kernel memory allocator divides physically contiguous memory
blocks into "zones", where each zone is a power of two number of
@@ -782,9 +782,6 @@ config ARCH_FORCE_MAX_ORDER
blocks of physically contiguous memory, then you may need to
increase this value.
- This config option is actually maximum order plus one. For example,
- a value of 11 means that the largest free memory block is 2^10 pages.
-
endmenu
menu "Power management options"
diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c
index 817eda2075aa..c491fabe3617 100644
--- a/drivers/base/regmap/regmap-debugfs.c
+++ b/drivers/base/regmap/regmap-debugfs.c
@@ -226,8 +226,8 @@ static ssize_t regmap_read_debugfs(struct regmap *map, unsigned int from,
if (*ppos < 0 || !count)
return -EINVAL;
- if (count > (PAGE_SIZE << (MAX_ORDER - 1)))
- count = PAGE_SIZE << (MAX_ORDER - 1);
+ if (count > (PAGE_SIZE << MAX_ORDER))
+ count = PAGE_SIZE << MAX_ORDER;
buf = kmalloc(count, GFP_KERNEL);
if (!buf)
@@ -373,8 +373,8 @@ static ssize_t regmap_reg_ranges_read_file(struct file *file,
if (*ppos < 0 || !count)
return -EINVAL;
- if (count > (PAGE_SIZE << (MAX_ORDER - 1)))
- count = PAGE_SIZE << (MAX_ORDER - 1);
+ if (count > (PAGE_SIZE << MAX_ORDER))
+ count = PAGE_SIZE << MAX_ORDER;
buf = kmalloc(count, GFP_KERNEL);
if (!buf)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 487840e3564d..cec2c20f5e59 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3108,7 +3108,7 @@ loop:
ptr->resultcode = 0;
if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) {
- if (ptr->length <= 0 || ptr->length >= MAX_LEN)
+ if (ptr->length <= 0 || ptr->length > MAX_LEN)
return -EINVAL;
ptr->kernel_data = (char *)fd_dma_mem_alloc(ptr->length);
fallback_on_nodma_alloc(&ptr->kernel_data, ptr->length);
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index e2f25926eb51..bf095baca244 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -886,7 +886,7 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
/*
* The length of the ID shouldn't be assumed by software since
* it may change in the future. The allocation size is limited
- * to 1 << (PAGE_SHIFT + MAX_ORDER - 1) by the page allocator.
+ * to 1 << (PAGE_SHIFT + MAX_ORDER) by the page allocator.
* If the allocation fails, simply return ENOMEM rather than
* warning in the kernel log.
*/
diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c
index 09586a837b1e..3df7a256e919 100644
--- a/drivers/crypto/hisilicon/sgl.c
+++ b/drivers/crypto/hisilicon/sgl.c
@@ -70,11 +70,11 @@ struct hisi_acc_sgl_pool *hisi_acc_create_sgl_pool(struct device *dev,
HISI_ACC_SGL_ALIGN_SIZE);
/*
- * the pool may allocate a block of memory of size PAGE_SIZE * 2^(MAX_ORDER - 1),
+ * the pool may allocate a block of memory of size PAGE_SIZE * 2^MAX_ORDER,
* block size may exceed 2^31 on ia64, so the max of block size is 2^31
*/
- block_size = 1 << (PAGE_SHIFT + MAX_ORDER <= 32 ?
- PAGE_SHIFT + MAX_ORDER - 1 : 31);
+ block_size = 1 << (PAGE_SHIFT + MAX_ORDER < 32 ?
+ PAGE_SHIFT + MAX_ORDER : 31);
sgl_num_per_block = block_size / sgl_size;
block_num = count / sgl_num_per_block;
remain_sgl = count % sgl_num_per_block;
diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
index e8bd10e60998..920db302a273 100644
--- a/drivers/dma-buf/heaps/system_heap.c
+++ b/drivers/dma-buf/heaps/system_heap.c
@@ -41,12 +41,11 @@ struct dma_heap_attachment {
bool mapped;
};
-#define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO | __GFP_COMP)
-#define MID_ORDER_GFP (LOW_ORDER_GFP | __GFP_NOWARN)
+#define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO)
#define HIGH_ORDER_GFP (((GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN \
| __GFP_NORETRY) & ~__GFP_RECLAIM) \
| __GFP_COMP)
-static gfp_t order_flags[] = {HIGH_ORDER_GFP, MID_ORDER_GFP, LOW_ORDER_GFP};
+static gfp_t order_flags[] = {HIGH_ORDER_GFP, HIGH_ORDER_GFP, LOW_ORDER_GFP};
/*
* The selection of the orders used for allocation (1MB, 64K, 4K) is designed
* to match with the sizes often found in IOMMUs. Using order 4 pages instead
diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
index defece0bcb81..99f39a5feca1 100644
--- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
+++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
@@ -115,7 +115,7 @@ static int get_huge_pages(struct drm_i915_gem_object *obj)
do {
struct page *page;
- GEM_BUG_ON(order >= MAX_ORDER);
+ GEM_BUG_ON(order > MAX_ORDER);
page = alloc_pages(GFP | __GFP_ZERO, order);
if (!page)
goto err;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index ca7744b852f5..5df3edadb808 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -254,7 +254,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
* encryption bits. This is because the exact location of the
* data may not be known at mmap() time and may also change
* at arbitrary times while the data is mmap'ed.
- * See vmf_insert_mixed_prot() for a discussion.
+ * See vmf_insert_pfn_prot() for a discussion.
*/
ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index aa116a7bbae3..6c8585abe08d 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -65,11 +65,11 @@ module_param(page_pool_size, ulong, 0644);
static atomic_long_t allocated_pages;
-static struct ttm_pool_type global_write_combined[MAX_ORDER];
-static struct ttm_pool_type global_uncached[MAX_ORDER];
+static struct ttm_pool_type global_write_combined[MAX_ORDER + 1];
+static struct ttm_pool_type global_uncached[MAX_ORDER + 1];
-static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER];
-static struct ttm_pool_type global_dma32_uncached[MAX_ORDER];
+static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER + 1];
+static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1];
static spinlock_t shrinker_lock;
static struct list_head shrinker_list;
@@ -405,7 +405,7 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
else
gfp_flags |= GFP_HIGHUSER;
- for (order = min_t(unsigned int, MAX_ORDER - 1, __fls(num_pages));
+ for (order = min_t(unsigned int, MAX_ORDER, __fls(num_pages));
num_pages;
order = min_t(unsigned int, order, __fls(num_pages))) {
struct ttm_pool_type *pt;
@@ -542,7 +542,7 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
if (use_dma_alloc) {
for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
- for (j = 0; j < MAX_ORDER; ++j)
+ for (j = 0; j <= MAX_ORDER; ++j)
ttm_pool_type_init(&pool->caching[i].orders[j],
pool, i, j);
}
@@ -562,7 +562,7 @@ void ttm_pool_fini(struct ttm_pool *pool)
if (pool->use_dma_alloc) {
for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
- for (j = 0; j < MAX_ORDER; ++j)
+ for (j = 0; j <= MAX_ORDER; ++j)
ttm_pool_type_fini(&pool->caching[i].orders[j]);
}
@@ -616,7 +616,7 @@ static void ttm_pool_debugfs_header(struct seq_file *m)
unsigned int i;
seq_puts(m, "\t ");
- for (i = 0; i < MAX_ORDER; ++i)
+ for (i = 0; i <= MAX_ORDER; ++i)
seq_printf(m, " ---%2u---", i);
seq_puts(m, "\n");
}
@@ -627,7 +627,7 @@ static void ttm_pool_debugfs_orders(struct ttm_pool_type *pt,
{
unsigned int i;
- for (i = 0; i < MAX_ORDER; ++i)
+ for (i = 0; i <= MAX_ORDER; ++i)
seq_printf(m, " %8u", ttm_pool_type_count(&pt[i]));
seq_puts(m, "\n");
}
@@ -736,7 +736,7 @@ int ttm_pool_mgr_init(unsigned long num_pages)
spin_lock_init(&shrinker_lock);
INIT_LIST_HEAD(&shrinker_list);
- for (i = 0; i < MAX_ORDER; ++i) {
+ for (i = 0; i <= MAX_ORDER; ++i) {
ttm_pool_type_init(&global_write_combined[i], NULL,
ttm_write_combined, i);
ttm_pool_type_init(&global_uncached[i], NULL, ttm_uncached, i);
@@ -769,7 +769,7 @@ void ttm_pool_mgr_fini(void)
{
unsigned int i;
- for (i = 0; i < MAX_ORDER; ++i) {
+ for (i = 0; i <= MAX_ORDER; ++i) {
ttm_pool_type_fini(&global_write_combined[i]);
ttm_pool_type_fini(&global_uncached[i]);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 8d772ea8a583..b574c58a3487 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -182,7 +182,7 @@
#ifdef CONFIG_CMA_ALIGNMENT
#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + CONFIG_CMA_ALIGNMENT)
#else
-#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_ORDER - 1)
+#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_ORDER)
#endif
/*
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 99b2646cb5c7..7a9f0b0bddbd 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -736,7 +736,7 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
struct page **pages;
unsigned int i = 0, nid = dev_to_node(dev);
- order_mask &= (2U << MAX_ORDER) - 1;
+ order_mask &= GENMASK(MAX_ORDER, 0);
if (!order_mask)
return NULL;
@@ -756,7 +756,7 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
* than a necessity, hence using __GFP_NORETRY until
* falling back to minimum-order allocations.
*/
- for (order_mask &= (2U << __fls(count)) - 1;
+ for (order_mask &= GENMASK(__fls(count), 0);
order_mask; order_mask &= ~order_size) {
unsigned int order = __fls(order_mask);
gfp_t alloc_flags = gfp;
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 586271b8aa39..85790b870877 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2440,8 +2440,8 @@ static bool its_parse_indirect_baser(struct its_node *its,
* feature is not supported by hardware.
*/
new_order = max_t(u32, get_order(esz << ids), new_order);
- if (new_order >= MAX_ORDER) {
- new_order = MAX_ORDER - 1;
+ if (new_order > MAX_ORDER) {
+ new_order = MAX_ORDER;
ids = ilog2(PAGE_ORDER_TO_SIZE(new_order) / (int)esz);
pr_warn("ITS@%pa: %s Table too large, reduce ids %llu->%u\n",
&its->phys_base, its_base_type_string[type],
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index cf077f9b30c3..733053c2eaa0 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -408,7 +408,7 @@ static void __cache_size_refresh(void)
* If the allocation may fail we use __get_free_pages. Memory fragmentation
* won't have a fatal effect here, but it just causes flushes of some other
* buffers and more I/O will be performed. Don't use __get_free_pages if it
- * always fails (i.e. order >= MAX_ORDER).
+ * always fails (i.e. order > MAX_ORDER).
*
* If the allocation shouldn't fail we use __vmalloc. This is only for the
* initial reserve allocation, so there's no risk of wasting all vmalloc
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index acffed750e3e..9e0c69958587 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
* Replacement block manager (new_bm) is created and old_bm destroyed outside of
* cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
* shrinker associated with the block manager's bufio client vs cmd root_lock).
- * - must take shrinker_rwsem without holding cmd->root_lock
+ * - must take shrinker_mutex without holding cmd->root_lock
*/
new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
CACHE_MAX_CONCURRENT_LOCKS);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index fd464fb024c3..9f5cb52c5763 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1887,7 +1887,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
* Replacement block manager (new_bm) is created and old_bm destroyed outside of
* pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
* shrinker associated with the block manager's bufio client vs pmd root_lock).
- * - must take shrinker_rwsem without holding pmd->root_lock
+ * - must take shrinker_mutex without holding pmd->root_lock
*/
new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
THIN_MAX_CONCURRENT_LOCKS);
diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c
index f778e11237a6..1c798d6b2dfb 100644
--- a/drivers/misc/genwqe/card_utils.c
+++ b/drivers/misc/genwqe/card_utils.c
@@ -210,7 +210,7 @@ u32 genwqe_crc32(u8 *buff, size_t len, u32 init)
void *__genwqe_alloc_consistent(struct genwqe_dev *cd, size_t size,
dma_addr_t *dma_handle)
{
- if (get_order(size) >= MAX_ORDER)
+ if (get_order(size) > MAX_ORDER)
return NULL;
return dma_alloc_coherent(&cd->pci_dev->dev, size, dma_handle,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 25be7f8ac7cd..3973ca6adf4c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1041,7 +1041,7 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring)
return;
order = get_order(alloc_size);
- if (order >= MAX_ORDER) {
+ if (order > MAX_ORDER) {
if (net_ratelimit())
dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n");
return;
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index b35c9b6f913b..4e18b4cefa97 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -75,7 +75,7 @@
* pool for the 4MB. Thus the 16 Rx and Tx queues require 32 * 5 = 160
* plus 16 for the TSO pools for a total of 176 LTB mappings per VNIC.
*/
-#define IBMVNIC_ONE_LTB_MAX ((u32)((1 << (MAX_ORDER - 1)) * PAGE_SIZE))
+#define IBMVNIC_ONE_LTB_MAX ((u32)((1 << MAX_ORDER) * PAGE_SIZE))
#define IBMVNIC_ONE_LTB_SIZE min((u32)(8 << 20), IBMVNIC_ONE_LTB_MAX)
#define IBMVNIC_LTB_SET_SIZE (38 << 20)
diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c
index ec3f6cf05f8c..34781dec3856 100644
--- a/drivers/video/fbdev/hyperv_fb.c
+++ b/drivers/video/fbdev/hyperv_fb.c
@@ -946,7 +946,7 @@ static phys_addr_t hvfb_get_phymem(struct hv_device *hdev,
if (request_size == 0)
return -1;
- if (order < MAX_ORDER) {
+ if (order <= MAX_ORDER) {
/* Call alloc_pages if the size is less than 2^MAX_ORDER */
page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
if (!page)
@@ -977,7 +977,7 @@ static void hvfb_release_phymem(struct hv_device *hdev,
{
unsigned int order = get_order(size);
- if (order < MAX_ORDER)
+ if (order <= MAX_ORDER)
__free_pages(pfn_to_page(paddr >> PAGE_SHIFT), order);
else
dma_free_coherent(&hdev->device,
diff --git a/drivers/video/fbdev/vermilion/vermilion.c b/drivers/video/fbdev/vermilion/vermilion.c
index 0374ee6b6d03..32e74e02a02f 100644
--- a/drivers/video/fbdev/vermilion/vermilion.c
+++ b/drivers/video/fbdev/vermilion/vermilion.c
@@ -197,7 +197,7 @@ static int vmlfb_alloc_vram(struct vml_info *vinfo,
va = &vinfo->vram[i];
order = 0;
- while (requested > (PAGE_SIZE << order) && order < MAX_ORDER)
+ while (requested > (PAGE_SIZE << order) && order <= MAX_ORDER)
order++;
err = vmlfb_alloc_vram_area(va, order, 0);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 3f78a3a1eb75..5b15936a5214 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -33,7 +33,7 @@
#define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \
__GFP_NOMEMALLOC)
/* The order of free page blocks to report to host */
-#define VIRTIO_BALLOON_HINT_BLOCK_ORDER (MAX_ORDER - 1)
+#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_ORDER
/* The size of a free page block in bytes */
#define VIRTIO_BALLOON_HINT_BLOCK_BYTES \
(1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT))
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 0c2892ec6817..835f6cc2fb66 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -1120,13 +1120,13 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn,
*/
static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
{
- unsigned long order = MAX_ORDER - 1;
+ unsigned long order = MAX_ORDER;
unsigned long i;
/*
* We might get called for ranges that don't cover properly aligned
- * MAX_ORDER - 1 pages; however, we can only online properly aligned
- * pages with an order of MAX_ORDER - 1 at maximum.
+ * MAX_ORDER pages; however, we can only online properly aligned
+ * pages with an order of MAX_ORDER at maximum.
*/
while (!IS_ALIGNED(pfn | nr_pages, 1 << order))
order--;
@@ -1237,9 +1237,9 @@ static void virtio_mem_online_page(struct virtio_mem *vm,
bool do_online;
/*
- * We can get called with any order up to MAX_ORDER - 1. If our
- * subblock size is smaller than that and we have a mixture of plugged
- * and unplugged subblocks within such a page, we have to process in
+ * We can get called with any order up to MAX_ORDER. If our subblock
+ * size is smaller than that and we have a mixture of plugged and
+ * unplugged subblocks within such a page, we have to process in
* smaller granularity. In that case we'll adjust the order exactly once
* within the loop.
*/
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 82690d1dd49a..f92b9e62d567 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -319,16 +319,16 @@ expand:
struct folio *folio;
folio = filemap_get_folio(mapping, i);
- if (!folio) {
+ if (IS_ERR(folio)) {
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
afs_stat_v(dvnode, n_inval);
-
- ret = -ENOMEM;
folio = __filemap_get_folio(mapping,
i, FGP_LOCK | FGP_CREAT,
mapping->gfp_mask);
- if (!folio)
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto error;
+ }
folio_attach_private(folio, (void *)1);
folio_unlock(folio);
}
@@ -524,7 +524,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
*/
folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE,
FGP_ACCESSED, 0);
- if (!folio) {
+ if (IS_ERR(folio)) {
ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
break;
}
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 0ab7752d1b75..f0eddccbdd95 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -115,7 +115,7 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
mapping->gfp_mask);
- if (!folio)
+ if (IS_ERR(folio))
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
else if (folio && !folio_test_private(folio))
folio_attach_private(folio, (void *)1);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 68d6d5dc608d..719b31374879 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -569,20 +569,10 @@ static void afs_vm_close(struct vm_area_struct *vma)
static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file));
- struct afs_file *af = vmf->vma->vm_file->private_data;
- switch (afs_validate(vnode, af->key)) {
- case 0:
+ if (afs_pagecache_valid(vnode))
return filemap_map_pages(vmf, start_pgoff, end_pgoff);
- case -ENOMEM:
- return VM_FAULT_OOM;
- case -EINTR:
- case -ERESTARTSYS:
- return VM_FAULT_RETRY;
- case -ESTALE:
- default:
- return VM_FAULT_SIGBUS;
- }
+ return 0;
}
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0167e96e5198..b1bdffd5e888 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -668,6 +668,24 @@ bool afs_check_validity(struct afs_vnode *vnode)
}
/*
+ * Returns true if the pagecache is still valid. Does not sleep.
+ */
+bool afs_pagecache_valid(struct afs_vnode *vnode)
+{
+ if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
+ if (vnode->netfs.inode.i_nlink)
+ clear_nlink(&vnode->netfs.inode);
+ return true;
+ }
+
+ if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
+ afs_check_validity(vnode))
+ return true;
+
+ return false;
+}
+
+/*
* validate a vnode/inode
* - there are several things we need to check
* - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
@@ -684,14 +702,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
vnode->fid.vid, vnode->fid.vnode, vnode->flags,
key_serial(key));
- if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
- if (vnode->netfs.inode.i_nlink)
- clear_nlink(&vnode->netfs.inode);
- goto valid;
- }
-
- if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
- afs_check_validity(vnode))
+ if (afs_pagecache_valid(vnode))
goto valid;
down_write(&vnode->validate_lock);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index ad8523d0d038..5c95df6621f9 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1171,6 +1171,7 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
extern struct inode *afs_root_iget(struct super_block *, struct key *);
extern bool afs_check_validity(struct afs_vnode *);
extern int afs_validate(struct afs_vnode *, struct key *);
+bool afs_pagecache_valid(struct afs_vnode *);
extern int afs_getattr(struct mnt_idmap *idmap, const struct path *,
struct kstat *, u32, unsigned int);
extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 571f3b9a417e..c822d6006033 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -232,7 +232,7 @@ static void afs_kill_pages(struct address_space *mapping,
_debug("kill %lx (to %lx)", index, last);
folio = filemap_get_folio(mapping, index);
- if (!folio) {
+ if (IS_ERR(folio)) {
next = index + 1;
continue;
}
@@ -270,7 +270,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,
_debug("redirty %llx @%llx", len, start);
folio = filemap_get_folio(mapping, index);
- if (!folio) {
+ if (IS_ERR(folio)) {
next = index + 1;
continue;
}
diff --git a/fs/exec.c b/fs/exec.c
index 7c44d0c65b1b..87cf3a2f0e9a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1034,7 +1034,7 @@ static int exec_mmap(struct mm_struct *mm)
mmput(old_mm);
return 0;
}
- mmdrop(active_mm);
+ mmdrop_lazy_tlb(active_mm);
return 0;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bf0b7dea4900..d7973743417b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5395,7 +5395,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
while (1) {
struct folio *folio = filemap_lock_folio(inode->i_mapping,
inode->i_size >> PAGE_SHIFT);
- if (!folio)
+ if (IS_ERR(folio))
return;
ret = __ext4_journalled_invalidate_folio(folio, offset,
folio_size(folio) - offset);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2de9829aed63..7bf6d069199c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -141,18 +141,18 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
flags = memalloc_nofs_save();
folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags,
mapping_gfp_mask(mapping[0]));
- if (!folio[0]) {
+ if (IS_ERR(folio[0])) {
memalloc_nofs_restore(flags);
- return -ENOMEM;
+ return PTR_ERR(folio[0]);
}
folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags,
mapping_gfp_mask(mapping[1]));
memalloc_nofs_restore(flags);
- if (!folio[1]) {
+ if (IS_ERR(folio[1])) {
folio_unlock(folio[0]);
folio_put(folio[0]);
- return -ENOMEM;
+ return PTR_ERR(folio[1]);
}
/*
* __filemap_get_folio() may not wait on folio's writeback if
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9062da6da567..702d79639c0d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -697,7 +697,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h,
struct folio *folio;
folio = filemap_lock_folio(mapping, idx);
- if (!folio)
+ if (IS_ERR(folio))
return;
start = start & ~huge_page_mask(h);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 6f4c97a6d7e9..96bb56c203f4 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -468,19 +468,12 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
{
unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
- struct folio *folio;
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
- folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
+ return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
fgp, mapping_gfp_mask(iter->inode->i_mapping));
- if (folio)
- return folio;
-
- if (iter->flags & IOMAP_NOWAIT)
- return ERR_PTR(-EAGAIN);
- return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(iomap_get_folio);
@@ -911,7 +904,7 @@ static int iomap_write_delalloc_scan(struct inode *inode,
/* grab locked page */
folio = filemap_lock_folio(inode->i_mapping,
start_byte >> PAGE_SHIFT);
- if (!folio) {
+ if (IS_ERR(folio)) {
start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
PAGE_SIZE;
continue;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 2e8461ce74de..961569c11159 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -691,6 +691,35 @@ void grab_metapage(struct metapage * mp)
unlock_page(mp->page);
}
+static int metapage_write_one(struct page *page)
+{
+ struct folio *folio = page_folio(page);
+ struct address_space *mapping = folio->mapping;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = folio_nr_pages(folio),
+ };
+ int ret = 0;
+
+ BUG_ON(!folio_test_locked(folio));
+
+ folio_wait_writeback(folio);
+
+ if (folio_clear_dirty_for_io(folio)) {
+ folio_get(folio);
+ ret = metapage_writepage(page, &wbc);
+ if (ret == 0)
+ folio_wait_writeback(folio);
+ folio_put(folio);
+ } else {
+ folio_unlock(folio);
+ }
+
+ if (!ret)
+ ret = filemap_check_errors(mapping);
+ return ret;
+}
+
void force_metapage(struct metapage *mp)
{
struct page *page = mp->page;
@@ -700,8 +729,8 @@ void force_metapage(struct metapage *mp)
get_page(page);
lock_page(page);
set_page_dirty(page);
- if (write_one_page(page))
- jfs_error(mp->sb, "write_one_page() failed\n");
+ if (metapage_write_one(page))
+ jfs_error(mp->sb, "metapage_write_one() failed\n");
clear_bit(META_forcewrite, &mp->flag);
put_page(page);
}
@@ -746,9 +775,9 @@ void release_metapage(struct metapage * mp)
set_page_dirty(page);
if (test_bit(META_sync, &mp->flag)) {
clear_bit(META_sync, &mp->flag);
- if (write_one_page(page))
- jfs_error(mp->sb, "write_one_page() failed\n");
- lock_page(page); /* write_one_page unlocks the page */
+ if (metapage_write_one(page))
+ jfs_error(mp->sb, "metapage_write_one() failed\n");
+ lock_page(page);
}
} else if (mp->lsn) /* discard_metapage doesn't remove it */
remove_from_logsync(mp);
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 7679a68e8193..209726a9cfdb 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -350,8 +350,8 @@ int netfs_write_begin(struct netfs_inode *ctx,
retry:
folio = __filemap_get_folio(mapping, index, fgp_flags,
mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
if (ctx->ops->check_write_begin) {
/* Allow the netfs (eg. ceph) to flush conflicts. */
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 893625eacab9..1d03406e6c03 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -336,8 +336,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
start:
folio = nfs_folio_grab_cache_write_begin(mapping, pos >> PAGE_SHIFT);
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
*pagep = &folio->page;
ret = nfs_flush_incompatible(file, folio);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 41ccd43cd979..5cf30827f244 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -259,10 +259,10 @@ repeat:
NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state");
dfolio = filemap_grab_folio(dmap, folio->index);
- if (unlikely(!dfolio)) {
+ if (unlikely(IS_ERR(dfolio))) {
/* No empty page is added to the page cache */
- err = -ENOMEM;
folio_unlock(folio);
+ err = PTR_ERR(dfolio);
break;
}
if (unlikely(!folio_buffers(folio)))
@@ -311,7 +311,7 @@ repeat:
folio_lock(folio);
dfolio = filemap_lock_folio(dmap, index);
- if (dfolio) {
+ if (!IS_ERR(dfolio)) {
/* overwrite existing folio in the destination cache */
WARN_ON(folio_test_dirty(dfolio));
nilfs_copy_page(&dfolio->page, &folio->page, 0);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 5a656dc683f1..564ab48d03ef 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2952,10 +2952,11 @@ retry:
*/
if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) {
if (PageDirty(page)) {
- /*
- * write_on_page will unlock the page on return
- */
- ret = write_one_page(page);
+ unlock_page(page);
+ put_page(page);
+
+ ret = filemap_write_and_wait_range(mapping,
+ offset, map_end - 1);
goto retry;
}
}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 71157ee35c1a..25b44b303b35 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -24,7 +24,7 @@
#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <asm/io.h>
#include <linux/list.h>
#include <linux/ioport.h>
@@ -307,10 +307,9 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
*i = ALIGN(*i + descsz, 4);
}
-static ssize_t
-read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
+static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- char *buf = file->private_data;
+ loff_t *fpos = &iocb->ki_pos;
size_t phdrs_offset, notes_offset, data_offset;
size_t page_offline_frozen = 1;
size_t phdrs_len, notes_len;
@@ -318,6 +317,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
size_t tsz;
int nphdr;
unsigned long start;
+ size_t buflen = iov_iter_count(iter);
size_t orig_buflen = buflen;
int ret = 0;
@@ -356,12 +356,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
};
tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
- if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) {
+ if (copy_to_iter((char *)&ehdr + *fpos, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
@@ -398,15 +397,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
- if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset,
- tsz)) {
+ if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz,
+ iter) != tsz) {
kfree(phdrs);
ret = -EFAULT;
goto out;
}
kfree(phdrs);
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
@@ -448,14 +446,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
min(vmcoreinfo_size, notes_len - i));
tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
- if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
+ if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) {
kfree(notes);
ret = -EFAULT;
goto out;
}
kfree(notes);
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
@@ -497,7 +494,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
if (!m) {
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -506,16 +503,33 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
switch (m->type) {
case KCORE_VMALLOC:
- vread(buf, (char *)start, tsz);
- /* we have to zero-fill user buffer even if no read */
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
+ {
+ const char *src = (char *)start;
+ size_t read = 0, left = tsz;
+
+ /*
+ * vmalloc uses spinlocks, so we optimistically try to
+ * read memory. If this fails, fault pages in and try
+ * again until we are done.
+ */
+ while (true) {
+ read += vread_iter(iter, src, left);
+ if (read == tsz)
+ break;
+
+ src += read;
+ left -= read;
+
+ if (fault_in_iov_iter_writeable(iter, left)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
break;
+ }
case KCORE_USER:
/* User page is handled prior to normal kernel page: */
- if (copy_to_user(buffer, (char *)start, tsz)) {
+ if (copy_to_iter((char *)start, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -531,7 +545,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
*/
if (!page || PageOffline(page) ||
is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -541,24 +555,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
case KCORE_VMEMMAP:
case KCORE_TEXT:
/*
- * Using bounce buffer to bypass the
- * hardened user copy kernel text checks.
+ * We use _copy_to_iter() to bypass usermode hardening
+ * which would otherwise prevent this operation.
*/
- if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- } else {
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
- }
+ if (_copy_to_iter((char *)start, tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
}
break;
default:
pr_warn_once("Unhandled KCORE type: %d\n", m->type);
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -566,7 +573,6 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
skip:
buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
start += tsz;
tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
}
@@ -589,10 +595,6 @@ static int open_kcore(struct inode *inode, struct file *filp)
if (ret)
return ret;
- filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!filp->private_data)
- return -ENOMEM;
-
if (kcore_need_update)
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
@@ -603,16 +605,9 @@ static int open_kcore(struct inode *inode, struct file *filp)
return 0;
}
-static int release_kcore(struct inode *inode, struct file *file)
-{
- kfree(file->private_data);
- return 0;
-}
-
static const struct proc_ops kcore_proc_ops = {
- .proc_read = read_kcore,
+ .proc_read_iter = read_kcore_iter,
.proc_open = open_kcore,
- .proc_release = release_kcore,
.proc_lseek = default_llseek,
};
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 440960110a42..b43d0bd42762 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -6,6 +6,7 @@
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/mmzone.h>
+#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/percpu.h>
#include <linux/seq_file.h>
@@ -131,6 +132,18 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "VmallocChunk: ", 0ul);
show_val_kb(m, "Percpu: ", pcpu_nr_pages());
+#ifdef CONFIG_MEMTEST
+ if (early_memtest_done) {
+ unsigned long early_memtest_bad_size_kb;
+
+ early_memtest_bad_size_kb = early_memtest_bad_size>>10;
+ if (early_memtest_bad_size && !early_memtest_bad_size_kb)
+ early_memtest_bad_size_kb = 1;
+ /* When 0 is reported, it means there actually was a successful test */
+ seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb);
+ }
+#endif
+
#ifdef CONFIG_MEMORY_FAILURE
seq_printf(m, "HardwareCorrupted: %5lu kB\n",
atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10));
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2f67516bb9bf..9fbb9b5256f7 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
/* make various checks */
order = get_order(newsize);
- if (unlikely(order >= MAX_ORDER))
+ if (unlikely(order > MAX_ORDER))
return -EFBIG;
ret = inode_newsize_ok(inode, newsize);
diff --git a/fs/super.c b/fs/super.c
index 04bc62ab7dfe..34afe411cf2b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
* If that happens we could trigger unregistering the shrinker from within the
- * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
+ * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we
* take a passive reference to the superblock to avoid this from occurring.
*/
static unsigned long super_cache_scan(struct shrinker *shrink,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 391efaf1d528..379d75796a5c 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -42,11 +42,10 @@ static inline int ufs_match(struct super_block *sb, int len,
return !memcmp(name, de->d_name, len);
}
-static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+static void ufs_commit_chunk(struct page *page, loff_t pos, unsigned len)
{
struct address_space *mapping = page->mapping;
struct inode *dir = mapping->host;
- int err = 0;
inode_inc_iversion(dir);
block_write_end(NULL, mapping, pos, len, len, page, NULL);
@@ -54,10 +53,16 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len)
i_size_write(dir, pos+len);
mark_inode_dirty(dir);
}
- if (IS_DIRSYNC(dir))
- err = write_one_page(page);
- else
- unlock_page(page);
+ unlock_page(page);
+}
+
+static int ufs_handle_dirsync(struct inode *dir)
+{
+ int err;
+
+ err = filemap_write_and_wait(dir->i_mapping);
+ if (!err)
+ err = sync_inode_metadata(dir, 1);
return err;
}
@@ -99,11 +104,12 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
ufs_set_de_type(dir->i_sb, de, inode->i_mode);
- err = ufs_commit_chunk(page, pos, len);
+ ufs_commit_chunk(page, pos, len);
ufs_put_page(page);
if (update_times)
dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty(dir);
+ ufs_handle_dirsync(dir);
}
@@ -390,10 +396,11 @@ got_it:
de->d_ino = cpu_to_fs32(sb, inode->i_ino);
ufs_set_de_type(sb, de, inode->i_mode);
- err = ufs_commit_chunk(page, pos, rec_len);
+ ufs_commit_chunk(page, pos, rec_len);
dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty(dir);
+ err = ufs_handle_dirsync(dir);
/* OFFSET_CACHE */
out_put:
ufs_put_page(page);
@@ -531,9 +538,10 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
if (pde)
pde->d_reclen = cpu_to_fs16(sb, to - from);
dir->d_ino = 0;
- err = ufs_commit_chunk(page, pos, to - from);
+ ufs_commit_chunk(page, pos, to - from);
inode->i_ctime = inode->i_mtime = current_time(inode);
mark_inode_dirty(inode);
+ err = ufs_handle_dirsync(inode);
out:
ufs_put_page(page);
UFSD("EXIT\n");
@@ -579,7 +587,8 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
strcpy (de->d_name, "..");
kunmap(page);
- err = ufs_commit_chunk(page, 0, chunk_size);
+ ufs_commit_chunk(page, 0, chunk_size);
+ err = ufs_handle_dirsync(inode);
fail:
put_page(page);
return err;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 40f9e1a2ebdd..3b2a41c330e6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
return ctx->features & UFFD_FEATURE_INITIALIZED;
}
+/*
+ * Whether WP_UNPOPULATED is enabled on the uffd context. It is only
+ * meaningful when userfaultfd_wp()==true on the vma and when it's
+ * anonymous.
+ */
+bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
+{
+ struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+ if (!ctx)
+ return false;
+
+ return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
+}
+
static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
vm_flags_t flags)
{
@@ -1629,7 +1644,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
/* Reset ptes for the whole vma range if wr-protected */
if (userfaultfd_wp(vma))
- uffd_wp_range(mm, vma, start, vma_end - start, false);
+ uffd_wp_range(vma, start, vma_end - start, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
@@ -1714,6 +1729,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
struct uffdio_copy uffdio_copy;
struct uffdio_copy __user *user_uffdio_copy;
struct userfaultfd_wake_range range;
+ uffd_flags_t flags = 0;
user_uffdio_copy = (struct uffdio_copy __user *) arg;
@@ -1740,10 +1756,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
goto out;
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out;
+ if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
+ flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len, &ctx->mmap_changing,
- uffdio_copy.mode);
+ ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len, &ctx->mmap_changing,
+ flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1793,9 +1811,9 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len,
- &ctx->mmap_changing);
+ ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len,
+ &ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1875,6 +1893,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
struct uffdio_continue uffdio_continue;
struct uffdio_continue __user *user_uffdio_continue;
struct userfaultfd_wake_range range;
+ uffd_flags_t flags = 0;
user_uffdio_continue = (struct uffdio_continue __user *)arg;
@@ -1899,13 +1918,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
uffdio_continue.range.start) {
goto out;
}
- if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
+ if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
+ UFFDIO_CONTINUE_MODE_WP))
goto out;
+ if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
+ flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
- uffdio_continue.range.len,
- &ctx->mmap_changing);
+ ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
+ uffdio_continue.range.len,
+ &ctx->mmap_changing, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1973,6 +1995,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
#endif
#ifndef CONFIG_PTE_MARKER_UFFD_WP
uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 705250f9f90a..528fc538b6b9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1388,25 +1388,10 @@ xfs_filemap_pfn_mkwrite(
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
-static vm_fault_t
-xfs_filemap_map_pages(
- struct vm_fault *vmf,
- pgoff_t start_pgoff,
- pgoff_t end_pgoff)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- vm_fault_t ret;
-
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- return ret;
-}
-
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
.huge_fault = xfs_filemap_huge_fault,
- .map_pages = xfs_filemap_map_pages,
+ .map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
};
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 977bea16cf1b..a7cf825befae 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -123,11 +123,11 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
- page = alloc_pages(gfp, 0);
+ page = alloc_page(gfp);
if (!page)
return NULL;
if (!pgtable_pmd_page_ctor(page)) {
- __free_pages(page, 0);
+ __free_page(page);
return NULL;
}
return (pmd_t *)page_address(page);
diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h
index ef09b23d29e3..8ce14f9d202a 100644
--- a/include/drm/ttm/ttm_pool.h
+++ b/include/drm/ttm/ttm_pool.h
@@ -72,7 +72,7 @@ struct ttm_pool {
bool use_dma32;
struct {
- struct ttm_pool_type orders[MAX_ORDER];
+ struct ttm_pool_type orders[MAX_ORDER + 1];
} caching[TTM_NUM_CACHING_TYPES];
};
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 65a78773dcca..ed8cb537c6a7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -319,7 +319,7 @@ extern void page_frag_free(void *addr);
#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)
-void page_alloc_init(void);
+void page_alloc_init_cpuhp(void);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
@@ -361,9 +361,4 @@ extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
#endif
void free_contig_range(unsigned long pfn, unsigned long nr_pages);
-#ifdef CONFIG_CMA
-/* CMA stuff */
-extern void init_cma_reserved_pageblock(struct page *page);
-#endif
-
#endif /* __LINUX_GFP_H */
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 5088637fe5c2..6583a58670c5 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -47,16 +47,14 @@ typedef unsigned int __bitwise gfp_t;
#define ___GFP_ACCOUNT 0x400000u
#define ___GFP_ZEROTAGS 0x800000u
#ifdef CONFIG_KASAN_HW_TAGS
-#define ___GFP_SKIP_ZERO 0x1000000u
-#define ___GFP_SKIP_KASAN_UNPOISON 0x2000000u
-#define ___GFP_SKIP_KASAN_POISON 0x4000000u
+#define ___GFP_SKIP_ZERO 0x1000000u
+#define ___GFP_SKIP_KASAN 0x2000000u
#else
-#define ___GFP_SKIP_ZERO 0
-#define ___GFP_SKIP_KASAN_UNPOISON 0
-#define ___GFP_SKIP_KASAN_POISON 0
+#define ___GFP_SKIP_ZERO 0
+#define ___GFP_SKIP_KASAN 0
#endif
#ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP 0x8000000u
+#define ___GFP_NOLOCKDEP 0x4000000u
#else
#define ___GFP_NOLOCKDEP 0
#endif
@@ -234,25 +232,24 @@ typedef unsigned int __bitwise gfp_t;
* memory tags at the same time as zeroing memory has minimal additional
* performace impact.
*
- * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation.
- * Only effective in HW_TAGS mode.
- *
- * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation.
- * Typically, used for userspace pages. Only effective in HW_TAGS mode.
+ * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation.
+ * Used for userspace and vmalloc pages; the latter are unpoisoned by
+ * kasan_unpoison_vmalloc instead. For userspace pages, results in
+ * poisoning being skipped as well, see should_skip_kasan_poison for
+ * details. Only effective in HW_TAGS mode.
*/
#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN)
#define __GFP_COMP ((__force gfp_t)___GFP_COMP)
#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO)
#define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS)
#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO)
-#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON)
-#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
+#define __GFP_SKIP_KASAN ((__force gfp_t)___GFP_SKIP_KASAN)
/* Disable lockdep for GFP context tracking */
#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
/* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP))
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/**
@@ -335,8 +332,7 @@ typedef unsigned int __bitwise gfp_t;
#define GFP_DMA __GFP_DMA
#define GFP_DMA32 __GFP_DMA32
#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM)
-#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE | \
- __GFP_SKIP_KASAN_POISON | __GFP_SKIP_KASAN_UNPOISON)
+#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE | __GFP_SKIP_KASAN)
#define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
__GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
#define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 8fc10089e19e..9c7cdaa3de8c 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -243,12 +243,10 @@ static inline void clear_highpage(struct page *page)
static inline void clear_highpage_kasan_tagged(struct page *page)
{
- u8 tag;
+ void *kaddr = kmap_local_page(page);
- tag = page_kasan_tag(page);
- page_kasan_tag_reset(page);
- clear_highpage(page);
- page_kasan_tag_set(page, tag);
+ clear_page(kasan_reset_tag(kaddr));
+ kunmap_local(kaddr);
}
#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 70bd867eba94..20284387b841 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,47 +39,12 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr, pgprot_t newprot,
unsigned long cp_flags);
-vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
- pgprot_t pgprot, bool write);
-/**
- * vmf_insert_pfn_pmd - insert a pmd size pfn
- * @vmf: Structure describing the fault
- * @pfn: pfn to insert
- * @pgprot: page protection to use
- * @write: whether it's a write fault
- *
- * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
- *
- * Return: vm_fault_t value.
- */
-static inline vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn,
- bool write)
-{
- return vmf_insert_pfn_pmd_prot(vmf, pfn, vmf->vma->vm_page_prot, write);
-}
-vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
- pgprot_t pgprot, bool write);
-
-/**
- * vmf_insert_pfn_pud - insert a pud size pfn
- * @vmf: Structure describing the fault
- * @pfn: pfn to insert
- * @pgprot: page protection to use
- * @write: whether it's a write fault
- *
- * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
- *
- * Return: vm_fault_t value.
- */
-static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn,
- bool write)
-{
- return vmf_insert_pfn_pud_prot(vmf, pfn, vmf->vma->vm_page_prot, write);
-}
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
enum transparent_hugepage_flag {
- TRANSPARENT_HUGEPAGE_NEVER_DAX,
+ TRANSPARENT_HUGEPAGE_UNSUPPORTED,
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7c977d234aba..2a758bcd6719 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -158,13 +158,12 @@ unsigned long hugetlb_total_pages(void);
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
#ifdef CONFIG_USERFAULTFD
-int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- enum mcopy_atomic_mode mode,
- struct page **pagep,
- bool wp_copy);
+int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ uffd_flags_t flags,
+ struct page **pagep);
#endif /* CONFIG_USERFAULTFD */
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma,
@@ -393,14 +392,12 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
}
#ifdef CONFIG_USERFAULTFD
-static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pte_t *dst_pte,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- enum mcopy_atomic_mode mode,
- struct page **pagep,
- bool wp_copy)
+static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ uffd_flags_t flags,
+ struct page **pagep)
{
BUG();
return 0;
@@ -818,7 +815,7 @@ static inline unsigned huge_page_shift(struct hstate *h)
static inline bool hstate_is_gigantic(struct hstate *h)
{
- return huge_page_order(h) >= MAX_ORDER;
+ return huge_page_order(h) > MAX_ORDER;
}
static inline unsigned int pages_per_huge_page(const struct hstate *h)
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 09d4f17c8d3b..7376c1df9c90 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -69,7 +69,10 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping,
BUG_ON(offset >= mapping->size);
phys_addr = mapping->base + offset;
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+ else
+ migrate_disable();
pagefault_disable();
return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot);
}
@@ -79,7 +82,10 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
{
kunmap_local_indexed((void __force *)vaddr);
pagefault_enable();
- preempt_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+ else
+ migrate_enable();
}
static inline void __iomem *
@@ -162,7 +168,10 @@ static inline void __iomem *
io_mapping_map_atomic_wc(struct io_mapping *mapping,
unsigned long offset)
{
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+ else
+ migrate_disable();
pagefault_disable();
return io_mapping_map_wc(mapping, offset, PAGE_SIZE);
}
@@ -172,7 +181,10 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
{
io_mapping_unmap(vaddr);
pagefault_enable();
- preempt_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+ else
+ migrate_enable();
}
static inline void __iomem *
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 50ad19662a32..f82ee3fac1cd 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -597,6 +597,8 @@ extern int hashdist; /* Distribute hashes across NUMA nodes? */
#endif
#ifdef CONFIG_MEMTEST
+extern phys_addr_t early_memtest_bad_size; /* Size of faulty ram found by memtest */
+extern bool early_memtest_done; /* Was early memtest done? */
extern void early_memtest(phys_addr_t start, phys_addr_t end);
#else
static inline void early_memtest(phys_addr_t start, phys_addr_t end)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b6eda2ab205d..aa69ea98e2d8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -97,6 +97,7 @@ struct shrinker_info {
struct rcu_head rcu;
atomic_long_t *nr_deferred;
unsigned long *map;
+ int map_nr_max;
};
struct lruvec_stats_percpu {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1f79667824eb..e249208f8fbe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -29,6 +29,7 @@
#include <linux/pgtable.h>
#include <linux/kasan.h>
#include <linux/memremap.h>
+#include <linux/slab.h>
struct mempolicy;
struct anon_vma;
@@ -38,6 +39,7 @@ struct pt_regs;
extern int sysctl_page_lock_unfairness;
+void mm_core_init(void);
void init_mm_internals(void);
#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */
@@ -256,6 +258,8 @@ void setup_initial_init_mm(void *start_code, void *end_code,
struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);
+/* Use only if VMA has no other users */
+void __vm_area_free(struct vm_area_struct *vma);
#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
@@ -478,7 +482,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
{ FAULT_FLAG_USER, "USER" }, \
{ FAULT_FLAG_REMOTE, "REMOTE" }, \
{ FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \
- { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }
+ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \
+ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" }
/*
* vm_fault is filled by the pagefault handler and passed to the vma's
@@ -623,6 +628,131 @@ struct vm_operations_struct {
unsigned long addr);
};
+#ifdef CONFIG_NUMA_BALANCING
+static inline void vma_numab_state_init(struct vm_area_struct *vma)
+{
+ vma->numab_state = NULL;
+}
+static inline void vma_numab_state_free(struct vm_area_struct *vma)
+{
+ kfree(vma->numab_state);
+}
+#else
+static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
+static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ */
+static inline bool vma_start_read(struct vm_area_struct *vma)
+{
+ /* Check before locking. A race might cause false locked result. */
+ if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+ return false;
+
+ if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
+ return false;
+
+ /*
+ * Overflow might produce false locked result.
+ * False unlocked result is impossible because we modify and check
+ * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
+ * modification invalidates all existing locks.
+ */
+ if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+ up_read(&vma->vm_lock->lock);
+ return false;
+ }
+ return true;
+}
+
+static inline void vma_end_read(struct vm_area_struct *vma)
+{
+ rcu_read_lock(); /* keeps vma alive till the end of up_read */
+ up_read(&vma->vm_lock->lock);
+ rcu_read_unlock();
+}
+
+static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
+{
+ mmap_assert_write_locked(vma->vm_mm);
+
+ /*
+ * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+ * mm->mm_lock_seq can't be concurrently modified.
+ */
+ *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+ return (vma->vm_lock_seq == *mm_lock_seq);
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+ int mm_lock_seq;
+
+ if (__is_vma_write_locked(vma, &mm_lock_seq))
+ return;
+
+ down_write(&vma->vm_lock->lock);
+ vma->vm_lock_seq = mm_lock_seq;
+ up_write(&vma->vm_lock->lock);
+}
+
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+{
+ int mm_lock_seq;
+
+ if (__is_vma_write_locked(vma, &mm_lock_seq))
+ return true;
+
+ if (!down_write_trylock(&vma->vm_lock->lock))
+ return false;
+
+ vma->vm_lock_seq = mm_lock_seq;
+ up_write(&vma->vm_lock->lock);
+ return true;
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+ int mm_lock_seq;
+
+ VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
+}
+
+static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
+{
+ /* When detaching vma should be write-locked */
+ if (detached)
+ vma_assert_write_locked(vma);
+ vma->detached = detached;
+}
+
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address);
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline bool vma_start_read(struct vm_area_struct *vma)
+ { return false; }
+static inline void vma_end_read(struct vm_area_struct *vma) {}
+static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+ { return true; }
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+static inline void vma_mark_detached(struct vm_area_struct *vma,
+ bool detached) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+/*
+ * WARNING: vma_init does not initialize vma->vm_lock.
+ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
+ */
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
static const struct vm_operations_struct dummy_vm_ops = {};
@@ -631,6 +761,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
vma->vm_mm = mm;
vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
+ vma_mark_detached(vma, false);
+ vma_numab_state_init(vma);
}
/* Use when VMA is not part of the VMA tree and needs no locking */
@@ -644,28 +776,28 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
static inline void vm_flags_reset(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
vm_flags_init(vma, flags);
}
static inline void vm_flags_reset_once(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
}
static inline void vm_flags_set(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
ACCESS_PRIVATE(vma, __vm_flags) |= flags;
}
static inline void vm_flags_clear(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
}
@@ -686,7 +818,7 @@ static inline void __vm_flags_mod(struct vm_area_struct *vma,
static inline void vm_flags_mod(struct vm_area_struct *vma,
vm_flags_t set, vm_flags_t clear)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
__vm_flags_mod(vma, set, clear);
}
@@ -1554,6 +1686,16 @@ static inline int xchg_page_access_time(struct page *page, int time)
last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
return last_time << PAGE_ACCESS_TIME_BUCKETS;
}
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+ unsigned int pid_bit;
+
+ pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
+ if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
+ __set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+ }
+}
#else /* !CONFIG_NUMA_BALANCING */
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
{
@@ -1603,6 +1745,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
{
return false;
}
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+}
#endif /* CONFIG_NUMA_BALANCING */
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@ -2636,12 +2782,6 @@ static inline bool ptlock_init(struct page *page) { return true; }
static inline void ptlock_free(struct page *page) {}
#endif /* USE_SPLIT_PTE_PTLOCKS */
-static inline void pgtable_init(void)
-{
- ptlock_cache_init();
- pgtable_cache_init();
-}
-
static inline bool pgtable_pte_page_ctor(struct page *page)
{
if (!ptlock_init(page))
@@ -2785,7 +2925,6 @@ extern unsigned long free_reserved_area(void *start, void *end,
int poison, const char *s);
extern void adjust_managed_page_count(struct page *page, long count);
-extern void mem_init_print_info(void);
extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
@@ -3185,8 +3324,6 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn);
-vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn);
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
@@ -3256,7 +3393,6 @@ extern int apply_to_existing_page_range(struct mm_struct *mm,
unsigned long address, unsigned long size,
pte_fn_t fn, void *data);
-extern void __init init_mem_debugging_and_hardening(void);
#ifdef CONFIG_PAGE_POISONING
extern void __kernel_poison_pages(struct page *page, int numpages);
extern void __kernel_unpoison_pages(struct page *page, int numpages);
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index de1e622dd366..0e1d239a882c 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -557,6 +557,12 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
/* The current status of the pte should be "cleared" before calling */
WARN_ON_ONCE(!pte_none(*pte));
+ /*
+ * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole
+ * thing, because when zapping either it means it's dropping the
+ * page, or in TTU where the present pte will be quickly replaced
+ * with a swap pte. There's no way of leaking the bit.
+ */
if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
return;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a57e6ae78e65..3fc9e680f174 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -471,6 +471,16 @@ struct anon_vma_name {
char name[];
};
+struct vma_lock {
+ struct rw_semaphore lock;
+};
+
+struct vma_numab_state {
+ unsigned long next_scan;
+ unsigned long next_pid_reset;
+ unsigned long access_pids[2];
+};
+
/*
* This struct describes a virtual memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
@@ -480,17 +490,19 @@ struct anon_vma_name {
struct vm_area_struct {
/* The first cache line has the info for VMA tree walking. */
- unsigned long vm_start; /* Our start address within vm_mm. */
- unsigned long vm_end; /* The first byte after our end address
- within vm_mm. */
+ union {
+ struct {
+ /* VMA covers [vm_start; vm_end) addresses within mm */
+ unsigned long vm_start;
+ unsigned long vm_end;
+ };
+#ifdef CONFIG_PER_VMA_LOCK
+ struct rcu_head vm_rcu; /* Used for deferred freeing. */
+#endif
+ };
struct mm_struct *vm_mm; /* The address space we belong to. */
-
- /*
- * Access permissions of this VMA.
- * See vmf_insert_mixed_prot() for discussion.
- */
- pgprot_t vm_page_prot;
+ pgprot_t vm_page_prot; /* Access permissions of this VMA. */
/*
* Flags, see mm.h.
@@ -501,6 +513,14 @@ struct vm_area_struct {
vm_flags_t __private __vm_flags;
};
+#ifdef CONFIG_PER_VMA_LOCK
+ int vm_lock_seq;
+ struct vma_lock *vm_lock;
+
+ /* Flag to indicate areas detached from the mm->mm_mt tree */
+ bool detached;
+#endif
+
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
@@ -547,6 +567,9 @@ struct vm_area_struct {
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ struct vma_numab_state *numab_state; /* NUMA Balancing state */
+#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
@@ -637,6 +660,9 @@ struct mm_struct {
* init_mm.mmlist, and are protected
* by mmlist_lock
*/
+#ifdef CONFIG_PER_VMA_LOCK
+ int mm_lock_seq;
+#endif
unsigned long hiwater_rss; /* High-watermark of RSS usage */
@@ -1037,6 +1063,7 @@ typedef struct {
* mapped after the fault.
* @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
* We should only access orig_pte if this flag set.
+ * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock.
*
* About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
* whether we would allow page faults to retry by specifying these two
@@ -1074,6 +1101,7 @@ enum fault_flag {
FAULT_FLAG_INTERRUPTIBLE = 1 << 9,
FAULT_FLAG_UNSHARE = 1 << 10,
FAULT_FLAG_ORIG_PTE_VALID = 1 << 11,
+ FAULT_FLAG_VMA_LOCK = 1 << 12,
};
typedef unsigned int __bitwise zap_flags_t;
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 96e113e23d04..aab8f1b28d26 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -60,6 +60,29 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
#endif /* CONFIG_TRACING */
+static inline void mmap_assert_locked(struct mm_struct *mm)
+{
+ lockdep_assert_held(&mm->mmap_lock);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+}
+
+static inline void mmap_assert_write_locked(struct mm_struct *mm)
+{
+ lockdep_assert_held_write(&mm->mmap_lock);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_end_write_all(struct mm_struct *mm)
+{
+ mmap_assert_write_locked(mm);
+ /* No races during update due to exclusive mmap_lock being held */
+ WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_end_write_all(struct mm_struct *mm) {}
+#endif
+
static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
@@ -102,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
static inline void mmap_write_unlock(struct mm_struct *mm)
{
__mmap_lock_trace_released(mm, true);
+ vma_end_write_all(mm);
up_write(&mm->mmap_lock);
}
static inline void mmap_write_downgrade(struct mm_struct *mm)
{
__mmap_lock_trace_acquire_returned(mm, false, true);
+ vma_end_write_all(mm);
downgrade_write(&mm->mmap_lock);
}
@@ -150,18 +175,6 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
up_read_non_owner(&mm->mmap_lock);
}
-static inline void mmap_assert_locked(struct mm_struct *mm)
-{
- lockdep_assert_held(&mm->mmap_lock);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
-}
-
-static inline void mmap_assert_write_locked(struct mm_struct *mm)
-{
- lockdep_assert_held_write(&mm->mmap_lock);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
-}
-
static inline int mmap_lock_is_contended(struct mm_struct *mm)
{
return rwsem_is_contended(&mm->mmap_lock);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9fb1b03b83b2..2d22e47dc1eb 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -26,11 +26,13 @@
/* Free memory management - zoned buddy allocator. */
#ifndef CONFIG_ARCH_FORCE_MAX_ORDER
-#define MAX_ORDER 11
+#define MAX_ORDER 10
#else
#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
#endif
-#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
+#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
+
+#define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)
/*
* PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
@@ -93,7 +95,7 @@ static inline bool migratetype_is_mergeable(int mt)
}
#define for_each_migratetype_order(order, type) \
- for (order = 0; order < MAX_ORDER; order++) \
+ for (order = 0; order <= MAX_ORDER; order++) \
for (type = 0; type < MIGRATE_TYPES; type++)
extern int page_group_by_mobility_disabled;
@@ -108,13 +110,6 @@ struct free_area {
unsigned long nr_free;
};
-static inline struct page *get_page_from_free_area(struct free_area *area,
- int migratetype)
-{
- return list_first_entry_or_null(&area->free_list[migratetype],
- struct page, lru);
-}
-
static inline bool free_area_empty(struct free_area *area, int migratetype)
{
return list_empty(&area->free_list[migratetype]);
@@ -922,7 +917,7 @@ struct zone {
CACHELINE_PADDING(_pad1_);
/* free areas of different sizes */
- struct free_area free_area[MAX_ORDER];
+ struct free_area free_area[MAX_ORDER + 1];
/* zone flags, see below */
unsigned long flags;
@@ -1369,7 +1364,7 @@ typedef struct pglist_data {
#ifdef CONFIG_LRU_GEN
/* kswap mm walk data */
- struct lru_gen_mm_walk mm_walk;
+ struct lru_gen_mm_walk mm_walk;
/* lru_gen_folio list */
struct lru_gen_memcg memcg_lru;
#endif
@@ -1745,7 +1740,7 @@ static inline bool movable_only_nodes(nodemask_t *nodes)
#define SECTION_BLOCKFLAGS_BITS \
((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
-#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
+#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_ORDER exceeds SECTION_SIZE
#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a7e3a3405520..dcda20c47b8f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -136,9 +136,6 @@ enum pageflags {
PG_arch_2,
PG_arch_3,
#endif
-#ifdef CONFIG_KASAN_HW_TAGS
- PG_skip_kasan_poison,
-#endif
__NR_PAGEFLAGS,
PG_readahead = PG_reclaim,
@@ -594,12 +591,6 @@ TESTCLEARFLAG(Young, young, PF_ANY)
PAGEFLAG(Idle, idle, PF_ANY)
#endif
-#ifdef CONFIG_KASAN_HW_TAGS
-PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD)
-#else
-PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
-#endif
-
/*
* PageReported() is used to track reported free pages within the Buddy
* allocator. We can use the non-atomic version of the test and set
@@ -926,9 +917,14 @@ static inline bool is_page_hwpoison(struct page *page)
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
+static inline int page_type_has_type(unsigned int page_type)
+{
+ return (int)page_type < PAGE_MAPCOUNT_RESERVE;
+}
+
static inline int page_has_type(struct page *page)
{
- return (int)page->page_type < PAGE_MAPCOUNT_RESERVE;
+ return page_type_has_type(page->page_type);
}
#define PAGE_TYPE_OPS(uname, lname) \
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index bc2e39090a1f..67314f648aeb 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -29,8 +29,6 @@ struct page_ext_operations {
bool need_shared_flags;
};
-extern bool deferred_struct_pages;
-
#ifdef CONFIG_PAGE_EXTENSION
/*
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 5f1ae07d724b..e83c4c095041 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -41,14 +41,14 @@ extern unsigned int pageblock_order;
* Huge pages are a constant size, but don't exceed the maximum allocation
* granularity.
*/
-#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER - 1)
+#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER)
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
#else /* CONFIG_HUGETLB_PAGE */
/* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
-#define pageblock_order (MAX_ORDER-1)
+#define pageblock_order MAX_ORDER
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0acb8e1fb7af..fdcd595d2294 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -504,9 +504,9 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
#define FGP_NOFS 0x00000010
#define FGP_NOWAIT 0x00000020
#define FGP_FOR_MMAP 0x00000040
-#define FGP_ENTRY 0x00000080
-#define FGP_STABLE 0x00000100
+#define FGP_STABLE 0x00000080
+void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
int fgp_flags, gfp_t gfp);
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
@@ -520,7 +520,8 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
* Looks up the page cache entry at @mapping & @index. If a folio is
* present, it is returned with an increased refcount.
*
- * Otherwise, %NULL is returned.
+ * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
+ * this index. Will not return a shadow, swap or DAX entry.
*/
static inline struct folio *filemap_get_folio(struct address_space *mapping,
pgoff_t index)
@@ -537,8 +538,8 @@ static inline struct folio *filemap_get_folio(struct address_space *mapping,
* present, it is returned locked with an increased refcount.
*
* Context: May sleep.
- * Return: A folio or %NULL if there is no folio in the cache for this
- * index. Will not return a shadow, swap or DAX entry.
+ * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
+ * this index. Will not return a shadow, swap or DAX entry.
*/
static inline struct folio *filemap_lock_folio(struct address_space *mapping,
pgoff_t index)
@@ -555,8 +556,8 @@ static inline struct folio *filemap_lock_folio(struct address_space *mapping,
* a new folio is created. The folio is locked, marked as accessed, and
* returned.
*
- * Return: A found or created folio. NULL if no folio is found and failed to
- * create a folio.
+ * Return: A found or created folio. ERR_PTR(-ENOMEM) if no folio is found
+ * and failed to create a folio.
*/
static inline struct folio *filemap_grab_folio(struct address_space *mapping,
pgoff_t index)
@@ -1066,12 +1067,6 @@ static inline void folio_cancel_dirty(struct folio *folio)
bool folio_clear_dirty_for_io(struct folio *folio);
bool clear_page_dirty_for_io(struct page *page);
void folio_invalidate(struct folio *folio, size_t offset, size_t length);
-int __must_check folio_write_one(struct folio *folio);
-static inline int __must_check write_one_page(struct page *page)
-{
- return folio_write_one(page_folio(page));
-}
-
int __set_page_dirty_nobuffers(struct page *page);
bool noop_dirty_folio(struct address_space *mapping, struct folio *folio);
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index c63cd44777ec..c5a51481bbb9 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -817,7 +817,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
#endif
#ifndef flush_tlb_fix_spurious_fault
-#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
+#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
#endif
/*
@@ -1191,9 +1191,10 @@ static inline void untrack_pfn(struct vm_area_struct *vma,
}
/*
- * untrack_pfn_moved is called while mremapping a pfnmap for a new region.
+ * untrack_pfn_clear is called while mremapping a pfnmap for a new region
+ * or fails to copy pgtable during duplicate vm area.
*/
-static inline void untrack_pfn_moved(struct vm_area_struct *vma)
+static inline void untrack_pfn_clear(struct vm_area_struct *vma)
{
}
#else
@@ -1205,7 +1206,7 @@ extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
extern int track_pfn_copy(struct vm_area_struct *vma);
extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size, bool mm_wr_locked);
-extern void untrack_pfn_moved(struct vm_area_struct *vma);
+extern void untrack_pfn_clear(struct vm_area_struct *vma);
#endif
#ifdef CONFIG_MMU
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2a243616f222..689dbe812563 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -79,6 +79,34 @@ static inline void mmdrop_sched(struct mm_struct *mm)
}
#endif
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+ mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+ mmdrop(mm);
+ } else {
+ /*
+ * mmdrop_lazy_tlb must provide a full memory barrier, see the
+ * membarrier comment finish_task_switch which relies on this.
+ */
+ smp_mb();
+ }
+}
+
+static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+ mmdrop_sched(mm);
+ else
+ smp_mb(); /* see mmdrop_lazy_tlb() above */
+}
+
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 103d1000a5a2..3bb8d21edbb3 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -9,6 +9,7 @@
#include <linux/percpu_counter.h>
#include <linux/xattr.h>
#include <linux/fs_parser.h>
+#include <linux/userfaultfd_k.h>
/* inode in-kernel data */
@@ -45,6 +46,7 @@ struct shmem_sb_info {
kuid_t uid; /* Mount uid for root directory */
kgid_t gid; /* Mount gid for root directory */
bool full_inums; /* If i_ino should be uint or ino_t */
+ bool noswap; /* ignores VM reclaim / swap requests */
ino_t next_ino; /* The next per-sb inode number to use */
ino_t __percpu *ino_batch; /* The next per-cpu inode number to use */
struct mempolicy *mpol; /* default memory policy for mappings */
@@ -151,15 +153,15 @@ extern void shmem_uncharge(struct inode *inode, long pages);
#ifdef CONFIG_USERFAULTFD
#ifdef CONFIG_SHMEM
-extern int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- bool zeropage, bool wp_copy,
+ uffd_flags_t flags,
struct page **pagep);
#else /* !CONFIG_SHMEM */
-#define shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
- src_addr, zeropage, wp_copy, pagep) ({ BUG(); 0; })
+#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
+ src_addr, flags, pagep) ({ BUG(); 0; })
#endif /* CONFIG_SHMEM */
#endif /* CONFIG_USERFAULTFD */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 45af70315a94..f8b1d63c63a3 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -167,7 +167,6 @@ struct mem_cgroup;
/*
* struct kmem_cache related prototypes
*/
-void __init kmem_cache_init(void);
bool slab_is_available(void);
struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
@@ -284,7 +283,7 @@ static inline unsigned int arch_slab_minalign(void)
* (PAGE_SIZE*2). Larger requests are passed to the page allocator.
*/
#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
+#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 5
#endif
@@ -292,7 +291,7 @@ static inline unsigned int arch_slab_minalign(void)
#ifdef CONFIG_SLUB
#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
+#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 3
#endif
@@ -305,7 +304,7 @@ static inline unsigned int arch_slab_minalign(void)
* be allocated from the same page.
*/
#define KMALLOC_SHIFT_HIGH PAGE_SHIFT
-#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
+#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 3
#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 209a425739a9..bfc3b06b5f8f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -620,18 +620,18 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
/* Cgroup2 doesn't have per-cgroup swappiness */
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
- return vm_swappiness;
+ return READ_ONCE(vm_swappiness);
/* root ? */
if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
- return vm_swappiness;
+ return READ_ONCE(vm_swappiness);
- return memcg->swappiness;
+ return READ_ONCE(memcg->swappiness);
}
#else
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
{
- return vm_swappiness;
+ return READ_ONCE(vm_swappiness);
}
#endif
@@ -641,22 +641,18 @@ extern atomic_t zswap_stored_pages;
#endif
#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
-static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp);
+static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
if (mem_cgroup_disabled())
return;
- __cgroup_throttle_swaprate(page, gfp_mask);
+ __folio_throttle_swaprate(folio, gfp);
}
#else
-static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
-{
-}
-#endif
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
- cgroup_throttle_swaprate(&folio->page, gfp);
}
+#endif
#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 27e3fd942960..29eb18bb6feb 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -173,6 +173,8 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
{
return copy_page_to_iter(&folio->page, offset, bytes, i);
}
+size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
+ size_t bytes, struct iov_iter *i);
static __always_inline __must_check
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 3767f18114ef..a2c53e98dfd6 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -40,40 +40,55 @@ extern int sysctl_unprivileged_userfaultfd;
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
-/*
- * The mode of operation for __mcopy_atomic and its helpers.
- *
- * This is almost an implementation detail (mcopy_atomic below doesn't take this
- * as a parameter), but it's exposed here because memory-kind-specific
- * implementations (e.g. hugetlbfs) need to know the mode of operation.
- */
-enum mcopy_atomic_mode {
- /* A normal copy_from_user into the destination range. */
- MCOPY_ATOMIC_NORMAL,
- /* Don't copy; map the destination range to the zero page. */
- MCOPY_ATOMIC_ZEROPAGE,
- /* Just install pte(s) with the existing page(s) in the page cache. */
- MCOPY_ATOMIC_CONTINUE,
+/* A combined operation mode + behavior flags. */
+typedef unsigned int __bitwise uffd_flags_t;
+
+/* Mutually exclusive modes of operation. */
+enum mfill_atomic_mode {
+ MFILL_ATOMIC_COPY,
+ MFILL_ATOMIC_ZEROPAGE,
+ MFILL_ATOMIC_CONTINUE,
+ NR_MFILL_ATOMIC_MODES,
};
-extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+#define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1)
+#define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr))
+#define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr))
+#define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1))
+
+static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected)
+{
+ return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected);
+}
+
+static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode)
+{
+ flags &= ~MFILL_ATOMIC_MODE_MASK;
+ return flags | ((__force uffd_flags_t) mode);
+}
+
+/* Flags controlling behavior. These behavior changes are mode-independent. */
+#define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0)
+
+extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, struct page *page,
- bool newly_allocated, bool wp_copy);
-
-extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, __u64 mode);
-extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long len,
- atomic_t *mmap_changing);
-extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long len, atomic_t *mmap_changing);
+ bool newly_allocated, uffd_flags_t flags);
+
+extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len,
+ atomic_t *mmap_changing, uffd_flags_t flags);
+extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len,
+ atomic_t *mmap_changing);
+extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long len, atomic_t *mmap_changing,
+ uffd_flags_t flags);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing);
-extern long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
+extern long uffd_wp_range(struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);
/* mm helpers */
@@ -179,6 +194,7 @@ extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
unsigned long end, struct list_head *uf);
extern void userfaultfd_unmap_complete(struct mm_struct *mm,
struct list_head *uf);
+extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
#else /* CONFIG_USERFAULTFD */
@@ -274,8 +290,30 @@ static inline bool uffd_disable_fault_around(struct vm_area_struct *vma)
return false;
}
+static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
+{
+ return false;
+}
+
#endif /* CONFIG_USERFAULTFD */
+static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
+{
+ /* Only wr-protect mode uses pte markers */
+ if (!userfaultfd_wp(vma))
+ return false;
+
+ /* File-based uffd-wp always need markers */
+ if (!vma_is_anonymous(vma))
+ return true;
+
+ /*
+ * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED
+ * enabled (to apply markers on zero pages).
+ */
+ return userfaultfd_wp_unpopulated(vma);
+}
+
static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry)
{
#ifdef CONFIG_PTE_MARKER_UFFD_WP
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 7f5d1caf5890..8abfa1240040 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -150,6 +150,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
DIRECT_MAP_LEVEL2_SPLIT,
DIRECT_MAP_LEVEL3_SPLIT,
#endif
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+ VMA_LOCK_SUCCESS,
+ VMA_LOCK_ABORT,
+ VMA_LOCK_RETRY,
+ VMA_LOCK_MISS,
+#endif
NR_VM_EVENT_ITEMS
};
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 69250efa03d1..c720be70c8dd 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -14,6 +14,7 @@
struct vm_area_struct; /* vma defining user mapping in mm_types.h */
struct notifier_block; /* in notifier.h */
+struct iov_iter; /* in uio.h */
/* bits in flags of vmalloc's vm_struct below */
#define VM_IOREMAP 0x00000001 /* ioremap() and friends */
@@ -131,12 +132,8 @@ extern void *vm_map_ram(struct page **pages, unsigned int count, int node);
extern void vm_unmap_aliases(void);
#ifdef CONFIG_MMU
-extern void __init vmalloc_init(void);
extern unsigned long vmalloc_nr_pages(void);
#else
-static inline void vmalloc_init(void)
-{
-}
static inline unsigned long vmalloc_nr_pages(void) { return 0; }
#endif
@@ -251,7 +248,7 @@ static inline void set_vm_flush_reset_perms(void *addr)
#endif
/* for /proc/kcore */
-extern long vread(char *buf, char *addr, unsigned long count);
+extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
/*
* Internals. Don't use..
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 19cf5b6892ce..fed855bae6d8 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,6 +125,12 @@ static inline void vm_events_fold_cpu(int cpu)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+#define count_vm_vma_lock_event(x) count_vm_event(x)
+#else
+#define count_vm_vma_lock_event(x) do {} while (0)
+#endif
+
#define __count_zid_vm_events(item, zid, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h
index ef75ea606ab2..25103e67737c 100644
--- a/include/trace/events/cma.h
+++ b/include/trace/events/cma.h
@@ -8,37 +8,6 @@
#include <linux/types.h>
#include <linux/tracepoint.h>
-DECLARE_EVENT_CLASS(cma_alloc_class,
-
- TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
- unsigned long count, unsigned int align),
-
- TP_ARGS(name, pfn, page, count, align),
-
- TP_STRUCT__entry(
- __string(name, name)
- __field(unsigned long, pfn)
- __field(const struct page *, page)
- __field(unsigned long, count)
- __field(unsigned int, align)
- ),
-
- TP_fast_assign(
- __assign_str(name, name);
- __entry->pfn = pfn;
- __entry->page = page;
- __entry->count = count;
- __entry->align = align;
- ),
-
- TP_printk("name=%s pfn=0x%lx page=%p count=%lu align=%u",
- __get_str(name),
- __entry->pfn,
- __entry->page,
- __entry->count,
- __entry->align)
-);
-
TRACE_EVENT(cma_release,
TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
@@ -125,12 +94,35 @@ TRACE_EVENT(cma_alloc_finish,
__entry->errorno)
);
-DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry,
+TRACE_EVENT(cma_alloc_busy_retry,
TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
unsigned long count, unsigned int align),
- TP_ARGS(name, pfn, page, count, align)
+ TP_ARGS(name, pfn, page, count, align),
+
+ TP_STRUCT__entry(
+ __string(name, name)
+ __field(unsigned long, pfn)
+ __field(const struct page *, page)
+ __field(unsigned long, count)
+ __field(unsigned int, align)
+ ),
+
+ TP_fast_assign(
+ __assign_str(name, name);
+ __entry->pfn = pfn;
+ __entry->page = page;
+ __entry->count = count;
+ __entry->align = align;
+ ),
+
+ TP_printk("name=%s pfn=0x%lx page=%p count=%lu align=%u",
+ __get_str(name),
+ __entry->pfn,
+ __entry->page,
+ __entry->count,
+ __entry->align)
);
#endif /* _TRACE_CMA_H */
diff --git a/include/trace/events/ksm.h b/include/trace/events/ksm.h
new file mode 100644
index 000000000000..b5ac35c1d0e8
--- /dev/null
+++ b/include/trace/events/ksm.h
@@ -0,0 +1,251 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ksm
+
+#if !defined(_TRACE_KSM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KSM_H
+
+#include <linux/tracepoint.h>
+
+/**
+ * ksm_scan_template - called for start / stop scan
+ *
+ * @seq: sequence number of scan
+ * @rmap_entries: actual number of rmap entries
+ *
+ * Allows to trace the start / stop of a ksm scan.
+ */
+DECLARE_EVENT_CLASS(ksm_scan_template,
+
+ TP_PROTO(int seq, u32 rmap_entries),
+
+ TP_ARGS(seq, rmap_entries),
+
+ TP_STRUCT__entry(
+ __field(int, seq)
+ __field(u32, rmap_entries)
+ ),
+
+ TP_fast_assign(
+ __entry->seq = seq;
+ __entry->rmap_entries = rmap_entries;
+ ),
+
+ TP_printk("seq %d rmap size %d",
+ __entry->seq, __entry->rmap_entries)
+);
+
+/**
+ * ksm_start_scan - called after a new ksm scan is started
+ *
+ * @seq: sequence number of scan
+ * @rmap_entries: actual number of rmap entries
+ *
+ * Allows to trace the start of a ksm scan.
+ */
+DEFINE_EVENT(ksm_scan_template, ksm_start_scan,
+
+ TP_PROTO(int seq, u32 rmap_entries),
+
+ TP_ARGS(seq, rmap_entries)
+);
+
+/**
+ * ksm_stop_scan - called after a new ksm scan has completed
+ *
+ * @seq: sequence number of scan
+ * @rmap_entries: actual number of rmap entries
+ *
+ * Allows to trace the completion of a ksm scan.
+ */
+DEFINE_EVENT(ksm_scan_template, ksm_stop_scan,
+
+ TP_PROTO(int seq, u32 rmap_entries),
+
+ TP_ARGS(seq, rmap_entries)
+);
+
+/**
+ * ksm_enter - called after a new process has been added / removed from ksm
+ *
+ * @mm: address of the mm object of the process
+ *
+ * Allows to trace the when a process has been added or removed from ksm.
+ */
+DECLARE_EVENT_CLASS(ksm_enter_exit_template,
+
+ TP_PROTO(void *mm),
+
+ TP_ARGS(mm),
+
+ TP_STRUCT__entry(
+ __field(void *, mm)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ ),
+
+ TP_printk("mm %p", __entry->mm)
+);
+
+/**
+ * ksm_enter - called after a new process has been added to ksm
+ *
+ * @mm: address of the mm object of the process
+ *
+ * Allows to trace the when a process has been added to ksm.
+ */
+DEFINE_EVENT(ksm_enter_exit_template, ksm_enter,
+
+ TP_PROTO(void *mm),
+
+ TP_ARGS(mm)
+);
+
+/**
+ * ksm_exit - called after a new process has been removed from ksm
+ *
+ * @mm: address of the mm object of the process
+ *
+ * Allows to trace the when a process has been removed from ksm.
+ */
+DEFINE_EVENT(ksm_enter_exit_template, ksm_exit,
+
+ TP_PROTO(void *mm),
+
+ TP_ARGS(mm)
+);
+
+/**
+ * ksm_merge_one_page - called after a page has been merged
+ *
+ * @pfn: page frame number of ksm page
+ * @rmap_item: address of rmap_item object
+ * @mm: address of the process mm struct
+ * @err: success
+ *
+ * Allows to trace the ksm merging of individual pages.
+ */
+TRACE_EVENT(ksm_merge_one_page,
+
+ TP_PROTO(unsigned long pfn, void *rmap_item, void *mm, int err),
+
+ TP_ARGS(pfn, rmap_item, mm, err),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(void *, rmap_item)
+ __field(void *, mm)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = pfn;
+ __entry->rmap_item = rmap_item;
+ __entry->mm = mm;
+ __entry->err = err;
+ ),
+
+ TP_printk("ksm pfn %lu rmap_item %p mm %p error %d",
+ __entry->pfn, __entry->rmap_item, __entry->mm, __entry->err)
+);
+
+/**
+ * ksm_merge_with_ksm_page - called after a page has been merged with a ksm page
+ *
+ * @ksm_page: address ksm page
+ * @pfn: page frame number of ksm page
+ * @rmap_item: address of rmap_item object
+ * @mm: address of the mm object of the process
+ * @err: success
+ *
+ * Allows to trace the merging of a page with a ksm page.
+ */
+TRACE_EVENT(ksm_merge_with_ksm_page,
+
+ TP_PROTO(void *ksm_page, unsigned long pfn, void *rmap_item, void *mm, int err),
+
+ TP_ARGS(ksm_page, pfn, rmap_item, mm, err),
+
+ TP_STRUCT__entry(
+ __field(void *, ksm_page)
+ __field(unsigned long, pfn)
+ __field(void *, rmap_item)
+ __field(void *, mm)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __entry->ksm_page = ksm_page;
+ __entry->pfn = pfn;
+ __entry->rmap_item = rmap_item;
+ __entry->mm = mm;
+ __entry->err = err;
+ ),
+
+ TP_printk("%spfn %lu rmap_item %p mm %p error %d",
+ (__entry->ksm_page ? "ksm " : ""),
+ __entry->pfn, __entry->rmap_item, __entry->mm, __entry->err)
+);
+
+/**
+ * ksm_remove_ksm_page - called after a ksm page has been removed
+ *
+ * @pfn: page frame number of ksm page
+ *
+ * Allows to trace the removing of stable ksm pages.
+ */
+TRACE_EVENT(ksm_remove_ksm_page,
+
+ TP_PROTO(unsigned long pfn),
+
+ TP_ARGS(pfn),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = pfn;
+ ),
+
+ TP_printk("pfn %lu", __entry->pfn)
+);
+
+/**
+ * ksm_remove_rmap_item - called after a rmap_item has been removed from the
+ * stable tree
+ *
+ * @pfn: page frame number of ksm page
+ * @rmap_item: address of rmap_item object
+ * @mm: address of the process mm struct
+ *
+ * Allows to trace the removal of pages from the stable tree list.
+ */
+TRACE_EVENT(ksm_remove_rmap_item,
+
+ TP_PROTO(unsigned long pfn, void *rmap_item, void *mm),
+
+ TP_ARGS(pfn, rmap_item, mm),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(void *, rmap_item)
+ __field(void *, mm)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = pfn;
+ __entry->rmap_item = rmap_item;
+ __entry->mm = mm;
+ ),
+
+ TP_printk("pfn %lu rmap_item %p mm %p",
+ __entry->pfn, __entry->rmap_item, __entry->mm)
+);
+
+#endif /* _TRACE_KSM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 9db52bc4ce19..b63e7c0fbbe5 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -55,8 +55,7 @@
#ifdef CONFIG_KASAN_HW_TAGS
#define __def_gfpflag_names_kasan , \
gfpflag_string(__GFP_SKIP_ZERO), \
- gfpflag_string(__GFP_SKIP_KASAN_POISON), \
- gfpflag_string(__GFP_SKIP_KASAN_UNPOISON)
+ gfpflag_string(__GFP_SKIP_KASAN)
#else
#define __def_gfpflag_names_kasan
#endif
@@ -67,77 +66,80 @@
) : "none"
#ifdef CONFIG_MMU
-#define IF_HAVE_PG_MLOCK(flag,string) ,{1UL << flag, string}
+#define IF_HAVE_PG_MLOCK(_name) ,{1UL << PG_##_name, __stringify(_name)}
#else
-#define IF_HAVE_PG_MLOCK(flag,string)
+#define IF_HAVE_PG_MLOCK(_name)
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-#define IF_HAVE_PG_UNCACHED(flag,string) ,{1UL << flag, string}
+#define IF_HAVE_PG_UNCACHED(_name) ,{1UL << PG_##_name, __stringify(_name)}
#else
-#define IF_HAVE_PG_UNCACHED(flag,string)
+#define IF_HAVE_PG_UNCACHED(_name)
#endif
#ifdef CONFIG_MEMORY_FAILURE
-#define IF_HAVE_PG_HWPOISON(flag,string) ,{1UL << flag, string}
+#define IF_HAVE_PG_HWPOISON(_name) ,{1UL << PG_##_name, __stringify(_name)}
#else
-#define IF_HAVE_PG_HWPOISON(flag,string)
+#define IF_HAVE_PG_HWPOISON(_name)
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
-#define IF_HAVE_PG_IDLE(flag,string) ,{1UL << flag, string}
+#define IF_HAVE_PG_IDLE(_name) ,{1UL << PG_##_name, __stringify(_name)}
#else
-#define IF_HAVE_PG_IDLE(flag,string)
+#define IF_HAVE_PG_IDLE(_name)
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_X
-#define IF_HAVE_PG_ARCH_X(flag,string) ,{1UL << flag, string}
+#define IF_HAVE_PG_ARCH_X(_name) ,{1UL << PG_##_name, __stringify(_name)}
#else
-#define IF_HAVE_PG_ARCH_X(flag,string)
+#define IF_HAVE_PG_ARCH_X(_name)
#endif
-#ifdef CONFIG_KASAN_HW_TAGS
-#define IF_HAVE_PG_SKIP_KASAN_POISON(flag,string) ,{1UL << flag, string}
-#else
-#define IF_HAVE_PG_SKIP_KASAN_POISON(flag,string)
-#endif
+#define DEF_PAGEFLAG_NAME(_name) { 1UL << PG_##_name, __stringify(_name) }
#define __def_pageflag_names \
- {1UL << PG_locked, "locked" }, \
- {1UL << PG_waiters, "waiters" }, \
- {1UL << PG_error, "error" }, \
- {1UL << PG_referenced, "referenced" }, \
- {1UL << PG_uptodate, "uptodate" }, \
- {1UL << PG_dirty, "dirty" }, \
- {1UL << PG_lru, "lru" }, \
- {1UL << PG_active, "active" }, \
- {1UL << PG_workingset, "workingset" }, \
- {1UL << PG_slab, "slab" }, \
- {1UL << PG_owner_priv_1, "owner_priv_1" }, \
- {1UL << PG_arch_1, "arch_1" }, \
- {1UL << PG_reserved, "reserved" }, \
- {1UL << PG_private, "private" }, \
- {1UL << PG_private_2, "private_2" }, \
- {1UL << PG_writeback, "writeback" }, \
- {1UL << PG_head, "head" }, \
- {1UL << PG_mappedtodisk, "mappedtodisk" }, \
- {1UL << PG_reclaim, "reclaim" }, \
- {1UL << PG_swapbacked, "swapbacked" }, \
- {1UL << PG_unevictable, "unevictable" } \
-IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \
-IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
-IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
-IF_HAVE_PG_IDLE(PG_young, "young" ) \
-IF_HAVE_PG_IDLE(PG_idle, "idle" ) \
-IF_HAVE_PG_ARCH_X(PG_arch_2, "arch_2" ) \
-IF_HAVE_PG_ARCH_X(PG_arch_3, "arch_3" ) \
-IF_HAVE_PG_SKIP_KASAN_POISON(PG_skip_kasan_poison, "skip_kasan_poison")
+ DEF_PAGEFLAG_NAME(locked), \
+ DEF_PAGEFLAG_NAME(waiters), \
+ DEF_PAGEFLAG_NAME(error), \
+ DEF_PAGEFLAG_NAME(referenced), \
+ DEF_PAGEFLAG_NAME(uptodate), \
+ DEF_PAGEFLAG_NAME(dirty), \
+ DEF_PAGEFLAG_NAME(lru), \
+ DEF_PAGEFLAG_NAME(active), \
+ DEF_PAGEFLAG_NAME(workingset), \
+ DEF_PAGEFLAG_NAME(slab), \
+ DEF_PAGEFLAG_NAME(owner_priv_1), \
+ DEF_PAGEFLAG_NAME(arch_1), \
+ DEF_PAGEFLAG_NAME(reserved), \
+ DEF_PAGEFLAG_NAME(private), \
+ DEF_PAGEFLAG_NAME(private_2), \
+ DEF_PAGEFLAG_NAME(writeback), \
+ DEF_PAGEFLAG_NAME(head), \
+ DEF_PAGEFLAG_NAME(mappedtodisk), \
+ DEF_PAGEFLAG_NAME(reclaim), \
+ DEF_PAGEFLAG_NAME(swapbacked), \
+ DEF_PAGEFLAG_NAME(unevictable) \
+IF_HAVE_PG_MLOCK(mlocked) \
+IF_HAVE_PG_UNCACHED(uncached) \
+IF_HAVE_PG_HWPOISON(hwpoison) \
+IF_HAVE_PG_IDLE(idle) \
+IF_HAVE_PG_IDLE(young) \
+IF_HAVE_PG_ARCH_X(arch_2) \
+IF_HAVE_PG_ARCH_X(arch_3)
#define show_page_flags(flags) \
(flags) ? __print_flags(flags, "|", \
__def_pageflag_names \
) : "none"
+#define DEF_PAGETYPE_NAME(_name) { PG_##_name, __stringify(_name) }
+
+#define __def_pagetype_names \
+ DEF_PAGETYPE_NAME(offline), \
+ DEF_PAGETYPE_NAME(guard), \
+ DEF_PAGETYPE_NAME(table), \
+ DEF_PAGETYPE_NAME(buddy)
+
#if defined(CONFIG_X86)
#define __VM_ARCH_SPECIFIC_1 {VM_PAT, "pat" }
#elif defined(CONFIG_PPC)
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 005e5e306266..66dd4cd277bd 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -38,7 +38,8 @@
UFFD_FEATURE_MINOR_HUGETLBFS | \
UFFD_FEATURE_MINOR_SHMEM | \
UFFD_FEATURE_EXACT_ADDRESS | \
- UFFD_FEATURE_WP_HUGETLBFS_SHMEM)
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \
+ UFFD_FEATURE_WP_UNPOPULATED)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -203,6 +204,12 @@ struct uffdio_api {
*
* UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd
* write-protection mode is supported on both shmem and hugetlbfs.
+ *
+ * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd
+ * write-protection mode will always apply to unpopulated pages
+ * (i.e. empty ptes). This will be the default behavior for shmem
+ * & hugetlbfs, so this flag only affects anonymous memory behavior
+ * when userfault write-protection mode is registered.
*/
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
@@ -217,6 +224,7 @@ struct uffdio_api {
#define UFFD_FEATURE_MINOR_SHMEM (1<<10)
#define UFFD_FEATURE_EXACT_ADDRESS (1<<11)
#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12)
+#define UFFD_FEATURE_WP_UNPOPULATED (1<<13)
__u64 features;
__u64 ioctls;
@@ -297,6 +305,13 @@ struct uffdio_writeprotect {
struct uffdio_continue {
struct uffdio_range range;
#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
+ /*
+ * UFFDIO_CONTINUE_MODE_WP will map the page write protected on
+ * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the
+ * write protected ioctl is implemented for the range
+ * according to the uffdio_register.ioctls.
+ */
+#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1)
__u64 mode;
/*
diff --git a/init/main.c b/init/main.c
index bb87b789c543..43bc4c82dc58 100644
--- a/init/main.c
+++ b/init/main.c
@@ -62,7 +62,6 @@
#include <linux/rmap.h>
#include <linux/mempolicy.h>
#include <linux/key.h>
-#include <linux/page_ext.h>
#include <linux/debug_locks.h>
#include <linux/debugobjects.h>
#include <linux/lockdep.h>
@@ -807,69 +806,6 @@ static inline void initcall_debug_enable(void)
}
#endif
-/* Report memory auto-initialization states for this boot. */
-static void __init report_meminit(void)
-{
- const char *stack;
-
- if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
- stack = "all(pattern)";
- else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
- stack = "all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
- stack = "byref_all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
- stack = "byref(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
- stack = "__user(zero)";
- else
- stack = "off";
-
- pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
- stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",
- want_init_on_free() ? "on" : "off");
- if (want_init_on_free())
- pr_info("mem auto-init: clearing system memory may take some time...\n");
-}
-
-/*
- * Set up kernel memory allocators
- */
-static void __init mm_init(void)
-{
- /*
- * page_ext requires contiguous pages,
- * bigger than MAX_ORDER unless SPARSEMEM.
- */
- page_ext_init_flatmem();
- init_mem_debugging_and_hardening();
- kfence_alloc_pool();
- report_meminit();
- kmsan_init_shadow();
- stack_depot_early_init();
- mem_init();
- mem_init_print_info();
- kmem_cache_init();
- /*
- * page_owner must be initialized after buddy is ready, and also after
- * slab is ready so that stack_depot_init() works properly
- */
- page_ext_init_flatmem_late();
- kmemleak_init();
- pgtable_init();
- debug_objects_mem_init();
- vmalloc_init();
- /* If no deferred init page_ext now, as vmap is fully initialized */
- if (!deferred_struct_pages)
- page_ext_init();
- /* Should be run before the first non-init thread is created */
- init_espfix_bsp();
- /* Should be run after espfix64 is set up. */
- pti_init();
- kmsan_init_runtime();
- mm_cache_init();
-}
-
#ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET
DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT,
randomize_kstack_offset);
@@ -972,9 +908,6 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
boot_cpu_hotplug_init();
- build_all_zonelists(NULL);
- page_alloc_init();
-
pr_notice("Kernel command line: %s\n", saved_command_line);
/* parameters may set static keys */
jump_label_init();
@@ -996,13 +929,13 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
/*
* These use large bootmem allocations and must precede
- * kmem_cache_init()
+ * initalization of page allocator
*/
setup_log_buf(0);
vfs_caches_init_early();
sort_main_extable();
trap_init();
- mm_init();
+ mm_core_init();
poking_init();
ftrace_init();
@@ -1631,9 +1564,6 @@ static noinline void __init kernel_init_freeable(void)
padata_init();
page_alloc_init_late();
- /* Initialize page ext after all struct pages are initialized. */
- if (deferred_struct_pages)
- page_ext_init();
do_basic_setup();
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6c0a92ca6bb5..189895288d9d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -623,7 +623,7 @@ static int finish_cpu(unsigned int cpu)
*/
if (mm != &init_mm)
idle->active_mm = &init_mm;
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
return 0;
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 755f5f08ab38..90ce1dfd591c 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -474,7 +474,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(list_head, prev);
VMCOREINFO_OFFSET(vmap_area, va_start);
VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1);
log_buf_vmcoreinfo_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
VMCOREINFO_NUMBER(NR_FREE_PAGES);
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 4d40dcce7604..1acec2e22827 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
void *addr;
int ret = -ENOMEM;
- /* Cannot allocate larger than MAX_ORDER-1 */
- order = min(get_order(pool_size), MAX_ORDER-1);
+ /* Cannot allocate larger than MAX_ORDER */
+ order = min(get_order(pool_size), MAX_ORDER);
do {
pool_size = 1 << (PAGE_SHIFT + order);
@@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void)
/*
* If coherent_pool was not used on the command line, default the pool
- * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1.
+ * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER.
*/
if (!atomic_pool_size) {
unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 273a0fe7910a..a0433f37b024 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -814,7 +814,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
size = sizeof(struct perf_buffer);
size += nr_pages * sizeof(void *);
- if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
+ if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER)
goto fail;
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
diff --git a/kernel/exit.c b/kernel/exit.c
index f2afdb0add7c..86902cb5ab78 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -537,7 +537,7 @@ static void exit_mm(void)
return;
sync_mm_rss(mm);
mmap_read_lock(mm);
- mmgrab(mm);
+ mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
diff --git a/kernel/fork.c b/kernel/fork.c
index 0c92f224c68c..9051bc07b600 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -451,13 +451,49 @@ static struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+ vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+ if (!vma->vm_lock)
+ return false;
+
+ init_rwsem(&vma->vm_lock->lock);
+ vma->vm_lock_seq = -1;
+
+ return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+ kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
struct vm_area_struct *vma;
vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (vma)
- vma_init(vma, mm);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+ if (!vma_lock_alloc(vma)) {
+ kmem_cache_free(vm_area_cachep, vma);
+ return NULL;
+ }
+
return vma;
}
@@ -465,26 +501,56 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (new) {
- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- /*
- * orig->shared.rb may be modified concurrently, but the clone
- * will be reinitialized.
- */
- data_race(memcpy(new, orig, sizeof(*new)));
- INIT_LIST_HEAD(&new->anon_vma_chain);
- dup_anon_vma_name(orig, new);
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ /*
+ * orig->shared.rb may be modified concurrently, but the clone
+ * will be reinitialized.
+ */
+ data_race(memcpy(new, orig, sizeof(*new)));
+ if (!vma_lock_alloc(new)) {
+ kmem_cache_free(vm_area_cachep, new);
+ return NULL;
}
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
+ dup_anon_vma_name(orig, new);
+
return new;
}
-void vm_area_free(struct vm_area_struct *vma)
+void __vm_area_free(struct vm_area_struct *vma)
{
+ vma_numab_state_free(vma);
free_anon_vma_name(vma);
+ vma_lock_free(vma);
kmem_cache_free(vm_area_cachep, vma);
}
+#ifdef CONFIG_PER_VMA_LOCK
+static void vm_area_free_rcu_cb(struct rcu_head *head)
+{
+ struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
+ vm_rcu);
+
+ /* The vma should not be locked while being destroyed. */
+ VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
+ __vm_area_free(vma);
+}
+#endif
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+ call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
+#else
+ __vm_area_free(vma);
+#endif
+}
+
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -775,6 +841,67 @@ static void check_mm(struct mm_struct *mm)
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+static void do_check_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void do_shoot_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (current->active_mm == mm) {
+ WARN_ON_ONCE(current->mm);
+ current->active_mm = &init_mm;
+ switch_mm(mm, &init_mm, current);
+ }
+}
+
+static void cleanup_lazy_tlbs(struct mm_struct *mm)
+{
+ if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * In this case, lazy tlb mms are refounted and would not reach
+ * __mmdrop until all CPUs have switched away and mmdrop()ed.
+ */
+ return;
+ }
+
+ /*
+ * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
+ * requires lazy mm users to switch to another mm when the refcount
+ * drops to zero, before the mm is freed. This requires IPIs here to
+ * switch kernel threads to init_mm.
+ *
+ * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
+ * switch with the final userspace teardown TLB flush which leaves the
+ * mm lazy on this CPU but no others, reducing the need for additional
+ * IPIs here. There are cases where a final IPI is still required here,
+ * such as the final mmdrop being performed on a different CPU than the
+ * one exiting, or kernel threads using the mm when userspace exits.
+ *
+ * IPI overheads have not found to be expensive, but they could be
+ * reduced in a number of possible ways, for example (roughly
+ * increasing order of complexity):
+ * - The last lazy reference created by exit_mm() could instead switch
+ * to init_mm, however it's probable this will run on the same CPU
+ * immediately afterwards, so this may not reduce IPIs much.
+ * - A batch of mms requiring IPIs could be gathered and freed at once.
+ * - CPUs store active_mm where it can be remotely checked without a
+ * lock, to filter out false-positives in the cpumask.
+ * - After mm_users or mm_count reaches zero, switching away from the
+ * mm could clear mm_cpumask to reduce some IPIs, perhaps together
+ * with some batching or delaying of the final IPIs.
+ * - A delayed freeing and RCU-like quiescing sequence based on mm
+ * switching to avoid IPIs completely.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
+ on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+}
+
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
@@ -786,6 +913,10 @@ void __mmdrop(struct mm_struct *mm)
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
+
+ /* Ensure no CPUs are using this as their lazy tlb mm */
+ cleanup_lazy_tlbs(mm);
+
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
@@ -1128,6 +1259,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+ mm->mm_lock_seq = 0;
+#endif
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
@@ -3065,6 +3199,9 @@ void __init proc_caches_init(void)
NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+ vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
mmap_init();
nsproxy_cache_init();
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7e6751b29101..470708c205e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1415,14 +1415,18 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(tsk->mm);
+ /*
+ * It is possible for mm to be the same as tsk->active_mm, but
+ * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm),
+ * because these references are not equivalent.
+ */
+ mmgrab(mm);
+
task_lock(tsk);
/* Hold off tlb flush IPIs while switching mm's */
local_irq_disable();
active_mm = tsk->active_mm;
- if (active_mm != mm) {
- mmgrab(mm);
- tsk->active_mm = mm;
- }
+ tsk->active_mm = mm;
tsk->mm = mm;
membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
@@ -1439,12 +1443,9 @@ void kthread_use_mm(struct mm_struct *mm)
* memory barrier after storing to tsk->mm, before accessing
* user-space memory. A full memory barrier for membarrier
* {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
- * mmdrop(), or explicitly with smp_mb().
+ * mmdrop_lazy_tlb().
*/
- if (active_mm != mm)
- mmdrop(active_mm);
- else
- smp_mb();
+ mmdrop_lazy_tlb(active_mm);
}
EXPORT_SYMBOL_GPL(kthread_use_mm);
@@ -1472,10 +1473,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
+ mmgrab_lazy_tlb(mm);
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
local_irq_enable();
task_unlock(tsk);
+
+ mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d18c3969f90..143e46bd2a68 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5203,13 +5203,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* rq->curr, before returning to userspace, so provide them here:
*
* - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
- * provided by mmdrop(),
+ * provided by mmdrop_lazy_tlb(),
* - a sync_core for SYNC_CORE.
*/
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
- mmdrop_sched(mm);
+ mmdrop_lazy_tlb_sched(mm);
}
+
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
@@ -5266,9 +5267,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
/*
* kernel -> kernel lazy + transfer active
- * user -> kernel lazy + mmgrab() active
+ * user -> kernel lazy + mmgrab_lazy_tlb() active
*
- * kernel -> user switch + mmdrop() active
+ * kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
*/
if (!next->mm) { // to kernel
@@ -5276,7 +5277,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
next->active_mm = prev->active_mm;
if (prev->mm) // from user
- mmgrab(prev->active_mm);
+ mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
} else { // to user
@@ -5293,7 +5294,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
lru_gen_use_mm(next->mm);
if (!prev->mm) { // from kernel
- /* will mmdrop() in finish_task_switch(). */
+ /* will mmdrop_lazy_tlb() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
@@ -9935,7 +9936,7 @@ void __init sched_init(void)
/*
* The boot idle thread does lazy MMU switching as well:
*/
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6986ea31c984..db6fc9d978ae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2928,6 +2928,24 @@ static void reset_ptenuma_scan(struct task_struct *p)
p->mm->numa_scan_offset = 0;
}
+static bool vma_is_accessed(struct vm_area_struct *vma)
+{
+ unsigned long pids;
+ /*
+ * Allow unconditional access first two times, so that all the (pages)
+ * of VMAs get prot_none fault introduced irrespective of accesses.
+ * This is also done to avoid any side effect of task scanning
+ * amplifying the unfairness of disjoint set of VMAs' access.
+ */
+ if (READ_ONCE(current->mm->numa_scan_seq) < 2)
+ return true;
+
+ pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
+ return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
+}
+
+#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
+
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
@@ -3027,6 +3045,45 @@ static void task_numa_work(struct callback_head *work)
if (!vma_is_accessible(vma))
continue;
+ /* Initialise new per-VMA NUMAB state. */
+ if (!vma->numab_state) {
+ vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
+ GFP_KERNEL);
+ if (!vma->numab_state)
+ continue;
+
+ vma->numab_state->next_scan = now +
+ msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+
+ /* Reset happens after 4 times scan delay of scan start */
+ vma->numab_state->next_pid_reset = vma->numab_state->next_scan +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+ }
+
+ /*
+ * Scanning the VMA's of short lived tasks add more overhead. So
+ * delay the scan for new VMAs.
+ */
+ if (mm->numa_scan_seq && time_before(jiffies,
+ vma->numab_state->next_scan))
+ continue;
+
+ /* Do not scan the VMA if task has not accessed */
+ if (!vma_is_accessed(vma))
+ continue;
+
+ /*
+ * RESET access PIDs regularly for old VMAs. Resetting after checking
+ * vma for recent access to avoid clearing PID info before access..
+ */
+ if (mm->numa_scan_seq &&
+ time_after(jiffies, vma->numab_state->next_pid_reset)) {
+ vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+ vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]);
+ vma->numab_state->access_pids[1] = 0;
+ }
+
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 39d1d93164bd..5cd8183bb4c1 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -791,6 +791,16 @@ config DEBUG_VM
If unsure, say N.
+config DEBUG_VM_SHOOT_LAZIES
+ bool "Debug MMU_LAZY_TLB_SHOOTDOWN implementation"
+ depends on DEBUG_VM
+ depends on MMU_LAZY_TLB_SHOOTDOWN
+ help
+ Enable additional IPIs that ensure lazy tlb mm references are removed
+ before the mm is freed.
+
+ If unsure, say N.
+
config DEBUG_VM_MAPLE_TREE
bool "Debug VM maple trees"
depends on DEBUG_VM
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 274014e4eafe..34dd6bdf2fba 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -172,6 +172,18 @@ static int copyout(void __user *to, const void *from, size_t n)
return n;
}
+static int copyout_nofault(void __user *to, const void *from, size_t n)
+{
+ long res;
+
+ if (should_fail_usercopy())
+ return n;
+
+ res = copy_to_user_nofault(to, from, n);
+
+ return res < 0 ? n : res;
+}
+
static int copyin(void *to, const void __user *from, size_t n)
{
size_t res = n;
@@ -734,6 +746,42 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
}
EXPORT_SYMBOL(copy_page_to_iter);
+size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
+ struct iov_iter *i)
+{
+ size_t res = 0;
+
+ if (!page_copy_sane(page, offset, bytes))
+ return 0;
+ if (WARN_ON_ONCE(i->data_source))
+ return 0;
+ if (unlikely(iov_iter_is_pipe(i)))
+ return copy_page_to_iter_pipe(page, offset, bytes, i);
+ page += offset / PAGE_SIZE; // first subpage
+ offset %= PAGE_SIZE;
+ while (1) {
+ void *kaddr = kmap_local_page(page);
+ size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
+
+ iterate_and_advance(i, n, base, len, off,
+ copyout_nofault(base, kaddr + offset + off, len),
+ memcpy(base, kaddr + offset + off, len)
+ )
+ kunmap_local(kaddr);
+ res += n;
+ bytes -= n;
+ if (!bytes || !n)
+ break;
+ offset += n;
+ if (offset == PAGE_SIZE) {
+ page++;
+ offset = 0;
+ }
+ }
+ return res;
+}
+EXPORT_SYMBOL(copy_page_to_iter_nofault);
+
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 7ff2a821a2a1..5577e6da6fe3 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -5814,6 +5814,7 @@ int mas_preallocate(struct ma_state *mas, gfp_t gfp)
mas_reset(mas);
return ret;
}
+EXPORT_SYMBOL_GPL(mas_preallocate);
/*
* mas_destroy() - destroy a maple state.
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 036da8e295d1..2f5aa851834e 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -17,6 +17,7 @@
#include <linux/gfp.h>
#include <linux/jhash.h>
#include <linux/kernel.h>
+#include <linux/kmsan.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
@@ -306,6 +307,11 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
stack->handle.extra = 0;
memcpy(stack->entries, entries, flex_array_size(stack, entries, size));
pool_offset += required_size;
+ /*
+ * Let KMSAN know the stored stack record is initialized. This shall
+ * prevent false positive reports if instrumented code accesses it.
+ */
+ kmsan_unpoison_memory(stack, required_size);
return stack;
}
@@ -465,6 +471,12 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
struct stack_record *stack;
*entries = NULL;
+ /*
+ * Let KMSAN know *entries is initialized. This shall prevent false
+ * positive reports if instrumented code accesses it.
+ */
+ kmsan_unpoison_memory(entries, sizeof(*entries));
+
if (!handle)
return 0;
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 46b4e6c414a3..7677ebccf3c3 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -642,12 +642,26 @@ page_flags_test(int section, int node, int zone, int last_cpupid,
test(cmp_buf, "%pGp", &flags);
}
+static void __init page_type_test(unsigned int page_type, const char *name,
+ char *cmp_buf)
+{
+ unsigned long size;
+
+ size = scnprintf(cmp_buf, BUF_SIZE, "%#x(", page_type);
+ if (page_type_has_type(page_type))
+ size += scnprintf(cmp_buf + size, BUF_SIZE - size, "%s", name);
+
+ snprintf(cmp_buf + size, BUF_SIZE - size, ")");
+ test(cmp_buf, "%pGt", &page_type);
+}
+
static void __init
flags(void)
{
unsigned long flags;
char *cmp_buffer;
gfp_t gfp;
+ unsigned int page_type;
cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
if (!cmp_buffer)
@@ -687,6 +701,18 @@ flags(void)
gfp |= __GFP_HIGH;
test(cmp_buffer, "%pGg", &gfp);
+ page_type = ~0;
+ page_type_test(page_type, "", cmp_buffer);
+
+ page_type = 10;
+ page_type_test(page_type, "", cmp_buffer);
+
+ page_type = ~PG_buddy;
+ page_type_test(page_type, "buddy", cmp_buffer);
+
+ page_type = ~(PG_table | PG_buddy);
+ page_type_test(page_type, "table|buddy", cmp_buffer);
+
kfree(cmp_buffer);
}
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index be71a03c936a..fbe320b5e89f 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -2052,6 +2052,25 @@ char *format_page_flags(char *buf, char *end, unsigned long flags)
return buf;
}
+static
+char *format_page_type(char *buf, char *end, unsigned int page_type)
+{
+ buf = number(buf, end, page_type, default_flag_spec);
+
+ if (buf < end)
+ *buf = '(';
+ buf++;
+
+ if (page_type_has_type(page_type))
+ buf = format_flags(buf, end, ~page_type, pagetype_names);
+
+ if (buf < end)
+ *buf = ')';
+ buf++;
+
+ return buf;
+}
+
static noinline_for_stack
char *flags_string(char *buf, char *end, void *flags_ptr,
struct printf_spec spec, const char *fmt)
@@ -2065,6 +2084,8 @@ char *flags_string(char *buf, char *end, void *flags_ptr,
switch (fmt[1]) {
case 'p':
return format_page_flags(buf, end, *(unsigned long *)flags_ptr);
+ case 't':
+ return format_page_type(buf, end, *(unsigned int *)flags_ptr);
case 'v':
flags = *(unsigned long *)flags_ptr;
names = vmaflag_names;
diff --git a/mm/Kconfig b/mm/Kconfig
index 4751031f3f05..6ee3b48ed298 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -346,9 +346,9 @@ config SHUFFLE_PAGE_ALLOCATOR
the presence of a memory-side-cache. There are also incidental
security benefits as it reduces the predictability of page
allocations to compliment SLAB_FREELIST_RANDOM, but the
- default granularity of shuffling on the "MAX_ORDER - 1" i.e,
- 10th order of pages is selected based on cache utilization
- benefits on x86.
+ default granularity of shuffling on the MAX_ORDER i.e, 10th
+ order of pages is selected based on cache utilization benefits
+ on x86.
While the randomization improves cache utilization it may
negatively impact workloads on platforms without a cache. For
@@ -666,8 +666,8 @@ config HUGETLB_PAGE_SIZE_VARIABLE
HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
on a platform.
- Note that the pageblock_order cannot exceed MAX_ORDER - 1 and will be
- clamped down to MAX_ORDER - 1.
+ Note that the pageblock_order cannot exceed MAX_ORDER and will be
+ clamped down to MAX_ORDER.
config CONTIG_ALLOC
def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
@@ -1100,6 +1100,15 @@ comment "GUP_TEST needs to have DEBUG_FS enabled"
config GUP_GET_PXX_LOW_HIGH
bool
+config DMAPOOL_TEST
+ tristate "Enable a module to run time tests on dma_pool"
+ depends on HAS_DMA
+ help
+ Provides a test module that will allocate and free many blocks of
+ various sizes and report how long it takes. This is intended to
+ provide a consistent way to measure how changes to the
+ dma_pool_alloc/free routines affect performance.
+
config ARCH_HAS_PTE_SPECIAL
bool
@@ -1202,6 +1211,18 @@ config LRU_GEN_STATS
This option has a per-memcg and per-node memory overhead.
# }
+config ARCH_SUPPORTS_PER_VMA_LOCK
+ def_bool n
+
+config PER_VMA_LOCK
+ def_bool y
+ depends on ARCH_SUPPORTS_PER_VMA_LOCK && MMU && SMP
+ help
+ Allow per-vma locking during page fault handling.
+
+ This feature allows locking each virtual memory area separately when
+ handling page faults instead of taking mmap_lock.
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index c3547a373c9c..4965a7333a3f 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -279,3 +279,9 @@ config DEBUG_KMEMLEAK_AUTO_SCAN
If unsure, say Y.
+config PER_VMA_LOCK_STATS
+ bool "Statistics for per-vma locks"
+ depends on PER_VMA_LOCK
+ default y
+ help
+ Statistics for per-vma locks.
diff --git a/mm/Makefile b/mm/Makefile
index 8e105e5b3e29..3a08f5d7b178 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -103,6 +103,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_GUP_TEST) += gup_test.o
+obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
diff --git a/mm/cma.c b/mm/cma.c
index a7263aa02c92..6268d6620254 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -33,6 +33,7 @@
#include <linux/kmemleak.h>
#include <trace/events/cma.h>
+#include "internal.h"
#include "cma.h"
struct cma cma_areas[MAX_CMA_AREAS];
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
index eb2f39caff59..56347d15b7e8 100644
--- a/mm/cma_sysfs.c
+++ b/mm/cma_sysfs.c
@@ -64,7 +64,7 @@ static struct attribute *cma_attrs[] = {
};
ATTRIBUTE_GROUPS(cma);
-static struct kobj_type cma_ktype = {
+static const struct kobj_type cma_ktype = {
.release = cma_kobj_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = cma_groups,
diff --git a/mm/compaction.c b/mm/compaction.c
index 5a9501e0ae01..e689d66cedf4 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -583,9 +583,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (PageCompound(page)) {
const unsigned int order = compound_order(page);
- if (likely(order < MAX_ORDER)) {
+ if (likely(order <= MAX_ORDER)) {
blockpfn += (1UL << order) - 1;
cursor += (1UL << order) - 1;
+ nr_scanned += (1UL << order) - 1;
}
goto isolate_fail;
}
@@ -893,6 +894,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
if (PageHuge(page) && cc->alloc_contig) {
+ if (locked) {
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
+ }
+
ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
/*
@@ -904,6 +910,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (ret == -EBUSY)
ret = 0;
low_pfn += compound_nr(page) - 1;
+ nr_scanned += compound_nr(page) - 1;
goto isolate_fail;
}
@@ -938,8 +945,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* a valid page order. Consider only values in the
* valid order range to prevent low_pfn overflow.
*/
- if (freepage_order > 0 && freepage_order < MAX_ORDER)
+ if (freepage_order > 0 && freepage_order <= MAX_ORDER) {
low_pfn += (1UL << freepage_order) - 1;
+ nr_scanned += (1UL << freepage_order) - 1;
+ }
continue;
}
@@ -954,8 +963,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (PageCompound(page) && !cc->alloc_contig) {
const unsigned int order = compound_order(page);
- if (likely(order < MAX_ORDER))
+ if (likely(order <= MAX_ORDER)) {
low_pfn += (1UL << order) - 1;
+ nr_scanned += (1UL << order) - 1;
+ }
goto isolate_fail;
}
@@ -1077,6 +1088,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (unlikely(PageCompound(page) && !cc->alloc_contig)) {
low_pfn += compound_nr(page) - 1;
+ nr_scanned += compound_nr(page) - 1;
SetPageLRU(page);
goto isolate_fail_put;
}
@@ -2124,7 +2136,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
/* Direct compactor: Is a suitable page free? */
ret = COMPACT_NO_SUITABLE_PAGE;
- for (order = cc->order; order < MAX_ORDER; order++) {
+ for (order = cc->order; order <= MAX_ORDER; order++) {
struct free_area *area = &cc->zone->free_area[order];
bool can_steal;
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 3cdad5a7f936..50cf89dcd898 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -384,7 +384,7 @@ static struct attribute *damon_sysfs_scheme_filter_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter);
-static struct kobj_type damon_sysfs_scheme_filter_ktype = {
+static const struct kobj_type damon_sysfs_scheme_filter_ktype = {
.release = damon_sysfs_scheme_filter_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_scheme_filter_groups,
@@ -503,7 +503,7 @@ static struct attribute *damon_sysfs_scheme_filters_attrs[] = {
};
ATTRIBUTE_GROUPS(damon_sysfs_scheme_filters);
-static struct kobj_type damon_sysfs_scheme_filters_ktype = {
+static const struct kobj_type damon_sysfs_scheme_filters_ktype = {
.release = damon_sysfs_scheme_filters_release,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = damon_sysfs_scheme_filters_groups,
diff --git a/mm/debug.c b/mm/debug.c
index 96d594e16292..c7b228097bd9 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -36,6 +36,11 @@ const struct trace_print_flags pageflag_names[] = {
{0, NULL}
};
+const struct trace_print_flags pagetype_names[] = {
+ __def_pagetype_names,
+ {0, NULL}
+};
+
const struct trace_print_flags gfpflag_names[] = {
__def_gfpflag_names,
{0, NULL}
@@ -115,6 +120,8 @@ static void __dump_page(struct page *page)
pr_warn("%sflags: %pGp%s\n", type, &head->flags,
page_cma ? " CMA" : "");
+ pr_warn("page_type: %pGt\n", &head->page_type);
+
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
sizeof(struct page), false);
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index af59cc7bd307..c54177aabebd 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -934,7 +934,7 @@ static void __init hugetlb_basic_tests(struct pgtable_debug_args *args)
#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
- WARN_ON(!pte_huge(pte_mkhuge(pte)));
+ WARN_ON(!pte_huge(arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS)));
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
}
#else /* !CONFIG_HUGETLB_PAGE */
@@ -1048,7 +1048,7 @@ static void __init destroy_args(struct pgtable_debug_args *args)
if (args->pte_pfn != ULONG_MAX) {
page = pfn_to_page(args->pte_pfn);
- __free_pages(page, 0);
+ __free_page(page);
args->pte_pfn = ULONG_MAX;
}
@@ -1086,7 +1086,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
struct page *page = NULL;
#ifdef CONFIG_CONTIG_ALLOC
- if (order >= MAX_ORDER) {
+ if (order > MAX_ORDER) {
page = alloc_contig_pages((1 << order), GFP_KERNEL,
first_online_node, NULL);
if (page) {
@@ -1096,7 +1096,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
}
#endif
- if (order < MAX_ORDER)
+ if (order <= MAX_ORDER)
page = alloc_pages(GFP_KERNEL, order);
return page;
@@ -1290,7 +1290,7 @@ static int __init init_args(struct pgtable_debug_args *args)
}
}
- page = alloc_pages(GFP_KERNEL, 0);
+ page = alloc_page(GFP_KERNEL);
if (page)
args->pte_pfn = page_to_pfn(page);
diff --git a/mm/dmapool_test.c b/mm/dmapool_test.c
new file mode 100644
index 000000000000..370fb9e209ef
--- /dev/null
+++ b/mm/dmapool_test.c
@@ -0,0 +1,147 @@
+#include <linux/device.h>
+#include <linux/dma-map-ops.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+
+#define NR_TESTS (100)
+
+struct dma_pool_pair {
+ dma_addr_t dma;
+ void *v;
+};
+
+struct dmapool_parms {
+ size_t size;
+ size_t align;
+ size_t boundary;
+};
+
+static const struct dmapool_parms pool_parms[] = {
+ { .size = 16, .align = 16, .boundary = 0 },
+ { .size = 64, .align = 64, .boundary = 0 },
+ { .size = 256, .align = 256, .boundary = 0 },
+ { .size = 1024, .align = 1024, .boundary = 0 },
+ { .size = 4096, .align = 4096, .boundary = 0 },
+ { .size = 68, .align = 32, .boundary = 4096 },
+};
+
+static struct dma_pool *pool;
+static struct device test_dev;
+static u64 dma_mask;
+
+static inline int nr_blocks(int size)
+{
+ return clamp_t(int, (PAGE_SIZE / size) * 512, 1024, 8192);
+}
+
+static int dmapool_test_alloc(struct dma_pool_pair *p, int blocks)
+{
+ int i;
+
+ for (i = 0; i < blocks; i++) {
+ p[i].v = dma_pool_alloc(pool, GFP_KERNEL,
+ &p[i].dma);
+ if (!p[i].v)
+ goto pool_fail;
+ }
+
+ for (i = 0; i < blocks; i++)
+ dma_pool_free(pool, p[i].v, p[i].dma);
+
+ return 0;
+
+pool_fail:
+ for (--i; i >= 0; i--)
+ dma_pool_free(pool, p[i].v, p[i].dma);
+ return -ENOMEM;
+}
+
+static int dmapool_test_block(const struct dmapool_parms *parms)
+{
+ int blocks = nr_blocks(parms->size);
+ ktime_t start_time, end_time;
+ struct dma_pool_pair *p;
+ int i, ret;
+
+ p = kcalloc(blocks, sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ pool = dma_pool_create("test pool", &test_dev, parms->size,
+ parms->align, parms->boundary);
+ if (!pool) {
+ ret = -ENOMEM;
+ goto free_pairs;
+ }
+
+ start_time = ktime_get();
+ for (i = 0; i < NR_TESTS; i++) {
+ ret = dmapool_test_alloc(p, blocks);
+ if (ret)
+ goto free_pool;
+ if (need_resched())
+ cond_resched();
+ }
+ end_time = ktime_get();
+
+ printk("dmapool test: size:%-4zu align:%-4zu blocks:%-4d time:%llu\n",
+ parms->size, parms->align, blocks,
+ ktime_us_delta(end_time, start_time));
+
+free_pool:
+ dma_pool_destroy(pool);
+free_pairs:
+ kfree(p);
+ return ret;
+}
+
+static void dmapool_test_release(struct device *dev)
+{
+}
+
+static int dmapool_checks(void)
+{
+ int i, ret;
+
+ ret = dev_set_name(&test_dev, "dmapool-test");
+ if (ret)
+ return ret;
+
+ ret = device_register(&test_dev);
+ if (ret) {
+ printk("%s: register failed:%d\n", __func__, ret);
+ goto put_device;
+ }
+
+ test_dev.release = dmapool_test_release;
+ set_dma_ops(&test_dev, NULL);
+ test_dev.dma_mask = &dma_mask;
+ ret = dma_set_mask_and_coherent(&test_dev, DMA_BIT_MASK(64));
+ if (ret) {
+ printk("%s: mask failed:%d\n", __func__, ret);
+ goto del_device;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(pool_parms); i++) {
+ ret = dmapool_test_block(&pool_parms[i]);
+ if (ret)
+ break;
+ }
+
+del_device:
+ device_del(&test_dev);
+put_device:
+ put_device(&test_dev);
+ return ret;
+}
+
+static void dmapool_exit(void)
+{
+}
+
+module_init(dmapool_checks);
+module_exit(dmapool_exit);
+MODULE_LICENSE("GPL");
diff --git a/mm/filemap.c b/mm/filemap.c
index 2723104cc06a..a34abfe8c654 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1836,7 +1836,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
*/
/*
- * mapping_get_entry - Get a page cache entry.
+ * filemap_get_entry - Get a page cache entry.
* @mapping: the address_space to search
* @index: The page cache index.
*
@@ -1847,7 +1847,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
*
* Return: The folio, swap or shadow entry, %NULL if nothing is found.
*/
-static void *mapping_get_entry(struct address_space *mapping, pgoff_t index)
+void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
{
XA_STATE(xas, &mapping->i_pages, index);
struct folio *folio;
@@ -1891,8 +1891,6 @@ out:
*
* * %FGP_ACCESSED - The folio will be marked accessed.
* * %FGP_LOCK - The folio is returned locked.
- * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
- * instead of allocating a new folio to replace it.
* * %FGP_CREAT - If no page is present then a new page is allocated using
* @gfp and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
@@ -1909,7 +1907,7 @@ out:
*
* If there is a page cache page, it is returned with an increased refcount.
*
- * Return: The found folio or %NULL otherwise.
+ * Return: The found folio or an ERR_PTR() otherwise.
*/
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
int fgp_flags, gfp_t gfp)
@@ -1917,12 +1915,9 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
struct folio *folio;
repeat:
- folio = mapping_get_entry(mapping, index);
- if (xa_is_value(folio)) {
- if (fgp_flags & FGP_ENTRY)
- return folio;
+ folio = filemap_get_entry(mapping, index);
+ if (xa_is_value(folio))
folio = NULL;
- }
if (!folio)
goto no_page;
@@ -1930,7 +1925,7 @@ repeat:
if (fgp_flags & FGP_NOWAIT) {
if (!folio_trylock(folio)) {
folio_put(folio);
- return NULL;
+ return ERR_PTR(-EAGAIN);
}
} else {
folio_lock(folio);
@@ -1969,7 +1964,7 @@ no_page:
folio = filemap_alloc_folio(gfp, 0);
if (!folio)
- return NULL;
+ return ERR_PTR(-ENOMEM);
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
@@ -1994,6 +1989,8 @@ no_page:
folio_unlock(folio);
}
+ if (!folio)
+ return ERR_PTR(-ENOENT);
return folio;
}
EXPORT_SYMBOL(__filemap_get_folio);
@@ -3263,7 +3260,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
* Do we have something in the page cache already?
*/
folio = filemap_get_folio(mapping, index);
- if (likely(folio)) {
+ if (likely(!IS_ERR(folio))) {
/*
* We found the page, so try async readahead before waiting for
* the lock.
@@ -3292,7 +3289,7 @@ retry_find:
folio = __filemap_get_folio(mapping, index,
FGP_CREAT|FGP_FOR_MMAP,
vmf->gfp_mask);
- if (!folio) {
+ if (IS_ERR(folio)) {
if (fpin)
goto out_retry;
filemap_invalidate_unlock_shared(mapping);
@@ -3643,7 +3640,7 @@ static struct folio *do_read_cache_folio(struct address_space *mapping,
filler = mapping->a_ops->read_folio;
repeat:
folio = filemap_get_folio(mapping, index);
- if (!folio) {
+ if (IS_ERR(folio)) {
folio = filemap_alloc_folio(gfp, 0);
if (!folio)
return ERR_PTR(-ENOMEM);
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index cabcd1de9ecb..2511c055a35f 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -97,8 +97,8 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
struct folio *folio;
folio = __filemap_get_folio(mapping, index, fgp_flags, gfp);
- if (!folio || xa_is_value(folio))
- return &folio->page;
+ if (IS_ERR(folio))
+ return NULL;
return folio_file_page(folio, index);
}
EXPORT_SYMBOL(pagecache_get_page);
diff --git a/mm/gup.c b/mm/gup.c
index eab18ba045db..1f72a717232b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2193,7 +2193,7 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
* This does not guarantee that the page exists in the user mappings when
* get_user_pages_remote returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
+ * and subsequently re-faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3fae2d2496ab..03d78901a7a7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -88,7 +88,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
/*
* If the hardware/firmware marked hugepage support disabled.
*/
- if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
+ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
return false;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
@@ -460,18 +460,14 @@ static int __init hugepage_init(void)
struct kobject *hugepage_kobj;
if (!has_transparent_hugepage()) {
- /*
- * Hardware doesn't support hugepages, hence disable
- * DAX PMD support.
- */
- transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
+ transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
return -EINVAL;
}
/*
* hugepages can't be allocated by the buddy allocator
*/
- MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
+ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER);
/*
* we use page->mapping and page->index in second tail page
* as list_head: assuming THP order >= 2
@@ -656,19 +652,20 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
struct page *page, gfp_t gfp)
{
struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio = page_folio(page);
pgtable_t pgtable;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
vm_fault_t ret = 0;
- VM_BUG_ON_PAGE(!PageCompound(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
- if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) {
- put_page(page);
+ if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+ folio_put(folio);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
return VM_FAULT_FALLBACK;
}
- cgroup_throttle_swaprate(page, gfp);
+ folio_throttle_swaprate(folio, gfp);
pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable)) {
@@ -678,11 +675,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
/*
- * The memory barrier inside __SetPageUptodate makes sure that
+ * The memory barrier inside __folio_mark_uptodate makes sure that
* clear_huge_page writes become visible before the set_pmd_at()
* write.
*/
- __SetPageUptodate(page);
+ __folio_mark_uptodate(folio);
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd))) {
@@ -697,7 +694,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
/* Deliver the page fault to userland */
if (userfaultfd_missing(vma)) {
spin_unlock(vmf->ptl);
- put_page(page);
+ folio_put(folio);
pte_free(vma->vm_mm, pgtable);
ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
@@ -706,8 +703,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- page_add_new_anon_rmap(page, vma, haddr);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ folio_add_new_anon_rmap(folio, vma, haddr);
+ folio_add_lru_vma(folio, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
@@ -724,7 +721,7 @@ unlock_release:
release:
if (pgtable)
pte_free(vma->vm_mm, pgtable);
- put_page(page);
+ folio_put(folio);
return ret;
}
@@ -888,23 +885,20 @@ out_unlock:
}
/**
- * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
+ * vmf_insert_pfn_pmd - insert a pmd size pfn
* @vmf: Structure describing the fault
* @pfn: pfn to insert
- * @pgprot: page protection to use
* @write: whether it's a write fault
*
- * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
- * also consult the vmf_insert_mixed_prot() documentation when
- * @pgprot != @vmf->vma->vm_page_prot.
+ * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
*
* Return: vm_fault_t value.
*/
-vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
- pgprot_t pgprot, bool write)
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
{
unsigned long addr = vmf->address & PMD_MASK;
struct vm_area_struct *vma = vmf->vma;
+ pgprot_t pgprot = vma->vm_page_prot;
pgtable_t pgtable = NULL;
/*
@@ -932,7 +926,7 @@ vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
@@ -943,9 +937,10 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
}
static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
+ pud_t *pud, pfn_t pfn, bool write)
{
struct mm_struct *mm = vma->vm_mm;
+ pgprot_t prot = vma->vm_page_prot;
pud_t entry;
spinlock_t *ptl;
@@ -979,23 +974,20 @@ out_unlock:
}
/**
- * vmf_insert_pfn_pud_prot - insert a pud size pfn
+ * vmf_insert_pfn_pud - insert a pud size pfn
* @vmf: Structure describing the fault
* @pfn: pfn to insert
- * @pgprot: page protection to use
* @write: whether it's a write fault
*
- * Insert a pud size pfn. See vmf_insert_pfn() for additional info and
- * also consult the vmf_insert_mixed_prot() documentation when
- * @pgprot != @vmf->vma->vm_page_prot.
+ * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
*
* Return: vm_fault_t value.
*/
-vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
- pgprot_t pgprot, bool write)
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
{
unsigned long addr = vmf->address & PUD_MASK;
struct vm_area_struct *vma = vmf->vma;
+ pgprot_t pgprot = vma->vm_page_prot;
/*
* If we had pud_special, we could avoid all these restrictions,
@@ -1013,10 +1005,10 @@ vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
track_pfn_insert(vma, &pgprot, pfn);
- insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
+ insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -3101,11 +3093,10 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
mapping = candidate->f_mapping;
for (index = off_start; index < off_end; index += nr_pages) {
- struct folio *folio = __filemap_get_folio(mapping, index,
- FGP_ENTRY, 0);
+ struct folio *folio = filemap_get_folio(mapping, index);
nr_pages = 1;
- if (xa_is_value(folio) || !folio)
+ if (IS_ERR(folio))
continue;
if (!folio_test_large(folio))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 245038a9fe4e..a58b3739ed4b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2090,7 +2090,7 @@ pgoff_t hugetlb_basepage_index(struct page *page)
pgoff_t index = page_index(page_head);
unsigned long compound_idx;
- if (compound_order(page_head) >= MAX_ORDER)
+ if (compound_order(page_head) > MAX_ORDER)
compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
else
compound_idx = page - page_head;
@@ -4497,7 +4497,7 @@ static int __init default_hugepagesz_setup(char *s)
* The number of default huge pages (for this size) could have been
* specified as the first hugetlb parameter: hugepages=X. If so,
* then default_hstate_max_huge_pages is set. If the default huge
- * page size is gigantic (>= MAX_ORDER), then the pages must be
+ * page size is gigantic (> MAX_ORDER), then the pages must be
* allocated here from bootmem allocator.
*/
if (default_hstate_max_huge_pages) {
@@ -5790,7 +5790,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
*/
new_folio = false;
folio = filemap_lock_folio(mapping, idx);
- if (!folio) {
+ if (IS_ERR(folio)) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto out;
@@ -6081,6 +6081,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vma_end_reservation(h, vma, haddr);
pagecache_folio = filemap_lock_folio(mapping, idx);
+ if (IS_ERR(pagecache_folio))
+ pagecache_folio = NULL;
}
ptl = huge_pte_lock(h, mm, ptep);
@@ -6164,19 +6166,19 @@ out_mutex:
#ifdef CONFIG_USERFAULTFD
/*
- * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
- * modifications for huge pages.
+ * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
+ * with modifications for hugetlb pages.
*/
-int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pte_t *dst_pte,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- enum mcopy_atomic_mode mode,
- struct page **pagep,
- bool wp_copy)
-{
- bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
+int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ uffd_flags_t flags,
+ struct page **pagep)
+{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE);
+ bool wp_enabled = (flags & MFILL_ATOMIC_WP);
struct hstate *h = hstate_vma(dst_vma);
struct address_space *mapping = dst_vma->vm_file->f_mapping;
pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
@@ -6192,7 +6194,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (is_continue) {
ret = -EFAULT;
folio = filemap_lock_folio(mapping, idx);
- if (!folio)
+ if (IS_ERR(folio))
goto out;
folio_in_pagecache = true;
} else if (!*pagep) {
@@ -6311,7 +6313,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
* with wp flag set, don't set pte write bit.
*/
- if (wp_copy || (is_continue && !vm_shared))
+ if (wp_enabled || (is_continue && !vm_shared))
writable = 0;
else
writable = dst_vma->vm_flags & VM_WRITE;
@@ -6326,7 +6328,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
_dst_pte = huge_pte_mkdirty(_dst_pte);
_dst_pte = pte_mkyoung(_dst_pte);
- if (wp_copy)
+ if (wp_enabled)
_dst_pte = huge_pte_mkuffd_wp(_dst_pte);
set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index a559037cce00..1198064f80eb 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -400,7 +400,7 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
return 0;
out:
list_for_each_entry_safe(page, next, list, lru)
- __free_pages(page, 0);
+ __free_page(page);
return -ENOMEM;
}
@@ -590,17 +590,15 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
static int __init hugetlb_vmemmap_init(void)
{
+ const struct hstate *h;
+
/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE);
- if (IS_ENABLED(CONFIG_PROC_SYSCTL)) {
- const struct hstate *h;
-
- for_each_hstate(h) {
- if (hugetlb_vmemmap_optimizable(h)) {
- register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
- break;
- }
+ for_each_hstate(h) {
+ if (hugetlb_vmemmap_optimizable(h)) {
+ register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
+ break;
}
}
return 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index c9327abb771c..33269314e060 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+ .mm_lock_seq = 0,
+#endif
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,
#ifdef CONFIG_IOMMU_SVA
diff --git a/mm/internal.h b/mm/internal.h
index 7920a8b7982e..73b167b59cc5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -105,7 +105,7 @@ void folio_activate(struct folio *folio);
void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *start_vma, unsigned long floor,
- unsigned long ceiling);
+ unsigned long ceiling, bool mm_wr_locked);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
struct zap_details;
@@ -201,6 +201,17 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
/*
* in mm/page_alloc.c
*/
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+extern char * const zone_names[MAX_NR_ZONES];
+
+/* perform sanity checks on struct pages being allocated or freed */
+DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
+
+static inline bool is_check_pages_enabled(void)
+{
+ return static_branch_unlikely(&check_pages_enabled);
+}
/*
* Structure for holding the mostly immutable allocation parameters passed
@@ -366,7 +377,29 @@ extern void __putback_isolated_page(struct page *page, unsigned int order,
extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order);
+
+static inline void prep_compound_head(struct page *page, unsigned int order)
+{
+ struct folio *folio = (struct folio *)page;
+
+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
+ set_compound_order(page, order);
+ atomic_set(&folio->_entire_mapcount, -1);
+ atomic_set(&folio->_nr_pages_mapped, 0);
+ atomic_set(&folio->_pincount, 0);
+}
+
+static inline void prep_compound_tail(struct page *head, int tail_idx)
+{
+ struct page *p = head + tail_idx;
+
+ p->mapping = TAIL_MAPPING;
+ set_compound_head(p, head);
+ set_page_private(p, 0);
+}
+
extern void prep_compound_page(struct page *page, unsigned int order);
+
extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;
@@ -377,6 +410,7 @@ extern void free_unref_page_list(struct list_head *list);
extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
+extern void zone_pcp_init(struct zone *zone);
extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr,
@@ -474,7 +508,12 @@ isolate_migratepages_range(struct compact_control *cc,
int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end);
-#endif
+
+/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
+void init_cma_reserved_pageblock(struct page *page);
+
+#endif /* CONFIG_COMPACTION || CONFIG_CMA */
+
int find_suitable_fallback(struct free_area *area, unsigned int order,
int migratetype, bool only_stealable, bool *can_steal);
@@ -658,6 +697,12 @@ static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
#endif /* !CONFIG_MMU */
/* Memory initialisation debug and verification */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+DECLARE_STATIC_KEY_TRUE(deferred_pages);
+
+bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
enum mminit_level {
MMINIT_WARNING,
MMINIT_VERIFY,
@@ -802,6 +847,7 @@ static inline void flush_tlb_batched_pending(struct mm_struct *mm)
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
extern const struct trace_print_flags pageflag_names[];
+extern const struct trace_print_flags pagetype_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];
@@ -833,9 +879,14 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
* mm/vmalloc.c
*/
#ifdef CONFIG_MMU
+void __init vmalloc_init(void);
int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift);
#else
+static inline void vmalloc_init(void)
+{
+}
+
static inline
int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index d1bcb0205327..f98b9f4d9d3e 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -205,7 +205,7 @@ void kasan_init_hw_tags_cpu(void)
* Enable async or asymm modes only when explicitly requested
* through the command line.
*/
- kasan_enable_tagging();
+ kasan_enable_hw_tags();
}
/* kasan_init_hw_tags() is called once on boot CPU. */
@@ -318,7 +318,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
* Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags
* the first virtual mapping, which is created by vmalloc().
* Tagging the page_alloc memory backing that vmalloc() allocation is
- * skipped, see ___GFP_SKIP_KASAN_UNPOISON.
+ * skipped, see ___GFP_SKIP_KASAN.
*
* For non-VM_ALLOC allocations, page_alloc memory is tagged as usual.
*/
@@ -373,19 +373,19 @@ void __kasan_poison_vmalloc(const void *start, unsigned long size)
#endif
-void kasan_enable_tagging(void)
+void kasan_enable_hw_tags(void)
{
if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
- hw_enable_tagging_async();
+ hw_enable_tag_checks_async();
else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM)
- hw_enable_tagging_asymm();
+ hw_enable_tag_checks_asymm();
else
- hw_enable_tagging_sync();
+ hw_enable_tag_checks_sync();
}
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-EXPORT_SYMBOL_GPL(kasan_enable_tagging);
+EXPORT_SYMBOL_GPL(kasan_enable_hw_tags);
void kasan_force_async_fault(void)
{
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index a61eeee3095a..f5e4f5f2ba20 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -395,46 +395,22 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#ifdef CONFIG_KASAN_HW_TAGS
-#ifndef arch_enable_tagging_sync
-#define arch_enable_tagging_sync()
-#endif
-#ifndef arch_enable_tagging_async
-#define arch_enable_tagging_async()
-#endif
-#ifndef arch_enable_tagging_asymm
-#define arch_enable_tagging_asymm()
-#endif
-#ifndef arch_force_async_tag_fault
-#define arch_force_async_tag_fault()
-#endif
-#ifndef arch_get_random_tag
-#define arch_get_random_tag() (0xFF)
-#endif
-#ifndef arch_get_mem_tag
-#define arch_get_mem_tag(addr) (0xFF)
-#endif
-#ifndef arch_set_mem_tag_range
-#define arch_set_mem_tag_range(addr, size, tag, init) ((void *)(addr))
-#endif
-
-#define hw_enable_tagging_sync() arch_enable_tagging_sync()
-#define hw_enable_tagging_async() arch_enable_tagging_async()
-#define hw_enable_tagging_asymm() arch_enable_tagging_asymm()
+#define hw_enable_tag_checks_sync() arch_enable_tag_checks_sync()
+#define hw_enable_tag_checks_async() arch_enable_tag_checks_async()
+#define hw_enable_tag_checks_asymm() arch_enable_tag_checks_asymm()
+#define hw_suppress_tag_checks_start() arch_suppress_tag_checks_start()
+#define hw_suppress_tag_checks_stop() arch_suppress_tag_checks_stop()
#define hw_force_async_tag_fault() arch_force_async_tag_fault()
#define hw_get_random_tag() arch_get_random_tag()
#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
#define hw_set_mem_tag_range(addr, size, tag, init) \
arch_set_mem_tag_range((addr), (size), (tag), (init))
-void kasan_enable_tagging(void);
+void kasan_enable_hw_tags(void);
#else /* CONFIG_KASAN_HW_TAGS */
-#define hw_enable_tagging_sync()
-#define hw_enable_tagging_async()
-#define hw_enable_tagging_asymm()
-
-static inline void kasan_enable_tagging(void) { }
+static inline void kasan_enable_hw_tags(void) { }
#endif /* CONFIG_KASAN_HW_TAGS */
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index 627eaf1ee1db..a375776f9896 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -148,7 +148,7 @@ static void kasan_test_exit(struct kunit *test)
kasan_sync_fault_possible()) { \
if (READ_ONCE(test_status.report_found) && \
!READ_ONCE(test_status.async_fault)) \
- kasan_enable_tagging(); \
+ kasan_enable_hw_tags(); \
migrate_enable(); \
} \
WRITE_ONCE(test_status.report_found, false); \
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 89078f912827..892a9dc9d4d3 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -72,10 +72,18 @@ static int __init kasan_set_multi_shot(char *str)
__setup("kasan_multi_shot", kasan_set_multi_shot);
/*
- * Used to suppress reports within kasan_disable/enable_current() critical
- * sections, which are used for marking accesses to slab metadata.
+ * This function is used to check whether KASAN reports are suppressed for
+ * software KASAN modes via kasan_disable/enable_current() critical sections.
+ *
+ * This is done to avoid:
+ * 1. False-positive reports when accessing slab metadata,
+ * 2. Deadlocking when poisoned memory is accessed by the reporting code.
+ *
+ * Hardware Tag-Based KASAN instead relies on:
+ * For #1: Resetting tags via kasan_reset_tag().
+ * For #2: Suppression of tag checks via CPU, see report_suppress_start/end().
*/
-static bool report_suppressed(void)
+static bool report_suppressed_sw(void)
{
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
if (current->kasan_depth)
@@ -84,6 +92,30 @@ static bool report_suppressed(void)
return false;
}
+static void report_suppress_start(void)
+{
+#ifdef CONFIG_KASAN_HW_TAGS
+ /*
+ * Disable preemption for the duration of printing a KASAN report, as
+ * hw_suppress_tag_checks_start() disables checks on the current CPU.
+ */
+ preempt_disable();
+ hw_suppress_tag_checks_start();
+#else
+ kasan_disable_current();
+#endif
+}
+
+static void report_suppress_stop(void)
+{
+#ifdef CONFIG_KASAN_HW_TAGS
+ hw_suppress_tag_checks_stop();
+ preempt_enable();
+#else
+ kasan_enable_current();
+#endif
+}
+
/*
* Used to avoid reporting more than one KASAN bug unless kasan_multi_shot
* is enabled. Note that KASAN tests effectively enable kasan_multi_shot
@@ -174,7 +206,7 @@ static void start_report(unsigned long *flags, bool sync)
/* Do not allow LOCKDEP mangling KASAN reports. */
lockdep_off();
/* Make sure we don't end up in loop. */
- kasan_disable_current();
+ report_suppress_start();
spin_lock_irqsave(&report_lock, *flags);
pr_err("==================================================================\n");
}
@@ -192,7 +224,7 @@ static void end_report(unsigned long *flags, void *addr)
panic("kasan.fault=panic set ...\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
lockdep_on();
- kasan_enable_current();
+ report_suppress_stop();
}
static void print_error_description(struct kasan_report_info *info)
@@ -480,9 +512,13 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty
struct kasan_report_info info;
/*
- * Do not check report_suppressed(), as an invalid-free cannot be
- * caused by accessing slab metadata and thus should not be
- * suppressed by kasan_disable/enable_current() critical sections.
+ * Do not check report_suppressed_sw(), as an invalid-free cannot be
+ * caused by accessing poisoned memory and thus should not be suppressed
+ * by kasan_disable/enable_current() critical sections.
+ *
+ * Note that for Hardware Tag-Based KASAN, kasan_report_invalid_free()
+ * is triggered by explicit tag checks and not by the ones performed by
+ * the CPU. Thus, reporting invalid-free is not suppressed as well.
*/
if (unlikely(!report_enabled()))
return;
@@ -517,7 +553,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write,
unsigned long irq_flags;
struct kasan_report_info info;
- if (unlikely(report_suppressed()) || unlikely(!report_enabled())) {
+ if (unlikely(report_suppressed_sw()) || unlikely(!report_enabled())) {
ret = false;
goto out;
}
@@ -549,8 +585,9 @@ void kasan_report_async(void)
unsigned long flags;
/*
- * Do not check report_suppressed(), as kasan_disable/enable_current()
- * critical sections do not affect Hardware Tag-Based KASAN.
+ * Do not check report_suppressed_sw(), as
+ * kasan_disable/enable_current() critical sections do not affect
+ * Hardware Tag-Based KASAN.
*/
if (unlikely(!report_enabled()))
return;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0ec69b96b497..2c6548cd18a9 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -976,12 +976,19 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
GFP_TRANSHUGE);
int node = hpage_collapse_find_target_node(cc);
+ struct folio *folio;
if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
return SCAN_ALLOC_HUGE_PAGE_FAIL;
- if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp)))
+
+ folio = page_folio(*hpage);
+ if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
+ folio_put(folio);
+ *hpage = NULL;
return SCAN_CGROUP_CHARGE_FAIL;
+ }
count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
+
return SCAN_SUCCEED;
}
@@ -1053,6 +1060,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
if (result != SCAN_SUCCEED)
goto out_up_write;
+ vma_start_write(vma);
anon_vma_lock_write(vma->anon_vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
@@ -1132,10 +1140,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
out_up_write:
mmap_write_unlock(mm);
out_nolock:
- if (hpage) {
- mem_cgroup_uncharge(page_folio(hpage));
+ if (hpage)
put_page(hpage);
- }
trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
return result;
}
@@ -1176,7 +1182,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* enabled swap entries. Please see
* comment below for pte_uffd_wp().
*/
- if (pte_swp_uffd_wp(pteval)) {
+ if (pte_swp_uffd_wp_any(pteval)) {
result = SCAN_PTE_UFFD_WP;
goto out_unmap;
}
@@ -1516,6 +1522,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto drop_hpage;
}
+ /* Lock the vma before taking i_mmap and page table locks */
+ vma_start_write(vma);
+
/*
* We need to lock the mapping so that from here on, only GUP-fast and
* hardware page walks can access the parts of the page tables that
@@ -1693,6 +1702,10 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
result = SCAN_PTE_MAPPED_HUGEPAGE;
if ((cc->is_khugepaged || is_target) &&
mmap_write_trylock(mm)) {
+ /* trylock for the same lock inversion as above */
+ if (!vma_try_start_write(vma))
+ goto unlock_next;
+
/*
* Re-check whether we have an ->anon_vma, because
* collapse_and_free_pmd() requires that either no
@@ -2134,10 +2147,8 @@ xa_unlocked:
unlock_page(hpage);
out:
VM_BUG_ON(!list_empty(&pagelist));
- if (hpage) {
- mem_cgroup_uncharge(page_folio(hpage));
+ if (hpage)
put_page(hpage);
- }
trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
return result;
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
index 7fb794242fad..ffedf4dbc49d 100644
--- a/mm/kmsan/init.c
+++ b/mm/kmsan/init.c
@@ -96,7 +96,7 @@ void __init kmsan_init_shadow(void)
struct metadata_page_pair {
struct page *shadow, *origin;
};
-static struct metadata_page_pair held_back[MAX_ORDER] __initdata;
+static struct metadata_page_pair held_back[MAX_ORDER + 1] __initdata;
/*
* Eager metadata allocation. When the memblock allocator is freeing pages to
@@ -211,8 +211,8 @@ static void kmsan_memblock_discard(void)
* order=N-1,
* - repeat.
*/
- collect.order = MAX_ORDER - 1;
- for (int i = MAX_ORDER - 1; i >= 0; i--) {
+ collect.order = MAX_ORDER;
+ for (int i = MAX_ORDER; i >= 0; i--) {
if (held_back[i].shadow)
smallstack_push(&collect, held_back[i].shadow);
if (held_back[i].origin)
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
index 088e21a48dc4..d9eb141c27aa 100644
--- a/mm/kmsan/kmsan_test.c
+++ b/mm/kmsan/kmsan_test.c
@@ -408,6 +408,37 @@ static void test_printk(struct kunit *test)
}
/*
+ * Prevent the compiler from optimizing @var away. Without this, Clang may
+ * notice that @var is uninitialized and drop memcpy() calls that use it.
+ *
+ * There is OPTIMIZER_HIDE_VAR() in linux/compier.h that we cannot use here,
+ * because it is implemented as inline assembly receiving @var as a parameter
+ * and will enforce a KMSAN check. Same is true for e.g. barrier_data(var).
+ */
+#define DO_NOT_OPTIMIZE(var) barrier()
+
+/*
+ * Test case: ensure that memcpy() correctly copies initialized values.
+ * Also serves as a regression test to ensure DO_NOT_OPTIMIZE() does not cause
+ * extra checks.
+ */
+static void test_init_memcpy(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ volatile int src;
+ volatile int dst = 0;
+
+ DO_NOT_OPTIMIZE(src);
+ src = 1;
+ kunit_info(
+ test,
+ "memcpy()ing aligned initialized src to aligned dst (no reports)\n");
+ memcpy((void *)&dst, (void *)&src, sizeof(src));
+ kmsan_check_memory((void *)&dst, sizeof(dst));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
* Test case: ensure that memcpy() correctly copies uninitialized values between
* aligned `src` and `dst`.
*/
@@ -420,7 +451,7 @@ static void test_memcpy_aligned_to_aligned(struct kunit *test)
kunit_info(
test,
"memcpy()ing aligned uninit src to aligned dst (UMR report)\n");
- OPTIMIZER_HIDE_VAR(uninit_src);
+ DO_NOT_OPTIMIZE(uninit_src);
memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src));
kmsan_check_memory((void *)&dst, sizeof(dst));
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
@@ -443,7 +474,7 @@ static void test_memcpy_aligned_to_unaligned(struct kunit *test)
kunit_info(
test,
"memcpy()ing aligned uninit src to unaligned dst (UMR report)\n");
- OPTIMIZER_HIDE_VAR(uninit_src);
+ DO_NOT_OPTIMIZE(uninit_src);
memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src));
kmsan_check_memory((void *)dst, 4);
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
@@ -467,13 +498,33 @@ static void test_memcpy_aligned_to_unaligned2(struct kunit *test)
kunit_info(
test,
"memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n");
- OPTIMIZER_HIDE_VAR(uninit_src);
+ DO_NOT_OPTIMIZE(uninit_src);
memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src));
kmsan_check_memory((void *)&dst[4], sizeof(uninit_src));
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
}
-static noinline void fibonacci(int *array, int size, int start) {
+/* Generate test cases for memset16(), memset32(), memset64(). */
+#define DEFINE_TEST_MEMSETXX(size) \
+ static void test_memset##size(struct kunit *test) \
+ { \
+ EXPECTATION_NO_REPORT(expect); \
+ volatile uint##size##_t uninit; \
+ \
+ kunit_info(test, \
+ "memset" #size "() should initialize memory\n"); \
+ DO_NOT_OPTIMIZE(uninit); \
+ memset##size((uint##size##_t *)&uninit, 0, 1); \
+ kmsan_check_memory((void *)&uninit, sizeof(uninit)); \
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect)); \
+ }
+
+DEFINE_TEST_MEMSETXX(16)
+DEFINE_TEST_MEMSETXX(32)
+DEFINE_TEST_MEMSETXX(64)
+
+static noinline void fibonacci(int *array, int size, int start)
+{
if (start < 2 || (start == size))
return;
array[start] = array[start - 1] + array[start - 2];
@@ -482,8 +533,7 @@ static noinline void fibonacci(int *array, int size, int start) {
static void test_long_origin_chain(struct kunit *test)
{
- EXPECTATION_UNINIT_VALUE_FN(expect,
- "test_long_origin_chain");
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_long_origin_chain");
/* (KMSAN_MAX_ORIGIN_DEPTH * 2) recursive calls to fibonacci(). */
volatile int accum[KMSAN_MAX_ORIGIN_DEPTH * 2 + 2];
int last = ARRAY_SIZE(accum) - 1;
@@ -501,6 +551,36 @@ static void test_long_origin_chain(struct kunit *test)
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
}
+/*
+ * Test case: ensure that saving/restoring/printing stacks to/from stackdepot
+ * does not trigger errors.
+ *
+ * KMSAN uses stackdepot to store origin stack traces, that's why we do not
+ * instrument lib/stackdepot.c. Yet it must properly mark its outputs as
+ * initialized because other kernel features (e.g. netdev tracker) may also
+ * access stackdepot from instrumented code.
+ */
+static void test_stackdepot_roundtrip(struct kunit *test)
+{
+ unsigned long src_entries[16], *dst_entries;
+ unsigned int src_nentries, dst_nentries;
+ EXPECTATION_NO_REPORT(expect);
+ depot_stack_handle_t handle;
+
+ kunit_info(test, "testing stackdepot roundtrip (no reports)\n");
+
+ src_nentries =
+ stack_trace_save(src_entries, ARRAY_SIZE(src_entries), 1);
+ handle = stack_depot_save(src_entries, src_nentries, GFP_KERNEL);
+ stack_depot_print(handle);
+ dst_nentries = stack_depot_fetch(handle, &dst_entries);
+ KUNIT_EXPECT_TRUE(test, src_nentries == dst_nentries);
+
+ kmsan_check_memory((void *)dst_entries,
+ sizeof(*dst_entries) * dst_nentries);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
static struct kunit_case kmsan_test_cases[] = {
KUNIT_CASE(test_uninit_kmalloc),
KUNIT_CASE(test_init_kmalloc),
@@ -515,10 +595,15 @@ static struct kunit_case kmsan_test_cases[] = {
KUNIT_CASE(test_uaf),
KUNIT_CASE(test_percpu_propagate),
KUNIT_CASE(test_printk),
+ KUNIT_CASE(test_init_memcpy),
KUNIT_CASE(test_memcpy_aligned_to_aligned),
KUNIT_CASE(test_memcpy_aligned_to_unaligned),
KUNIT_CASE(test_memcpy_aligned_to_unaligned2),
+ KUNIT_CASE(test_memset16),
+ KUNIT_CASE(test_memset32),
+ KUNIT_CASE(test_memset64),
KUNIT_CASE(test_long_origin_chain),
+ KUNIT_CASE(test_stackdepot_roundtrip),
{},
};
diff --git a/mm/ksm.c b/mm/ksm.c
index 2b8d30068cbb..290a3eb6d8de 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -45,6 +45,9 @@
#include "internal.h"
#include "mm_slot.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/ksm.h>
+
#ifdef CONFIG_NUMA
#define NUMA(x) (x)
#define DO_NUMA(x) do { (x); } while (0)
@@ -633,10 +636,12 @@ static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
BUG_ON(stable_node->rmap_hlist_len < 0);
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
- if (rmap_item->hlist.next)
+ if (rmap_item->hlist.next) {
ksm_pages_sharing--;
- else
+ trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm);
+ } else {
ksm_pages_shared--;
+ }
rmap_item->mm->ksm_merging_pages--;
@@ -657,6 +662,7 @@ static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
+ trace_ksm_remove_ksm_page(stable_node->kpfn);
if (stable_node->head == &migrate_nodes)
list_del(&stable_node->list);
else
@@ -1324,6 +1330,8 @@ static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item,
get_anon_vma(vma->anon_vma);
out:
mmap_read_unlock(mm);
+ trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page),
+ rmap_item, mm, err);
return err;
}
@@ -2142,6 +2150,9 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
if (vma) {
err = try_to_merge_one_page(vma, page,
ZERO_PAGE(rmap_item->address));
+ trace_ksm_merge_one_page(
+ page_to_pfn(ZERO_PAGE(rmap_item->address)),
+ rmap_item, mm, err);
} else {
/*
* If the vma is out of date, we do not need to
@@ -2264,6 +2275,8 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
mm_slot = ksm_scan.mm_slot;
if (mm_slot == &ksm_mm_head) {
+ trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
+
/*
* A number of pages can hang around indefinitely on per-cpu
* pagevecs, raised page count preventing write_protect_page
@@ -2414,6 +2427,7 @@ no_vmas:
if (mm_slot != &ksm_mm_head)
goto next_mm;
+ trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
ksm_scan.seqnr++;
return NULL;
}
@@ -2565,6 +2579,7 @@ int __ksm_enter(struct mm_struct *mm)
if (needs_wakeup)
wake_up_interruptible(&ksm_thread_wait);
+ trace_ksm_enter(mm);
return 0;
}
@@ -2606,6 +2621,8 @@ void __ksm_exit(struct mm_struct *mm)
mmap_write_lock(mm);
mmap_write_unlock(mm);
}
+
+ trace_ksm_exit(mm);
}
struct page *ksm_might_need_to_copy(struct page *page,
diff --git a/mm/memblock.c b/mm/memblock.c
index 25fd0626a9e7..7911224b1ed3 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2043,7 +2043,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
int order;
while (start < end) {
- order = min(MAX_ORDER - 1UL, __ffs(start));
+ order = min_t(int, MAX_ORDER, __ffs(start));
while (start + (1UL << order) > end)
order--;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5abffe6f8389..0524add35cae 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1929,7 +1929,7 @@ static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
* Please note that mem_cgroup_out_of_memory might fail to find a
* victim and then we have to bail out from the charge path.
*/
- if (memcg->oom_kill_disable) {
+ if (READ_ONCE(memcg->oom_kill_disable)) {
if (current->in_user_fault) {
css_get(&memcg->css);
current->memcg_in_oom = memcg;
@@ -1999,7 +1999,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
if (locked)
mem_cgroup_oom_notify(memcg);
- if (locked && !memcg->oom_kill_disable) {
+ if (locked && !READ_ONCE(memcg->oom_kill_disable)) {
mem_cgroup_unmark_under_oom(memcg);
finish_wait(&memcg_oom_waitq, &owait.wait);
mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
@@ -2067,7 +2067,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
* highest-level memory cgroup with oom.group set.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- if (memcg->oom_group)
+ if (READ_ONCE(memcg->oom_group))
oom_group = memcg;
if (memcg == oom_domain)
@@ -3728,7 +3728,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
case RES_FAILCNT:
return counter->failcnt;
case RES_SOFT_LIMIT:
- return (u64)memcg->soft_limit * PAGE_SIZE;
+ return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
default:
BUG();
}
@@ -3870,7 +3870,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
ret = -EOPNOTSUPP;
} else {
- memcg->soft_limit = nr_pages;
+ WRITE_ONCE(memcg->soft_limit, nr_pages);
ret = 0;
}
break;
@@ -4179,9 +4179,9 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
return -EINVAL;
if (!mem_cgroup_is_root(memcg))
- memcg->swappiness = val;
+ WRITE_ONCE(memcg->swappiness, val);
else
- vm_swappiness = val;
+ WRITE_ONCE(vm_swappiness, val);
return 0;
}
@@ -4515,7 +4515,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
- seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+ seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable));
seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
seq_printf(sf, "oom_kill %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
@@ -4531,7 +4531,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
return -EINVAL;
- memcg->oom_kill_disable = val;
+ WRITE_ONCE(memcg->oom_kill_disable, val);
if (!val)
memcg_oom_recover(memcg);
@@ -5347,14 +5347,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_CAST(memcg);
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
- memcg->soft_limit = PAGE_COUNTER_MAX;
+ WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
memcg->zswap_max = PAGE_COUNTER_MAX;
#endif
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
if (parent) {
- memcg->swappiness = mem_cgroup_swappiness(parent);
- memcg->oom_kill_disable = parent->oom_kill_disable;
+ WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
+ WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
@@ -5502,7 +5502,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
- memcg->soft_limit = PAGE_COUNTER_MAX;
+ WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
memcg_wb_domain_size_changed(memcg);
}
@@ -5705,7 +5705,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
/* shmem/tmpfs may report page out on swap: account for that too. */
index = linear_page_index(vma, addr);
folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
- if (!folio)
+ if (IS_ERR(folio))
return NULL;
return folio_file_page(folio, index);
}
@@ -6623,7 +6623,7 @@ static int memory_oom_group_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- seq_printf(m, "%d\n", memcg->oom_group);
+ seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group));
return 0;
}
@@ -6645,7 +6645,7 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
if (oom_group != 0 && oom_group != 1)
return -EINVAL;
- memcg->oom_group = oom_group;
+ WRITE_ONCE(memcg->oom_group, oom_group);
return nbytes;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fae9baf3be16..f761704d27d7 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -168,7 +168,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
return true;
}
-#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
+#if IS_ENABLED(CONFIG_HWPOISON_INJECT)
u32 hwpoison_filter_enable = 0;
u32 hwpoison_filter_dev_major = ~0U;
diff --git a/mm/memory.c b/mm/memory.c
index 01a23ad48a04..387226d6094d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -104,6 +104,20 @@ EXPORT_SYMBOL(mem_map);
#endif
static vm_fault_t do_fault(struct vm_fault *vmf);
+static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
+static bool vmf_pte_changed(struct vm_fault *vmf);
+
+/*
+ * Return true if the original pte was a uffd-wp pte marker (so the pte was
+ * wr-protected).
+ */
+static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
+{
+ if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
+ return false;
+
+ return pte_marker_uffd_wp(vmf->orig_pte);
+}
/*
* A number of key systems in x86 including ioremap() rely on the assumption
@@ -348,7 +362,7 @@ void free_pgd_range(struct mmu_gather *tlb,
void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *vma, unsigned long floor,
- unsigned long ceiling)
+ unsigned long ceiling, bool mm_wr_locked)
{
MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
@@ -366,6 +380,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
*/
+ if (mm_wr_locked)
+ vma_start_write(vma);
unlink_anon_vmas(vma);
unlink_file_vma(vma);
@@ -380,6 +396,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
&& !is_vm_hugetlb_page(next)) {
vma = next;
next = mas_find(&mas, ceiling - 1);
+ if (mm_wr_locked)
+ vma_start_write(vma);
unlink_anon_vmas(vma);
unlink_file_vma(vma);
}
@@ -970,7 +988,7 @@ static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm,
folio_put(new_folio);
return NULL;
}
- cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL);
+ folio_throttle_swaprate(new_folio, GFP_KERNEL);
return new_folio;
}
@@ -1290,6 +1308,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
continue;
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
addr, next))) {
+ untrack_pfn_clear(dst_vma);
ret = -ENOMEM;
break;
}
@@ -1345,6 +1364,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte,
struct zap_details *details, pte_t pteval)
{
+ /* Zap on anonymous always means dropping everything */
+ if (vma_is_anonymous(vma))
+ return;
+
if (zap_drop_file_uffd_wp(details))
return;
@@ -1451,8 +1474,12 @@ again:
continue;
rss[mm_counter(page)]--;
} else if (pte_marker_entry_uffd_wp(entry)) {
- /* Only drop the uffd-wp marker if explicitly requested */
- if (!zap_drop_file_uffd_wp(details))
+ /*
+ * For anon: always drop the marker; for file: only
+ * drop the marker if explicitly requested.
+ */
+ if (!vma_is_anonymous(vma) &&
+ !zap_drop_file_uffd_wp(details))
continue;
} else if (is_hwpoison_entry(entry) ||
is_swapin_error_entry(entry)) {
@@ -2142,8 +2169,20 @@ out_unlock:
* vmf_insert_pfn_prot should only be used if using multiple VMAs is
* impractical.
*
- * See vmf_insert_mixed_prot() for a discussion of the implication of using
- * a value of @pgprot different from that of @vma->vm_page_prot.
+ * pgprot typically only differs from @vma->vm_page_prot when drivers set
+ * caching- and encryption bits different than those of @vma->vm_page_prot,
+ * because the caching- or encryption mode may not be known at mmap() time.
+ *
+ * This is ok as long as @vma->vm_page_prot is not used by the core vm
+ * to set caching and encryption bits for those vmas (except for COW pages).
+ * This is ensured by core vm only modifying these page table entries using
+ * functions that don't touch caching- or encryption bits, using pte_modify()
+ * if needed. (See for example mprotect()).
+ *
+ * Also when new page-table entries are created, this is only done using the
+ * fault() callback, and never using the value of vma->vm_page_prot,
+ * except for page-table entries that point to anonymous pages as the result
+ * of COW.
*
* Context: Process context. May allocate using %GFP_KERNEL.
* Return: vm_fault_t value.
@@ -2218,9 +2257,9 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
}
static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn, pgprot_t pgprot,
- bool mkwrite)
+ unsigned long addr, pfn_t pfn, bool mkwrite)
{
+ pgprot_t pgprot = vma->vm_page_prot;
int err;
BUG_ON(!vm_mixed_ok(vma, pfn));
@@ -2263,43 +2302,10 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
-/**
- * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- * @pgprot: pgprot flags for the inserted page
- *
- * This is exactly like vmf_insert_mixed(), except that it allows drivers
- * to override pgprot on a per-page basis.
- *
- * Typically this function should be used by drivers to set caching- and
- * encryption bits different than those of @vma->vm_page_prot, because
- * the caching- or encryption mode may not be known at mmap() time.
- * This is ok as long as @vma->vm_page_prot is not used by the core vm
- * to set caching and encryption bits for those vmas (except for COW pages).
- * This is ensured by core vm only modifying these page table entries using
- * functions that don't touch caching- or encryption bits, using pte_modify()
- * if needed. (See for example mprotect()).
- * Also when new page-table entries are created, this is only done using the
- * fault() callback, and never using the value of vma->vm_page_prot,
- * except for page-table entries that point to anonymous pages as the result
- * of COW.
- *
- * Context: Process context. May allocate using %GFP_KERNEL.
- * Return: vm_fault_t value.
- */
-vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t pgprot)
-{
- return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
-}
-EXPORT_SYMBOL(vmf_insert_mixed_prot);
-
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn)
{
- return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
+ return __vm_insert_mixed(vma, addr, pfn, false);
}
EXPORT_SYMBOL(vmf_insert_mixed);
@@ -2311,7 +2317,7 @@ EXPORT_SYMBOL(vmf_insert_mixed);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn)
{
- return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
+ return __vm_insert_mixed(vma, addr, pfn, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
@@ -3091,7 +3097,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
goto oom_free_new;
- cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL);
+ folio_throttle_swaprate(new_folio, GFP_KERNEL);
__folio_mark_uptodate(new_folio);
@@ -3633,6 +3639,14 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
return 0;
}
+static vm_fault_t do_pte_missing(struct vm_fault *vmf)
+{
+ if (vma_is_anonymous(vmf->vma))
+ return do_anonymous_page(vmf);
+ else
+ return do_fault(vmf);
+}
+
/*
* This is actually a page-missing access, but with uffd-wp special pte
* installed. It means this pte was wr-protected before being unmapped.
@@ -3643,11 +3657,10 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
* Just in case there're leftover special ptes even after the region
* got unregistered - we can simply clear them.
*/
- if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
+ if (unlikely(!userfaultfd_wp(vmf->vma)))
return pte_marker_clear(vmf);
- /* do_fault() can handle pte markers too like none pte */
- return do_fault(vmf);
+ return do_pte_missing(vmf);
}
static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
@@ -3698,6 +3711,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!pte_unmap_same(vmf))
goto out;
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ ret = VM_FAULT_RETRY;
+ goto out;
+ }
+
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
@@ -3852,7 +3870,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
lru_add_drain();
}
- cgroup_throttle_swaprate(page, GFP_KERNEL);
+ folio_throttle_swaprate(folio, GFP_KERNEL);
/*
* Back out if somebody else already faulted in this pte.
@@ -4012,6 +4030,7 @@ out_release:
*/
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
+ bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
struct vm_area_struct *vma = vmf->vma;
struct folio *folio;
vm_fault_t ret = 0;
@@ -4045,7 +4064,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vma->vm_page_prot));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- if (!pte_none(*vmf->pte)) {
+ if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock;
}
@@ -4069,7 +4088,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
- cgroup_throttle_swaprate(&folio->page, GFP_KERNEL);
+ folio_throttle_swaprate(folio, GFP_KERNEL);
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
@@ -4085,7 +4104,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (!pte_none(*vmf->pte)) {
+ if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto release;
}
@@ -4105,6 +4124,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
folio_add_new_anon_rmap(folio, vma, vmf->address);
folio_add_lru_vma(folio, vma);
setpte:
+ if (uffd_wp)
+ entry = pte_mkuffd_wp(entry);
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
@@ -4272,7 +4293,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
{
struct vm_area_struct *vma = vmf->vma;
- bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
+ bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool prefault = vmf->address != addr;
pte_t entry;
@@ -4386,13 +4407,13 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return ret;
}
-static unsigned long fault_around_bytes __read_mostly =
- rounddown_pow_of_two(65536);
+static unsigned long fault_around_pages __read_mostly =
+ 65536 >> PAGE_SHIFT;
#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
{
- *val = fault_around_bytes;
+ *val = fault_around_pages << PAGE_SHIFT;
return 0;
}
@@ -4404,10 +4425,13 @@ static int fault_around_bytes_set(void *data, u64 val)
{
if (val / PAGE_SIZE > PTRS_PER_PTE)
return -EINVAL;
- if (val > PAGE_SIZE)
- fault_around_bytes = rounddown_pow_of_two(val);
- else
- fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
+
+ /*
+ * The minimum value is 1 page, however this results in no fault-around
+ * at all. See should_fault_around().
+ */
+ fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL);
+
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
@@ -4430,41 +4454,34 @@ late_initcall(fault_around_debugfs);
* It uses vm_ops->map_pages() to map the pages, which skips the page if it's
* not ready to be mapped: not up-to-date, locked, etc.
*
- * This function doesn't cross the VMA boundaries, in order to call map_pages()
- * only once.
+ * This function doesn't cross VMA or page table boundaries, in order to call
+ * map_pages() and acquire a PTE lock only once.
*
- * fault_around_bytes defines how many bytes we'll try to map.
+ * fault_around_pages defines how many pages we'll try to map.
* do_fault_around() expects it to be set to a power of two less than or equal
* to PTRS_PER_PTE.
*
* The virtual address of the area that we map is naturally aligned to
- * fault_around_bytes rounded down to the machine page size
+ * fault_around_pages * PAGE_SIZE rounded down to the machine page size
* (and therefore to page order). This way it's easier to guarantee
* that we don't cross page table boundaries.
*/
static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
- unsigned long address = vmf->address, nr_pages, mask;
- pgoff_t start_pgoff = vmf->pgoff;
- pgoff_t end_pgoff;
- int off;
-
- nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
- mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
+ pgoff_t nr_pages = READ_ONCE(fault_around_pages);
+ pgoff_t pte_off = pte_index(vmf->address);
+ /* The page offset of vmf->address within the VMA. */
+ pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
+ pgoff_t from_pte, to_pte;
+ vm_fault_t ret;
- address = max(address & mask, vmf->vma->vm_start);
- off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
- start_pgoff -= off;
+ /* The PTE offset of the start address, clamped to the VMA. */
+ from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
+ pte_off - min(pte_off, vma_off));
- /*
- * end_pgoff is either the end of the page table, the end of
- * the vma or nr_pages from start_pgoff, depending what is nearest.
- */
- end_pgoff = start_pgoff -
- ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
- PTRS_PER_PTE - 1;
- end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
- start_pgoff + nr_pages - 1);
+ /* The PTE offset of the end address, clamped to the VMA and PTE. */
+ to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
+ pte_off + vma_pages(vmf->vma) - vma_off) - 1;
if (pmd_none(*vmf->pmd)) {
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
@@ -4472,7 +4489,13 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
return VM_FAULT_OOM;
}
- return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
+ rcu_read_lock();
+ ret = vmf->vma->vm_ops->map_pages(vmf,
+ vmf->pgoff + from_pte - pte_off,
+ vmf->pgoff + to_pte - pte_off);
+ rcu_read_unlock();
+
+ return ret;
}
/* Return true if we should do read fault-around, false otherwise */
@@ -4485,7 +4508,8 @@ static inline bool should_fault_around(struct vm_fault *vmf)
if (uffd_disable_fault_around(vmf->vma))
return false;
- return fault_around_bytes >> PAGE_SHIFT > 1;
+ /* A single page implies no faulting 'around' at all. */
+ return fault_around_pages > 1;
}
static vm_fault_t do_read_fault(struct vm_fault *vmf)
@@ -4531,7 +4555,7 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
- cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
+ folio_throttle_swaprate(page_folio(vmf->cow_page), GFP_KERNEL);
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -4651,6 +4675,9 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
{
get_page(page);
+ /* Record the current PID acceesing VMA */
+ vma_set_access_pid_bit(vma);
+
count_vm_numa_event(NUMA_HINT_FAULTS);
if (page_nid == numa_node_id()) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
@@ -4916,12 +4943,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
}
}
- if (!vmf->pte) {
- if (vma_is_anonymous(vmf->vma))
- return do_anonymous_page(vmf);
- else
- return do_fault(vmf);
- }
+ if (!vmf->pte)
+ return do_pte_missing(vmf);
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
@@ -4957,7 +4980,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
* with threads.
*/
if (vmf->flags & FAULT_FLAG_WRITE)
- flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
+ flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
+ vmf->pte);
}
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -5230,6 +5254,67 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address)
+{
+ MA_STATE(mas, &mm->mm_mt, address, address);
+ struct vm_area_struct *vma;
+
+ rcu_read_lock();
+retry:
+ vma = mas_walk(&mas);
+ if (!vma)
+ goto inval;
+
+ /* Only anonymous vmas are supported for now */
+ if (!vma_is_anonymous(vma))
+ goto inval;
+
+ /* find_mergeable_anon_vma uses adjacent vmas which are not locked */
+ if (!vma->anon_vma)
+ goto inval;
+
+ if (!vma_start_read(vma))
+ goto inval;
+
+ /*
+ * Due to the possibility of userfault handler dropping mmap_lock, avoid
+ * it for now and fall back to page fault handling under mmap_lock.
+ */
+ if (userfaultfd_armed(vma)) {
+ vma_end_read(vma);
+ goto inval;
+ }
+
+ /* Check since vm_start/vm_end might change before we lock the VMA */
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+ vma_end_read(vma);
+ goto inval;
+ }
+
+ /* Check if the VMA got isolated after we found it */
+ if (vma->detached) {
+ vma_end_read(vma);
+ count_vm_vma_lock_event(VMA_LOCK_MISS);
+ /* The area was replaced with another one */
+ goto retry;
+ }
+
+ rcu_read_unlock();
+ return vma;
+inval:
+ rcu_read_unlock();
+ count_vm_vma_lock_event(VMA_LOCK_ABORT);
+ return NULL;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
#ifndef __PAGETABLE_P4D_FOLDED
/*
* Allocate p4d page table.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index db3b270254f1..c8f0a8c2d049 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -596,7 +596,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
unsigned long pfn;
/*
- * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
+ * Online the pages in MAX_ORDER aligned chunks. The callback might
* decide to not expose all pages to the buddy (e.g., expose them
* later). We account all pages as being online and belonging to this
* zone ("present").
@@ -605,7 +605,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
* this and the first chunk to online will be pageblock_nr_pages.
*/
for (pfn = start_pfn; pfn < end_pfn;) {
- int order = min(MAX_ORDER - 1UL, __ffs(pfn));
+ int order = min_t(int, MAX_ORDER, __ffs(pfn));
(*online_page_callback)(pfn_to_page(pfn), order);
pfn += (1UL << order);
diff --git a/mm/memtest.c b/mm/memtest.c
index f53ace709ccd..57149dfee438 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -4,6 +4,9 @@
#include <linux/init.h>
#include <linux/memblock.h>
+bool early_memtest_done;
+phys_addr_t early_memtest_bad_size;
+
static u64 patterns[] __initdata = {
/* The first entry has to be 0 to leave memtest with zeroed memory */
0,
@@ -30,6 +33,7 @@ static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr
pr_info(" %016llx bad mem addr %pa - %pa reserved\n",
cpu_to_be64(pattern), &start_bad, &end_bad);
memblock_reserve(start_bad, end_bad - start_bad);
+ early_memtest_bad_size += (end_bad - start_bad);
}
static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size)
@@ -61,6 +65,8 @@ static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size
}
if (start_bad)
reserve_bad_mem(pattern, start_bad, last_bad + incr);
+
+ early_memtest_done = true;
}
static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
diff --git a/mm/migrate.c b/mm/migrate.c
index db3f154446af..afe21c48dc6e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -249,7 +249,6 @@ static bool remove_migration_pte(struct folio *folio,
if (folio_test_hugetlb(folio)) {
unsigned int shift = huge_page_shift(hstate_vma(vma));
- pte = pte_mkhuge(pte);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
if (folio_test_anon(folio))
hugepage_add_anon_rmap(new, vma, pvmw.address,
diff --git a/mm/mincore.c b/mm/mincore.c
index d359650b0f75..2d5be013a25a 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -61,7 +61,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
folio = filemap_get_incore_folio(mapping, index);
- if (folio) {
+ if (!IS_ERR(folio)) {
present = folio_test_uptodate(folio);
folio_put(folio);
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c1883362e71d..dd3a6ed9663f 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -14,7 +14,23 @@
#include <linux/notifier.h>
#include <linux/sched.h>
#include <linux/mman.h>
+#include <linux/memblock.h>
+#include <linux/page-isolation.h>
+#include <linux/padata.h>
+#include <linux/nmi.h>
+#include <linux/buffer_head.h>
+#include <linux/kmemleak.h>
+#include <linux/kfence.h>
+#include <linux/page_ext.h>
+#include <linux/pti.h>
+#include <linux/pgtable.h>
+#include <linux/swap.h>
+#include <linux/cma.h>
#include "internal.h"
+#include "slab.h"
+#include "shuffle.h"
+
+#include <asm/setup.h>
#ifdef CONFIG_DEBUG_MEMORY_INIT
int __meminitdata mminit_loglevel;
@@ -198,3 +214,2535 @@ static int __init mm_sysfs_init(void)
return 0;
}
postcore_initcall(mm_sysfs_init);
+
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
+
+static unsigned long required_kernelcore __initdata;
+static unsigned long required_kernelcore_percent __initdata;
+static unsigned long required_movablecore __initdata;
+static unsigned long required_movablecore_percent __initdata;
+
+static unsigned long nr_kernel_pages __initdata;
+static unsigned long nr_all_pages __initdata;
+static unsigned long dma_reserve __initdata;
+
+static bool deferred_struct_pages __meminitdata;
+
+static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+
+static int __init cmdline_parse_core(char *p, unsigned long *core,
+ unsigned long *percent)
+{
+ unsigned long long coremem;
+ char *endptr;
+
+ if (!p)
+ return -EINVAL;
+
+ /* Value may be a percentage of total memory, otherwise bytes */
+ coremem = simple_strtoull(p, &endptr, 0);
+ if (*endptr == '%') {
+ /* Paranoid check for percent values greater than 100 */
+ WARN_ON(coremem > 100);
+
+ *percent = coremem;
+ } else {
+ coremem = memparse(p, &p);
+ /* Paranoid check that UL is enough for the coremem value */
+ WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+
+ *core = coremem >> PAGE_SHIFT;
+ *percent = 0UL;
+ }
+ return 0;
+}
+
+/*
+ * kernelcore=size sets the amount of memory for use for allocations that
+ * cannot be reclaimed or migrated.
+ */
+static int __init cmdline_parse_kernelcore(char *p)
+{
+ /* parse kernelcore=mirror */
+ if (parse_option_str(p, "mirror")) {
+ mirrored_kernelcore = true;
+ return 0;
+ }
+
+ return cmdline_parse_core(p, &required_kernelcore,
+ &required_kernelcore_percent);
+}
+early_param("kernelcore", cmdline_parse_kernelcore);
+
+/*
+ * movablecore=size sets the amount of memory for use for allocations that
+ * can be reclaimed or migrated.
+ */
+static int __init cmdline_parse_movablecore(char *p)
+{
+ return cmdline_parse_core(p, &required_movablecore,
+ &required_movablecore_percent);
+}
+early_param("movablecore", cmdline_parse_movablecore);
+
+/*
+ * early_calculate_totalpages()
+ * Sum pages in active regions for movable zone.
+ * Populate N_MEMORY for calculating usable_nodes.
+ */
+static unsigned long __init early_calculate_totalpages(void)
+{
+ unsigned long totalpages = 0;
+ unsigned long start_pfn, end_pfn;
+ int i, nid;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ unsigned long pages = end_pfn - start_pfn;
+
+ totalpages += pages;
+ if (pages)
+ node_set_state(nid, N_MEMORY);
+ }
+ return totalpages;
+}
+
+/*
+ * This finds a zone that can be used for ZONE_MOVABLE pages. The
+ * assumption is made that zones within a node are ordered in monotonic
+ * increasing memory addresses so that the "highest" populated zone is used
+ */
+static void __init find_usable_zone_for_movable(void)
+{
+ int zone_index;
+ for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
+ if (zone_index == ZONE_MOVABLE)
+ continue;
+
+ if (arch_zone_highest_possible_pfn[zone_index] >
+ arch_zone_lowest_possible_pfn[zone_index])
+ break;
+ }
+
+ VM_BUG_ON(zone_index == -1);
+ movable_zone = zone_index;
+}
+
+/*
+ * Find the PFN the Movable zone begins in each node. Kernel memory
+ * is spread evenly between nodes as long as the nodes have enough
+ * memory. When they don't, some nodes will have more kernelcore than
+ * others
+ */
+static void __init find_zone_movable_pfns_for_nodes(void)
+{
+ int i, nid;
+ unsigned long usable_startpfn;
+ unsigned long kernelcore_node, kernelcore_remaining;
+ /* save the state before borrow the nodemask */
+ nodemask_t saved_node_state = node_states[N_MEMORY];
+ unsigned long totalpages = early_calculate_totalpages();
+ int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+ struct memblock_region *r;
+
+ /* Need to find movable_zone earlier when movable_node is specified. */
+ find_usable_zone_for_movable();
+
+ /*
+ * If movable_node is specified, ignore kernelcore and movablecore
+ * options.
+ */
+ if (movable_node_is_enabled()) {
+ for_each_mem_region(r) {
+ if (!memblock_is_hotpluggable(r))
+ continue;
+
+ nid = memblock_get_region_node(r);
+
+ usable_startpfn = PFN_DOWN(r->base);
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ goto out2;
+ }
+
+ /*
+ * If kernelcore=mirror is specified, ignore movablecore option
+ */
+ if (mirrored_kernelcore) {
+ bool mem_below_4gb_not_mirrored = false;
+
+ for_each_mem_region(r) {
+ if (memblock_is_mirror(r))
+ continue;
+
+ nid = memblock_get_region_node(r);
+
+ usable_startpfn = memblock_region_memory_base_pfn(r);
+
+ if (usable_startpfn < PHYS_PFN(SZ_4G)) {
+ mem_below_4gb_not_mirrored = true;
+ continue;
+ }
+
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ if (mem_below_4gb_not_mirrored)
+ pr_warn("This configuration results in unmirrored kernel memory.\n");
+
+ goto out2;
+ }
+
+ /*
+ * If kernelcore=nn% or movablecore=nn% was specified, calculate the
+ * amount of necessary memory.
+ */
+ if (required_kernelcore_percent)
+ required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
+ 10000UL;
+ if (required_movablecore_percent)
+ required_movablecore = (totalpages * 100 * required_movablecore_percent) /
+ 10000UL;
+
+ /*
+ * If movablecore= was specified, calculate what size of
+ * kernelcore that corresponds so that memory usable for
+ * any allocation type is evenly spread. If both kernelcore
+ * and movablecore are specified, then the value of kernelcore
+ * will be used for required_kernelcore if it's greater than
+ * what movablecore would have allowed.
+ */
+ if (required_movablecore) {
+ unsigned long corepages;
+
+ /*
+ * Round-up so that ZONE_MOVABLE is at least as large as what
+ * was requested by the user
+ */
+ required_movablecore =
+ roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+ required_movablecore = min(totalpages, required_movablecore);
+ corepages = totalpages - required_movablecore;
+
+ required_kernelcore = max(required_kernelcore, corepages);
+ }
+
+ /*
+ * If kernelcore was not specified or kernelcore size is larger
+ * than totalpages, there is no ZONE_MOVABLE.
+ */
+ if (!required_kernelcore || required_kernelcore >= totalpages)
+ goto out;
+
+ /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
+ usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
+
+restart:
+ /* Spread kernelcore memory as evenly as possible throughout nodes */
+ kernelcore_node = required_kernelcore / usable_nodes;
+ for_each_node_state(nid, N_MEMORY) {
+ unsigned long start_pfn, end_pfn;
+
+ /*
+ * Recalculate kernelcore_node if the division per node
+ * now exceeds what is necessary to satisfy the requested
+ * amount of memory for the kernel
+ */
+ if (required_kernelcore < kernelcore_node)
+ kernelcore_node = required_kernelcore / usable_nodes;
+
+ /*
+ * As the map is walked, we track how much memory is usable
+ * by the kernel using kernelcore_remaining. When it is
+ * 0, the rest of the node is usable by ZONE_MOVABLE
+ */
+ kernelcore_remaining = kernelcore_node;
+
+ /* Go through each range of PFNs within this node */
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ unsigned long size_pages;
+
+ start_pfn = max(start_pfn, zone_movable_pfn[nid]);
+ if (start_pfn >= end_pfn)
+ continue;
+
+ /* Account for what is only usable for kernelcore */
+ if (start_pfn < usable_startpfn) {
+ unsigned long kernel_pages;
+ kernel_pages = min(end_pfn, usable_startpfn)
+ - start_pfn;
+
+ kernelcore_remaining -= min(kernel_pages,
+ kernelcore_remaining);
+ required_kernelcore -= min(kernel_pages,
+ required_kernelcore);
+
+ /* Continue if range is now fully accounted */
+ if (end_pfn <= usable_startpfn) {
+
+ /*
+ * Push zone_movable_pfn to the end so
+ * that if we have to rebalance
+ * kernelcore across nodes, we will
+ * not double account here
+ */
+ zone_movable_pfn[nid] = end_pfn;
+ continue;
+ }
+ start_pfn = usable_startpfn;
+ }
+
+ /*
+ * The usable PFN range for ZONE_MOVABLE is from
+ * start_pfn->end_pfn. Calculate size_pages as the
+ * number of pages used as kernelcore
+ */
+ size_pages = end_pfn - start_pfn;
+ if (size_pages > kernelcore_remaining)
+ size_pages = kernelcore_remaining;
+ zone_movable_pfn[nid] = start_pfn + size_pages;
+
+ /*
+ * Some kernelcore has been met, update counts and
+ * break if the kernelcore for this node has been
+ * satisfied
+ */
+ required_kernelcore -= min(required_kernelcore,
+ size_pages);
+ kernelcore_remaining -= size_pages;
+ if (!kernelcore_remaining)
+ break;
+ }
+ }
+
+ /*
+ * If there is still required_kernelcore, we do another pass with one
+ * less node in the count. This will push zone_movable_pfn[nid] further
+ * along on the nodes that still have memory until kernelcore is
+ * satisfied
+ */
+ usable_nodes--;
+ if (usable_nodes && required_kernelcore > usable_nodes)
+ goto restart;
+
+out2:
+ /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ unsigned long start_pfn, end_pfn;
+
+ zone_movable_pfn[nid] =
+ roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ if (zone_movable_pfn[nid] >= end_pfn)
+ zone_movable_pfn[nid] = 0;
+ }
+
+out:
+ /* restore the node_state */
+ node_states[N_MEMORY] = saved_node_state;
+}
+
+static void __meminit __init_single_page(struct page *page, unsigned long pfn,
+ unsigned long zone, int nid)
+{
+ mm_zero_struct_page(page);
+ set_page_links(page, zone, nid, pfn);
+ init_page_count(page);
+ page_mapcount_reset(page);
+ page_cpupid_reset_last(page);
+ page_kasan_tag_reset(page);
+
+ INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+ /* The shift won't overflow because ZONE_NORMAL is below 4G. */
+ if (!is_highmem_idx(zone))
+ set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * During memory init memblocks map pfns to nids. The search is expensive and
+ * this caches recent lookups. The implementation of __early_pfn_to_nid
+ * treats start/end as pfns.
+ */
+struct mminit_pfnnid_cache {
+ unsigned long last_start;
+ unsigned long last_end;
+ int last_nid;
+};
+
+static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
+
+/*
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ */
+static int __meminit __early_pfn_to_nid(unsigned long pfn,
+ struct mminit_pfnnid_cache *state)
+{
+ unsigned long start_pfn, end_pfn;
+ int nid;
+
+ if (state->last_start <= pfn && pfn < state->last_end)
+ return state->last_nid;
+
+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
+ if (nid != NUMA_NO_NODE) {
+ state->last_start = start_pfn;
+ state->last_end = end_pfn;
+ state->last_nid = nid;
+ }
+
+ return nid;
+}
+
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+ static DEFINE_SPINLOCK(early_pfn_lock);
+ int nid;
+
+ spin_lock(&early_pfn_lock);
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
+ if (nid < 0)
+ nid = first_online_node;
+ spin_unlock(&early_pfn_lock);
+
+ return nid;
+}
+
+int hashdist = HASHDIST_DEFAULT;
+
+static int __init set_hashdist(char *str)
+{
+ if (!str)
+ return 0;
+ hashdist = simple_strtoul(str, &str, 0);
+ return 1;
+}
+__setup("hashdist=", set_hashdist);
+
+static inline void fixup_hashdist(void)
+{
+ if (num_node_state(N_MEMORY) == 1)
+ hashdist = 0;
+}
+#else
+static inline void fixup_hashdist(void) {}
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
+{
+ pgdat->first_deferred_pfn = ULONG_MAX;
+}
+
+/* Returns true if the struct page for the pfn is initialised */
+static inline bool __meminit early_page_initialised(unsigned long pfn)
+{
+ int nid = early_pfn_to_nid(pfn);
+
+ if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
+ return false;
+
+ return true;
+}
+
+/*
+ * Returns true when the remaining initialisation should be deferred until
+ * later in the boot cycle when it can be parallelised.
+ */
+static bool __meminit
+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+{
+ static unsigned long prev_end_pfn, nr_initialised;
+
+ if (early_page_ext_enabled())
+ return false;
+ /*
+ * prev_end_pfn static that contains the end of previous zone
+ * No need to protect because called very early in boot before smp_init.
+ */
+ if (prev_end_pfn != end_pfn) {
+ prev_end_pfn = end_pfn;
+ nr_initialised = 0;
+ }
+
+ /* Always populate low zones for address-constrained allocations */
+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
+ return false;
+
+ if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
+ return true;
+ /*
+ * We start only with one section of pages, more pages are added as
+ * needed until the rest of deferred pages are initialized.
+ */
+ nr_initialised++;
+ if ((nr_initialised > PAGES_PER_SECTION) &&
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+ NODE_DATA(nid)->first_deferred_pfn = pfn;
+ return true;
+ }
+ return false;
+}
+
+static void __meminit init_reserved_page(unsigned long pfn)
+{
+ pg_data_t *pgdat;
+ int nid, zid;
+
+ if (early_page_initialised(pfn))
+ return;
+
+ nid = early_pfn_to_nid(pfn);
+ pgdat = NODE_DATA(nid);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (zone_spans_pfn(zone, pfn))
+ break;
+ }
+ __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
+}
+#else
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
+
+static inline bool early_page_initialised(unsigned long pfn)
+{
+ return true;
+}
+
+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+{
+ return false;
+}
+
+static inline void init_reserved_page(unsigned long pfn)
+{
+}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+/*
+ * Initialised pages do not have PageReserved set. This function is
+ * called for each range allocated by the bootmem allocator and
+ * marks the pages PageReserved. The remaining valid pages are later
+ * sent to the buddy page allocator.
+ */
+void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
+{
+ unsigned long start_pfn = PFN_DOWN(start);
+ unsigned long end_pfn = PFN_UP(end);
+
+ for (; start_pfn < end_pfn; start_pfn++) {
+ if (pfn_valid(start_pfn)) {
+ struct page *page = pfn_to_page(start_pfn);
+
+ init_reserved_page(start_pfn);
+
+ /* Avoid false-positive PageTail() */
+ INIT_LIST_HEAD(&page->lru);
+
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
+ }
+ }
+}
+
+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
+static bool __meminit
+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
+{
+ static struct memblock_region *r;
+
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_mem_region(r) {
+ if (*pfn < memblock_region_memory_end_pfn(r))
+ break;
+ }
+ }
+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ *pfn = memblock_region_memory_end_pfn(r);
+ return true;
+ }
+ }
+ return false;
+}
+
+/*
+ * Only struct pages that correspond to ranges defined by memblock.memory
+ * are zeroed and initialized by going through __init_single_page() during
+ * memmap_init_zone_range().
+ *
+ * But, there could be struct pages that correspond to holes in
+ * memblock.memory. This can happen because of the following reasons:
+ * - physical memory bank size is not necessarily the exact multiple of the
+ * arbitrary section size
+ * - early reserved memory may not be listed in memblock.memory
+ * - memory layouts defined with memmap= kernel parameter may not align
+ * nicely with memmap sections
+ *
+ * Explicitly initialize those struct pages so that:
+ * - PG_Reserved is set
+ * - zone and node links point to zone and node that span the page if the
+ * hole is in the middle of a zone
+ * - zone and node links point to adjacent zone/node if the hole falls on
+ * the zone boundary; the pages in such holes will be prepended to the
+ * zone/node above the hole except for the trailing pages in the last
+ * section that will be appended to the zone/node below.
+ */
+static void __init init_unavailable_range(unsigned long spfn,
+ unsigned long epfn,
+ int zone, int node)
+{
+ unsigned long pfn;
+ u64 pgcnt = 0;
+
+ for (pfn = spfn; pfn < epfn; pfn++) {
+ if (!pfn_valid(pageblock_start_pfn(pfn))) {
+ pfn = pageblock_end_pfn(pfn) - 1;
+ continue;
+ }
+ __init_single_page(pfn_to_page(pfn), pfn, zone, node);
+ __SetPageReserved(pfn_to_page(pfn));
+ pgcnt++;
+ }
+
+ if (pgcnt)
+ pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
+ node, zone_names[zone], pgcnt);
+}
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by memblock_free_all() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
+ */
+void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
+ unsigned long start_pfn, unsigned long zone_end_pfn,
+ enum meminit_context context,
+ struct vmem_altmap *altmap, int migratetype)
+{
+ unsigned long pfn, end_pfn = start_pfn + size;
+ struct page *page;
+
+ if (highest_memmap_pfn < end_pfn - 1)
+ highest_memmap_pfn = end_pfn - 1;
+
+#ifdef CONFIG_ZONE_DEVICE
+ /*
+ * Honor reservation requested by the driver for this ZONE_DEVICE
+ * memory. We limit the total number of pages to initialize to just
+ * those that might contain the memory mapping. We will defer the
+ * ZONE_DEVICE page initialization until after we have released
+ * the hotplug lock.
+ */
+ if (zone == ZONE_DEVICE) {
+ if (!altmap)
+ return;
+
+ if (start_pfn == altmap->base_pfn)
+ start_pfn += altmap->reserve;
+ end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ }
+#endif
+
+ for (pfn = start_pfn; pfn < end_pfn; ) {
+ /*
+ * There can be holes in boot-time mem_map[]s handed to this
+ * function. They do not exist on hotplugged memory.
+ */
+ if (context == MEMINIT_EARLY) {
+ if (overlap_memmap_init(zone, &pfn))
+ continue;
+ if (defer_init(nid, pfn, zone_end_pfn)) {
+ deferred_struct_pages = true;
+ break;
+ }
+ }
+
+ page = pfn_to_page(pfn);
+ __init_single_page(page, pfn, zone, nid);
+ if (context == MEMINIT_HOTPLUG)
+ __SetPageReserved(page);
+
+ /*
+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
+ * such that unmovable allocations won't be scattered all
+ * over the place during system boot.
+ */
+ if (pageblock_aligned(pfn)) {
+ set_pageblock_migratetype(page, migratetype);
+ cond_resched();
+ }
+ pfn++;
+ }
+}
+
+static void __init memmap_init_zone_range(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long *hole_pfn)
+{
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
+ unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+ int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
+
+ start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
+
+ if (start_pfn >= end_pfn)
+ return;
+
+ memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+
+ if (*hole_pfn < start_pfn)
+ init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
+
+ *hole_pfn = end_pfn;
+}
+
+static void __init memmap_init(void)
+{
+ unsigned long start_pfn, end_pfn;
+ unsigned long hole_pfn = 0;
+ int i, j, zone_id = 0, nid;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ struct pglist_data *node = NODE_DATA(nid);
+
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone *zone = node->node_zones + j;
+
+ if (!populated_zone(zone))
+ continue;
+
+ memmap_init_zone_range(zone, start_pfn, end_pfn,
+ &hole_pfn);
+ zone_id = j;
+ }
+ }
+
+#ifdef CONFIG_SPARSEMEM
+ /*
+ * Initialize the memory map for hole in the range [memory_end,
+ * section_end].
+ * Append the pages in this hole to the highest zone in the last
+ * node.
+ * The call to init_unavailable_range() is outside the ifdef to
+ * silence the compiler warining about zone_id set but not used;
+ * for FLATMEM it is a nop anyway
+ */
+ end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
+ if (hole_pfn < end_pfn)
+#endif
+ init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
+ unsigned long zone_idx, int nid,
+ struct dev_pagemap *pgmap)
+{
+
+ __init_single_page(page, pfn, zone_idx, nid);
+
+ /*
+ * Mark page reserved as it will need to wait for onlining
+ * phase for it to be fully associated with a zone.
+ *
+ * We can use the non-atomic __set_bit operation for setting
+ * the flag as we are still initializing the pages.
+ */
+ __SetPageReserved(page);
+
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
+ * and zone_device_data. It is a bug if a ZONE_DEVICE page is
+ * ever freed or placed on a driver-private list.
+ */
+ page->pgmap = pgmap;
+ page->zone_device_data = NULL;
+
+ /*
+ * Mark the block movable so that blocks are reserved for
+ * movable at startup. This will force kernel allocations
+ * to reserve their blocks rather than leaking throughout
+ * the address space during boot when many long-lived
+ * kernel allocations are made.
+ *
+ * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
+ * because this is done early in section_activate()
+ */
+ if (pageblock_aligned(pfn)) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ cond_resched();
+ }
+
+ /*
+ * ZONE_DEVICE pages are released directly to the driver page allocator
+ * which will set the page count to 1 when allocating the page.
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_COHERENT)
+ set_page_count(page, 0);
+}
+
+/*
+ * With compound page geometry and when struct pages are stored in ram most
+ * tail pages are reused. Consequently, the amount of unique struct pages to
+ * initialize is a lot smaller that the total amount of struct pages being
+ * mapped. This is a paired / mild layering violation with explicit knowledge
+ * of how the sparse_vmemmap internals handle compound pages in the lack
+ * of an altmap. See vmemmap_populate_compound_pages().
+ */
+static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
+ unsigned long nr_pages)
+{
+ return is_power_of_2(sizeof(struct page)) &&
+ !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
+}
+
+static void __ref memmap_init_compound(struct page *head,
+ unsigned long head_pfn,
+ unsigned long zone_idx, int nid,
+ struct dev_pagemap *pgmap,
+ unsigned long nr_pages)
+{
+ unsigned long pfn, end_pfn = head_pfn + nr_pages;
+ unsigned int order = pgmap->vmemmap_shift;
+
+ __SetPageHead(head);
+ for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+ prep_compound_tail(head, pfn - head_pfn);
+ set_page_count(page, 0);
+
+ /*
+ * The first tail page stores important compound page info.
+ * Call prep_compound_head() after the first tail page has
+ * been initialized, to not have the data overwritten.
+ */
+ if (pfn == head_pfn + 1)
+ prep_compound_head(head, order);
+ }
+}
+
+void __ref memmap_init_zone_device(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long nr_pages,
+ struct dev_pagemap *pgmap)
+{
+ unsigned long pfn, end_pfn = start_pfn + nr_pages;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ struct vmem_altmap *altmap = pgmap_altmap(pgmap);
+ unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
+ unsigned long zone_idx = zone_idx(zone);
+ unsigned long start = jiffies;
+ int nid = pgdat->node_id;
+
+ if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))
+ return;
+
+ /*
+ * The call to memmap_init should have already taken care
+ * of the pages reserved for the memmap, so we can just jump to
+ * the end of that region and start processing the device pages.
+ */
+ if (altmap) {
+ start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ nr_pages = end_pfn - start_pfn;
+ }
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
+ struct page *page = pfn_to_page(pfn);
+
+ __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+
+ if (pfns_per_compound == 1)
+ continue;
+
+ memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
+ compound_nr_pages(altmap, pfns_per_compound));
+ }
+
+ pr_debug("%s initialised %lu pages in %ums\n", __func__,
+ nr_pages, jiffies_to_msecs(jiffies - start));
+}
+#endif
+
+/*
+ * The zone ranges provided by the architecture do not include ZONE_MOVABLE
+ * because it is sized independent of architecture. Unlike the other zones,
+ * the starting point for ZONE_MOVABLE is not fixed. It may be different
+ * in each node depending on the size of each node and how evenly kernelcore
+ * is distributed. This helper function adjusts the zone ranges
+ * provided by the architecture for a given node by using the end of the
+ * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
+ * zones within a node are in order of monotonic increases memory addresses
+ */
+static void __init adjust_zone_range_for_zone_movable(int nid,
+ unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn)
+{
+ /* Only adjust if ZONE_MOVABLE is on this node */
+ if (zone_movable_pfn[nid]) {
+ /* Size ZONE_MOVABLE */
+ if (zone_type == ZONE_MOVABLE) {
+ *zone_start_pfn = zone_movable_pfn[nid];
+ *zone_end_pfn = min(node_end_pfn,
+ arch_zone_highest_possible_pfn[movable_zone]);
+
+ /* Adjust for ZONE_MOVABLE starting within this range */
+ } else if (!mirrored_kernelcore &&
+ *zone_start_pfn < zone_movable_pfn[nid] &&
+ *zone_end_pfn > zone_movable_pfn[nid]) {
+ *zone_end_pfn = zone_movable_pfn[nid];
+
+ /* Check if this whole range is within ZONE_MOVABLE */
+ } else if (*zone_start_pfn >= zone_movable_pfn[nid])
+ *zone_start_pfn = *zone_end_pfn;
+ }
+}
+
+/*
+ * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
+ * then all holes in the requested range will be accounted for.
+ */
+unsigned long __init __absent_pages_in_range(int nid,
+ unsigned long range_start_pfn,
+ unsigned long range_end_pfn)
+{
+ unsigned long nr_absent = range_end_pfn - range_start_pfn;
+ unsigned long start_pfn, end_pfn;
+ int i;
+
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+ end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+ nr_absent -= end_pfn - start_pfn;
+ }
+ return nr_absent;
+}
+
+/**
+ * absent_pages_in_range - Return number of page frames in holes within a range
+ * @start_pfn: The start PFN to start searching for holes
+ * @end_pfn: The end PFN to stop searching for holes
+ *
+ * Return: the number of pages frames in memory holes within a range.
+ */
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+
+/* Return the number of page frames in holes in a zone on a node */
+static unsigned long __init zone_absent_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn)
+{
+ unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
+ unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long nr_absent;
+
+ /* When hotadd a new node from cpu_up(), the node should be empty */
+ if (!node_start_pfn && !node_end_pfn)
+ return 0;
+
+ zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
+
+ adjust_zone_range_for_zone_movable(nid, zone_type,
+ node_start_pfn, node_end_pfn,
+ &zone_start_pfn, &zone_end_pfn);
+ nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+
+ /*
+ * ZONE_MOVABLE handling.
+ * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+ * and vice versa.
+ */
+ if (mirrored_kernelcore && zone_movable_pfn[nid]) {
+ unsigned long start_pfn, end_pfn;
+ struct memblock_region *r;
+
+ for_each_mem_region(r) {
+ start_pfn = clamp(memblock_region_memory_base_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(memblock_region_memory_end_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+
+ if (zone_type == ZONE_MOVABLE &&
+ memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+
+ if (zone_type == ZONE_NORMAL &&
+ !memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+ }
+ }
+
+ return nr_absent;
+}
+
+/*
+ * Return the number of pages a zone spans in a node, including holes
+ * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
+ */
+static unsigned long __init zone_spanned_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn)
+{
+ unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
+ /* When hotadd a new node from cpu_up(), the node should be empty */
+ if (!node_start_pfn && !node_end_pfn)
+ return 0;
+
+ /* Get the start and end of the zone */
+ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
+ adjust_zone_range_for_zone_movable(nid, zone_type,
+ node_start_pfn, node_end_pfn,
+ zone_start_pfn, zone_end_pfn);
+
+ /* Check that this node has pages within the zone's required range */
+ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
+ return 0;
+
+ /* Move the zone boundaries inside the node if necessary */
+ *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+ *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
+
+ /* Return the spanned pages */
+ return *zone_end_pfn - *zone_start_pfn;
+}
+
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn)
+{
+ unsigned long realtotalpages = 0, totalpages = 0;
+ enum zone_type i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+ unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long spanned, absent;
+ unsigned long size, real_size;
+
+ spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn,
+ &zone_start_pfn,
+ &zone_end_pfn);
+ absent = zone_absent_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn);
+
+ size = spanned;
+ real_size = size - absent;
+
+ if (size)
+ zone->zone_start_pfn = zone_start_pfn;
+ else
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = size;
+ zone->present_pages = real_size;
+#if defined(CONFIG_MEMORY_HOTPLUG)
+ zone->present_early_pages = real_size;
+#endif
+
+ totalpages += size;
+ realtotalpages += real_size;
+ }
+
+ pgdat->node_spanned_pages = totalpages;
+ pgdat->node_present_pages = realtotalpages;
+ pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
+}
+
+static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
+ unsigned long present_pages)
+{
+ unsigned long pages = spanned_pages;
+
+ /*
+ * Provide a more accurate estimation if there are holes within
+ * the zone and SPARSEMEM is in use. If there are holes within the
+ * zone, each populated memory region may cost us one or two extra
+ * memmap pages due to alignment because memmap pages for each
+ * populated regions may not be naturally aligned on page boundary.
+ * So the (present_pages >> 4) heuristic is a tradeoff for that.
+ */
+ if (spanned_pages > present_pages + (present_pages >> 4) &&
+ IS_ENABLED(CONFIG_SPARSEMEM))
+ pages = present_pages;
+
+ return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pgdat_init_split_queue(struct pglist_data *pgdat)
+{
+ struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
+
+ spin_lock_init(&ds_queue->split_queue_lock);
+ INIT_LIST_HEAD(&ds_queue->split_queue);
+ ds_queue->split_queue_len = 0;
+}
+#else
+static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
+#endif
+
+#ifdef CONFIG_COMPACTION
+static void pgdat_init_kcompactd(struct pglist_data *pgdat)
+{
+ init_waitqueue_head(&pgdat->kcompactd_wait);
+}
+#else
+static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
+#endif
+
+static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
+{
+ int i;
+
+ pgdat_resize_init(pgdat);
+ pgdat_kswapd_lock_init(pgdat);
+
+ pgdat_init_split_queue(pgdat);
+ pgdat_init_kcompactd(pgdat);
+
+ init_waitqueue_head(&pgdat->kswapd_wait);
+ init_waitqueue_head(&pgdat->pfmemalloc_wait);
+
+ for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
+ init_waitqueue_head(&pgdat->reclaim_wait[i]);
+
+ pgdat_page_ext_init(pgdat);
+ lruvec_init(&pgdat->__lruvec);
+}
+
+static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
+ unsigned long remaining_pages)
+{
+ atomic_long_set(&zone->managed_pages, remaining_pages);
+ zone_set_nid(zone, nid);
+ zone->name = zone_names[idx];
+ zone->zone_pgdat = NODE_DATA(nid);
+ spin_lock_init(&zone->lock);
+ zone_seqlock_init(zone);
+ zone_pcp_init(zone);
+}
+
+static void __meminit zone_init_free_lists(struct zone *zone)
+{
+ unsigned int order, t;
+ for_each_migratetype_order(order, t) {
+ INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
+ zone->free_area[order].nr_free = 0;
+ }
+}
+
+void __meminit init_currently_empty_zone(struct zone *zone,
+ unsigned long zone_start_pfn,
+ unsigned long size)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int zone_idx = zone_idx(zone) + 1;
+
+ if (zone_idx > pgdat->nr_zones)
+ pgdat->nr_zones = zone_idx;
+
+ zone->zone_start_pfn = zone_start_pfn;
+
+ mminit_dprintk(MMINIT_TRACE, "memmap_init",
+ "Initialising map node %d zone %lu pfns %lu -> %lu\n",
+ pgdat->node_id,
+ (unsigned long)zone_idx(zone),
+ zone_start_pfn, (zone_start_pfn + size));
+
+ zone_init_free_lists(zone);
+ zone->initialized = 1;
+}
+
+#ifndef CONFIG_SPARSEMEM
+/*
+ * Calculate the size of the zone->blockflags rounded to an unsigned long
+ * Start by making sure zonesize is a multiple of pageblock_order by rounding
+ * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
+ * round what is now in bits to nearest long in bits, then return it in
+ * bytes.
+ */
+static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
+{
+ unsigned long usemapsize;
+
+ zonesize += zone_start_pfn & (pageblock_nr_pages-1);
+ usemapsize = roundup(zonesize, pageblock_nr_pages);
+ usemapsize = usemapsize >> pageblock_order;
+ usemapsize *= NR_PAGEBLOCK_BITS;
+ usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
+
+ return usemapsize / 8;
+}
+
+static void __ref setup_usemap(struct zone *zone)
+{
+ unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
+ zone->spanned_pages);
+ zone->pageblock_flags = NULL;
+ if (usemapsize) {
+ zone->pageblock_flags =
+ memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
+ zone_to_nid(zone));
+ if (!zone->pageblock_flags)
+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
+ usemapsize, zone->name, zone_to_nid(zone));
+ }
+}
+#else
+static inline void setup_usemap(struct zone *zone) {}
+#endif /* CONFIG_SPARSEMEM */
+
+#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+
+/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
+void __init set_pageblock_order(void)
+{
+ unsigned int order = MAX_ORDER;
+
+ /* Check that pageblock_nr_pages has not already been setup */
+ if (pageblock_order)
+ return;
+
+ /* Don't let pageblocks exceed the maximum allocation granularity. */
+ if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
+ order = HUGETLB_PAGE_ORDER;
+
+ /*
+ * Assume the largest contiguous order of interest is a huge page.
+ * This value may be variable depending on boot parameters on IA64 and
+ * powerpc.
+ */
+ pageblock_order = order;
+}
+#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+
+/*
+ * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
+ * is unused as pageblock_order is set at compile-time. See
+ * include/linux/pageblock-flags.h for the values of pageblock_order based on
+ * the kernel config
+ */
+void __init set_pageblock_order(void)
+{
+}
+
+#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+
+/*
+ * Set up the zone data structures
+ * - init pgdat internals
+ * - init all zones belonging to this node
+ *
+ * NOTE: this function is only called during memory hotplug
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
+{
+ int nid = pgdat->node_id;
+ enum zone_type z;
+ int cpu;
+
+ pgdat_init_internals(pgdat);
+
+ if (pgdat->per_cpu_nodestats == &boot_nodestats)
+ pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
+
+ /*
+ * Reset the nr_zones, order and highest_zoneidx before reuse.
+ * Note that kswapd will init kswapd_highest_zoneidx properly
+ * when it starts in the near future.
+ */
+ pgdat->nr_zones = 0;
+ pgdat->kswapd_order = 0;
+ pgdat->kswapd_highest_zoneidx = 0;
+ pgdat->node_start_pfn = 0;
+ for_each_online_cpu(cpu) {
+ struct per_cpu_nodestat *p;
+
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+ memset(p, 0, sizeof(*p));
+ }
+
+ for (z = 0; z < MAX_NR_ZONES; z++)
+ zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
+}
+#endif
+
+/*
+ * Set up the zone data structures:
+ * - mark all pages reserved
+ * - mark all memory queues empty
+ * - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
+ * NOTE: this function is only called during early init.
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat)
+{
+ enum zone_type j;
+ int nid = pgdat->node_id;
+
+ pgdat_init_internals(pgdat);
+ pgdat->per_cpu_nodestats = &boot_nodestats;
+
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone *zone = pgdat->node_zones + j;
+ unsigned long size, freesize, memmap_pages;
+
+ size = zone->spanned_pages;
+ freesize = zone->present_pages;
+
+ /*
+ * Adjust freesize so that it accounts for how much memory
+ * is used by this zone for memmap. This affects the watermark
+ * and per-cpu initialisations
+ */
+ memmap_pages = calc_memmap_size(size, freesize);
+ if (!is_highmem_idx(j)) {
+ if (freesize >= memmap_pages) {
+ freesize -= memmap_pages;
+ if (memmap_pages)
+ pr_debug(" %s zone: %lu pages used for memmap\n",
+ zone_names[j], memmap_pages);
+ } else
+ pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n",
+ zone_names[j], memmap_pages, freesize);
+ }
+
+ /* Account for reserved pages */
+ if (j == 0 && freesize > dma_reserve) {
+ freesize -= dma_reserve;
+ pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
+ }
+
+ if (!is_highmem_idx(j))
+ nr_kernel_pages += freesize;
+ /* Charge for highmem memmap if there are enough kernel pages */
+ else if (nr_kernel_pages > memmap_pages * 2)
+ nr_kernel_pages -= memmap_pages;
+ nr_all_pages += freesize;
+
+ /*
+ * Set an approximate value for lowmem here, it will be adjusted
+ * when the bootmem allocator frees pages into the buddy system.
+ * And all highmem pages will be managed by the buddy system.
+ */
+ zone_init_internals(zone, j, nid, freesize);
+
+ if (!size)
+ continue;
+
+ set_pageblock_order();
+ setup_usemap(zone);
+ init_currently_empty_zone(zone, zone->zone_start_pfn, size);
+ }
+}
+
+void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, int nid, bool exact_nid)
+{
+ void *ptr;
+
+ if (exact_nid)
+ ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
+ MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid);
+ else
+ ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
+ MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid);
+
+ if (ptr && size > 0)
+ page_init_poison(ptr, size);
+
+ return ptr;
+}
+
+#ifdef CONFIG_FLATMEM
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+{
+ unsigned long __maybe_unused start = 0;
+ unsigned long __maybe_unused offset = 0;
+
+ /* Skip empty nodes */
+ if (!pgdat->node_spanned_pages)
+ return;
+
+ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+ offset = pgdat->node_start_pfn - start;
+ /* ia64 gets its own node_mem_map, before this, without bootmem */
+ if (!pgdat->node_mem_map) {
+ unsigned long size, end;
+ struct page *map;
+
+ /*
+ * The zone's endpoints aren't required to be MAX_ORDER
+ * aligned but the node_mem_map endpoints must be in order
+ * for the buddy allocator to function correctly.
+ */
+ end = pgdat_end_pfn(pgdat);
+ end = ALIGN(end, MAX_ORDER_NR_PAGES);
+ size = (end - start) * sizeof(struct page);
+ map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
+ pgdat->node_id, false);
+ if (!map)
+ panic("Failed to allocate %ld bytes for node %d memory map\n",
+ size, pgdat->node_id);
+ pgdat->node_mem_map = map + offset;
+ }
+ pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
+ __func__, pgdat->node_id, (unsigned long)pgdat,
+ (unsigned long)pgdat->node_mem_map);
+#ifndef CONFIG_NUMA
+ /*
+ * With no DISCONTIG, the global mem_map is just set as node 0's
+ */
+ if (pgdat == NODE_DATA(0)) {
+ mem_map = NODE_DATA(0)->node_mem_map;
+ if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
+ mem_map -= offset;
+ }
+#endif
+}
+#else
+static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
+#endif /* CONFIG_FLATMEM */
+
+/**
+ * get_pfn_range_for_nid - Return the start and end page frames for a node
+ * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
+ * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
+ * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
+ *
+ * It returns the start and end page frame of a node based on information
+ * provided by memblock_set_node(). If called for a node
+ * with no available memory, a warning is printed and the start and end
+ * PFNs will be 0.
+ */
+void __init get_pfn_range_for_nid(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn)
+{
+ unsigned long this_start_pfn, this_end_pfn;
+ int i;
+
+ *start_pfn = -1UL;
+ *end_pfn = 0;
+
+ for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
+ *start_pfn = min(*start_pfn, this_start_pfn);
+ *end_pfn = max(*end_pfn, this_end_pfn);
+ }
+
+ if (*start_pfn == -1UL)
+ *start_pfn = 0;
+}
+
+static void __init free_area_init_node(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ unsigned long start_pfn = 0;
+ unsigned long end_pfn = 0;
+
+ /* pg_data_t should be reset to zero when it's allocated */
+ WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
+
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+
+ pgdat->node_id = nid;
+ pgdat->node_start_pfn = start_pfn;
+ pgdat->per_cpu_nodestats = NULL;
+
+ if (start_pfn != end_pfn) {
+ pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
+ (u64)start_pfn << PAGE_SHIFT,
+ end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+ } else {
+ pr_info("Initmem setup node %d as memoryless\n", nid);
+ }
+
+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
+
+ alloc_node_mem_map(pgdat);
+ pgdat_set_deferred_range(pgdat);
+
+ free_area_init_core(pgdat);
+ lru_gen_init_pgdat(pgdat);
+}
+
+/* Any regular or high memory on that node ? */
+static void check_for_memory(pg_data_t *pgdat, int nid)
+{
+ enum zone_type zone_type;
+
+ for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+ if (populated_zone(zone)) {
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ node_set_state(nid, N_HIGH_MEMORY);
+ if (zone_type <= ZONE_NORMAL)
+ node_set_state(nid, N_NORMAL_MEMORY);
+ break;
+ }
+ }
+}
+
+#if MAX_NUMNODES > 1
+/*
+ * Figure out the number of possible node ids.
+ */
+void __init setup_nr_node_ids(void)
+{
+ unsigned int highest;
+
+ highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
+ nr_node_ids = highest + 1;
+}
+#endif
+
+static void __init free_area_init_memoryless_node(int nid)
+{
+ free_area_init_node(nid);
+}
+
+/*
+ * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * such cases we allow max_zone_pfn sorted in the descending order
+ */
+bool __weak arch_has_descending_max_zone_pfns(void)
+{
+ return false;
+}
+
+/**
+ * free_area_init - Initialise all pg_data_t and zone data
+ * @max_zone_pfn: an array of max PFNs for each zone
+ *
+ * This will call free_area_init_node() for each active node in the system.
+ * Using the page ranges provided by memblock_set_node(), the size of each
+ * zone in each node and their holes is calculated. If the maximum PFN
+ * between two adjacent zones match, it is assumed that the zone is empty.
+ * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
+ * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
+ * starts where the previous one ended. For example, ZONE_DMA32 starts
+ * at arch_max_dma_pfn.
+ */
+void __init free_area_init(unsigned long *max_zone_pfn)
+{
+ unsigned long start_pfn, end_pfn;
+ int i, nid, zone;
+ bool descending;
+
+ /* Record where the zone boundaries are */
+ memset(arch_zone_lowest_possible_pfn, 0,
+ sizeof(arch_zone_lowest_possible_pfn));
+ memset(arch_zone_highest_possible_pfn, 0,
+ sizeof(arch_zone_highest_possible_pfn));
+
+ start_pfn = PHYS_PFN(memblock_start_of_DRAM());
+ descending = arch_has_descending_max_zone_pfns();
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (descending)
+ zone = MAX_NR_ZONES - i - 1;
+ else
+ zone = i;
+
+ if (zone == ZONE_MOVABLE)
+ continue;
+
+ end_pfn = max(max_zone_pfn[zone], start_pfn);
+ arch_zone_lowest_possible_pfn[zone] = start_pfn;
+ arch_zone_highest_possible_pfn[zone] = end_pfn;
+
+ start_pfn = end_pfn;
+ }
+
+ /* Find the PFNs that ZONE_MOVABLE begins at in each node */
+ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+ find_zone_movable_pfns_for_nodes();
+
+ /* Print out the zone ranges */
+ pr_info("Zone ranges:\n");
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (i == ZONE_MOVABLE)
+ continue;
+ pr_info(" %-8s ", zone_names[i]);
+ if (arch_zone_lowest_possible_pfn[i] ==
+ arch_zone_highest_possible_pfn[i])
+ pr_cont("empty\n");
+ else
+ pr_cont("[mem %#018Lx-%#018Lx]\n",
+ (u64)arch_zone_lowest_possible_pfn[i]
+ << PAGE_SHIFT,
+ ((u64)arch_zone_highest_possible_pfn[i]
+ << PAGE_SHIFT) - 1);
+ }
+
+ /* Print out the PFNs ZONE_MOVABLE begins at in each node */
+ pr_info("Movable zone start for each node\n");
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (zone_movable_pfn[i])
+ pr_info(" Node %d: %#018Lx\n", i,
+ (u64)zone_movable_pfn[i] << PAGE_SHIFT);
+ }
+
+ /*
+ * Print out the early node map, and initialize the
+ * subsection-map relative to active online memory ranges to
+ * enable future "sub-section" extensions of the memory map.
+ */
+ pr_info("Early memory node ranges\n");
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
+ (u64)start_pfn << PAGE_SHIFT,
+ ((u64)end_pfn << PAGE_SHIFT) - 1);
+ subsection_map_init(start_pfn, end_pfn - start_pfn);
+ }
+
+ /* Initialise every node */
+ mminit_verify_pageflags_layout();
+ setup_nr_node_ids();
+ for_each_node(nid) {
+ pg_data_t *pgdat;
+
+ if (!node_online(nid)) {
+ pr_info("Initializing node %d as memoryless\n", nid);
+
+ /* Allocator not initialized yet */
+ pgdat = arch_alloc_nodedata(nid);
+ if (!pgdat)
+ panic("Cannot allocate %zuB for node %d.\n",
+ sizeof(*pgdat), nid);
+ arch_refresh_nodedata(nid, pgdat);
+ free_area_init_memoryless_node(nid);
+
+ /*
+ * We do not want to confuse userspace by sysfs
+ * files/directories for node without any memory
+ * attached to it, so this node is not marked as
+ * N_MEMORY and not marked online so that no sysfs
+ * hierarchy will be created via register_one_node for
+ * it. The pgdat will get fully initialized by
+ * hotadd_init_pgdat() when memory is hotplugged into
+ * this node.
+ */
+ continue;
+ }
+
+ pgdat = NODE_DATA(nid);
+ free_area_init_node(nid);
+
+ /* Any memory on that node */
+ if (pgdat->node_present_pages)
+ node_set_state(nid, N_MEMORY);
+ check_for_memory(pgdat, nid);
+ }
+
+ memmap_init();
+
+ /* disable hash distribution for systems with a single node */
+ fixup_hashdist();
+}
+
+/**
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
+ * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
+ *
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Return: the determined alignment in pfn's. 0 if there is no alignment
+ * requirement (single node).
+ */
+unsigned long __init node_map_pfn_alignment(void)
+{
+ unsigned long accl_mask = 0, last_end = 0;
+ unsigned long start, end, mask;
+ int last_nid = NUMA_NO_NODE;
+ int i, nid;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
+ if (!start || last_nid < 0 || last_nid == nid) {
+ last_nid = nid;
+ last_end = end;
+ continue;
+ }
+
+ /*
+ * Start with a mask granular enough to pin-point to the
+ * start pfn and tick off bits one-by-one until it becomes
+ * too coarse to separate the current node from the last.
+ */
+ mask = ~((1 << __ffs(start)) - 1);
+ while (mask && last_end <= (start & (mask << 1)))
+ mask <<= 1;
+
+ /* accumulate all internode masks */
+ accl_mask |= mask;
+ }
+
+ /* convert mask to number of pages */
+ return ~accl_mask + 1;
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static void __init deferred_free_range(unsigned long pfn,
+ unsigned long nr_pages)
+{
+ struct page *page;
+ unsigned long i;
+
+ if (!nr_pages)
+ return;
+
+ page = pfn_to_page(pfn);
+
+ /* Free a large naturally-aligned chunk if possible */
+ if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
+ for (i = 0; i < nr_pages; i += pageblock_nr_pages)
+ set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
+ __free_pages_core(page, MAX_ORDER);
+ return;
+ }
+
+ for (i = 0; i < nr_pages; i++, page++, pfn++) {
+ if (pageblock_aligned(pfn))
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ __free_pages_core(page, 0);
+ }
+}
+
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+ if (atomic_dec_and_test(&pgdat_init_n_undone))
+ complete(&pgdat_init_all_done_comp);
+}
+
+/*
+ * Returns true if page needs to be initialized or freed to buddy allocator.
+ *
+ * We check if a current MAX_ORDER block is valid by only checking the validity
+ * of the head pfn.
+ */
+static inline bool __init deferred_pfn_valid(unsigned long pfn)
+{
+ if (IS_MAX_ORDER_ALIGNED(pfn) && !pfn_valid(pfn))
+ return false;
+ return true;
+}
+
+/*
+ * Free pages to buddy allocator. Try to free aligned pages in
+ * MAX_ORDER_NR_PAGES sizes.
+ */
+static void __init deferred_free_pages(unsigned long pfn,
+ unsigned long end_pfn)
+{
+ unsigned long nr_free = 0;
+
+ for (; pfn < end_pfn; pfn++) {
+ if (!deferred_pfn_valid(pfn)) {
+ deferred_free_range(pfn - nr_free, nr_free);
+ nr_free = 0;
+ } else if (IS_MAX_ORDER_ALIGNED(pfn)) {
+ deferred_free_range(pfn - nr_free, nr_free);
+ nr_free = 1;
+ } else {
+ nr_free++;
+ }
+ }
+ /* Free the last block of pages to allocator */
+ deferred_free_range(pfn - nr_free, nr_free);
+}
+
+/*
+ * Initialize struct pages. We minimize pfn page lookups and scheduler checks
+ * by performing it only once every MAX_ORDER_NR_PAGES.
+ * Return number of pages initialized.
+ */
+static unsigned long __init deferred_init_pages(struct zone *zone,
+ unsigned long pfn,
+ unsigned long end_pfn)
+{
+ int nid = zone_to_nid(zone);
+ unsigned long nr_pages = 0;
+ int zid = zone_idx(zone);
+ struct page *page = NULL;
+
+ for (; pfn < end_pfn; pfn++) {
+ if (!deferred_pfn_valid(pfn)) {
+ page = NULL;
+ continue;
+ } else if (!page || IS_MAX_ORDER_ALIGNED(pfn)) {
+ page = pfn_to_page(pfn);
+ } else {
+ page++;
+ }
+ __init_single_page(page, pfn, zid, nid);
+ nr_pages++;
+ }
+ return (nr_pages);
+}
+
+/*
+ * This function is meant to pre-load the iterator for the zone init.
+ * Specifically it walks through the ranges until we are caught up to the
+ * first_init_pfn value and exits there. If we never encounter the value we
+ * return false indicating there are no valid ranges left.
+ */
+static bool __init
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
+ unsigned long *spfn, unsigned long *epfn,
+ unsigned long first_init_pfn)
+{
+ u64 j;
+
+ /*
+ * Start out by walking through the ranges in this zone that have
+ * already been initialized. We don't need to do anything with them
+ * so we just need to flush them out of the system.
+ */
+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
+ if (*epfn <= first_init_pfn)
+ continue;
+ if (*spfn < first_init_pfn)
+ *spfn = first_init_pfn;
+ *i = j;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, then free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ *
+ * In order to try and keep some memory in the cache we have the loop
+ * broken along max page order boundaries. This way we will not cause
+ * any issues with the buddy page computation.
+ */
+static unsigned long __init
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+ unsigned long spfn = *start_pfn, epfn = *end_pfn;
+ unsigned long nr_pages = 0;
+ u64 j = *i;
+
+ /* First we loop through and initialize the page values */
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
+ unsigned long t;
+
+ if (mo_pfn <= *start_pfn)
+ break;
+
+ t = min(mo_pfn, *end_pfn);
+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
+
+ if (mo_pfn < *end_pfn) {
+ *start_pfn = mo_pfn;
+ break;
+ }
+ }
+
+ /* Reset values and now loop through freeing pages as needed */
+ swap(j, *i);
+
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
+ unsigned long t;
+
+ if (mo_pfn <= spfn)
+ break;
+
+ t = min(mo_pfn, epfn);
+ deferred_free_pages(spfn, t);
+
+ if (mo_pfn <= epfn)
+ break;
+ }
+
+ return nr_pages;
+}
+
+static void __init
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
+ void *arg)
+{
+ unsigned long spfn, epfn;
+ struct zone *zone = arg;
+ u64 i;
+
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
+
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so that we
+ * can avoid introducing any issues with the buddy allocator.
+ */
+ while (spfn < end_pfn) {
+ deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ cond_resched();
+ }
+}
+
+/* An arch may override for more concurrency. */
+__weak int __init
+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+ return 1;
+}
+
+/* Initialise remaining memory on a node */
+static int __init deferred_init_memmap(void *data)
+{
+ pg_data_t *pgdat = data;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ unsigned long spfn = 0, epfn = 0;
+ unsigned long first_init_pfn, flags;
+ unsigned long start = jiffies;
+ struct zone *zone;
+ int zid, max_threads;
+ u64 i;
+
+ /* Bind memory initialisation thread to a local node if possible */
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(current, cpumask);
+
+ pgdat_resize_lock(pgdat, &flags);
+ first_init_pfn = pgdat->first_deferred_pfn;
+ if (first_init_pfn == ULONG_MAX) {
+ pgdat_resize_unlock(pgdat, &flags);
+ pgdat_init_report_one_done();
+ return 0;
+ }
+
+ /* Sanity check boundaries */
+ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
+ BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
+ pgdat->first_deferred_pfn = ULONG_MAX;
+
+ /*
+ * Once we unlock here, the zone cannot be grown anymore, thus if an
+ * interrupt thread must allocate this early in boot, zone must be
+ * pre-grown prior to start of deferred page initialization.
+ */
+ pgdat_resize_unlock(pgdat, &flags);
+
+ /* Only the highest zone is deferred so find it */
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ zone = pgdat->node_zones + zid;
+ if (first_init_pfn < zone_end_pfn(zone))
+ break;
+ }
+
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_init_pfn))
+ goto zone_empty;
+
+ max_threads = deferred_page_init_max_threads(cpumask);
+
+ while (spfn < epfn) {
+ unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
+ struct padata_mt_job job = {
+ .thread_fn = deferred_init_memmap_chunk,
+ .fn_arg = zone,
+ .start = spfn,
+ .size = epfn_align - spfn,
+ .align = PAGES_PER_SECTION,
+ .min_chunk = PAGES_PER_SECTION,
+ .max_threads = max_threads,
+ };
+
+ padata_do_multithreaded(&job);
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ epfn_align);
+ }
+zone_empty:
+ /* Sanity check that the next zone really is unpopulated */
+ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
+
+ pr_info("node %d deferred pages initialised in %ums\n",
+ pgdat->node_id, jiffies_to_msecs(jiffies - start));
+
+ pgdat_init_report_one_done();
+ return 0;
+}
+
+/*
+ * If this zone has deferred pages, try to grow it by initializing enough
+ * deferred pages to satisfy the allocation specified by order, rounded up to
+ * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
+ * of SECTION_SIZE bytes by initializing struct pages in increments of
+ * PAGES_PER_SECTION * sizeof(struct page) bytes.
+ *
+ * Return true when zone was grown, otherwise return false. We return true even
+ * when we grow less than requested, to let the caller decide if there are
+ * enough pages to satisfy the allocation.
+ *
+ * Note: We use noinline because this function is needed only during boot, and
+ * it is called from a __ref function _deferred_grow_zone. This way we are
+ * making sure that it is not inlined into permanent text section.
+ */
+bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+ unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
+ pg_data_t *pgdat = zone->zone_pgdat;
+ unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
+ unsigned long spfn, epfn, flags;
+ unsigned long nr_pages = 0;
+ u64 i;
+
+ /* Only the last zone may have deferred pages */
+ if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
+ return false;
+
+ pgdat_resize_lock(pgdat, &flags);
+
+ /*
+ * If someone grew this zone while we were waiting for spinlock, return
+ * true, as there might be enough pages already.
+ */
+ if (first_deferred_pfn != pgdat->first_deferred_pfn) {
+ pgdat_resize_unlock(pgdat, &flags);
+ return true;
+ }
+
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_deferred_pfn)) {
+ pgdat->first_deferred_pfn = ULONG_MAX;
+ pgdat_resize_unlock(pgdat, &flags);
+ /* Retry only once. */
+ return first_deferred_pfn != ULONG_MAX;
+ }
+
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
+ */
+ while (spfn < epfn) {
+ /* update our first deferred PFN for this section */
+ first_deferred_pfn = spfn;
+
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ touch_nmi_watchdog();
+
+ /* We should only stop along section boundaries */
+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
+ continue;
+
+ /* If our quota has been met we can stop here */
+ if (nr_pages >= nr_pages_needed)
+ break;
+ }
+
+ pgdat->first_deferred_pfn = spfn;
+ pgdat_resize_unlock(pgdat, &flags);
+
+ return nr_pages > 0;
+}
+
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+#ifdef CONFIG_CMA
+void __init init_cma_reserved_pageblock(struct page *page)
+{
+ unsigned i = pageblock_nr_pages;
+ struct page *p = page;
+
+ do {
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ } while (++p, --i);
+
+ set_pageblock_migratetype(page, MIGRATE_CMA);
+ set_page_refcounted(page);
+ __free_pages(page, pageblock_order);
+
+ adjust_managed_page_count(page, pageblock_nr_pages);
+ page_zone(page)->cma_pages += pageblock_nr_pages;
+}
+#endif
+
+void __init page_alloc_init_late(void)
+{
+ struct zone *zone;
+ int nid;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+
+ /* There will be num_node_state(N_MEMORY) threads */
+ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
+ for_each_node_state(nid, N_MEMORY) {
+ kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
+ }
+
+ /* Block until all are initialised */
+ wait_for_completion(&pgdat_init_all_done_comp);
+
+ /*
+ * We initialized the rest of the deferred pages. Permanently disable
+ * on-demand struct page initialization.
+ */
+ static_branch_disable(&deferred_pages);
+
+ /* Reinit limits that are based on free pages after the kernel is up */
+ files_maxfiles_init();
+#endif
+
+ buffer_init();
+
+ /* Discard memblock private memory */
+ memblock_discard();
+
+ for_each_node_state(nid, N_MEMORY)
+ shuffle_free_memory(NODE_DATA(nid));
+
+ for_each_populated_zone(zone)
+ set_zone_contiguous(zone);
+
+ /* Initialize page ext after all struct pages are initialized. */
+ if (deferred_struct_pages)
+ page_ext_init();
+}
+
+#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+/*
+ * Returns the number of pages that arch has reserved but
+ * is not known to alloc_large_system_hash().
+ */
+static unsigned long __init arch_reserved_kernel_pages(void)
+{
+ return 0;
+}
+#endif
+
+/*
+ * Adaptive scale is meant to reduce sizes of hash tables on large memory
+ * machines. As memory size is increased the scale is also increased but at
+ * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
+ * quadruples the scale is increased by one, which means the size of hash table
+ * only doubles, instead of quadrupling as well.
+ * Because 32-bit systems cannot have large physical memory, where this scaling
+ * makes sense, it is disabled on such platforms.
+ */
+#if __BITS_PER_LONG > 32
+#define ADAPT_SCALE_BASE (64ul << 30)
+#define ADAPT_SCALE_SHIFT 2
+#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
+#endif
+
+/*
+ * allocate a large system hash table from bootmem
+ * - it is assumed that the hash table must contain an exact power-of-2
+ * quantity of entries
+ * - limit is the number of hash buckets, not the total allocation size
+ */
+void *__init alloc_large_system_hash(const char *tablename,
+ unsigned long bucketsize,
+ unsigned long numentries,
+ int scale,
+ int flags,
+ unsigned int *_hash_shift,
+ unsigned int *_hash_mask,
+ unsigned long low_limit,
+ unsigned long high_limit)
+{
+ unsigned long long max = high_limit;
+ unsigned long log2qty, size;
+ void *table;
+ gfp_t gfp_flags;
+ bool virt;
+ bool huge;
+
+ /* allow the kernel cmdline to have a say */
+ if (!numentries) {
+ /* round applicable memory size up to nearest megabyte */
+ numentries = nr_kernel_pages;
+ numentries -= arch_reserved_kernel_pages();
+
+ /* It isn't necessary when PAGE_SIZE >= 1MB */
+ if (PAGE_SIZE < SZ_1M)
+ numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
+
+#if __BITS_PER_LONG > 32
+ if (!high_limit) {
+ unsigned long adapt;
+
+ for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
+ adapt <<= ADAPT_SCALE_SHIFT)
+ scale++;
+ }
+#endif
+
+ /* limit to 1 bucket per 2^scale bytes of low memory */
+ if (scale > PAGE_SHIFT)
+ numentries >>= (scale - PAGE_SHIFT);
+ else
+ numentries <<= (PAGE_SHIFT - scale);
+
+ /* Make sure we've got at least a 0-order allocation.. */
+ if (unlikely(flags & HASH_SMALL)) {
+ /* Makes no sense without HASH_EARLY */
+ WARN_ON(!(flags & HASH_EARLY));
+ if (!(numentries >> *_hash_shift)) {
+ numentries = 1UL << *_hash_shift;
+ BUG_ON(!numentries);
+ }
+ } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+ numentries = PAGE_SIZE / bucketsize;
+ }
+ numentries = roundup_pow_of_two(numentries);
+
+ /* limit allocation size to 1/16 total memory by default */
+ if (max == 0) {
+ max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
+ do_div(max, bucketsize);
+ }
+ max = min(max, 0x80000000ULL);
+
+ if (numentries < low_limit)
+ numentries = low_limit;
+ if (numentries > max)
+ numentries = max;
+
+ log2qty = ilog2(numentries);
+
+ gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
+ do {
+ virt = false;
+ size = bucketsize << log2qty;
+ if (flags & HASH_EARLY) {
+ if (flags & HASH_ZERO)
+ table = memblock_alloc(size, SMP_CACHE_BYTES);
+ else
+ table = memblock_alloc_raw(size,
+ SMP_CACHE_BYTES);
+ } else if (get_order(size) > MAX_ORDER || hashdist) {
+ table = vmalloc_huge(size, gfp_flags);
+ virt = true;
+ if (table)
+ huge = is_vm_area_hugepages(table);
+ } else {
+ /*
+ * If bucketsize is not a power-of-two, we may free
+ * some pages at the end of hash table which
+ * alloc_pages_exact() automatically does
+ */
+ table = alloc_pages_exact(size, gfp_flags);
+ kmemleak_alloc(table, size, 1, gfp_flags);
+ }
+ } while (!table && size > PAGE_SIZE && --log2qty);
+
+ if (!table)
+ panic("Failed to allocate %s hash table\n", tablename);
+
+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
+ virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
+
+ if (_hash_shift)
+ *_hash_shift = log2qty;
+ if (_hash_mask)
+ *_hash_mask = (1 << log2qty) - 1;
+
+ return table;
+}
+
+/**
+ * set_dma_reserve - set the specified number of pages reserved in the first zone
+ * @new_dma_reserve: The number of pages to mark reserved
+ *
+ * The per-cpu batchsize and zone watermarks are determined by managed_pages.
+ * In the DMA zone, a significant percentage may be consumed by kernel image
+ * and other unfreeable allocations which can skew the watermarks badly. This
+ * function may optionally be used to account for unfreeable pages in the
+ * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
+ * smaller per-cpu batchsize.
+ */
+void __init set_dma_reserve(unsigned long new_dma_reserve)
+{
+ dma_reserve = new_dma_reserve;
+}
+
+void __init memblock_free_pages(struct page *page, unsigned long pfn,
+ unsigned int order)
+{
+ if (!early_page_initialised(pfn))
+ return;
+ if (!kmsan_memblock_free_pages(page, order)) {
+ /* KMSAN will take care of these pages. */
+ return;
+ }
+ __free_pages_core(page, order);
+}
+
+static bool _init_on_alloc_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
+static int __init early_init_on_alloc(char *buf)
+{
+
+ return kstrtobool(buf, &_init_on_alloc_enabled_early);
+}
+early_param("init_on_alloc", early_init_on_alloc);
+
+static bool _init_on_free_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
+static int __init early_init_on_free(char *buf)
+{
+ return kstrtobool(buf, &_init_on_free_enabled_early);
+}
+early_param("init_on_free", early_init_on_free);
+
+DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
+
+/*
+ * Enable static keys related to various memory debugging and hardening options.
+ * Some override others, and depend on early params that are evaluated in the
+ * order of appearance. So we need to first gather the full picture of what was
+ * enabled, and then make decisions.
+ */
+static void __init mem_debugging_and_hardening_init(void)
+{
+ bool page_poisoning_requested = false;
+ bool want_check_pages = false;
+
+#ifdef CONFIG_PAGE_POISONING
+ /*
+ * Page poisoning is debug page alloc for some arches. If
+ * either of those options are enabled, enable poisoning.
+ */
+ if (page_poisoning_enabled() ||
+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+ debug_pagealloc_enabled())) {
+ static_branch_enable(&_page_poisoning_enabled);
+ page_poisoning_requested = true;
+ want_check_pages = true;
+ }
+#endif
+
+ if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
+ page_poisoning_requested) {
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
+ "will take precedence over init_on_alloc and init_on_free\n");
+ _init_on_alloc_enabled_early = false;
+ _init_on_free_enabled_early = false;
+ }
+
+ if (_init_on_alloc_enabled_early) {
+ want_check_pages = true;
+ static_branch_enable(&init_on_alloc);
+ } else {
+ static_branch_disable(&init_on_alloc);
+ }
+
+ if (_init_on_free_enabled_early) {
+ want_check_pages = true;
+ static_branch_enable(&init_on_free);
+ } else {
+ static_branch_disable(&init_on_free);
+ }
+
+ if (IS_ENABLED(CONFIG_KMSAN) &&
+ (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
+ pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (debug_pagealloc_enabled()) {
+ want_check_pages = true;
+ static_branch_enable(&_debug_pagealloc_enabled);
+
+ if (debug_guardpage_minorder())
+ static_branch_enable(&_debug_guardpage_enabled);
+ }
+#endif
+
+ /*
+ * Any page debugging or hardening option also enables sanity checking
+ * of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's
+ * enabled already.
+ */
+ if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages)
+ static_branch_enable(&check_pages_enabled);
+}
+
+/* Report memory auto-initialization states for this boot. */
+static void __init report_meminit(void)
+{
+ const char *stack;
+
+ if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
+ stack = "all(pattern)";
+ else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
+ stack = "all(zero)";
+ else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
+ stack = "byref_all(zero)";
+ else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
+ stack = "byref(zero)";
+ else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
+ stack = "__user(zero)";
+ else
+ stack = "off";
+
+ pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
+ stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",
+ want_init_on_free() ? "on" : "off");
+ if (want_init_on_free())
+ pr_info("mem auto-init: clearing system memory may take some time...\n");
+}
+
+static void __init mem_init_print_info(void)
+{
+ unsigned long physpages, codesize, datasize, rosize, bss_size;
+ unsigned long init_code_size, init_data_size;
+
+ physpages = get_num_physpages();
+ codesize = _etext - _stext;
+ datasize = _edata - _sdata;
+ rosize = __end_rodata - __start_rodata;
+ bss_size = __bss_stop - __bss_start;
+ init_data_size = __init_end - __init_begin;
+ init_code_size = _einittext - _sinittext;
+
+ /*
+ * Detect special cases and adjust section sizes accordingly:
+ * 1) .init.* may be embedded into .data sections
+ * 2) .init.text.* may be out of [__init_begin, __init_end],
+ * please refer to arch/tile/kernel/vmlinux.lds.S.
+ * 3) .rodata.* may be embedded into .text or .data sections.
+ */
+#define adj_init_size(start, end, size, pos, adj) \
+ do { \
+ if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
+ size -= adj; \
+ } while (0)
+
+ adj_init_size(__init_begin, __init_end, init_data_size,
+ _sinittext, init_code_size);
+ adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
+ adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
+ adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
+ adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
+
+#undef adj_init_size
+
+ pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
+#ifdef CONFIG_HIGHMEM
+ ", %luK highmem"
+#endif
+ ")\n",
+ K(nr_free_pages()), K(physpages),
+ codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
+ (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
+ K(physpages - totalram_pages() - totalcma_pages),
+ K(totalcma_pages)
+#ifdef CONFIG_HIGHMEM
+ , K(totalhigh_pages())
+#endif
+ );
+}
+
+/*
+ * Set up kernel memory allocators
+ */
+void __init mm_core_init(void)
+{
+ /* Initializations relying on SMP setup */
+ build_all_zonelists(NULL);
+ page_alloc_init_cpuhp();
+
+ /*
+ * page_ext requires contiguous pages,
+ * bigger than MAX_ORDER unless SPARSEMEM.
+ */
+ page_ext_init_flatmem();
+ mem_debugging_and_hardening_init();
+ kfence_alloc_pool();
+ report_meminit();
+ kmsan_init_shadow();
+ stack_depot_early_init();
+ mem_init();
+ mem_init_print_info();
+ kmem_cache_init();
+ /*
+ * page_owner must be initialized after buddy is ready, and also after
+ * slab is ready so that stack_depot_init() works properly
+ */
+ page_ext_init_flatmem_late();
+ kmemleak_init();
+ ptlock_cache_init();
+ pgtable_cache_init();
+ debug_objects_mem_init();
+ vmalloc_init();
+ /* If no deferred init page_ext now, as vmap is fully initialized */
+ if (!deferred_struct_pages)
+ page_ext_init();
+ /* Should be run before the first non-init thread is created */
+ init_espfix_bsp();
+ /* Should be run after espfix64 is set up. */
+ pti_init();
+ kmsan_init_runtime();
+ mm_cache_init();
+}
diff --git a/mm/mmap.c b/mm/mmap.c
index ff68a67a2a7c..51cd747884e3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -133,7 +133,7 @@ void unlink_file_vma(struct vm_area_struct *vma)
/*
* Close a vm structure and free it.
*/
-static void remove_vma(struct vm_area_struct *vma)
+static void remove_vma(struct vm_area_struct *vma, bool unreachable)
{
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
@@ -141,7 +141,10 @@ static void remove_vma(struct vm_area_struct *vma)
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
- vm_area_free(vma);
+ if (unreachable)
+ __vm_area_free(vma);
+ else
+ vm_area_free(vma);
}
static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
@@ -502,6 +505,15 @@ static inline void init_vma_prep(struct vma_prepare *vp,
*/
static inline void vma_prepare(struct vma_prepare *vp)
{
+ vma_start_write(vp->vma);
+ if (vp->adj_next)
+ vma_start_write(vp->adj_next);
+ /* vp->insert is always a newly created VMA, no need for locking */
+ if (vp->remove)
+ vma_start_write(vp->remove);
+ if (vp->remove2)
+ vma_start_write(vp->remove2);
+
if (vp->file) {
uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
@@ -590,6 +602,7 @@ static inline void vma_complete(struct vma_prepare *vp,
if (vp->remove) {
again:
+ vma_mark_detached(vp->remove, true);
if (vp->file) {
uprobe_munmap(vp->remove, vp->remove->vm_start,
vp->remove->vm_end);
@@ -605,7 +618,7 @@ again:
/*
* In mprotect's case 6 (see comments on vma_merge),
- * we must remove the one after next as well.
+ * we are removing both mid and next vmas
*/
if (vp->remove2) {
vp->remove = vp->remove2;
@@ -683,12 +696,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (vma_iter_prealloc(vmi))
goto nomem;
+ vma_prepare(&vp);
vma_adjust_trans_huge(vma, start, end, 0);
/* VMA iterator points to previous, so set to start if necessary */
if (vma_iter_addr(vmi) != start)
vma_iter_set(vmi, start);
- vma_prepare(&vp);
vma->vm_start = start;
vma->vm_end = end;
vma->vm_pgoff = pgoff;
@@ -723,8 +736,8 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
return -ENOMEM;
init_vma_prep(&vp, vma);
- vma_adjust_trans_huge(vma, start, end, 0);
vma_prepare(&vp);
+ vma_adjust_trans_huge(vma, start, end, 0);
if (vma->vm_start < start)
vma_iter_clear(vmi, vma->vm_start, start);
@@ -742,12 +755,13 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
/*
* If the vma has a ->close operation then the driver probably needs to release
- * per-vma resources, so we don't attempt to merge those.
+ * per-vma resources, so we don't attempt to merge those if the caller indicates
+ * the current vma may be removed as part of the merge.
*/
-static inline int is_mergeable_vma(struct vm_area_struct *vma,
- struct file *file, unsigned long vm_flags,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name)
+static inline bool is_mergeable_vma(struct vm_area_struct *vma,
+ struct file *file, unsigned long vm_flags,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ struct anon_vma_name *anon_name, bool may_remove_vma)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -758,21 +772,20 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
* extended instead.
*/
if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
- return 0;
+ return false;
if (vma->vm_file != file)
- return 0;
- if (vma->vm_ops && vma->vm_ops->close)
- return 0;
+ return false;
+ if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
+ return false;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
- return 0;
+ return false;
if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
- return 0;
- return 1;
+ return false;
+ return true;
}
-static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
- struct anon_vma *anon_vma2,
- struct vm_area_struct *vma)
+static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
+ struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
/*
* The list_is_singular() test is to avoid merging VMA cloned from
@@ -780,7 +793,7 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
*/
if ((!anon_vma1 || !anon_vma2) && (!vma ||
list_is_singular(&vma->anon_vma_chain)))
- return 1;
+ return true;
return anon_vma1 == anon_vma2;
}
@@ -794,20 +807,21 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
* We don't check here for the merged mmap wrapping around the end of pagecache
* indices (16TB on ia32) because do_mmap() does not permit mmap's which
* wrap, nor mmaps which cover the final page at index -1UL.
+ *
+ * We assume the vma may be removed as part of the merge.
*/
-static int
+static bool
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t vm_pgoff,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ struct anon_vma_name *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -816,22 +830,23 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
*
* We cannot merge two vmas if they have differently assigned (non-NULL)
* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
+ *
+ * We assume that vma is not removed as part of the merge.
*/
-static int
+static bool
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t vm_pgoff,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ struct anon_vma_name *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
if (vma->vm_pgoff + vm_pglen == vm_pgoff)
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -846,42 +861,45 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* this area are about to be changed to vm_flags - and the no-change
* case has already been eliminated.
*
- * The following mprotect cases have to be considered, where AAAA is
+ * The following mprotect cases have to be considered, where **** is
* the area passed down from mprotect_fixup, never extending beyond one
- * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
+ * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
+ * at the same address as **** and is of the same or larger span, and
+ * NNNN the next vma after ****:
*
- * AAAA AAAA AAAA
- * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN
+ * **** **** ****
+ * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC
* cannot merge might become might become
- * PPNNNNNNNNNN PPPPPPPPPPNN
+ * PPNNNNNNNNNN PPPPPPPPPPCC
* mmap, brk or case 4 below case 5 below
* mremap move:
- * AAAA AAAA
- * PPPP NNNN PPPPNNNNXXXX
+ * **** ****
+ * PPPP NNNN PPPPCCCCNNNN
* might become might become
* PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
- * PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or
- * PPPPNNNNNNNN 3 PPPPXXXXXXXX 8
+ * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or
+ * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8
*
- * It is important for case 8 that the vma NNNN overlapping the
- * region AAAA is never going to extended over XXXX. Instead XXXX must
- * be extended in region AAAA and NNNN must be removed. This way in
+ * It is important for case 8 that the vma CCCC overlapping the
+ * region **** is never going to extended over NNNN. Instead NNNN must
+ * be extended in region **** and CCCC must be removed. This way in
* all cases where vma_merge succeeds, the moment vma_merge drops the
* rmap_locks, the properties of the merged vma will be already
* correct for the whole merged range. Some of those properties like
* vm_page_prot/vm_flags may be accessed by rmap_walks and they must
* be correct for the whole merged range immediately after the
- * rmap_locks are released. Otherwise if XXXX would be removed and
- * NNNN would be extended over the XXXX range, remove_migration_ptes
+ * rmap_locks are released. Otherwise if NNNN would be removed and
+ * CCCC would be extended over the NNNN range, remove_migration_ptes
* or other rmap walkers (if working on addresses beyond the "end"
- * parameter) may establish ptes with the wrong permissions of NNNN
- * instead of the right permissions of XXXX.
+ * parameter) may establish ptes with the wrong permissions of CCCC
+ * instead of the right permissions of NNNN.
*
* In the code below:
* PPPP is represented by *prev
- * NNNN is represented by *mid (and possibly equal to *next)
- * XXXX is represented by *next or not represented at all.
- * AAAA is not represented - it will be merged or the function will return NULL
+ * CCCC is represented by *curr or not represented at all (NULL)
+ * NNNN is represented by *next or not represented at all (NULL)
+ * **** is not represented - it will be merged and the vma containing the
+ * area is returned, or the function will return NULL
*/
struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
@@ -891,18 +909,18 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
struct anon_vma_name *anon_name)
{
- pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
- pgoff_t vma_pgoff;
- struct vm_area_struct *mid, *next, *res = NULL;
+ struct vm_area_struct *curr, *next, *res;
struct vm_area_struct *vma, *adjust, *remove, *remove2;
- int err = -1;
+ struct vma_prepare vp;
+ pgoff_t vma_pgoff;
+ int err = 0;
bool merge_prev = false;
bool merge_next = false;
bool vma_expanded = false;
- struct vma_prepare vp;
- unsigned long vma_end = end;
- long adj_next = 0;
unsigned long vma_start = addr;
+ unsigned long vma_end = end;
+ pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
+ long adj_start = 0;
validate_mm(mm);
/*
@@ -912,94 +930,105 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
if (vm_flags & VM_SPECIAL)
return NULL;
- next = find_vma(mm, prev ? prev->vm_end : 0);
- mid = next;
- if (next && next->vm_end == end) /* cases 6, 7, 8 */
- next = find_vma(mm, next->vm_end);
+ /* Does the input range span an existing VMA? (cases 5 - 8) */
+ curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);
- /* verify some invariant that must be enforced by the caller */
- VM_WARN_ON(prev && addr <= prev->vm_start);
- VM_WARN_ON(mid && end > mid->vm_end);
- VM_WARN_ON(addr >= end);
+ if (!curr || /* cases 1 - 4 */
+ end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
+ next = vma_lookup(mm, end);
+ else
+ next = NULL; /* case 5 */
if (prev) {
- res = prev;
- vma = prev;
vma_start = prev->vm_start;
vma_pgoff = prev->vm_pgoff;
+
/* Can we merge the predecessor? */
- if (prev->vm_end == addr && mpol_equal(vma_policy(prev), policy)
+ if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
&& can_vma_merge_after(prev, vm_flags, anon_vma, file,
- pgoff, vm_userfaultfd_ctx, anon_name)) {
+ pgoff, vm_userfaultfd_ctx, anon_name)) {
merge_prev = true;
vma_prev(vmi);
}
}
+
/* Can we merge the successor? */
- if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen,
- vm_userfaultfd_ctx, anon_name)) {
+ if (next && mpol_equal(policy, vma_policy(next)) &&
+ can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
+ vm_userfaultfd_ctx, anon_name)) {
merge_next = true;
}
+ if (!merge_prev && !merge_next)
+ return NULL; /* Not mergeable. */
+
+ res = vma = prev;
remove = remove2 = adjust = NULL;
+
+ /* Verify some invariant that must be enforced by the caller. */
+ VM_WARN_ON(prev && addr <= prev->vm_start);
+ VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
+ VM_WARN_ON(addr >= end);
+
/* Can we merge both the predecessor and the successor? */
if (merge_prev && merge_next &&
is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
- remove = mid; /* case 1 */
+ remove = next; /* case 1 */
vma_end = next->vm_end;
- err = dup_anon_vma(res, remove);
- if (mid != next) { /* case 6 */
+ err = dup_anon_vma(prev, next);
+ if (curr) { /* case 6 */
+ remove = curr;
remove2 = next;
- if (!remove->anon_vma)
- err = dup_anon_vma(res, remove2);
+ if (!next->anon_vma)
+ err = dup_anon_vma(prev, curr);
}
- } else if (merge_prev) {
- err = 0; /* case 2 */
- if (mid && end > mid->vm_start) {
- err = dup_anon_vma(res, mid);
- if (end == mid->vm_end) { /* case 7 */
- remove = mid;
+ } else if (merge_prev) { /* case 2 */
+ if (curr) {
+ err = dup_anon_vma(prev, curr);
+ if (end == curr->vm_end) { /* case 7 */
+ remove = curr;
} else { /* case 5 */
- adjust = mid;
- adj_next = (end - mid->vm_start);
+ adjust = curr;
+ adj_start = (end - curr->vm_start);
}
}
- } else if (merge_next) {
+ } else { /* merge_next */
res = next;
if (prev && addr < prev->vm_end) { /* case 4 */
vma_end = addr;
- adjust = mid;
- adj_next = -(vma->vm_end - addr);
- err = dup_anon_vma(adjust, prev);
+ adjust = next;
+ adj_start = -(prev->vm_end - addr);
+ err = dup_anon_vma(next, prev);
} else {
+ /*
+ * Note that cases 3 and 8 are the ONLY ones where prev
+ * is permitted to be (but is not necessarily) NULL.
+ */
vma = next; /* case 3 */
vma_start = addr;
vma_end = next->vm_end;
- vma_pgoff = mid->vm_pgoff;
- err = 0;
- if (mid != next) { /* case 8 */
- remove = mid;
- err = dup_anon_vma(res, remove);
+ vma_pgoff = next->vm_pgoff;
+ if (curr) { /* case 8 */
+ vma_pgoff = curr->vm_pgoff;
+ remove = curr;
+ err = dup_anon_vma(next, curr);
}
}
}
- /* Cannot merge or error in anon_vma clone */
+ /* Error in anon_vma clone. */
if (err)
return NULL;
if (vma_iter_prealloc(vmi))
return NULL;
- vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next);
init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
vp.anon_vma != adjust->anon_vma);
vma_prepare(&vp);
+ vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
if (vma_start < vma->vm_start || vma_end > vma->vm_end)
vma_expanded = true;
@@ -1010,10 +1039,10 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
if (vma_expanded)
vma_iter_store(vmi, vma);
- if (adj_next) {
- adjust->vm_start += adj_next;
- adjust->vm_pgoff += adj_next >> PAGE_SHIFT;
- if (adj_next < 0) {
+ if (adj_start) {
+ adjust->vm_start += adj_start;
+ adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
+ if (adj_start < 0) {
WARN_ON(vma_expanded);
vma_iter_store(vmi, next);
}
@@ -2119,7 +2148,7 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
vm_stat_account(mm, vma->vm_flags, -nrpages);
- remove_vma(vma);
+ remove_vma(vma, false);
}
vm_unacct_memory(nr_accounted);
validate_mm(mm);
@@ -2142,7 +2171,8 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
update_hiwater_rss(mm);
unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked);
free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
- next ? next->vm_start : USER_PGTABLES_CEILING);
+ next ? next->vm_start : USER_PGTABLES_CEILING,
+ mm_wr_locked);
tlb_finish_mmu(&tlb);
}
@@ -2198,10 +2228,10 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
- vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
+ vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
if (new_below) {
vma->vm_start = addr;
@@ -2245,10 +2275,12 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
static inline int munmap_sidetree(struct vm_area_struct *vma,
struct ma_state *mas_detach)
{
+ vma_start_write(vma);
mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
return -ENOMEM;
+ vma_mark_detached(vma, true);
if (vma->vm_flags & VM_LOCKED)
vma->vm_mm->locked_vm -= vma_pages(vma);
@@ -2904,9 +2936,9 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (vma_iter_prealloc(vmi))
goto unacct_fail;
- vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
init_vma_prep(&vp, vma);
vma_prepare(&vp);
+ vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
vma->vm_end = addr + len;
vm_flags_set(vma, VM_SOFTDIRTY);
vma_iter_store(vmi, vma);
@@ -3039,7 +3071,7 @@ void exit_mmap(struct mm_struct *mm)
mmap_write_lock(mm);
mt_clear_in_rcu(&mm->mm_mt);
free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
- USER_PGTABLES_CEILING);
+ USER_PGTABLES_CEILING, true);
tlb_finish_mmu(&tlb);
/*
@@ -3050,7 +3082,7 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- remove_vma(vma);
+ remove_vma(vma, true);
count++;
cond_resched();
} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
@@ -3173,6 +3205,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
+ vma_start_write(new_vma);
if (vma_link(mm, new_vma))
goto out_vma_link;
*need_rmap_locks = false;
@@ -3467,6 +3500,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* of mm/rmap.c:
* - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
* hugetlb mapping);
+ * - all vmas marked locked
* - all i_mmap_rwsem locks;
* - all anon_vma->rwseml
*
@@ -3492,6 +3526,13 @@ int mm_take_all_locks(struct mm_struct *mm)
mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
+ vma_start_write(vma);
+ }
+
+ mas_set(&mas, 0);
+ mas_for_each(&mas, vma, ULONG_MAX) {
+ if (signal_pending(current))
+ goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
is_vm_hugetlb_page(vma))
vm_lock_mapping(mm, vma->vm_file->f_mapping);
@@ -3578,6 +3619,7 @@ void mm_drop_all_locks(struct mm_struct *mm)
if (vma->vm_file && vma->vm_file->f_mapping)
vm_unlock_mapping(vma->vm_file->f_mapping);
}
+ vma_end_write_all(mm);
mutex_unlock(&mm_all_locks_mutex);
}
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 2b93cf6ac9ae..ea9683e12936 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -32,7 +32,7 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
return false;
- batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+ batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (!batch)
return false;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 36351a00c0e8..204194155863 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -276,7 +276,15 @@ static long change_pte_range(struct mmu_gather *tlb,
} else {
/* It must be an none page, or what else?.. */
WARN_ON_ONCE(!pte_none(oldpte));
- if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
+
+ /*
+ * Nobody plays with any none ptes besides
+ * userfaultfd when applying the protections.
+ */
+ if (likely(!uffd_wp))
+ continue;
+
+ if (userfaultfd_wp_use_markers(vma)) {
/*
* For file-backed mem, we need to be able to
* wr-protect a none pte, because even if the
@@ -320,23 +328,46 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
return 0;
}
-/* Return true if we're uffd wr-protecting file-backed memory, or false */
+/*
+ * Return true if we want to split THPs into PTE mappings in change
+ * protection procedure, false otherwise.
+ */
static inline bool
-uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
+pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags)
{
+ /*
+ * pte markers only resides in pte level, if we need pte markers,
+ * we need to split. We cannot wr-protect shmem thp because file
+ * thp is handled differently when split by erasing the pmd so far.
+ */
return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
}
/*
- * If wr-protecting the range for file-backed, populate pgtable for the case
- * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc()
- * failed we treat it the same way as pgtable allocation failures during
- * page faults by kicking OOM and returning error.
+ * Return true if we want to populate pgtables in change protection
+ * procedure, false otherwise
+ */
+static inline bool
+pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags)
+{
+ /* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */
+ if (!(cp_flags & MM_CP_UFFD_WP))
+ return false;
+
+ /* Populate if the userfaultfd mode requires pte markers */
+ return userfaultfd_wp_use_markers(vma);
+}
+
+/*
+ * Populate the pgtable underneath for whatever reason if requested.
+ * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable
+ * allocation failures during page faults by kicking OOM and returning
+ * error.
*/
#define change_pmd_prepare(vma, pmd, cp_flags) \
({ \
long err = 0; \
- if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \
+ if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \
if (pte_alloc(vma->vm_mm, pmd)) \
err = -ENOMEM; \
} \
@@ -351,7 +382,7 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
#define change_prepare(vma, high, low, addr, cp_flags) \
({ \
long err = 0; \
- if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \
+ if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \
low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
if (p == NULL) \
err = -ENOMEM; \
@@ -404,7 +435,7 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if ((next - addr != HPAGE_PMD_SIZE) ||
- uffd_wp_protect_file(vma, cp_flags)) {
+ pgtable_split_needed(vma, cp_flags)) {
__split_huge_pmd(vma, pmd, addr, false, NULL);
/*
* For file-backed, the pmd could have been
diff --git a/mm/mremap.c b/mm/mremap.c
index 411a85682b58..b11ce6c92099 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -623,6 +623,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return -ENOMEM;
}
+ vma_start_write(vma);
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
&need_rmap_locks);
@@ -683,7 +684,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
/* Tell pfnmap has moved from this vma */
if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn_moved(vma);
+ untrack_pfn_clear(vma);
if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
/* We always clear VM_LOCKED[ONFAULT] on the old vma */
@@ -1040,23 +1041,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
* vma (expand operation itself) and possibly also with
* the next vma if it becomes adjacent to the expanded
* vma and otherwise compatible.
- *
- * However, vma_merge() can currently fail due to
- * is_mergeable_vma() check for vm_ops->close (see the
- * comment there). Yet this should not prevent vma
- * expanding, so perform a simple expand for such vma.
- * Ideally the check for close op should be only done
- * when a vma would be actually removed due to a merge.
*/
- if (!vma->vm_ops || !vma->vm_ops->close) {
- vma = vma_merge(&vmi, mm, vma, extension_start,
- extension_end, vma->vm_flags, vma->anon_vma,
- vma->vm_file, extension_pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, anon_vma_name(vma));
- } else if (vma_expand(&vmi, vma, vma->vm_start,
- addr + new_len, vma->vm_pgoff, NULL)) {
- vma = NULL;
- }
+ vma = vma_merge(&vmi, mm, vma, extension_start,
+ extension_end, vma->vm_flags, vma->anon_vma,
+ vma->vm_file, extension_pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (!vma) {
vm_unacct_memory(pages);
ret = -ENOMEM;
diff --git a/mm/nommu.c b/mm/nommu.c
index 57ba243c6a37..f670d9979a26 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -36,6 +36,7 @@
#include <linux/printk.h>
#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -198,14 +199,13 @@ unsigned long vmalloc_to_pfn(const void *addr)
}
EXPORT_SYMBOL(vmalloc_to_pfn);
-long vread(char *buf, char *addr, unsigned long count)
+long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
/* Don't allow overflow */
- if ((unsigned long) buf + count < count)
- count = -(unsigned long) buf;
+ if ((unsigned long) addr + count < count)
+ count = -(unsigned long) addr;
- memcpy(buf, addr, count);
- return count;
+ return copy_to_iter(addr, count, iter);
}
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 516b1aa247e8..db7943999007 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2583,46 +2583,6 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
return ret;
}
-/**
- * folio_write_one - write out a single folio and wait on I/O.
- * @folio: The folio to write.
- *
- * The folio must be locked by the caller and will be unlocked upon return.
- *
- * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
- * function returns.
- *
- * Return: %0 on success, negative error code otherwise
- */
-int folio_write_one(struct folio *folio)
-{
- struct address_space *mapping = folio->mapping;
- int ret = 0;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = folio_nr_pages(folio),
- };
-
- BUG_ON(!folio_test_locked(folio));
-
- folio_wait_writeback(folio);
-
- if (folio_clear_dirty_for_io(folio)) {
- folio_get(folio);
- ret = mapping->a_ops->writepage(&folio->page, &wbc);
- if (ret == 0)
- folio_wait_writeback(folio);
- folio_put(folio);
- } else {
- folio_unlock(folio);
- }
-
- if (!ret)
- ret = filemap_check_errors(mapping);
- return ret;
-}
-EXPORT_SYMBOL(folio_write_one);
-
/*
* For address_spaces which do not use buffers nor write back.
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7136c36c5d01..d0eb280ec7e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -72,9 +72,7 @@
#include <linux/lockdep.h>
#include <linux/nmi.h>
#include <linux/psi.h>
-#include <linux/padata.h>
#include <linux/khugepaged.h>
-#include <linux/buffer_head.h>
#include <linux/delayacct.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -112,17 +110,6 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
-/*
- * Don't poison memory with KASAN (only for the tag-based modes).
- * During boot, all non-reserved memblock memory is exposed to page_alloc.
- * Poisoning all that memory lengthens boot time, especially on systems with
- * large amount of RAM. This flag is used to skip that poisoning.
- * This is only done for the tag-based KASAN modes, as those are able to
- * detect memory corruptions with the memory tags assigned by default.
- * All memory allocated normally after boot gets poisoned as usual.
- */
-#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
-
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -253,23 +240,6 @@ EXPORT_SYMBOL(init_on_alloc);
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
EXPORT_SYMBOL(init_on_free);
-static bool _init_on_alloc_enabled_early __read_mostly
- = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
-static int __init early_init_on_alloc(char *buf)
-{
-
- return kstrtobool(buf, &_init_on_alloc_enabled_early);
-}
-early_param("init_on_alloc", early_init_on_alloc);
-
-static bool _init_on_free_enabled_early __read_mostly
- = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
-static int __init early_init_on_free(char *buf)
-{
- return kstrtobool(buf, &_init_on_free_enabled_early);
-}
-early_param("init_on_free", early_init_on_free);
-
/*
* A cached value of the page's pageblock's migratetype, used when the page is
* put on a pcplist. Used to avoid the pageblock migratetype lookup when
@@ -358,7 +328,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
[ZONE_MOVABLE] = 0,
};
-static char * const zone_names[MAX_NR_ZONES] = {
+char * const zone_names[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
"DMA",
#endif
@@ -404,17 +374,6 @@ int user_min_free_kbytes = -1;
int watermark_boost_factor __read_mostly = 15000;
int watermark_scale_factor = 10;
-static unsigned long nr_kernel_pages __initdata;
-static unsigned long nr_all_pages __initdata;
-static unsigned long dma_reserve __initdata;
-
-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
-static unsigned long required_kernelcore __initdata;
-static unsigned long required_kernelcore_percent __initdata;
-static unsigned long required_movablecore __initdata;
-static unsigned long required_movablecore_percent __initdata;
-static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
bool mirrored_kernelcore __initdata_memblock;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -430,86 +389,36 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
-bool deferred_struct_pages __meminitdata;
-
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* During boot we initialize deferred pages on-demand, as needed, but once
* page_alloc_init_late() has finished, the deferred pages are all initialized,
* and we can permanently disable that path.
*/
-static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+DEFINE_STATIC_KEY_TRUE(deferred_pages);
static inline bool deferred_pages_enabled(void)
{
return static_branch_unlikely(&deferred_pages);
}
-/* Returns true if the struct page for the pfn is initialised */
-static inline bool __meminit early_page_initialised(unsigned long pfn)
-{
- int nid = early_pfn_to_nid(pfn);
-
- if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
- return false;
-
- return true;
-}
-
/*
- * Returns true when the remaining initialisation should be deferred until
- * later in the boot cycle when it can be parallelised.
+ * deferred_grow_zone() is __init, but it is called from
+ * get_page_from_freelist() during early boot until deferred_pages permanently
+ * disables this call. This is why we have refdata wrapper to avoid warning,
+ * and to ensure that the function body gets unloaded.
*/
-static bool __meminit
-defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+static bool __ref
+_deferred_grow_zone(struct zone *zone, unsigned int order)
{
- static unsigned long prev_end_pfn, nr_initialised;
-
- if (early_page_ext_enabled())
- return false;
- /*
- * prev_end_pfn static that contains the end of previous zone
- * No need to protect because called very early in boot before smp_init.
- */
- if (prev_end_pfn != end_pfn) {
- prev_end_pfn = end_pfn;
- nr_initialised = 0;
- }
-
- /* Always populate low zones for address-constrained allocations */
- if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
- return false;
-
- if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
- return true;
- /*
- * We start only with one section of pages, more pages are added as
- * needed until the rest of deferred pages are initialized.
- */
- nr_initialised++;
- if ((nr_initialised > PAGES_PER_SECTION) &&
- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
- NODE_DATA(nid)->first_deferred_pfn = pfn;
- return true;
- }
- return false;
+ return deferred_grow_zone(zone, order);
}
#else
static inline bool deferred_pages_enabled(void)
{
return false;
}
-
-static inline bool early_page_initialised(unsigned long pfn)
-{
- return true;
-}
-
-static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
-{
- return false;
-}
-#endif
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/* Return a pointer to the bitmap storing bits affecting a block of pages */
static inline unsigned long *get_pageblock_bitmap(const struct page *page,
@@ -775,26 +684,6 @@ void free_compound_page(struct page *page)
free_the_page(page, compound_order(page));
}
-static void prep_compound_head(struct page *page, unsigned int order)
-{
- struct folio *folio = (struct folio *)page;
-
- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
- set_compound_order(page, order);
- atomic_set(&folio->_entire_mapcount, -1);
- atomic_set(&folio->_nr_pages_mapped, 0);
- atomic_set(&folio->_pincount, 0);
-}
-
-static void prep_compound_tail(struct page *head, int tail_idx)
-{
- struct page *p = head + tail_idx;
-
- p->mapping = TAIL_MAPPING;
- set_compound_head(p, head);
- set_page_private(p, 0);
-}
-
void prep_compound_page(struct page *page, unsigned int order)
{
int i;
@@ -884,64 +773,6 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) {}
#endif
-/*
- * Enable static keys related to various memory debugging and hardening options.
- * Some override others, and depend on early params that are evaluated in the
- * order of appearance. So we need to first gather the full picture of what was
- * enabled, and then make decisions.
- */
-void __init init_mem_debugging_and_hardening(void)
-{
- bool page_poisoning_requested = false;
-
-#ifdef CONFIG_PAGE_POISONING
- /*
- * Page poisoning is debug page alloc for some arches. If
- * either of those options are enabled, enable poisoning.
- */
- if (page_poisoning_enabled() ||
- (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
- debug_pagealloc_enabled())) {
- static_branch_enable(&_page_poisoning_enabled);
- page_poisoning_requested = true;
- }
-#endif
-
- if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
- page_poisoning_requested) {
- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
- "will take precedence over init_on_alloc and init_on_free\n");
- _init_on_alloc_enabled_early = false;
- _init_on_free_enabled_early = false;
- }
-
- if (_init_on_alloc_enabled_early)
- static_branch_enable(&init_on_alloc);
- else
- static_branch_disable(&init_on_alloc);
-
- if (_init_on_free_enabled_early)
- static_branch_enable(&init_on_free);
- else
- static_branch_disable(&init_on_free);
-
- if (IS_ENABLED(CONFIG_KMSAN) &&
- (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
- pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (!debug_pagealloc_enabled())
- return;
-
- static_branch_enable(&_debug_pagealloc_enabled);
-
- if (!debug_guardpage_minorder())
- return;
-
- static_branch_enable(&_debug_guardpage_enabled);
-#endif
-}
-
static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
@@ -1044,6 +875,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
zone->free_area[order].nr_free--;
}
+static inline struct page *get_page_from_free_area(struct free_area *area,
+ int migratetype)
+{
+ return list_first_entry_or_null(&area->free_list[migratetype],
+ struct page, lru);
+}
+
/*
* If this is not the largest possible page, check if the buddy
* of the next-highest order is free. If it is, it's possible
@@ -1059,7 +897,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
unsigned long higher_page_pfn;
struct page *higher_page;
- if (order >= MAX_ORDER - 2)
+ if (order >= MAX_ORDER - 1)
return false;
higher_page_pfn = buddy_pfn & pfn;
@@ -1114,7 +952,7 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
- while (order < MAX_ORDER - 1) {
+ while (order < MAX_ORDER) {
if (compaction_capture(capc, page, order, migratetype)) {
__mod_zone_freepage_state(zone, -(1 << order),
migratetype);
@@ -1355,13 +1193,19 @@ out:
/*
* Skip KASAN memory poisoning when either:
*
- * 1. Deferred memory initialization has not yet completed,
- * see the explanation below.
- * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON,
- * see the comment next to it.
- * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
- * see the comment next to it.
- * 4. The allocation is excluded from being checked due to sampling,
+ * 1. For generic KASAN: deferred memory initialization has not yet completed.
+ * Tag-based KASAN modes skip pages freed via deferred memory initialization
+ * using page tags instead (see below).
+ * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
+ * that error detection is disabled for accesses via the page address.
+ *
+ * Pages will have match-all tags in the following circumstances:
+ *
+ * 1. Pages are being initialized for the first time, including during deferred
+ * memory init; see the call to page_kasan_tag_reset in __init_single_page.
+ * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
+ * exception of pages unpoisoned by kasan_unpoison_vmalloc.
+ * 3. The allocation was excluded from being checked due to sampling,
* see the call to kasan_unpoison_pages.
*
* Poisoning pages during deferred memory init will greatly lengthen the
@@ -1377,10 +1221,10 @@ out:
*/
static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
{
- return deferred_pages_enabled() ||
- (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
- PageSkipKASanPoison(page);
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ return deferred_pages_enabled();
+
+ return page_kasan_tag(page) == 0xff;
}
static void kernel_init_pages(struct page *page, int numpages)
@@ -1395,7 +1239,7 @@ static void kernel_init_pages(struct page *page, int numpages)
}
static __always_inline bool free_pages_prepare(struct page *page,
- unsigned int order, bool check_free, fpi_t fpi_flags)
+ unsigned int order, fpi_t fpi_flags)
{
int bad = 0;
bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
@@ -1433,9 +1277,11 @@ static __always_inline bool free_pages_prepare(struct page *page,
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_pages_check(page, page + i);
- if (unlikely(free_page_is_bad(page + i))) {
- bad++;
- continue;
+ if (is_check_pages_enabled()) {
+ if (unlikely(free_page_is_bad(page + i))) {
+ bad++;
+ continue;
+ }
}
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
@@ -1444,10 +1290,12 @@ static __always_inline bool free_pages_prepare(struct page *page,
page->mapping = NULL;
if (memcg_kmem_online() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
- if (check_free && free_page_is_bad(page))
- bad++;
- if (bad)
- return false;
+ if (is_check_pages_enabled()) {
+ if (free_page_is_bad(page))
+ bad++;
+ if (bad)
+ return false;
+ }
page_cpupid_reset_last(page);
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
@@ -1493,46 +1341,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
return true;
}
-#ifdef CONFIG_DEBUG_VM
-/*
- * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
- * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
- * moved from pcp lists to free lists.
- */
-static bool free_pcp_prepare(struct page *page, unsigned int order)
-{
- return free_pages_prepare(page, order, true, FPI_NONE);
-}
-
-/* return true if this page has an inappropriate state */
-static bool bulkfree_pcp_prepare(struct page *page)
-{
- if (debug_pagealloc_enabled_static())
- return free_page_is_bad(page);
- else
- return false;
-}
-#else
-/*
- * With DEBUG_VM disabled, order-0 pages being freed are checked only when
- * moving from pcp lists to free list in order to reduce overhead. With
- * debug_pagealloc enabled, they are checked also immediately when being freed
- * to the pcp lists.
- */
-static bool free_pcp_prepare(struct page *page, unsigned int order)
-{
- if (debug_pagealloc_enabled_static())
- return free_pages_prepare(page, order, true, FPI_NONE);
- else
- return free_pages_prepare(page, order, false, FPI_NONE);
-}
-
-static bool bulkfree_pcp_prepare(struct page *page)
-{
- return free_page_is_bad(page);
-}
-#endif /* CONFIG_DEBUG_VM */
-
/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone.
@@ -1592,9 +1400,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
count -= nr_pages;
pcp->count -= nr_pages;
- if (bulkfree_pcp_prepare(page))
- continue;
-
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
@@ -1625,80 +1430,6 @@ static void free_one_page(struct zone *zone,
spin_unlock_irqrestore(&zone->lock, flags);
}
-static void __meminit __init_single_page(struct page *page, unsigned long pfn,
- unsigned long zone, int nid)
-{
- mm_zero_struct_page(page);
- set_page_links(page, zone, nid, pfn);
- init_page_count(page);
- page_mapcount_reset(page);
- page_cpupid_reset_last(page);
- page_kasan_tag_reset(page);
-
- INIT_LIST_HEAD(&page->lru);
-#ifdef WANT_PAGE_VIRTUAL
- /* The shift won't overflow because ZONE_NORMAL is below 4G. */
- if (!is_highmem_idx(zone))
- set_page_address(page, __va(pfn << PAGE_SHIFT));
-#endif
-}
-
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __meminit init_reserved_page(unsigned long pfn)
-{
- pg_data_t *pgdat;
- int nid, zid;
-
- if (early_page_initialised(pfn))
- return;
-
- nid = early_pfn_to_nid(pfn);
- pgdat = NODE_DATA(nid);
-
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- struct zone *zone = &pgdat->node_zones[zid];
-
- if (zone_spans_pfn(zone, pfn))
- break;
- }
- __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
-}
-#else
-static inline void init_reserved_page(unsigned long pfn)
-{
-}
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
-
-/*
- * Initialised pages do not have PageReserved set. This function is
- * called for each range allocated by the bootmem allocator and
- * marks the pages PageReserved. The remaining valid pages are later
- * sent to the buddy page allocator.
- */
-void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
-{
- unsigned long start_pfn = PFN_DOWN(start);
- unsigned long end_pfn = PFN_UP(end);
-
- for (; start_pfn < end_pfn; start_pfn++) {
- if (pfn_valid(start_pfn)) {
- struct page *page = pfn_to_page(start_pfn);
-
- init_reserved_page(start_pfn);
-
- /* Avoid false-positive PageTail() */
- INIT_LIST_HEAD(&page->lru);
-
- /*
- * no need for atomic set_bit because the struct
- * page is not visible yet so nobody should
- * access it yet.
- */
- __SetPageReserved(page);
- }
- }
-}
-
static void __free_pages_ok(struct page *page, unsigned int order,
fpi_t fpi_flags)
{
@@ -1707,7 +1438,7 @@ static void __free_pages_ok(struct page *page, unsigned int order,
unsigned long pfn = page_to_pfn(page);
struct zone *zone = page_zone(page);
- if (!free_pages_prepare(page, order, true, fpi_flags))
+ if (!free_pages_prepare(page, order, fpi_flags))
return;
/*
@@ -1754,71 +1485,7 @@ void __free_pages_core(struct page *page, unsigned int order)
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
*/
- __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
-}
-
-#ifdef CONFIG_NUMA
-
-/*
- * During memory init memblocks map pfns to nids. The search is expensive and
- * this caches recent lookups. The implementation of __early_pfn_to_nid
- * treats start/end as pfns.
- */
-struct mminit_pfnnid_cache {
- unsigned long last_start;
- unsigned long last_end;
- int last_nid;
-};
-
-static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
-
-/*
- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
- */
-static int __meminit __early_pfn_to_nid(unsigned long pfn,
- struct mminit_pfnnid_cache *state)
-{
- unsigned long start_pfn, end_pfn;
- int nid;
-
- if (state->last_start <= pfn && pfn < state->last_end)
- return state->last_nid;
-
- nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- if (nid != NUMA_NO_NODE) {
- state->last_start = start_pfn;
- state->last_end = end_pfn;
- state->last_nid = nid;
- }
-
- return nid;
-}
-
-int __meminit early_pfn_to_nid(unsigned long pfn)
-{
- static DEFINE_SPINLOCK(early_pfn_lock);
- int nid;
-
- spin_lock(&early_pfn_lock);
- nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
- if (nid < 0)
- nid = first_online_node;
- spin_unlock(&early_pfn_lock);
-
- return nid;
-}
-#endif /* CONFIG_NUMA */
-
-void __init memblock_free_pages(struct page *page, unsigned long pfn,
- unsigned int order)
-{
- if (!early_page_initialised(pfn))
- return;
- if (!kmsan_memblock_free_pages(page, order)) {
- /* KMSAN will take care of these pages. */
- return;
- }
- __free_pages_core(page, order);
+ __free_pages_ok(page, order, FPI_TO_TAIL);
}
/*
@@ -1891,445 +1558,6 @@ void clear_zone_contiguous(struct zone *zone)
zone->contiguous = false;
}
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __init deferred_free_range(unsigned long pfn,
- unsigned long nr_pages)
-{
- struct page *page;
- unsigned long i;
-
- if (!nr_pages)
- return;
-
- page = pfn_to_page(pfn);
-
- /* Free a large naturally-aligned chunk if possible */
- if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_core(page, pageblock_order);
- return;
- }
-
- for (i = 0; i < nr_pages; i++, page++, pfn++) {
- if (pageblock_aligned(pfn))
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_core(page, 0);
- }
-}
-
-/* Completion tracking for deferred_init_memmap() threads */
-static atomic_t pgdat_init_n_undone __initdata;
-static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
-
-static inline void __init pgdat_init_report_one_done(void)
-{
- if (atomic_dec_and_test(&pgdat_init_n_undone))
- complete(&pgdat_init_all_done_comp);
-}
-
-/*
- * Returns true if page needs to be initialized or freed to buddy allocator.
- *
- * We check if a current large page is valid by only checking the validity
- * of the head pfn.
- */
-static inline bool __init deferred_pfn_valid(unsigned long pfn)
-{
- if (pageblock_aligned(pfn) && !pfn_valid(pfn))
- return false;
- return true;
-}
-
-/*
- * Free pages to buddy allocator. Try to free aligned pages in
- * pageblock_nr_pages sizes.
- */
-static void __init deferred_free_pages(unsigned long pfn,
- unsigned long end_pfn)
-{
- unsigned long nr_free = 0;
-
- for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(pfn)) {
- deferred_free_range(pfn - nr_free, nr_free);
- nr_free = 0;
- } else if (pageblock_aligned(pfn)) {
- deferred_free_range(pfn - nr_free, nr_free);
- nr_free = 1;
- } else {
- nr_free++;
- }
- }
- /* Free the last block of pages to allocator */
- deferred_free_range(pfn - nr_free, nr_free);
-}
-
-/*
- * Initialize struct pages. We minimize pfn page lookups and scheduler checks
- * by performing it only once every pageblock_nr_pages.
- * Return number of pages initialized.
- */
-static unsigned long __init deferred_init_pages(struct zone *zone,
- unsigned long pfn,
- unsigned long end_pfn)
-{
- int nid = zone_to_nid(zone);
- unsigned long nr_pages = 0;
- int zid = zone_idx(zone);
- struct page *page = NULL;
-
- for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(pfn)) {
- page = NULL;
- continue;
- } else if (!page || pageblock_aligned(pfn)) {
- page = pfn_to_page(pfn);
- } else {
- page++;
- }
- __init_single_page(page, pfn, zid, nid);
- nr_pages++;
- }
- return (nr_pages);
-}
-
-/*
- * This function is meant to pre-load the iterator for the zone init.
- * Specifically it walks through the ranges until we are caught up to the
- * first_init_pfn value and exits there. If we never encounter the value we
- * return false indicating there are no valid ranges left.
- */
-static bool __init
-deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
- unsigned long *spfn, unsigned long *epfn,
- unsigned long first_init_pfn)
-{
- u64 j;
-
- /*
- * Start out by walking through the ranges in this zone that have
- * already been initialized. We don't need to do anything with them
- * so we just need to flush them out of the system.
- */
- for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
- if (*epfn <= first_init_pfn)
- continue;
- if (*spfn < first_init_pfn)
- *spfn = first_init_pfn;
- *i = j;
- return true;
- }
-
- return false;
-}
-
-/*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, then free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
- *
- * In order to try and keep some memory in the cache we have the loop
- * broken along max page order boundaries. This way we will not cause
- * any issues with the buddy page computation.
- */
-static unsigned long __init
-deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
- unsigned long *end_pfn)
-{
- unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
- unsigned long spfn = *start_pfn, epfn = *end_pfn;
- unsigned long nr_pages = 0;
- u64 j = *i;
-
- /* First we loop through and initialize the page values */
- for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
- unsigned long t;
-
- if (mo_pfn <= *start_pfn)
- break;
-
- t = min(mo_pfn, *end_pfn);
- nr_pages += deferred_init_pages(zone, *start_pfn, t);
-
- if (mo_pfn < *end_pfn) {
- *start_pfn = mo_pfn;
- break;
- }
- }
-
- /* Reset values and now loop through freeing pages as needed */
- swap(j, *i);
-
- for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
- unsigned long t;
-
- if (mo_pfn <= spfn)
- break;
-
- t = min(mo_pfn, epfn);
- deferred_free_pages(spfn, t);
-
- if (mo_pfn <= epfn)
- break;
- }
-
- return nr_pages;
-}
-
-static void __init
-deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
- void *arg)
-{
- unsigned long spfn, epfn;
- struct zone *zone = arg;
- u64 i;
-
- deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
-
- /*
- * Initialize and free pages in MAX_ORDER sized increments so that we
- * can avoid introducing any issues with the buddy allocator.
- */
- while (spfn < end_pfn) {
- deferred_init_maxorder(&i, zone, &spfn, &epfn);
- cond_resched();
- }
-}
-
-/* An arch may override for more concurrency. */
-__weak int __init
-deferred_page_init_max_threads(const struct cpumask *node_cpumask)
-{
- return 1;
-}
-
-/* Initialise remaining memory on a node */
-static int __init deferred_init_memmap(void *data)
-{
- pg_data_t *pgdat = data;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
- unsigned long spfn = 0, epfn = 0;
- unsigned long first_init_pfn, flags;
- unsigned long start = jiffies;
- struct zone *zone;
- int zid, max_threads;
- u64 i;
-
- /* Bind memory initialisation thread to a local node if possible */
- if (!cpumask_empty(cpumask))
- set_cpus_allowed_ptr(current, cpumask);
-
- pgdat_resize_lock(pgdat, &flags);
- first_init_pfn = pgdat->first_deferred_pfn;
- if (first_init_pfn == ULONG_MAX) {
- pgdat_resize_unlock(pgdat, &flags);
- pgdat_init_report_one_done();
- return 0;
- }
-
- /* Sanity check boundaries */
- BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
- BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
- pgdat->first_deferred_pfn = ULONG_MAX;
-
- /*
- * Once we unlock here, the zone cannot be grown anymore, thus if an
- * interrupt thread must allocate this early in boot, zone must be
- * pre-grown prior to start of deferred page initialization.
- */
- pgdat_resize_unlock(pgdat, &flags);
-
- /* Only the highest zone is deferred so find it */
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- zone = pgdat->node_zones + zid;
- if (first_init_pfn < zone_end_pfn(zone))
- break;
- }
-
- /* If the zone is empty somebody else may have cleared out the zone */
- if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
- first_init_pfn))
- goto zone_empty;
-
- max_threads = deferred_page_init_max_threads(cpumask);
-
- while (spfn < epfn) {
- unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
- struct padata_mt_job job = {
- .thread_fn = deferred_init_memmap_chunk,
- .fn_arg = zone,
- .start = spfn,
- .size = epfn_align - spfn,
- .align = PAGES_PER_SECTION,
- .min_chunk = PAGES_PER_SECTION,
- .max_threads = max_threads,
- };
-
- padata_do_multithreaded(&job);
- deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
- epfn_align);
- }
-zone_empty:
- /* Sanity check that the next zone really is unpopulated */
- WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
-
- pr_info("node %d deferred pages initialised in %ums\n",
- pgdat->node_id, jiffies_to_msecs(jiffies - start));
-
- pgdat_init_report_one_done();
- return 0;
-}
-
-/*
- * If this zone has deferred pages, try to grow it by initializing enough
- * deferred pages to satisfy the allocation specified by order, rounded up to
- * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
- * of SECTION_SIZE bytes by initializing struct pages in increments of
- * PAGES_PER_SECTION * sizeof(struct page) bytes.
- *
- * Return true when zone was grown, otherwise return false. We return true even
- * when we grow less than requested, to let the caller decide if there are
- * enough pages to satisfy the allocation.
- *
- * Note: We use noinline because this function is needed only during boot, and
- * it is called from a __ref function _deferred_grow_zone. This way we are
- * making sure that it is not inlined into permanent text section.
- */
-static noinline bool __init
-deferred_grow_zone(struct zone *zone, unsigned int order)
-{
- unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
- pg_data_t *pgdat = zone->zone_pgdat;
- unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
- unsigned long spfn, epfn, flags;
- unsigned long nr_pages = 0;
- u64 i;
-
- /* Only the last zone may have deferred pages */
- if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
- return false;
-
- pgdat_resize_lock(pgdat, &flags);
-
- /*
- * If someone grew this zone while we were waiting for spinlock, return
- * true, as there might be enough pages already.
- */
- if (first_deferred_pfn != pgdat->first_deferred_pfn) {
- pgdat_resize_unlock(pgdat, &flags);
- return true;
- }
-
- /* If the zone is empty somebody else may have cleared out the zone */
- if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
- first_deferred_pfn)) {
- pgdat->first_deferred_pfn = ULONG_MAX;
- pgdat_resize_unlock(pgdat, &flags);
- /* Retry only once. */
- return first_deferred_pfn != ULONG_MAX;
- }
-
- /*
- * Initialize and free pages in MAX_ORDER sized increments so
- * that we can avoid introducing any issues with the buddy
- * allocator.
- */
- while (spfn < epfn) {
- /* update our first deferred PFN for this section */
- first_deferred_pfn = spfn;
-
- nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
- touch_nmi_watchdog();
-
- /* We should only stop along section boundaries */
- if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
- continue;
-
- /* If our quota has been met we can stop here */
- if (nr_pages >= nr_pages_needed)
- break;
- }
-
- pgdat->first_deferred_pfn = spfn;
- pgdat_resize_unlock(pgdat, &flags);
-
- return nr_pages > 0;
-}
-
-/*
- * deferred_grow_zone() is __init, but it is called from
- * get_page_from_freelist() during early boot until deferred_pages permanently
- * disables this call. This is why we have refdata wrapper to avoid warning,
- * and to ensure that the function body gets unloaded.
- */
-static bool __ref
-_deferred_grow_zone(struct zone *zone, unsigned int order)
-{
- return deferred_grow_zone(zone, order);
-}
-
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
-
-void __init page_alloc_init_late(void)
-{
- struct zone *zone;
- int nid;
-
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-
- /* There will be num_node_state(N_MEMORY) threads */
- atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
- for_each_node_state(nid, N_MEMORY) {
- kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
- }
-
- /* Block until all are initialised */
- wait_for_completion(&pgdat_init_all_done_comp);
-
- /*
- * We initialized the rest of the deferred pages. Permanently disable
- * on-demand struct page initialization.
- */
- static_branch_disable(&deferred_pages);
-
- /* Reinit limits that are based on free pages after the kernel is up */
- files_maxfiles_init();
-#endif
-
- buffer_init();
-
- /* Discard memblock private memory */
- memblock_discard();
-
- for_each_node_state(nid, N_MEMORY)
- shuffle_free_memory(NODE_DATA(nid));
-
- for_each_populated_zone(zone)
- set_zone_contiguous(zone);
-}
-
-#ifdef CONFIG_CMA
-/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
-void __init init_cma_reserved_pageblock(struct page *page)
-{
- unsigned i = pageblock_nr_pages;
- struct page *p = page;
-
- do {
- __ClearPageReserved(p);
- set_page_count(p, 0);
- } while (++p, --i);
-
- set_pageblock_migratetype(page, MIGRATE_CMA);
- set_page_refcounted(page);
- __free_pages(page, pageblock_order);
-
- adjust_managed_page_count(page, pageblock_nr_pages);
- page_zone(page)->cma_pages += pageblock_nr_pages;
-}
-#endif
-
/*
* The order of subdivision here is critical for the IO subsystem.
* Please do not alter this order without good reasons and regression
@@ -2383,7 +1611,7 @@ static void check_new_page_bad(struct page *page)
/*
* This page is about to be returned from the page allocator
*/
-static inline int check_new_page(struct page *page)
+static int check_new_page(struct page *page)
{
if (likely(page_expected_state(page,
PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
@@ -2393,56 +1621,20 @@ static inline int check_new_page(struct page *page)
return 1;
}
-static bool check_new_pages(struct page *page, unsigned int order)
+static inline bool check_new_pages(struct page *page, unsigned int order)
{
- int i;
- for (i = 0; i < (1 << order); i++) {
- struct page *p = page + i;
+ if (is_check_pages_enabled()) {
+ for (int i = 0; i < (1 << order); i++) {
+ struct page *p = page + i;
- if (unlikely(check_new_page(p)))
- return true;
+ if (unlikely(check_new_page(p)))
+ return true;
+ }
}
return false;
}
-#ifdef CONFIG_DEBUG_VM
-/*
- * With DEBUG_VM enabled, order-0 pages are checked for expected state when
- * being allocated from pcp lists. With debug_pagealloc also enabled, they are
- * also checked when pcp lists are refilled from the free lists.
- */
-static inline bool check_pcp_refill(struct page *page, unsigned int order)
-{
- if (debug_pagealloc_enabled_static())
- return check_new_pages(page, order);
- else
- return false;
-}
-
-static inline bool check_new_pcp(struct page *page, unsigned int order)
-{
- return check_new_pages(page, order);
-}
-#else
-/*
- * With DEBUG_VM disabled, free order-0 pages are checked for expected state
- * when pcp lists are being refilled from the free lists. With debug_pagealloc
- * enabled, they are also checked when being allocated from the pcp lists.
- */
-static inline bool check_pcp_refill(struct page *page, unsigned int order)
-{
- return check_new_pages(page, order);
-}
-static inline bool check_new_pcp(struct page *page, unsigned int order)
-{
- if (debug_pagealloc_enabled_static())
- return check_new_pages(page, order);
- else
- return false;
-}
-#endif /* CONFIG_DEBUG_VM */
-
static inline bool should_skip_kasan_unpoison(gfp_t flags)
{
/* Don't skip if a software KASAN mode is enabled. */
@@ -2456,9 +1648,9 @@ static inline bool should_skip_kasan_unpoison(gfp_t flags)
/*
* With hardware tag-based KASAN enabled, skip if this has been
- * requested via __GFP_SKIP_KASAN_UNPOISON.
+ * requested via __GFP_SKIP_KASAN.
*/
- return flags & __GFP_SKIP_KASAN_UNPOISON;
+ return flags & __GFP_SKIP_KASAN;
}
static inline bool should_skip_init(gfp_t flags)
@@ -2477,7 +1669,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
!should_skip_init(gfp_flags);
bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
- bool reset_tags = true;
int i;
set_page_private(page, 0);
@@ -2511,37 +1702,22 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
/* Take note that memory was initialized by the loop above. */
init = false;
}
- if (!should_skip_kasan_unpoison(gfp_flags)) {
- /* Try unpoisoning (or setting tags) and initializing memory. */
- if (kasan_unpoison_pages(page, order, init)) {
- /* Take note that memory was initialized by KASAN. */
- if (kasan_has_integrated_init())
- init = false;
- /* Take note that memory tags were set by KASAN. */
- reset_tags = false;
- } else {
- /*
- * KASAN decided to exclude this allocation from being
- * (un)poisoned due to sampling. Make KASAN skip
- * poisoning when the allocation is freed.
- */
- SetPageSkipKASanPoison(page);
- }
- }
- /*
- * If memory tags have not been set by KASAN, reset the page tags to
- * ensure page_address() dereferencing does not fault.
- */
- if (reset_tags) {
+ if (!should_skip_kasan_unpoison(gfp_flags) &&
+ kasan_unpoison_pages(page, order, init)) {
+ /* Take note that memory was initialized by KASAN. */
+ if (kasan_has_integrated_init())
+ init = false;
+ } else {
+ /*
+ * If memory tags have not been set by KASAN, reset the page
+ * tags to ensure page_address() dereferencing does not fault.
+ */
for (i = 0; i != 1 << order; ++i)
page_kasan_tag_reset(page + i);
}
/* If memory is still not initialized, initialize it now. */
if (init)
kernel_init_pages(page, 1 << order);
- /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
- if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
- SetPageSkipKASanPoison(page);
set_page_owner(page, order, gfp_flags);
page_table_check_alloc(page, order);
@@ -2580,7 +1756,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
struct page *page;
/* Find a page of the appropriate size in the preferred list */
- for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+ for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
@@ -2952,7 +2128,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
continue;
spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order < MAX_ORDER; order++) {
+ for (order = 0; order <= MAX_ORDER; order++) {
struct free_area *area = &(zone->free_area[order]);
page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
@@ -3036,7 +2212,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
* approximates finding the pageblock with the most free pages, which
* would be too costly to do exactly.
*/
- for (current_order = MAX_ORDER - 1; current_order >= min_order;
+ for (current_order = MAX_ORDER; current_order >= min_order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -3062,7 +2238,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
return false;
find_smallest:
- for (current_order = order; current_order < MAX_ORDER;
+ for (current_order = order; current_order <= MAX_ORDER;
current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -3075,7 +2251,7 @@ find_smallest:
* This should not happen - we already found a suitable fallback
* when looking for the largest page.
*/
- VM_BUG_ON(current_order == MAX_ORDER);
+ VM_BUG_ON(current_order > MAX_ORDER);
do_steal:
page = get_page_from_free_area(area, fallback_mt);
@@ -3137,7 +2313,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
int migratetype, unsigned int alloc_flags)
{
unsigned long flags;
- int i, allocated = 0;
+ int i;
spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
@@ -3146,9 +2322,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
if (unlikely(page == NULL))
break;
- if (unlikely(check_pcp_refill(page, order)))
- continue;
-
/*
* Split buddy pages returned by expand() are received here in
* physical page order. The page is added to the tail of
@@ -3160,21 +2333,15 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* pages are ordered properly.
*/
list_add_tail(&page->pcp_list, list);
- allocated++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
- /*
- * i pages were removed from the buddy list even if some leak due
- * to check_pcp_refill failing so adjust NR_FREE_PAGES based
- * on i. Do not confuse with 'allocated' which is the number of
- * pages added to the pcp list.
- */
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock_irqrestore(&zone->lock, flags);
- return allocated;
+
+ return i;
}
#ifdef CONFIG_NUMA
@@ -3385,7 +2552,7 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
{
int migratetype;
- if (!free_pcp_prepare(page, order))
+ if (!free_pages_prepare(page, order, FPI_NONE))
return false;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -3791,7 +2958,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
page = list_first_entry(list, struct page, pcp_list);
list_del(&page->pcp_list);
pcp->count -= 1 << order;
- } while (check_new_pcp(page, order));
+ } while (check_new_pages(page, order));
return page;
}
@@ -4045,7 +3212,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
return true;
/* For a high-order request, check at least one suitable page is free */
- for (o = order; o < MAX_ORDER; o++) {
+ for (o = order; o <= MAX_ORDER; o++) {
struct free_area *area = &z->free_area[o];
int mt;
@@ -5565,7 +4732,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
* There are several places where we assume that the order value is sane
* so bail out early if the request is out of bound.
*/
- if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp))
+ if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
return NULL;
gfp &= gfp_allowed_mask;
@@ -5648,7 +4815,7 @@ EXPORT_SYMBOL(__get_free_pages);
unsigned long get_zeroed_page(gfp_t gfp_mask)
{
- return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
+ return __get_free_page(gfp_mask | __GFP_ZERO);
}
EXPORT_SYMBOL(get_zeroed_page);
@@ -6079,8 +5246,6 @@ static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask
return !node_isset(nid, *nodemask);
}
-#define K(x) ((x) << (PAGE_SHIFT-10))
-
static void show_migration_types(unsigned char type)
{
static const char types[MIGRATE_TYPES] = {
@@ -6295,8 +5460,8 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i
for_each_populated_zone(zone) {
unsigned int order;
- unsigned long nr[MAX_ORDER], flags, total = 0;
- unsigned char types[MAX_ORDER];
+ unsigned long nr[MAX_ORDER + 1], flags, total = 0;
+ unsigned char types[MAX_ORDER + 1];
if (zone_idx(zone) > max_zone_idx)
continue;
@@ -6306,7 +5471,7 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i
printk(KERN_CONT "%s: ", zone->name);
spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order < MAX_ORDER; order++) {
+ for (order = 0; order <= MAX_ORDER; order++) {
struct free_area *area = &zone->free_area[order];
int type;
@@ -6320,7 +5485,7 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i
}
}
spin_unlock_irqrestore(&zone->lock, flags);
- for (order = 0; order < MAX_ORDER; order++) {
+ for (order = 0; order <= MAX_ORDER; order++) {
printk(KERN_CONT "%lu*%lukB ",
nr[order], K(1UL) << order);
if (nr[order])
@@ -6625,7 +5790,6 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
#define BOOT_PAGESET_BATCH 1
static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
-static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
static void __build_all_zonelists(void *data)
{
@@ -6739,366 +5903,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
#endif
}
-/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
-static bool __meminit
-overlap_memmap_init(unsigned long zone, unsigned long *pfn)
-{
- static struct memblock_region *r;
-
- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
- if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
- for_each_mem_region(r) {
- if (*pfn < memblock_region_memory_end_pfn(r))
- break;
- }
- }
- if (*pfn >= memblock_region_memory_base_pfn(r) &&
- memblock_is_mirror(r)) {
- *pfn = memblock_region_memory_end_pfn(r);
- return true;
- }
- }
- return false;
-}
-
-/*
- * Initially all pages are reserved - free ones are freed
- * up by memblock_free_all() once the early boot process is
- * done. Non-atomic initialization, single-pass.
- *
- * All aligned pageblocks are initialized to the specified migratetype
- * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
- * zone stats (e.g., nr_isolate_pageblock) are touched.
- */
-void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn, unsigned long zone_end_pfn,
- enum meminit_context context,
- struct vmem_altmap *altmap, int migratetype)
-{
- unsigned long pfn, end_pfn = start_pfn + size;
- struct page *page;
-
- if (highest_memmap_pfn < end_pfn - 1)
- highest_memmap_pfn = end_pfn - 1;
-
-#ifdef CONFIG_ZONE_DEVICE
- /*
- * Honor reservation requested by the driver for this ZONE_DEVICE
- * memory. We limit the total number of pages to initialize to just
- * those that might contain the memory mapping. We will defer the
- * ZONE_DEVICE page initialization until after we have released
- * the hotplug lock.
- */
- if (zone == ZONE_DEVICE) {
- if (!altmap)
- return;
-
- if (start_pfn == altmap->base_pfn)
- start_pfn += altmap->reserve;
- end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
- }
-#endif
-
- for (pfn = start_pfn; pfn < end_pfn; ) {
- /*
- * There can be holes in boot-time mem_map[]s handed to this
- * function. They do not exist on hotplugged memory.
- */
- if (context == MEMINIT_EARLY) {
- if (overlap_memmap_init(zone, &pfn))
- continue;
- if (defer_init(nid, pfn, zone_end_pfn)) {
- deferred_struct_pages = true;
- break;
- }
- }
-
- page = pfn_to_page(pfn);
- __init_single_page(page, pfn, zone, nid);
- if (context == MEMINIT_HOTPLUG)
- __SetPageReserved(page);
-
- /*
- * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
- * such that unmovable allocations won't be scattered all
- * over the place during system boot.
- */
- if (pageblock_aligned(pfn)) {
- set_pageblock_migratetype(page, migratetype);
- cond_resched();
- }
- pfn++;
- }
-}
-
-#ifdef CONFIG_ZONE_DEVICE
-static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
- unsigned long zone_idx, int nid,
- struct dev_pagemap *pgmap)
-{
-
- __init_single_page(page, pfn, zone_idx, nid);
-
- /*
- * Mark page reserved as it will need to wait for onlining
- * phase for it to be fully associated with a zone.
- *
- * We can use the non-atomic __set_bit operation for setting
- * the flag as we are still initializing the pages.
- */
- __SetPageReserved(page);
-
- /*
- * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
- * and zone_device_data. It is a bug if a ZONE_DEVICE page is
- * ever freed or placed on a driver-private list.
- */
- page->pgmap = pgmap;
- page->zone_device_data = NULL;
-
- /*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
- * because this is done early in section_activate()
- */
- if (pageblock_aligned(pfn)) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- cond_resched();
- }
-
- /*
- * ZONE_DEVICE pages are released directly to the driver page allocator
- * which will set the page count to 1 when allocating the page.
- */
- if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
- pgmap->type == MEMORY_DEVICE_COHERENT)
- set_page_count(page, 0);
-}
-
-/*
- * With compound page geometry and when struct pages are stored in ram most
- * tail pages are reused. Consequently, the amount of unique struct pages to
- * initialize is a lot smaller that the total amount of struct pages being
- * mapped. This is a paired / mild layering violation with explicit knowledge
- * of how the sparse_vmemmap internals handle compound pages in the lack
- * of an altmap. See vmemmap_populate_compound_pages().
- */
-static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
- unsigned long nr_pages)
-{
- return is_power_of_2(sizeof(struct page)) &&
- !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
-}
-
-static void __ref memmap_init_compound(struct page *head,
- unsigned long head_pfn,
- unsigned long zone_idx, int nid,
- struct dev_pagemap *pgmap,
- unsigned long nr_pages)
-{
- unsigned long pfn, end_pfn = head_pfn + nr_pages;
- unsigned int order = pgmap->vmemmap_shift;
-
- __SetPageHead(head);
- for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
- struct page *page = pfn_to_page(pfn);
-
- __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
- prep_compound_tail(head, pfn - head_pfn);
- set_page_count(page, 0);
-
- /*
- * The first tail page stores important compound page info.
- * Call prep_compound_head() after the first tail page has
- * been initialized, to not have the data overwritten.
- */
- if (pfn == head_pfn + 1)
- prep_compound_head(head, order);
- }
-}
-
-void __ref memmap_init_zone_device(struct zone *zone,
- unsigned long start_pfn,
- unsigned long nr_pages,
- struct dev_pagemap *pgmap)
-{
- unsigned long pfn, end_pfn = start_pfn + nr_pages;
- struct pglist_data *pgdat = zone->zone_pgdat;
- struct vmem_altmap *altmap = pgmap_altmap(pgmap);
- unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
- unsigned long zone_idx = zone_idx(zone);
- unsigned long start = jiffies;
- int nid = pgdat->node_id;
-
- if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))
- return;
-
- /*
- * The call to memmap_init should have already taken care
- * of the pages reserved for the memmap, so we can just jump to
- * the end of that region and start processing the device pages.
- */
- if (altmap) {
- start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
- nr_pages = end_pfn - start_pfn;
- }
-
- for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
- struct page *page = pfn_to_page(pfn);
-
- __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
-
- if (pfns_per_compound == 1)
- continue;
-
- memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
- compound_nr_pages(altmap, pfns_per_compound));
- }
-
- pr_info("%s initialised %lu pages in %ums\n", __func__,
- nr_pages, jiffies_to_msecs(jiffies - start));
-}
-
-#endif
-static void __meminit zone_init_free_lists(struct zone *zone)
-{
- unsigned int order, t;
- for_each_migratetype_order(order, t) {
- INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
- zone->free_area[order].nr_free = 0;
- }
-}
-
-/*
- * Only struct pages that correspond to ranges defined by memblock.memory
- * are zeroed and initialized by going through __init_single_page() during
- * memmap_init_zone_range().
- *
- * But, there could be struct pages that correspond to holes in
- * memblock.memory. This can happen because of the following reasons:
- * - physical memory bank size is not necessarily the exact multiple of the
- * arbitrary section size
- * - early reserved memory may not be listed in memblock.memory
- * - memory layouts defined with memmap= kernel parameter may not align
- * nicely with memmap sections
- *
- * Explicitly initialize those struct pages so that:
- * - PG_Reserved is set
- * - zone and node links point to zone and node that span the page if the
- * hole is in the middle of a zone
- * - zone and node links point to adjacent zone/node if the hole falls on
- * the zone boundary; the pages in such holes will be prepended to the
- * zone/node above the hole except for the trailing pages in the last
- * section that will be appended to the zone/node below.
- */
-static void __init init_unavailable_range(unsigned long spfn,
- unsigned long epfn,
- int zone, int node)
-{
- unsigned long pfn;
- u64 pgcnt = 0;
-
- for (pfn = spfn; pfn < epfn; pfn++) {
- if (!pfn_valid(pageblock_start_pfn(pfn))) {
- pfn = pageblock_end_pfn(pfn) - 1;
- continue;
- }
- __init_single_page(pfn_to_page(pfn), pfn, zone, node);
- __SetPageReserved(pfn_to_page(pfn));
- pgcnt++;
- }
-
- if (pgcnt)
- pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
- node, zone_names[zone], pgcnt);
-}
-
-static void __init memmap_init_zone_range(struct zone *zone,
- unsigned long start_pfn,
- unsigned long end_pfn,
- unsigned long *hole_pfn)
-{
- unsigned long zone_start_pfn = zone->zone_start_pfn;
- unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
- int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
-
- start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
- end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
-
- if (start_pfn >= end_pfn)
- return;
-
- memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
- zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
-
- if (*hole_pfn < start_pfn)
- init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
-
- *hole_pfn = end_pfn;
-}
-
-static void __init memmap_init(void)
-{
- unsigned long start_pfn, end_pfn;
- unsigned long hole_pfn = 0;
- int i, j, zone_id = 0, nid;
-
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
- struct pglist_data *node = NODE_DATA(nid);
-
- for (j = 0; j < MAX_NR_ZONES; j++) {
- struct zone *zone = node->node_zones + j;
-
- if (!populated_zone(zone))
- continue;
-
- memmap_init_zone_range(zone, start_pfn, end_pfn,
- &hole_pfn);
- zone_id = j;
- }
- }
-
-#ifdef CONFIG_SPARSEMEM
- /*
- * Initialize the memory map for hole in the range [memory_end,
- * section_end].
- * Append the pages in this hole to the highest zone in the last
- * node.
- * The call to init_unavailable_range() is outside the ifdef to
- * silence the compiler warining about zone_id set but not used;
- * for FLATMEM it is a nop anyway
- */
- end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
- if (hole_pfn < end_pfn)
-#endif
- init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
-}
-
-void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
- phys_addr_t min_addr, int nid, bool exact_nid)
-{
- void *ptr;
-
- if (exact_nid)
- ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
- MEMBLOCK_ALLOC_ACCESSIBLE,
- nid);
- else
- ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
- MEMBLOCK_ALLOC_ACCESSIBLE,
- nid);
-
- if (ptr && size > 0)
- page_init_poison(ptr, size);
-
- return ptr;
-}
-
static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
@@ -7337,7 +6141,7 @@ void __init setup_per_cpu_pageset(void)
alloc_percpu(struct per_cpu_nodestat);
}
-static __meminit void zone_pcp_init(struct zone *zone)
+__meminit void zone_pcp_init(struct zone *zone)
{
/*
* per cpu subsystem is not up at this point. The following code
@@ -7354,1148 +6158,6 @@ static __meminit void zone_pcp_init(struct zone *zone)
zone->present_pages, zone_batchsize(zone));
}
-void __meminit init_currently_empty_zone(struct zone *zone,
- unsigned long zone_start_pfn,
- unsigned long size)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- int zone_idx = zone_idx(zone) + 1;
-
- if (zone_idx > pgdat->nr_zones)
- pgdat->nr_zones = zone_idx;
-
- zone->zone_start_pfn = zone_start_pfn;
-
- mminit_dprintk(MMINIT_TRACE, "memmap_init",
- "Initialising map node %d zone %lu pfns %lu -> %lu\n",
- pgdat->node_id,
- (unsigned long)zone_idx(zone),
- zone_start_pfn, (zone_start_pfn + size));
-
- zone_init_free_lists(zone);
- zone->initialized = 1;
-}
-
-/**
- * get_pfn_range_for_nid - Return the start and end page frames for a node
- * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
- * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
- * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
- *
- * It returns the start and end page frame of a node based on information
- * provided by memblock_set_node(). If called for a node
- * with no available memory, a warning is printed and the start and end
- * PFNs will be 0.
- */
-void __init get_pfn_range_for_nid(unsigned int nid,
- unsigned long *start_pfn, unsigned long *end_pfn)
-{
- unsigned long this_start_pfn, this_end_pfn;
- int i;
-
- *start_pfn = -1UL;
- *end_pfn = 0;
-
- for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
- *start_pfn = min(*start_pfn, this_start_pfn);
- *end_pfn = max(*end_pfn, this_end_pfn);
- }
-
- if (*start_pfn == -1UL)
- *start_pfn = 0;
-}
-
-/*
- * This finds a zone that can be used for ZONE_MOVABLE pages. The
- * assumption is made that zones within a node are ordered in monotonic
- * increasing memory addresses so that the "highest" populated zone is used
- */
-static void __init find_usable_zone_for_movable(void)
-{
- int zone_index;
- for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
- if (zone_index == ZONE_MOVABLE)
- continue;
-
- if (arch_zone_highest_possible_pfn[zone_index] >
- arch_zone_lowest_possible_pfn[zone_index])
- break;
- }
-
- VM_BUG_ON(zone_index == -1);
- movable_zone = zone_index;
-}
-
-/*
- * The zone ranges provided by the architecture do not include ZONE_MOVABLE
- * because it is sized independent of architecture. Unlike the other zones,
- * the starting point for ZONE_MOVABLE is not fixed. It may be different
- * in each node depending on the size of each node and how evenly kernelcore
- * is distributed. This helper function adjusts the zone ranges
- * provided by the architecture for a given node by using the end of the
- * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
- * zones within a node are in order of monotonic increases memory addresses
- */
-static void __init adjust_zone_range_for_zone_movable(int nid,
- unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zone_start_pfn,
- unsigned long *zone_end_pfn)
-{
- /* Only adjust if ZONE_MOVABLE is on this node */
- if (zone_movable_pfn[nid]) {
- /* Size ZONE_MOVABLE */
- if (zone_type == ZONE_MOVABLE) {
- *zone_start_pfn = zone_movable_pfn[nid];
- *zone_end_pfn = min(node_end_pfn,
- arch_zone_highest_possible_pfn[movable_zone]);
-
- /* Adjust for ZONE_MOVABLE starting within this range */
- } else if (!mirrored_kernelcore &&
- *zone_start_pfn < zone_movable_pfn[nid] &&
- *zone_end_pfn > zone_movable_pfn[nid]) {
- *zone_end_pfn = zone_movable_pfn[nid];
-
- /* Check if this whole range is within ZONE_MOVABLE */
- } else if (*zone_start_pfn >= zone_movable_pfn[nid])
- *zone_start_pfn = *zone_end_pfn;
- }
-}
-
-/*
- * Return the number of pages a zone spans in a node, including holes
- * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
- */
-static unsigned long __init zone_spanned_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zone_start_pfn,
- unsigned long *zone_end_pfn)
-{
- unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
- unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
- /* When hotadd a new node from cpu_up(), the node should be empty */
- if (!node_start_pfn && !node_end_pfn)
- return 0;
-
- /* Get the start and end of the zone */
- *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
- *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
- adjust_zone_range_for_zone_movable(nid, zone_type,
- node_start_pfn, node_end_pfn,
- zone_start_pfn, zone_end_pfn);
-
- /* Check that this node has pages within the zone's required range */
- if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
- return 0;
-
- /* Move the zone boundaries inside the node if necessary */
- *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
- *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
-
- /* Return the spanned pages */
- return *zone_end_pfn - *zone_start_pfn;
-}
-
-/*
- * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
- * then all holes in the requested range will be accounted for.
- */
-unsigned long __init __absent_pages_in_range(int nid,
- unsigned long range_start_pfn,
- unsigned long range_end_pfn)
-{
- unsigned long nr_absent = range_end_pfn - range_start_pfn;
- unsigned long start_pfn, end_pfn;
- int i;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
- start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
- end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
- nr_absent -= end_pfn - start_pfn;
- }
- return nr_absent;
-}
-
-/**
- * absent_pages_in_range - Return number of page frames in holes within a range
- * @start_pfn: The start PFN to start searching for holes
- * @end_pfn: The end PFN to stop searching for holes
- *
- * Return: the number of pages frames in memory holes within a range.
- */
-unsigned long __init absent_pages_in_range(unsigned long start_pfn,
- unsigned long end_pfn)
-{
- return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
-}
-
-/* Return the number of page frames in holes in a zone on a node */
-static unsigned long __init zone_absent_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn)
-{
- unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
- unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
- unsigned long zone_start_pfn, zone_end_pfn;
- unsigned long nr_absent;
-
- /* When hotadd a new node from cpu_up(), the node should be empty */
- if (!node_start_pfn && !node_end_pfn)
- return 0;
-
- zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
- zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
-
- adjust_zone_range_for_zone_movable(nid, zone_type,
- node_start_pfn, node_end_pfn,
- &zone_start_pfn, &zone_end_pfn);
- nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
-
- /*
- * ZONE_MOVABLE handling.
- * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
- * and vice versa.
- */
- if (mirrored_kernelcore && zone_movable_pfn[nid]) {
- unsigned long start_pfn, end_pfn;
- struct memblock_region *r;
-
- for_each_mem_region(r) {
- start_pfn = clamp(memblock_region_memory_base_pfn(r),
- zone_start_pfn, zone_end_pfn);
- end_pfn = clamp(memblock_region_memory_end_pfn(r),
- zone_start_pfn, zone_end_pfn);
-
- if (zone_type == ZONE_MOVABLE &&
- memblock_is_mirror(r))
- nr_absent += end_pfn - start_pfn;
-
- if (zone_type == ZONE_NORMAL &&
- !memblock_is_mirror(r))
- nr_absent += end_pfn - start_pfn;
- }
- }
-
- return nr_absent;
-}
-
-static void __init calculate_node_totalpages(struct pglist_data *pgdat,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn)
-{
- unsigned long realtotalpages = 0, totalpages = 0;
- enum zone_type i;
-
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zone *zone = pgdat->node_zones + i;
- unsigned long zone_start_pfn, zone_end_pfn;
- unsigned long spanned, absent;
- unsigned long size, real_size;
-
- spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn,
- &zone_start_pfn,
- &zone_end_pfn);
- absent = zone_absent_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn);
-
- size = spanned;
- real_size = size - absent;
-
- if (size)
- zone->zone_start_pfn = zone_start_pfn;
- else
- zone->zone_start_pfn = 0;
- zone->spanned_pages = size;
- zone->present_pages = real_size;
-#if defined(CONFIG_MEMORY_HOTPLUG)
- zone->present_early_pages = real_size;
-#endif
-
- totalpages += size;
- realtotalpages += real_size;
- }
-
- pgdat->node_spanned_pages = totalpages;
- pgdat->node_present_pages = realtotalpages;
- pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
-}
-
-#ifndef CONFIG_SPARSEMEM
-/*
- * Calculate the size of the zone->blockflags rounded to an unsigned long
- * Start by making sure zonesize is a multiple of pageblock_order by rounding
- * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
- * round what is now in bits to nearest long in bits, then return it in
- * bytes.
- */
-static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
-{
- unsigned long usemapsize;
-
- zonesize += zone_start_pfn & (pageblock_nr_pages-1);
- usemapsize = roundup(zonesize, pageblock_nr_pages);
- usemapsize = usemapsize >> pageblock_order;
- usemapsize *= NR_PAGEBLOCK_BITS;
- usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
-
- return usemapsize / 8;
-}
-
-static void __ref setup_usemap(struct zone *zone)
-{
- unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
- zone->spanned_pages);
- zone->pageblock_flags = NULL;
- if (usemapsize) {
- zone->pageblock_flags =
- memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
- zone_to_nid(zone));
- if (!zone->pageblock_flags)
- panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
- usemapsize, zone->name, zone_to_nid(zone));
- }
-}
-#else
-static inline void setup_usemap(struct zone *zone) {}
-#endif /* CONFIG_SPARSEMEM */
-
-#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
-
-/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-void __init set_pageblock_order(void)
-{
- unsigned int order = MAX_ORDER - 1;
-
- /* Check that pageblock_nr_pages has not already been setup */
- if (pageblock_order)
- return;
-
- /* Don't let pageblocks exceed the maximum allocation granularity. */
- if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
- order = HUGETLB_PAGE_ORDER;
-
- /*
- * Assume the largest contiguous order of interest is a huge page.
- * This value may be variable depending on boot parameters on IA64 and
- * powerpc.
- */
- pageblock_order = order;
-}
-#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
-
-/*
- * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
- * is unused as pageblock_order is set at compile-time. See
- * include/linux/pageblock-flags.h for the values of pageblock_order based on
- * the kernel config
- */
-void __init set_pageblock_order(void)
-{
-}
-
-#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
-
-static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
- unsigned long present_pages)
-{
- unsigned long pages = spanned_pages;
-
- /*
- * Provide a more accurate estimation if there are holes within
- * the zone and SPARSEMEM is in use. If there are holes within the
- * zone, each populated memory region may cost us one or two extra
- * memmap pages due to alignment because memmap pages for each
- * populated regions may not be naturally aligned on page boundary.
- * So the (present_pages >> 4) heuristic is a tradeoff for that.
- */
- if (spanned_pages > present_pages + (present_pages >> 4) &&
- IS_ENABLED(CONFIG_SPARSEMEM))
- pages = present_pages;
-
- return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void pgdat_init_split_queue(struct pglist_data *pgdat)
-{
- struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
-
- spin_lock_init(&ds_queue->split_queue_lock);
- INIT_LIST_HEAD(&ds_queue->split_queue);
- ds_queue->split_queue_len = 0;
-}
-#else
-static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
-#endif
-
-#ifdef CONFIG_COMPACTION
-static void pgdat_init_kcompactd(struct pglist_data *pgdat)
-{
- init_waitqueue_head(&pgdat->kcompactd_wait);
-}
-#else
-static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
-#endif
-
-static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
-{
- int i;
-
- pgdat_resize_init(pgdat);
- pgdat_kswapd_lock_init(pgdat);
-
- pgdat_init_split_queue(pgdat);
- pgdat_init_kcompactd(pgdat);
-
- init_waitqueue_head(&pgdat->kswapd_wait);
- init_waitqueue_head(&pgdat->pfmemalloc_wait);
-
- for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
- init_waitqueue_head(&pgdat->reclaim_wait[i]);
-
- pgdat_page_ext_init(pgdat);
- lruvec_init(&pgdat->__lruvec);
-}
-
-static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
- unsigned long remaining_pages)
-{
- atomic_long_set(&zone->managed_pages, remaining_pages);
- zone_set_nid(zone, nid);
- zone->name = zone_names[idx];
- zone->zone_pgdat = NODE_DATA(nid);
- spin_lock_init(&zone->lock);
- zone_seqlock_init(zone);
- zone_pcp_init(zone);
-}
-
-/*
- * Set up the zone data structures
- * - init pgdat internals
- * - init all zones belonging to this node
- *
- * NOTE: this function is only called during memory hotplug
- */
-#ifdef CONFIG_MEMORY_HOTPLUG
-void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
-{
- int nid = pgdat->node_id;
- enum zone_type z;
- int cpu;
-
- pgdat_init_internals(pgdat);
-
- if (pgdat->per_cpu_nodestats == &boot_nodestats)
- pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
-
- /*
- * Reset the nr_zones, order and highest_zoneidx before reuse.
- * Note that kswapd will init kswapd_highest_zoneidx properly
- * when it starts in the near future.
- */
- pgdat->nr_zones = 0;
- pgdat->kswapd_order = 0;
- pgdat->kswapd_highest_zoneidx = 0;
- pgdat->node_start_pfn = 0;
- for_each_online_cpu(cpu) {
- struct per_cpu_nodestat *p;
-
- p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
- memset(p, 0, sizeof(*p));
- }
-
- for (z = 0; z < MAX_NR_ZONES; z++)
- zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
-}
-#endif
-
-/*
- * Set up the zone data structures:
- * - mark all pages reserved
- * - mark all memory queues empty
- * - clear the memory bitmaps
- *
- * NOTE: pgdat should get zeroed by caller.
- * NOTE: this function is only called during early init.
- */
-static void __init free_area_init_core(struct pglist_data *pgdat)
-{
- enum zone_type j;
- int nid = pgdat->node_id;
-
- pgdat_init_internals(pgdat);
- pgdat->per_cpu_nodestats = &boot_nodestats;
-
- for (j = 0; j < MAX_NR_ZONES; j++) {
- struct zone *zone = pgdat->node_zones + j;
- unsigned long size, freesize, memmap_pages;
-
- size = zone->spanned_pages;
- freesize = zone->present_pages;
-
- /*
- * Adjust freesize so that it accounts for how much memory
- * is used by this zone for memmap. This affects the watermark
- * and per-cpu initialisations
- */
- memmap_pages = calc_memmap_size(size, freesize);
- if (!is_highmem_idx(j)) {
- if (freesize >= memmap_pages) {
- freesize -= memmap_pages;
- if (memmap_pages)
- pr_debug(" %s zone: %lu pages used for memmap\n",
- zone_names[j], memmap_pages);
- } else
- pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n",
- zone_names[j], memmap_pages, freesize);
- }
-
- /* Account for reserved pages */
- if (j == 0 && freesize > dma_reserve) {
- freesize -= dma_reserve;
- pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
- }
-
- if (!is_highmem_idx(j))
- nr_kernel_pages += freesize;
- /* Charge for highmem memmap if there are enough kernel pages */
- else if (nr_kernel_pages > memmap_pages * 2)
- nr_kernel_pages -= memmap_pages;
- nr_all_pages += freesize;
-
- /*
- * Set an approximate value for lowmem here, it will be adjusted
- * when the bootmem allocator frees pages into the buddy system.
- * And all highmem pages will be managed by the buddy system.
- */
- zone_init_internals(zone, j, nid, freesize);
-
- if (!size)
- continue;
-
- set_pageblock_order();
- setup_usemap(zone);
- init_currently_empty_zone(zone, zone->zone_start_pfn, size);
- }
-}
-
-#ifdef CONFIG_FLATMEM
-static void __init alloc_node_mem_map(struct pglist_data *pgdat)
-{
- unsigned long __maybe_unused start = 0;
- unsigned long __maybe_unused offset = 0;
-
- /* Skip empty nodes */
- if (!pgdat->node_spanned_pages)
- return;
-
- start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
- offset = pgdat->node_start_pfn - start;
- /* ia64 gets its own node_mem_map, before this, without bootmem */
- if (!pgdat->node_mem_map) {
- unsigned long size, end;
- struct page *map;
-
- /*
- * The zone's endpoints aren't required to be MAX_ORDER
- * aligned but the node_mem_map endpoints must be in order
- * for the buddy allocator to function correctly.
- */
- end = pgdat_end_pfn(pgdat);
- end = ALIGN(end, MAX_ORDER_NR_PAGES);
- size = (end - start) * sizeof(struct page);
- map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
- pgdat->node_id, false);
- if (!map)
- panic("Failed to allocate %ld bytes for node %d memory map\n",
- size, pgdat->node_id);
- pgdat->node_mem_map = map + offset;
- }
- pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
- __func__, pgdat->node_id, (unsigned long)pgdat,
- (unsigned long)pgdat->node_mem_map);
-#ifndef CONFIG_NUMA
- /*
- * With no DISCONTIG, the global mem_map is just set as node 0's
- */
- if (pgdat == NODE_DATA(0)) {
- mem_map = NODE_DATA(0)->node_mem_map;
- if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
- mem_map -= offset;
- }
-#endif
-}
-#else
-static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
-#endif /* CONFIG_FLATMEM */
-
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
-{
- pgdat->first_deferred_pfn = ULONG_MAX;
-}
-#else
-static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
-#endif
-
-static void __init free_area_init_node(int nid)
-{
- pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long start_pfn = 0;
- unsigned long end_pfn = 0;
-
- /* pg_data_t should be reset to zero when it's allocated */
- WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
-
- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
-
- pgdat->node_id = nid;
- pgdat->node_start_pfn = start_pfn;
- pgdat->per_cpu_nodestats = NULL;
-
- if (start_pfn != end_pfn) {
- pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
- (u64)start_pfn << PAGE_SHIFT,
- end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
- } else {
- pr_info("Initmem setup node %d as memoryless\n", nid);
- }
-
- calculate_node_totalpages(pgdat, start_pfn, end_pfn);
-
- alloc_node_mem_map(pgdat);
- pgdat_set_deferred_range(pgdat);
-
- free_area_init_core(pgdat);
- lru_gen_init_pgdat(pgdat);
-}
-
-static void __init free_area_init_memoryless_node(int nid)
-{
- free_area_init_node(nid);
-}
-
-#if MAX_NUMNODES > 1
-/*
- * Figure out the number of possible node ids.
- */
-void __init setup_nr_node_ids(void)
-{
- unsigned int highest;
-
- highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
- nr_node_ids = highest + 1;
-}
-#endif
-
-/**
- * node_map_pfn_alignment - determine the maximum internode alignment
- *
- * This function should be called after node map is populated and sorted.
- * It calculates the maximum power of two alignment which can distinguish
- * all the nodes.
- *
- * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
- * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
- * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
- * shifted, 1GiB is enough and this function will indicate so.
- *
- * This is used to test whether pfn -> nid mapping of the chosen memory
- * model has fine enough granularity to avoid incorrect mapping for the
- * populated node map.
- *
- * Return: the determined alignment in pfn's. 0 if there is no alignment
- * requirement (single node).
- */
-unsigned long __init node_map_pfn_alignment(void)
-{
- unsigned long accl_mask = 0, last_end = 0;
- unsigned long start, end, mask;
- int last_nid = NUMA_NO_NODE;
- int i, nid;
-
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
- if (!start || last_nid < 0 || last_nid == nid) {
- last_nid = nid;
- last_end = end;
- continue;
- }
-
- /*
- * Start with a mask granular enough to pin-point to the
- * start pfn and tick off bits one-by-one until it becomes
- * too coarse to separate the current node from the last.
- */
- mask = ~((1 << __ffs(start)) - 1);
- while (mask && last_end <= (start & (mask << 1)))
- mask <<= 1;
-
- /* accumulate all internode masks */
- accl_mask |= mask;
- }
-
- /* convert mask to number of pages */
- return ~accl_mask + 1;
-}
-
-/*
- * early_calculate_totalpages()
- * Sum pages in active regions for movable zone.
- * Populate N_MEMORY for calculating usable_nodes.
- */
-static unsigned long __init early_calculate_totalpages(void)
-{
- unsigned long totalpages = 0;
- unsigned long start_pfn, end_pfn;
- int i, nid;
-
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
- unsigned long pages = end_pfn - start_pfn;
-
- totalpages += pages;
- if (pages)
- node_set_state(nid, N_MEMORY);
- }
- return totalpages;
-}
-
-/*
- * Find the PFN the Movable zone begins in each node. Kernel memory
- * is spread evenly between nodes as long as the nodes have enough
- * memory. When they don't, some nodes will have more kernelcore than
- * others
- */
-static void __init find_zone_movable_pfns_for_nodes(void)
-{
- int i, nid;
- unsigned long usable_startpfn;
- unsigned long kernelcore_node, kernelcore_remaining;
- /* save the state before borrow the nodemask */
- nodemask_t saved_node_state = node_states[N_MEMORY];
- unsigned long totalpages = early_calculate_totalpages();
- int usable_nodes = nodes_weight(node_states[N_MEMORY]);
- struct memblock_region *r;
-
- /* Need to find movable_zone earlier when movable_node is specified. */
- find_usable_zone_for_movable();
-
- /*
- * If movable_node is specified, ignore kernelcore and movablecore
- * options.
- */
- if (movable_node_is_enabled()) {
- for_each_mem_region(r) {
- if (!memblock_is_hotpluggable(r))
- continue;
-
- nid = memblock_get_region_node(r);
-
- usable_startpfn = PFN_DOWN(r->base);
- zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
- min(usable_startpfn, zone_movable_pfn[nid]) :
- usable_startpfn;
- }
-
- goto out2;
- }
-
- /*
- * If kernelcore=mirror is specified, ignore movablecore option
- */
- if (mirrored_kernelcore) {
- bool mem_below_4gb_not_mirrored = false;
-
- for_each_mem_region(r) {
- if (memblock_is_mirror(r))
- continue;
-
- nid = memblock_get_region_node(r);
-
- usable_startpfn = memblock_region_memory_base_pfn(r);
-
- if (usable_startpfn < PHYS_PFN(SZ_4G)) {
- mem_below_4gb_not_mirrored = true;
- continue;
- }
-
- zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
- min(usable_startpfn, zone_movable_pfn[nid]) :
- usable_startpfn;
- }
-
- if (mem_below_4gb_not_mirrored)
- pr_warn("This configuration results in unmirrored kernel memory.\n");
-
- goto out2;
- }
-
- /*
- * If kernelcore=nn% or movablecore=nn% was specified, calculate the
- * amount of necessary memory.
- */
- if (required_kernelcore_percent)
- required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
- 10000UL;
- if (required_movablecore_percent)
- required_movablecore = (totalpages * 100 * required_movablecore_percent) /
- 10000UL;
-
- /*
- * If movablecore= was specified, calculate what size of
- * kernelcore that corresponds so that memory usable for
- * any allocation type is evenly spread. If both kernelcore
- * and movablecore are specified, then the value of kernelcore
- * will be used for required_kernelcore if it's greater than
- * what movablecore would have allowed.
- */
- if (required_movablecore) {
- unsigned long corepages;
-
- /*
- * Round-up so that ZONE_MOVABLE is at least as large as what
- * was requested by the user
- */
- required_movablecore =
- roundup(required_movablecore, MAX_ORDER_NR_PAGES);
- required_movablecore = min(totalpages, required_movablecore);
- corepages = totalpages - required_movablecore;
-
- required_kernelcore = max(required_kernelcore, corepages);
- }
-
- /*
- * If kernelcore was not specified or kernelcore size is larger
- * than totalpages, there is no ZONE_MOVABLE.
- */
- if (!required_kernelcore || required_kernelcore >= totalpages)
- goto out;
-
- /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
- usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
-
-restart:
- /* Spread kernelcore memory as evenly as possible throughout nodes */
- kernelcore_node = required_kernelcore / usable_nodes;
- for_each_node_state(nid, N_MEMORY) {
- unsigned long start_pfn, end_pfn;
-
- /*
- * Recalculate kernelcore_node if the division per node
- * now exceeds what is necessary to satisfy the requested
- * amount of memory for the kernel
- */
- if (required_kernelcore < kernelcore_node)
- kernelcore_node = required_kernelcore / usable_nodes;
-
- /*
- * As the map is walked, we track how much memory is usable
- * by the kernel using kernelcore_remaining. When it is
- * 0, the rest of the node is usable by ZONE_MOVABLE
- */
- kernelcore_remaining = kernelcore_node;
-
- /* Go through each range of PFNs within this node */
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
- unsigned long size_pages;
-
- start_pfn = max(start_pfn, zone_movable_pfn[nid]);
- if (start_pfn >= end_pfn)
- continue;
-
- /* Account for what is only usable for kernelcore */
- if (start_pfn < usable_startpfn) {
- unsigned long kernel_pages;
- kernel_pages = min(end_pfn, usable_startpfn)
- - start_pfn;
-
- kernelcore_remaining -= min(kernel_pages,
- kernelcore_remaining);
- required_kernelcore -= min(kernel_pages,
- required_kernelcore);
-
- /* Continue if range is now fully accounted */
- if (end_pfn <= usable_startpfn) {
-
- /*
- * Push zone_movable_pfn to the end so
- * that if we have to rebalance
- * kernelcore across nodes, we will
- * not double account here
- */
- zone_movable_pfn[nid] = end_pfn;
- continue;
- }
- start_pfn = usable_startpfn;
- }
-
- /*
- * The usable PFN range for ZONE_MOVABLE is from
- * start_pfn->end_pfn. Calculate size_pages as the
- * number of pages used as kernelcore
- */
- size_pages = end_pfn - start_pfn;
- if (size_pages > kernelcore_remaining)
- size_pages = kernelcore_remaining;
- zone_movable_pfn[nid] = start_pfn + size_pages;
-
- /*
- * Some kernelcore has been met, update counts and
- * break if the kernelcore for this node has been
- * satisfied
- */
- required_kernelcore -= min(required_kernelcore,
- size_pages);
- kernelcore_remaining -= size_pages;
- if (!kernelcore_remaining)
- break;
- }
- }
-
- /*
- * If there is still required_kernelcore, we do another pass with one
- * less node in the count. This will push zone_movable_pfn[nid] further
- * along on the nodes that still have memory until kernelcore is
- * satisfied
- */
- usable_nodes--;
- if (usable_nodes && required_kernelcore > usable_nodes)
- goto restart;
-
-out2:
- /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
- for (nid = 0; nid < MAX_NUMNODES; nid++) {
- unsigned long start_pfn, end_pfn;
-
- zone_movable_pfn[nid] =
- roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
-
- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
- if (zone_movable_pfn[nid] >= end_pfn)
- zone_movable_pfn[nid] = 0;
- }
-
-out:
- /* restore the node_state */
- node_states[N_MEMORY] = saved_node_state;
-}
-
-/* Any regular or high memory on that node ? */
-static void check_for_memory(pg_data_t *pgdat, int nid)
-{
- enum zone_type zone_type;
-
- for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
- struct zone *zone = &pgdat->node_zones[zone_type];
- if (populated_zone(zone)) {
- if (IS_ENABLED(CONFIG_HIGHMEM))
- node_set_state(nid, N_HIGH_MEMORY);
- if (zone_type <= ZONE_NORMAL)
- node_set_state(nid, N_NORMAL_MEMORY);
- break;
- }
- }
-}
-
-/*
- * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
- * such cases we allow max_zone_pfn sorted in the descending order
- */
-bool __weak arch_has_descending_max_zone_pfns(void)
-{
- return false;
-}
-
-/**
- * free_area_init - Initialise all pg_data_t and zone data
- * @max_zone_pfn: an array of max PFNs for each zone
- *
- * This will call free_area_init_node() for each active node in the system.
- * Using the page ranges provided by memblock_set_node(), the size of each
- * zone in each node and their holes is calculated. If the maximum PFN
- * between two adjacent zones match, it is assumed that the zone is empty.
- * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
- * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
- * starts where the previous one ended. For example, ZONE_DMA32 starts
- * at arch_max_dma_pfn.
- */
-void __init free_area_init(unsigned long *max_zone_pfn)
-{
- unsigned long start_pfn, end_pfn;
- int i, nid, zone;
- bool descending;
-
- /* Record where the zone boundaries are */
- memset(arch_zone_lowest_possible_pfn, 0,
- sizeof(arch_zone_lowest_possible_pfn));
- memset(arch_zone_highest_possible_pfn, 0,
- sizeof(arch_zone_highest_possible_pfn));
-
- start_pfn = PHYS_PFN(memblock_start_of_DRAM());
- descending = arch_has_descending_max_zone_pfns();
-
- for (i = 0; i < MAX_NR_ZONES; i++) {
- if (descending)
- zone = MAX_NR_ZONES - i - 1;
- else
- zone = i;
-
- if (zone == ZONE_MOVABLE)
- continue;
-
- end_pfn = max(max_zone_pfn[zone], start_pfn);
- arch_zone_lowest_possible_pfn[zone] = start_pfn;
- arch_zone_highest_possible_pfn[zone] = end_pfn;
-
- start_pfn = end_pfn;
- }
-
- /* Find the PFNs that ZONE_MOVABLE begins at in each node */
- memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
- find_zone_movable_pfns_for_nodes();
-
- /* Print out the zone ranges */
- pr_info("Zone ranges:\n");
- for (i = 0; i < MAX_NR_ZONES; i++) {
- if (i == ZONE_MOVABLE)
- continue;
- pr_info(" %-8s ", zone_names[i]);
- if (arch_zone_lowest_possible_pfn[i] ==
- arch_zone_highest_possible_pfn[i])
- pr_cont("empty\n");
- else
- pr_cont("[mem %#018Lx-%#018Lx]\n",
- (u64)arch_zone_lowest_possible_pfn[i]
- << PAGE_SHIFT,
- ((u64)arch_zone_highest_possible_pfn[i]
- << PAGE_SHIFT) - 1);
- }
-
- /* Print out the PFNs ZONE_MOVABLE begins at in each node */
- pr_info("Movable zone start for each node\n");
- for (i = 0; i < MAX_NUMNODES; i++) {
- if (zone_movable_pfn[i])
- pr_info(" Node %d: %#018Lx\n", i,
- (u64)zone_movable_pfn[i] << PAGE_SHIFT);
- }
-
- /*
- * Print out the early node map, and initialize the
- * subsection-map relative to active online memory ranges to
- * enable future "sub-section" extensions of the memory map.
- */
- pr_info("Early memory node ranges\n");
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
- pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
- (u64)start_pfn << PAGE_SHIFT,
- ((u64)end_pfn << PAGE_SHIFT) - 1);
- subsection_map_init(start_pfn, end_pfn - start_pfn);
- }
-
- /* Initialise every node */
- mminit_verify_pageflags_layout();
- setup_nr_node_ids();
- for_each_node(nid) {
- pg_data_t *pgdat;
-
- if (!node_online(nid)) {
- pr_info("Initializing node %d as memoryless\n", nid);
-
- /* Allocator not initialized yet */
- pgdat = arch_alloc_nodedata(nid);
- if (!pgdat)
- panic("Cannot allocate %zuB for node %d.\n",
- sizeof(*pgdat), nid);
- arch_refresh_nodedata(nid, pgdat);
- free_area_init_memoryless_node(nid);
-
- /*
- * We do not want to confuse userspace by sysfs
- * files/directories for node without any memory
- * attached to it, so this node is not marked as
- * N_MEMORY and not marked online so that no sysfs
- * hierarchy will be created via register_one_node for
- * it. The pgdat will get fully initialized by
- * hotadd_init_pgdat() when memory is hotplugged into
- * this node.
- */
- continue;
- }
-
- pgdat = NODE_DATA(nid);
- free_area_init_node(nid);
-
- /* Any memory on that node */
- if (pgdat->node_present_pages)
- node_set_state(nid, N_MEMORY);
- check_for_memory(pgdat, nid);
- }
-
- memmap_init();
-}
-
-static int __init cmdline_parse_core(char *p, unsigned long *core,
- unsigned long *percent)
-{
- unsigned long long coremem;
- char *endptr;
-
- if (!p)
- return -EINVAL;
-
- /* Value may be a percentage of total memory, otherwise bytes */
- coremem = simple_strtoull(p, &endptr, 0);
- if (*endptr == '%') {
- /* Paranoid check for percent values greater than 100 */
- WARN_ON(coremem > 100);
-
- *percent = coremem;
- } else {
- coremem = memparse(p, &p);
- /* Paranoid check that UL is enough for the coremem value */
- WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
-
- *core = coremem >> PAGE_SHIFT;
- *percent = 0UL;
- }
- return 0;
-}
-
-/*
- * kernelcore=size sets the amount of memory for use for allocations that
- * cannot be reclaimed or migrated.
- */
-static int __init cmdline_parse_kernelcore(char *p)
-{
- /* parse kernelcore=mirror */
- if (parse_option_str(p, "mirror")) {
- mirrored_kernelcore = true;
- return 0;
- }
-
- return cmdline_parse_core(p, &required_kernelcore,
- &required_kernelcore_percent);
-}
-
-/*
- * movablecore=size sets the amount of memory for use for allocations that
- * can be reclaimed or migrated.
- */
-static int __init cmdline_parse_movablecore(char *p)
-{
- return cmdline_parse_core(p, &required_movablecore,
- &required_movablecore_percent);
-}
-
-early_param("kernelcore", cmdline_parse_kernelcore);
-early_param("movablecore", cmdline_parse_movablecore);
-
void adjust_managed_page_count(struct page *page, long count)
{
atomic_long_add(count, &page_zone(page)->managed_pages);
@@ -8543,73 +6205,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
return pages;
}
-void __init mem_init_print_info(void)
-{
- unsigned long physpages, codesize, datasize, rosize, bss_size;
- unsigned long init_code_size, init_data_size;
-
- physpages = get_num_physpages();
- codesize = _etext - _stext;
- datasize = _edata - _sdata;
- rosize = __end_rodata - __start_rodata;
- bss_size = __bss_stop - __bss_start;
- init_data_size = __init_end - __init_begin;
- init_code_size = _einittext - _sinittext;
-
- /*
- * Detect special cases and adjust section sizes accordingly:
- * 1) .init.* may be embedded into .data sections
- * 2) .init.text.* may be out of [__init_begin, __init_end],
- * please refer to arch/tile/kernel/vmlinux.lds.S.
- * 3) .rodata.* may be embedded into .text or .data sections.
- */
-#define adj_init_size(start, end, size, pos, adj) \
- do { \
- if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
- size -= adj; \
- } while (0)
-
- adj_init_size(__init_begin, __init_end, init_data_size,
- _sinittext, init_code_size);
- adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
- adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
- adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
- adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
-
-#undef adj_init_size
-
- pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
-#ifdef CONFIG_HIGHMEM
- ", %luK highmem"
-#endif
- ")\n",
- K(nr_free_pages()), K(physpages),
- codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
- (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
- K(physpages - totalram_pages() - totalcma_pages),
- K(totalcma_pages)
-#ifdef CONFIG_HIGHMEM
- , K(totalhigh_pages())
-#endif
- );
-}
-
-/**
- * set_dma_reserve - set the specified number of pages reserved in the first zone
- * @new_dma_reserve: The number of pages to mark reserved
- *
- * The per-cpu batchsize and zone watermarks are determined by managed_pages.
- * In the DMA zone, a significant percentage may be consumed by kernel image
- * and other unfreeable allocations which can skew the watermarks badly. This
- * function may optionally be used to account for unfreeable pages in the
- * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
- * smaller per-cpu batchsize.
- */
-void __init set_dma_reserve(unsigned long new_dma_reserve)
-{
- dma_reserve = new_dma_reserve;
-}
-
static int page_alloc_cpu_dead(unsigned int cpu)
{
struct zone *zone;
@@ -8650,28 +6245,10 @@ static int page_alloc_cpu_online(unsigned int cpu)
return 0;
}
-#ifdef CONFIG_NUMA
-int hashdist = HASHDIST_DEFAULT;
-
-static int __init set_hashdist(char *str)
-{
- if (!str)
- return 0;
- hashdist = simple_strtoul(str, &str, 0);
- return 1;
-}
-__setup("hashdist=", set_hashdist);
-#endif
-
-void __init page_alloc_init(void)
+void __init page_alloc_init_cpuhp(void)
{
int ret;
-#ifdef CONFIG_NUMA
- if (num_node_state(N_MEMORY) == 1)
- hashdist = 0;
-#endif
-
ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
"mm/page_alloc:pcp",
page_alloc_cpu_online,
@@ -9054,149 +6631,6 @@ out:
return ret;
}
-#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
-/*
- * Returns the number of pages that arch has reserved but
- * is not known to alloc_large_system_hash().
- */
-static unsigned long __init arch_reserved_kernel_pages(void)
-{
- return 0;
-}
-#endif
-
-/*
- * Adaptive scale is meant to reduce sizes of hash tables on large memory
- * machines. As memory size is increased the scale is also increased but at
- * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
- * quadruples the scale is increased by one, which means the size of hash table
- * only doubles, instead of quadrupling as well.
- * Because 32-bit systems cannot have large physical memory, where this scaling
- * makes sense, it is disabled on such platforms.
- */
-#if __BITS_PER_LONG > 32
-#define ADAPT_SCALE_BASE (64ul << 30)
-#define ADAPT_SCALE_SHIFT 2
-#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
-#endif
-
-/*
- * allocate a large system hash table from bootmem
- * - it is assumed that the hash table must contain an exact power-of-2
- * quantity of entries
- * - limit is the number of hash buckets, not the total allocation size
- */
-void *__init alloc_large_system_hash(const char *tablename,
- unsigned long bucketsize,
- unsigned long numentries,
- int scale,
- int flags,
- unsigned int *_hash_shift,
- unsigned int *_hash_mask,
- unsigned long low_limit,
- unsigned long high_limit)
-{
- unsigned long long max = high_limit;
- unsigned long log2qty, size;
- void *table;
- gfp_t gfp_flags;
- bool virt;
- bool huge;
-
- /* allow the kernel cmdline to have a say */
- if (!numentries) {
- /* round applicable memory size up to nearest megabyte */
- numentries = nr_kernel_pages;
- numentries -= arch_reserved_kernel_pages();
-
- /* It isn't necessary when PAGE_SIZE >= 1MB */
- if (PAGE_SIZE < SZ_1M)
- numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
-
-#if __BITS_PER_LONG > 32
- if (!high_limit) {
- unsigned long adapt;
-
- for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
- adapt <<= ADAPT_SCALE_SHIFT)
- scale++;
- }
-#endif
-
- /* limit to 1 bucket per 2^scale bytes of low memory */
- if (scale > PAGE_SHIFT)
- numentries >>= (scale - PAGE_SHIFT);
- else
- numentries <<= (PAGE_SHIFT - scale);
-
- /* Make sure we've got at least a 0-order allocation.. */
- if (unlikely(flags & HASH_SMALL)) {
- /* Makes no sense without HASH_EARLY */
- WARN_ON(!(flags & HASH_EARLY));
- if (!(numentries >> *_hash_shift)) {
- numentries = 1UL << *_hash_shift;
- BUG_ON(!numentries);
- }
- } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
- numentries = PAGE_SIZE / bucketsize;
- }
- numentries = roundup_pow_of_two(numentries);
-
- /* limit allocation size to 1/16 total memory by default */
- if (max == 0) {
- max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
- do_div(max, bucketsize);
- }
- max = min(max, 0x80000000ULL);
-
- if (numentries < low_limit)
- numentries = low_limit;
- if (numentries > max)
- numentries = max;
-
- log2qty = ilog2(numentries);
-
- gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
- do {
- virt = false;
- size = bucketsize << log2qty;
- if (flags & HASH_EARLY) {
- if (flags & HASH_ZERO)
- table = memblock_alloc(size, SMP_CACHE_BYTES);
- else
- table = memblock_alloc_raw(size,
- SMP_CACHE_BYTES);
- } else if (get_order(size) >= MAX_ORDER || hashdist) {
- table = vmalloc_huge(size, gfp_flags);
- virt = true;
- if (table)
- huge = is_vm_area_hugepages(table);
- } else {
- /*
- * If bucketsize is not a power-of-two, we may free
- * some pages at the end of hash table which
- * alloc_pages_exact() automatically does
- */
- table = alloc_pages_exact(size, gfp_flags);
- kmemleak_alloc(table, size, 1, gfp_flags);
- }
- } while (!table && size > PAGE_SIZE && --log2qty);
-
- if (!table)
- panic("Failed to allocate %s hash table\n", tablename);
-
- pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
- virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
-
- if (_hash_shift)
- *_hash_shift = log2qty;
- if (_hash_mask)
- *_hash_mask = (1 << log2qty) - 1;
-
- return table;
-}
-
#ifdef CONFIG_CONTIG_ALLOC
#if defined(CONFIG_DYNAMIC_DEBUG) || \
(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
@@ -9380,7 +6814,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
order = 0;
outer_start = start;
while (!PageBuddy(pfn_to_page(outer_start))) {
- if (++order >= MAX_ORDER) {
+ if (++order > MAX_ORDER) {
outer_start = start;
break;
}
@@ -9630,7 +7064,7 @@ bool is_free_buddy_page(struct page *page)
unsigned long pfn = page_to_pfn(page);
unsigned int order;
- for (order = 0; order < MAX_ORDER; order++) {
+ for (order = 0; order <= MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
if (PageBuddy(page_head) &&
@@ -9638,7 +7072,7 @@ bool is_free_buddy_page(struct page *page)
break;
}
- return order < MAX_ORDER;
+ return order <= MAX_ORDER;
}
EXPORT_SYMBOL(is_free_buddy_page);
@@ -9689,7 +7123,7 @@ bool take_page_off_buddy(struct page *page)
bool ret = false;
spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order < MAX_ORDER; order++) {
+ for (order = 0; order <= MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
int page_order = buddy_order(page_head);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 47fbc1696466..c6f3605e37ab 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -226,7 +226,7 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
*/
if (PageBuddy(page)) {
order = buddy_order(page);
- if (order >= pageblock_order && order < MAX_ORDER - 1) {
+ if (order >= pageblock_order && order < MAX_ORDER) {
buddy = find_buddy_page_pfn(page, page_to_pfn(page),
order, NULL);
if (buddy && !is_migrate_isolate_page(buddy)) {
@@ -290,11 +290,11 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* isolate_single_pageblock()
* @migratetype: migrate type to set in error recovery.
*
- * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
+ * Free and in-use pages can be as big as MAX_ORDER and contain more than one
* pageblock. When not all pageblocks within a page are isolated at the same
* time, free page accounting can go wrong. For example, in the case of
- * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
- * [ MAX_ORDER-1 ]
+ * MAX_ORDER = pageblock_order + 1, a MAX_ORDER page has two pagelbocks.
+ * [ MAX_ORDER ]
* [ pageblock0 | pageblock1 ]
* When either pageblock is isolated, if it is a free page, the page is not
* split into separate migratetype lists, which is supposed to; if it is an
@@ -451,7 +451,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
* the free page to the right migratetype list.
*
* head_pfn is not used here as a hugetlb page order
- * can be bigger than MAX_ORDER-1, but after it is
+ * can be bigger than MAX_ORDER, but after it is
* freed, the free page order is not. Use pfn within
* the range to find the head of the free page.
*/
@@ -459,7 +459,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
outer_pfn = pfn;
while (!PageBuddy(pfn_to_page(outer_pfn))) {
/* stop if we cannot find the free page */
- if (++order >= MAX_ORDER)
+ if (++order > MAX_ORDER)
goto failed;
outer_pfn &= ~0UL << order;
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 220cdeddc295..31169b3e7f06 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -315,7 +315,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
unsigned long freepage_order;
freepage_order = buddy_order_unsafe(page);
- if (freepage_order < MAX_ORDER)
+ if (freepage_order <= MAX_ORDER)
pfn += (1UL << freepage_order) - 1;
continue;
}
@@ -549,7 +549,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
if (PageBuddy(page)) {
unsigned long freepage_order = buddy_order_unsafe(page);
- if (freepage_order < MAX_ORDER)
+ if (freepage_order <= MAX_ORDER)
pfn += (1UL << freepage_order) - 1;
continue;
}
@@ -657,7 +657,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
if (PageBuddy(page)) {
unsigned long order = buddy_order_unsafe(page);
- if (order > 0 && order < MAX_ORDER)
+ if (order > 0 && order <= MAX_ORDER)
pfn += (1UL << order) - 1;
continue;
}
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index c65813a9dc78..b021f482a4cb 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -20,7 +20,7 @@ static int page_order_update_notify(const char *val, const struct kernel_param *
* If param is set beyond this limit, order is set to default
* pageblock_order value
*/
- return param_set_uint_minmax(val, kp, 0, MAX_ORDER-1);
+ return param_set_uint_minmax(val, kp, 0, MAX_ORDER);
}
static const struct kernel_param_ops page_reporting_param_ops = {
@@ -276,7 +276,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
return err;
/* Process each free list starting from lowest order/mt */
- for (order = page_reporting_order; order < MAX_ORDER; order++) {
+ for (order = page_reporting_order; order <= MAX_ORDER; order++) {
for (mt = 0; mt < MIGRATE_TYPES; mt++) {
/* We do not pull pages from the isolate free list */
if (is_migrate_isolate(mt))
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 90ab721a12a8..d2fc52bffafc 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -69,7 +69,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(*ptep, entry);
if (changed) {
set_pte_at(vma->vm_mm, address, ptep, entry);
- flush_tlb_fix_spurious_fault(vma, address);
+ flush_tlb_fix_spurious_fault(vma, address, ptep);
}
return changed;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 8632e02661ac..ba901c416785 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,21 +25,22 @@
* mapping->invalidate_lock (in filemap_fault)
* page->flags PG_locked (lock_page)
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
- * mapping->i_mmap_rwsem
- * anon_vma->rwsem
- * mm->page_table_lock or pte_lock
- * swap_lock (in swap_duplicate, swap_info_get)
- * mmlist_lock (in mmput, drain_mmlist and others)
- * mapping->private_lock (in block_dirty_folio)
- * folio_lock_memcg move_lock (in block_dirty_folio)
- * i_pages lock (widely used)
- * lruvec->lru_lock (in folio_lruvec_lock_irq)
- * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
- * sb_lock (within inode_lock in fs/fs-writeback.c)
- * i_pages lock (widely used, in set_page_dirty,
- * in arch-dependent flush_dcache_mmap_lock,
- * within bdi.wb->list_lock in __sync_single_inode)
+ * vma_start_write
+ * mapping->i_mmap_rwsem
+ * anon_vma->rwsem
+ * mm->page_table_lock or pte_lock
+ * swap_lock (in swap_duplicate, swap_info_get)
+ * mmlist_lock (in mmput, drain_mmlist and others)
+ * mapping->private_lock (in block_dirty_folio)
+ * folio_lock_memcg move_lock (in block_dirty_folio)
+ * i_pages lock (widely used)
+ * lruvec->lru_lock (in folio_lruvec_lock_irq)
+ * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ * sb_lock (within inode_lock in fs/fs-writeback.c)
+ * i_pages lock (widely used, in set_page_dirty,
+ * in arch-dependent flush_dcache_mmap_lock,
+ * within bdi.wb->list_lock in __sync_single_inode)
*
* anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon)
* ->tasklist_lock
@@ -644,7 +645,7 @@ void try_to_unmap_flush_dirty(void)
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
- int batch, nbatch;
+ int batch;
arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true;
@@ -662,11 +663,8 @@ retry:
* overflow. Reset `pending' and `flushed' to be 1 and 0 if
* `pending' becomes large.
*/
- nbatch = atomic_cmpxchg(&mm->tlb_flush_batched, batch, 1);
- if (nbatch != batch) {
- batch = nbatch;
+ if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1))
goto retry;
- }
} else {
atomic_inc(&mm->tlb_flush_batched);
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 448f393d8ab2..b185c1db3009 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -76,7 +76,6 @@ static struct vfsmount *shm_mnt;
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <uapi/linux/memfd.h>
-#include <linux/userfaultfd_k.h>
#include <linux/rmap.h>
#include <linux/uuid.h>
@@ -116,10 +115,12 @@ struct shmem_options {
bool full_inums;
int huge;
int seen;
+ bool noswap;
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
+#define SHMEM_SEEN_NOSWAP 16
};
#ifdef CONFIG_TMPFS
@@ -603,7 +604,7 @@ next:
index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
folio = filemap_get_folio(inode->i_mapping, index);
- if (!folio)
+ if (IS_ERR(folio))
goto drop;
/* No huge page at the end of the file: nothing to split */
@@ -883,14 +884,21 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
/*
* At first avoid shmem_get_folio(,,,SGP_READ): that fails
- * beyond i_size, and reports fallocated pages as holes.
+ * beyond i_size, and reports fallocated folios as holes.
*/
- folio = __filemap_get_folio(inode->i_mapping, index,
- FGP_ENTRY | FGP_LOCK, 0);
- if (!xa_is_value(folio))
+ folio = filemap_get_entry(inode->i_mapping, index);
+ if (!folio)
return folio;
+ if (!xa_is_value(folio)) {
+ folio_lock(folio);
+ if (folio->mapping == inode->i_mapping)
+ return folio;
+ /* The folio has been swapped out */
+ folio_unlock(folio);
+ folio_put(folio);
+ }
/*
- * But read a page back from swap if any of it is within i_size
+ * But read a folio back from swap if any of it is within i_size
* (although in some cases this is just a waste of time).
*/
folio = NULL;
@@ -1331,13 +1339,30 @@ int shmem_unuse(unsigned int type)
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
struct folio *folio = page_folio(page);
- struct shmem_inode_info *info;
- struct address_space *mapping;
- struct inode *inode;
+ struct address_space *mapping = folio->mapping;
+ struct inode *inode = mapping->host;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
swp_entry_t swap;
pgoff_t index;
/*
+ * Our capabilities prevent regular writeback or sync from ever calling
+ * shmem_writepage; but a stacking filesystem might use ->writepage of
+ * its underlying filesystem, in which case tmpfs should write out to
+ * swap only in response to memory pressure, and not for the writeback
+ * threads or sync.
+ */
+ if (WARN_ON_ONCE(!wbc->for_reclaim))
+ goto redirty;
+
+ if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
+ goto redirty;
+
+ if (!total_swap_pages)
+ goto redirty;
+
+ /*
* If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
* "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
* and its shmem_writeback() needs them to be split when swapping.
@@ -1351,27 +1376,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
folio_clear_dirty(folio);
}
- BUG_ON(!folio_test_locked(folio));
- mapping = folio->mapping;
index = folio->index;
- inode = mapping->host;
- info = SHMEM_I(inode);
- if (info->flags & VM_LOCKED)
- goto redirty;
- if (!total_swap_pages)
- goto redirty;
-
- /*
- * Our capabilities prevent regular writeback or sync from ever calling
- * shmem_writepage; but a stacking filesystem might use ->writepage of
- * its underlying filesystem, in which case tmpfs should write out to
- * swap only in response to memory pressure, and not for the writeback
- * threads or sync.
- */
- if (!wbc->for_reclaim) {
- WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
- goto redirty;
- }
/*
* This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
@@ -1874,12 +1879,10 @@ repeat:
sbinfo = SHMEM_SB(inode->i_sb);
charge_mm = vma ? vma->vm_mm : NULL;
- folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0);
+ folio = filemap_get_entry(mapping, index);
if (folio && vma && userfaultfd_minor(vma)) {
- if (!xa_is_value(folio)) {
- folio_unlock(folio);
+ if (!xa_is_value(folio))
folio_put(folio);
- }
*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
return 0;
}
@@ -1895,6 +1898,14 @@ repeat:
}
if (folio) {
+ folio_lock(folio);
+
+ /* Has the folio been truncated or swapped out? */
+ if (unlikely(folio->mapping != mapping)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto repeat;
+ }
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
if (folio_test_uptodate(folio))
@@ -2376,6 +2387,8 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
shmem_set_inode_flags(inode, info->fsflags);
INIT_LIST_HEAD(&info->shrinklist);
INIT_LIST_HEAD(&info->swaplist);
+ if (sbinfo->noswap)
+ mapping_set_unevictable(inode->i_mapping);
simple_xattrs_init(&info->xattrs);
cache_no_acl(inode);
mapping_set_large_folios(inode->i_mapping);
@@ -2415,12 +2428,11 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
}
#ifdef CONFIG_USERFAULTFD
-int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
+int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- bool zeropage, bool wp_copy,
+ uffd_flags_t flags,
struct page **pagep)
{
struct inode *inode = file_inode(dst_vma->vm_file);
@@ -2452,7 +2464,7 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (!folio)
goto out_unacct_blocks;
- if (!zeropage) { /* COPY */
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
page_kaddr = kmap_local_folio(folio, 0);
/*
* The read mmap_lock is held here. Despite the
@@ -2506,12 +2518,12 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
goto out_release;
ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
- gfp & GFP_RECLAIM_MASK, dst_mm);
+ gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm);
if (ret)
goto out_release;
- ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- &folio->page, true, wp_copy);
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+ &folio->page, true, flags);
if (ret)
goto out_delete_from_cache;
@@ -3200,7 +3212,7 @@ static const char *shmem_get_link(struct dentry *dentry,
if (!dentry) {
folio = filemap_get_folio(inode->i_mapping, 0);
- if (!folio)
+ if (IS_ERR(folio))
return ERR_PTR(-ECHILD);
if (PageHWPoison(folio_page(folio, 0)) ||
!folio_test_uptodate(folio)) {
@@ -3463,6 +3475,7 @@ enum shmem_param {
Opt_uid,
Opt_inode32,
Opt_inode64,
+ Opt_noswap,
};
static const struct constant_table shmem_param_enums_huge[] = {
@@ -3484,6 +3497,7 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_u32 ("uid", Opt_uid),
fsparam_flag ("inode32", Opt_inode32),
fsparam_flag ("inode64", Opt_inode64),
+ fsparam_flag ("noswap", Opt_noswap),
{}
};
@@ -3567,6 +3581,10 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
ctx->full_inums = true;
ctx->seen |= SHMEM_SEEN_INUMS;
break;
+ case Opt_noswap:
+ ctx->noswap = true;
+ ctx->seen |= SHMEM_SEEN_NOSWAP;
+ break;
}
return 0;
@@ -3665,6 +3683,14 @@ static int shmem_reconfigure(struct fs_context *fc)
err = "Current inum too high to switch to 32-bit inums";
goto out;
}
+ if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
+ err = "Cannot disable swap on remount";
+ goto out;
+ }
+ if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
+ err = "Cannot enable swap on remount if it was disabled on first mount";
+ goto out;
+ }
if (ctx->seen & SHMEM_SEEN_HUGE)
sbinfo->huge = ctx->huge;
@@ -3685,6 +3711,10 @@ static int shmem_reconfigure(struct fs_context *fc)
sbinfo->mpol = ctx->mpol; /* transfers initial ref */
ctx->mpol = NULL;
}
+
+ if (ctx->noswap)
+ sbinfo->noswap = true;
+
raw_spin_unlock(&sbinfo->stat_lock);
mpol_put(mpol);
return 0;
@@ -3739,6 +3769,8 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
#endif
shmem_show_mpol(seq, sbinfo->mpol);
+ if (sbinfo->noswap)
+ seq_printf(seq, ",noswap");
return 0;
}
@@ -3782,6 +3814,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
ctx->inodes = shmem_default_max_inodes();
if (!(ctx->seen & SHMEM_SEEN_INUMS))
ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
+ sbinfo->noswap = ctx->noswap;
} else {
sb->s_flags |= SB_NOUSER;
}
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 39c3491e28a3..fdd155fd35ed 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -5,10 +5,12 @@
#include <linux/seq_file.h>
#include <linux/shrinker.h>
#include <linux/memcontrol.h>
+#include <linux/srcu.h>
/* defined in vmscan.c */
-extern struct rw_semaphore shrinker_rwsem;
+extern struct mutex shrinker_mutex;
extern struct list_head shrinker_list;
+extern struct srcu_struct shrinker_srcu;
static DEFINE_IDA(shrinker_debugfs_ida);
static struct dentry *shrinker_debugfs_root;
@@ -49,18 +51,13 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
struct mem_cgroup *memcg;
unsigned long total;
bool memcg_aware;
- int ret, nid;
+ int ret = 0, nid, srcu_idx;
count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
if (!count_per_node)
return -ENOMEM;
- ret = down_read_killable(&shrinker_rwsem);
- if (ret) {
- kfree(count_per_node);
- return ret;
- }
- rcu_read_lock();
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
@@ -91,8 +88,7 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
}
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
- rcu_read_unlock();
- up_read(&shrinker_rwsem);
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
kfree(count_per_node);
return ret;
@@ -115,9 +111,8 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
.gfp_mask = GFP_KERNEL,
};
struct mem_cgroup *memcg = NULL;
- int nid;
+ int nid, srcu_idx;
char kbuf[72];
- ssize_t ret;
read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1);
if (copy_from_user(kbuf, buf, read_len))
@@ -146,11 +141,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
return -EINVAL;
}
- ret = down_read_killable(&shrinker_rwsem);
- if (ret) {
- mem_cgroup_put(memcg);
- return ret;
- }
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
sc.nid = nid;
sc.memcg = memcg;
@@ -159,7 +150,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
shrinker->scan_objects(shrinker, &sc);
- up_read(&shrinker_rwsem);
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
mem_cgroup_put(memcg);
return size;
@@ -177,7 +168,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
char buf[128];
int id;
- lockdep_assert_held(&shrinker_rwsem);
+ lockdep_assert_held(&shrinker_mutex);
/* debugfs isn't initialized yet, add debugfs entries later. */
if (!shrinker_debugfs_root)
@@ -220,7 +211,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
if (!new)
return -ENOMEM;
- down_write(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
old = shrinker->name;
shrinker->name = new;
@@ -238,7 +229,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
shrinker->debugfs_entry = entry;
}
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
kfree_const(old);
@@ -250,7 +241,7 @@ struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker)
{
struct dentry *entry = shrinker->debugfs_entry;
- lockdep_assert_held(&shrinker_rwsem);
+ lockdep_assert_held(&shrinker_mutex);
kfree_const(shrinker->name);
shrinker->name = NULL;
@@ -275,14 +266,14 @@ static int __init shrinker_debugfs_init(void)
shrinker_debugfs_root = dentry;
/* Create debugfs entries for shrinkers registered at boot */
- down_write(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
list_for_each_entry(shrinker, &shrinker_list, list)
if (!shrinker->debugfs_entry) {
ret = shrinker_debugfs_add(shrinker);
if (ret)
break;
}
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
return ret;
}
diff --git a/mm/shuffle.h b/mm/shuffle.h
index cec62984f7d3..a6bdf54f96f1 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -4,7 +4,7 @@
#define _MM_SHUFFLE_H
#include <linux/jump_label.h>
-#define SHUFFLE_ORDER (MAX_ORDER-1)
+#define SHUFFLE_ORDER MAX_ORDER
#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
diff --git a/mm/slab.c b/mm/slab.c
index edbe722fb906..6b7c172158e5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -465,7 +465,7 @@ static int __init slab_max_order_setup(char *str)
{
get_option(&str, &slab_max_order);
slab_max_order = slab_max_order < 0 ? 0 :
- min(slab_max_order, MAX_ORDER - 1);
+ min(slab_max_order, MAX_ORDER);
slab_max_order_set = true;
return 1;
diff --git a/mm/slab.h b/mm/slab.h
index 43966aa5fadf..3f8df2244f5a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -4,6 +4,7 @@
/*
* Internal slab definitions
*/
+void __init kmem_cache_init(void);
/* Reuses the bits in struct page */
struct slab {
diff --git a/mm/slub.c b/mm/slub.c
index 39327e98fce3..f49d669ff604 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4172,7 +4172,7 @@ static inline int calculate_order(unsigned int size)
* Doh this slab cannot be placed using slub_max_order.
*/
order = calc_slab_order(size, 1, MAX_ORDER, 1);
- if (order < MAX_ORDER)
+ if (order <= MAX_ORDER)
return order;
return -ENOSYS;
}
@@ -4697,7 +4697,7 @@ __setup("slub_min_order=", setup_slub_min_order);
static int __init setup_slub_max_order(char *str)
{
get_option(&str, (int *)&slub_max_order);
- slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1);
+ slub_max_order = min_t(unsigned int, slub_max_order, MAX_ORDER);
return 1;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7a003d8abb37..b76a65ac28b3 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -336,7 +336,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
struct folio *folio;
folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
- if (folio) {
+ if (!IS_ERR(folio)) {
bool vma_ra = swap_use_vma_readahead();
bool readahead;
@@ -366,6 +366,8 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
if (!vma || !vma_ra)
atomic_inc(&swapin_readahead_hits);
}
+ } else {
+ folio = NULL;
}
return folio;
@@ -386,25 +388,26 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
{
swp_entry_t swp;
struct swap_info_struct *si;
- struct folio *folio = __filemap_get_folio(mapping, index, FGP_ENTRY, 0);
+ struct folio *folio = filemap_get_entry(mapping, index);
+ if (!folio)
+ return ERR_PTR(-ENOENT);
if (!xa_is_value(folio))
- goto out;
+ return folio;
if (!shmem_mapping(mapping))
- return NULL;
+ return ERR_PTR(-ENOENT);
swp = radix_to_swp_entry(folio);
/* There might be swapin error entries in shmem mapping. */
if (non_swap_entry(swp))
- return NULL;
+ return ERR_PTR(-ENOENT);
/* Prevent swapoff from happening to us */
si = get_swap_device(swp);
if (!si)
- return NULL;
+ return ERR_PTR(-ENOENT);
index = swp_offset(swp);
folio = filemap_get_folio(swap_address_space(swp), index);
put_swap_device(si);
-out:
return folio;
}
@@ -431,7 +434,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
folio = filemap_get_folio(swap_address_space(entry),
swp_offset(entry));
put_swap_device(si);
- if (folio)
+ if (!IS_ERR(folio))
return folio_file_page(folio, swp_offset(entry));
/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2c718f45745f..274bbf797480 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -136,7 +136,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
int ret = 0;
folio = filemap_get_folio(swap_address_space(entry), offset);
- if (!folio)
+ if (IS_ERR(folio))
return 0;
/*
* When this function is called from scan_swap_map_slots() and it's
@@ -2096,7 +2096,7 @@ retry:
entry = swp_entry(type, i);
folio = filemap_get_folio(swap_address_space(entry), i);
- if (!folio)
+ if (IS_ERR(folio))
continue;
/*
@@ -3636,12 +3636,12 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
struct swap_info_struct *si, *next;
- int nid = page_to_nid(page);
+ int nid = folio_nid(folio);
- if (!(gfp_mask & __GFP_IO))
+ if (!(gfp & __GFP_IO))
return;
if (!blk_cgroup_congested())
diff --git a/mm/truncate.c b/mm/truncate.c
index 7b4ea4c4a46b..86de31ed4d32 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -375,7 +375,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
- if (folio) {
+ if (!IS_ERR(folio)) {
same_folio = lend < folio_pos(folio) + folio_size(folio);
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
start = folio->index + folio_nr_pages(folio);
@@ -387,14 +387,15 @@ void truncate_inode_pages_range(struct address_space *mapping,
folio = NULL;
}
- if (!same_folio)
+ if (!same_folio) {
folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
FGP_LOCK, 0);
- if (folio) {
- if (!truncate_inode_partial_folio(folio, lstart, lend))
- end = folio->index;
- folio_unlock(folio);
- folio_put(folio);
+ if (!IS_ERR(folio)) {
+ if (!truncate_inode_partial_folio(folio, lstart, lend))
+ end = folio->index;
+ folio_unlock(folio);
+ folio_put(folio);
+ }
}
index = start;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 53c3d916ff66..7f1b5f8b712c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -55,12 +55,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
* This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
* and anon, and for both shared and private VMAs.
*/
-int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+int mfill_atomic_install_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, struct page *page,
- bool newly_allocated, bool wp_copy)
+ bool newly_allocated, uffd_flags_t flags)
{
int ret;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
pte_t _dst_pte, *dst_pte;
bool writable = dst_vma->vm_flags & VM_WRITE;
bool vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -76,7 +77,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
writable = false;
if (writable)
_dst_pte = pte_mkwrite(_dst_pte);
- if (wp_copy)
+ if (flags & MFILL_ATOMIC_WP)
_dst_pte = pte_mkuffd_wp(_dst_pte);
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
@@ -127,13 +128,12 @@ out_unlock:
return ret;
}
-static int mcopy_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep,
- bool wp_copy)
+static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ uffd_flags_t flags,
+ struct page **pagep)
{
void *page_kaddr;
int ret;
@@ -190,11 +190,11 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
__SetPageUptodate(page);
ret = -ENOMEM;
- if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL))
+ if (mem_cgroup_charge(page_folio(page), dst_vma->vm_mm, GFP_KERNEL))
goto out_release;
- ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- page, true, wp_copy);
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+ page, true, flags);
if (ret)
goto out_release;
out:
@@ -204,10 +204,9 @@ out_release:
goto out;
}
-static int mfill_zeropage_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
+static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
{
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
@@ -217,7 +216,7 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm,
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
if (dst_vma->vm_file) {
/* the shmem MAP_PRIVATE case requires checking the i_size */
inode = dst_vma->vm_file->f_inode;
@@ -230,7 +229,7 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm,
ret = -EEXIST;
if (!pte_none(*dst_pte))
goto out_unlock;
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
ret = 0;
@@ -240,11 +239,10 @@ out_unlock:
}
/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- bool wp_copy)
+static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ uffd_flags_t flags)
{
struct inode *inode = file_inode(dst_vma->vm_file);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
@@ -269,8 +267,8 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
goto out_release;
}
- ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- page, false, wp_copy);
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+ page, false, flags);
if (ret)
goto out_release;
@@ -307,17 +305,17 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
#ifdef CONFIG_HUGETLB_PAGE
/*
- * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
+ * mfill_atomic processing for HUGETLB vmas. Note that this routine is
* called with mmap_lock held, it will release mmap_lock before returning.
*/
-static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+static __always_inline ssize_t mfill_atomic_hugetlb(
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- enum mcopy_atomic_mode mode,
- bool wp_copy)
+ uffd_flags_t flags)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
pte_t *dst_pte;
@@ -335,7 +333,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
* by THP. Since we can not reliably insert a zero page, this
* feature is not supported.
*/
- if (mode == MCOPY_ATOMIC_ZEROPAGE) {
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
mmap_read_unlock(dst_mm);
return -EINVAL;
}
@@ -403,7 +401,7 @@ retry:
goto out_unlock;
}
- if (mode != MCOPY_ATOMIC_CONTINUE &&
+ if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
!huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
err = -EEXIST;
hugetlb_vma_unlock_read(dst_vma);
@@ -411,9 +409,8 @@ retry:
goto out_unlock;
}
- err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
- dst_addr, src_addr, mode, &page,
- wp_copy);
+ err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
+ src_addr, flags, &page);
hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -463,29 +460,25 @@ out:
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
-extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
- struct vm_area_struct *dst_vma,
- unsigned long dst_start,
- unsigned long src_start,
- unsigned long len,
- enum mcopy_atomic_mode mode,
- bool wp_copy);
+extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
-static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
+static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- struct page **page,
- enum mcopy_atomic_mode mode,
- bool wp_copy)
+ uffd_flags_t flags,
+ struct page **pagep)
{
ssize_t err;
- if (mode == MCOPY_ATOMIC_CONTINUE) {
- return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- wp_copy);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
+ return mfill_atomic_pte_continue(dst_pmd, dst_vma,
+ dst_addr, flags);
}
/*
@@ -499,30 +492,28 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
* and not in the radix tree.
*/
if (!(dst_vma->vm_flags & VM_SHARED)) {
- if (mode == MCOPY_ATOMIC_NORMAL)
- err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, page,
- wp_copy);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
+ err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
+ dst_addr, src_addr,
+ flags, pagep);
else
- err = mfill_zeropage_pte(dst_mm, dst_pmd,
+ err = mfill_atomic_pte_zeropage(dst_pmd,
dst_vma, dst_addr);
} else {
- err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
dst_addr, src_addr,
- mode != MCOPY_ATOMIC_NORMAL,
- wp_copy, page);
+ flags, pagep);
}
return err;
}
-static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long src_start,
- unsigned long len,
- enum mcopy_atomic_mode mcopy_mode,
- atomic_t *mmap_changing,
- __u64 mode)
+static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ atomic_t *mmap_changing,
+ uffd_flags_t flags)
{
struct vm_area_struct *dst_vma;
ssize_t err;
@@ -530,7 +521,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long src_addr, dst_addr;
long copied;
struct page *page;
- bool wp_copy;
/*
* Sanitize the command parameters:
@@ -580,21 +570,20 @@ retry:
* validate 'mode' now that we know the dst_vma: don't allow
* a wrprotect copy if the userfaultfd didn't register as WP.
*/
- wp_copy = mode & UFFDIO_COPY_MODE_WP;
- if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
+ if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
goto out_unlock;
/*
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
- src_start, len, mcopy_mode,
- wp_copy);
+ return mfill_atomic_hugetlb(dst_vma, dst_start,
+ src_start, len, flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
- if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+ if (!vma_is_shmem(dst_vma) &&
+ uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
goto out_unlock;
/*
@@ -641,8 +630,8 @@ retry:
BUG_ON(pmd_none(*dst_pmd));
BUG_ON(pmd_trans_huge(*dst_pmd));
- err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- src_addr, &page, mcopy_mode, wp_copy);
+ err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
+ src_addr, flags, &page);
cond_resched();
if (unlikely(err == -ENOENT)) {
@@ -688,35 +677,38 @@ out:
return copied ? copied : err;
}
-ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, __u64 mode)
+ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len,
+ atomic_t *mmap_changing, uffd_flags_t flags)
{
- return __mcopy_atomic(dst_mm, dst_start, src_start, len,
- MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
+ return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
+ uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
}
-ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, atomic_t *mmap_changing)
{
- return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
- mmap_changing, 0);
+ return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
}
-ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, atomic_t *mmap_changing,
+ uffd_flags_t flags)
{
- return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
- mmap_changing, 0);
+ return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
}
-long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
+long uffd_wp_range(struct vm_area_struct *dst_vma,
unsigned long start, unsigned long len, bool enable_wp)
{
unsigned int mm_cp_flags;
struct mmu_gather tlb;
long ret;
+ VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end,
+ "The address range exceeds VMA boundary.\n");
if (enable_wp)
mm_cp_flags = MM_CP_UFFD_WP;
else
@@ -730,7 +722,7 @@ long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
*/
if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
- tlb_gather_mmu(&tlb, dst_mm);
+ tlb_gather_mmu(&tlb, dst_vma->vm_mm);
ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
tlb_finish_mmu(&tlb);
@@ -741,9 +733,12 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, bool enable_wp,
atomic_t *mmap_changing)
{
+ unsigned long end = start + len;
+ unsigned long _start, _end;
struct vm_area_struct *dst_vma;
unsigned long page_mask;
long err;
+ VMA_ITERATOR(vmi, dst_mm, start);
/*
* Sanitize the command parameters:
@@ -766,28 +761,30 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
goto out_unlock;
err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, start, len);
+ for_each_vma_range(vmi, dst_vma, end) {
- if (!dst_vma)
- goto out_unlock;
- if (!userfaultfd_wp(dst_vma))
- goto out_unlock;
- if (!vma_can_userfault(dst_vma, dst_vma->vm_flags))
- goto out_unlock;
+ if (!userfaultfd_wp(dst_vma)) {
+ err = -ENOENT;
+ break;
+ }
- if (is_vm_hugetlb_page(dst_vma)) {
- err = -EINVAL;
- page_mask = vma_kernel_pagesize(dst_vma) - 1;
- if ((start & page_mask) || (len & page_mask))
- goto out_unlock;
- }
+ if (is_vm_hugetlb_page(dst_vma)) {
+ err = -EINVAL;
+ page_mask = vma_kernel_pagesize(dst_vma) - 1;
+ if ((start & page_mask) || (len & page_mask))
+ break;
+ }
- err = uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
+ _start = max(dst_vma->vm_start, start);
+ _end = min(dst_vma->vm_end, end);
- /* Return 0 on success, <0 on failures */
- if (err > 0)
- err = 0;
+ err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp);
+ /* Return 0 on success, <0 on failures */
+ if (err < 0)
+ break;
+ err = 0;
+ }
out_unlock:
mmap_read_unlock(dst_mm);
return err;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a50072066221..3fa476f17887 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -33,11 +33,11 @@
#include <linux/compiler.h>
#include <linux/memcontrol.h>
#include <linux/llist.h>
+#include <linux/uio.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
#include <linux/pgtable.h>
-#include <linux/uaccess.h>
#include <linux/hugetlb.h>
#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
@@ -2739,7 +2739,7 @@ void vfree(const void *addr)
* High-order allocs for huge vmallocs are split, so
* can be freed as an array of order-0 allocations
*/
- __free_pages(page, 0);
+ __free_page(page);
cond_resched();
}
atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
@@ -3190,7 +3190,7 @@ again:
* pages backing VM_ALLOC mapping. Memory is instead
* poisoned and zeroed by kasan_unpoison_vmalloc().
*/
- gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO;
+ gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
}
/* Take note that the mapping is PAGE_KERNEL. */
@@ -3444,62 +3444,96 @@ void *vmalloc_32_user(unsigned long size)
EXPORT_SYMBOL(vmalloc_32_user);
/*
- * small helper routine , copy contents to buf from addr.
- * If the page is not present, fill zero.
+ * Atomically zero bytes in the iterator.
+ *
+ * Returns the number of zeroed bytes.
*/
+static size_t zero_iter(struct iov_iter *iter, size_t count)
+{
+ size_t remains = count;
+
+ while (remains > 0) {
+ size_t num, copied;
+
+ num = remains < PAGE_SIZE ? remains : PAGE_SIZE;
+ copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
+ remains -= copied;
+
+ if (copied < num)
+ break;
+ }
-static int aligned_vread(char *buf, char *addr, unsigned long count)
+ return count - remains;
+}
+
+/*
+ * small helper routine, copy contents to iter from addr.
+ * If the page is not present, fill zero.
+ *
+ * Returns the number of copied bytes.
+ */
+static size_t aligned_vread_iter(struct iov_iter *iter,
+ const char *addr, size_t count)
{
- struct page *p;
- int copied = 0;
+ size_t remains = count;
+ struct page *page;
- while (count) {
+ while (remains > 0) {
unsigned long offset, length;
+ size_t copied = 0;
offset = offset_in_page(addr);
length = PAGE_SIZE - offset;
- if (length > count)
- length = count;
- p = vmalloc_to_page(addr);
+ if (length > remains)
+ length = remains;
+ page = vmalloc_to_page(addr);
/*
- * To do safe access to this _mapped_ area, we need
- * lock. But adding lock here means that we need to add
- * overhead of vmalloc()/vfree() calls for this _debug_
- * interface, rarely used. Instead of that, we'll use
- * kmap() and get small overhead in this access function.
+ * To do safe access to this _mapped_ area, we need lock. But
+ * adding lock here means that we need to add overhead of
+ * vmalloc()/vfree() calls for this _debug_ interface, rarely
+ * used. Instead of that, we'll use an local mapping via
+ * copy_page_to_iter_nofault() and accept a small overhead in
+ * this access function.
*/
- if (p) {
- /* We can expect USER0 is not used -- see vread() */
- void *map = kmap_atomic(p);
- memcpy(buf, map + offset, length);
- kunmap_atomic(map);
- } else
- memset(buf, 0, length);
+ if (page)
+ copied = copy_page_to_iter_nofault(page, offset,
+ length, iter);
+ else
+ copied = zero_iter(iter, length);
- addr += length;
- buf += length;
- copied += length;
- count -= length;
+ addr += copied;
+ remains -= copied;
+
+ if (copied != length)
+ break;
}
- return copied;
+
+ return count - remains;
}
-static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
+/*
+ * Read from a vm_map_ram region of memory.
+ *
+ * Returns the number of copied bytes.
+ */
+static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
+ size_t count, unsigned long flags)
{
char *start;
struct vmap_block *vb;
unsigned long offset;
- unsigned int rs, re, n;
+ unsigned int rs, re;
+ size_t remains, n;
/*
* If it's area created by vm_map_ram() interface directly, but
* not further subdividing and delegating management to vmap_block,
* handle it here.
*/
- if (!(flags & VMAP_BLOCK)) {
- aligned_vread(buf, addr, count);
- return;
- }
+ if (!(flags & VMAP_BLOCK))
+ return aligned_vread_iter(iter, addr, count);
+
+ remains = count;
/*
* Area is split into regions and tracked with vmap_block, read out
@@ -3507,50 +3541,64 @@ static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags
*/
vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
if (!vb)
- goto finished;
+ goto finished_zero;
spin_lock(&vb->lock);
if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
spin_unlock(&vb->lock);
- goto finished;
+ goto finished_zero;
}
+
for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
- if (!count)
- break;
+ size_t copied;
+
+ if (remains == 0)
+ goto finished;
+
start = vmap_block_vaddr(vb->va->va_start, rs);
- while (addr < start) {
- if (count == 0)
- goto unlock;
- *buf = '\0';
- buf++;
- addr++;
- count--;
+
+ if (addr < start) {
+ size_t to_zero = min_t(size_t, start - addr, remains);
+ size_t zeroed = zero_iter(iter, to_zero);
+
+ addr += zeroed;
+ remains -= zeroed;
+
+ if (remains == 0 || zeroed != to_zero)
+ goto finished;
}
+
/*it could start reading from the middle of used region*/
offset = offset_in_page(addr);
n = ((re - rs + 1) << PAGE_SHIFT) - offset;
- if (n > count)
- n = count;
- aligned_vread(buf, start+offset, n);
+ if (n > remains)
+ n = remains;
+
+ copied = aligned_vread_iter(iter, start + offset, n);
- buf += n;
- addr += n;
- count -= n;
+ addr += copied;
+ remains -= copied;
+
+ if (copied != n)
+ goto finished;
}
-unlock:
+
spin_unlock(&vb->lock);
-finished:
+finished_zero:
/* zero-fill the left dirty or free regions */
- if (count)
- memset(buf, 0, count);
+ return count - remains + zero_iter(iter, remains);
+finished:
+ /* We couldn't copy/zero everything */
+ spin_unlock(&vb->lock);
+ return count - remains;
}
/**
- * vread() - read vmalloc area in a safe way.
- * @buf: buffer for reading data
- * @addr: vm address.
- * @count: number of bytes to be read.
+ * vread_iter() - read vmalloc area in a safe way to an iterator.
+ * @iter: the iterator to which data should be written.
+ * @addr: vm address.
+ * @count: number of bytes to be read.
*
* This function checks that addr is a valid vmalloc'ed area, and
* copy data from that area to a given buffer. If the given memory range
@@ -3570,13 +3618,12 @@ finished:
* (same number as @count) or %0 if [addr...addr+count) doesn't
* include any intersection with valid vmalloc area
*/
-long vread(char *buf, char *addr, unsigned long count)
+long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
struct vmap_area *va;
struct vm_struct *vm;
- char *vaddr, *buf_start = buf;
- unsigned long buflen = count;
- unsigned long n, size, flags;
+ char *vaddr;
+ size_t n, size, flags, remains;
addr = kasan_reset_tag(addr);
@@ -3584,18 +3631,22 @@ long vread(char *buf, char *addr, unsigned long count)
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
+ remains = count;
+
spin_lock(&vmap_area_lock);
va = find_vmap_area_exceed_addr((unsigned long)addr);
if (!va)
- goto finished;
+ goto finished_zero;
/* no intersects with alive vmap_area */
- if ((unsigned long)addr + count <= va->va_start)
- goto finished;
+ if ((unsigned long)addr + remains <= va->va_start)
+ goto finished_zero;
list_for_each_entry_from(va, &vmap_area_list, list) {
- if (!count)
- break;
+ size_t copied;
+
+ if (remains == 0)
+ goto finished;
vm = va->vm;
flags = va->flags & VMAP_FLAGS_MASK;
@@ -3610,6 +3661,7 @@ long vread(char *buf, char *addr, unsigned long count)
if (vm && (vm->flags & VM_UNINITIALIZED))
continue;
+
/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
smp_rmb();
@@ -3618,38 +3670,45 @@ long vread(char *buf, char *addr, unsigned long count)
if (addr >= vaddr + size)
continue;
- while (addr < vaddr) {
- if (count == 0)
+
+ if (addr < vaddr) {
+ size_t to_zero = min_t(size_t, vaddr - addr, remains);
+ size_t zeroed = zero_iter(iter, to_zero);
+
+ addr += zeroed;
+ remains -= zeroed;
+
+ if (remains == 0 || zeroed != to_zero)
goto finished;
- *buf = '\0';
- buf++;
- addr++;
- count--;
}
+
n = vaddr + size - addr;
- if (n > count)
- n = count;
+ if (n > remains)
+ n = remains;
if (flags & VMAP_RAM)
- vmap_ram_vread(buf, addr, n, flags);
+ copied = vmap_ram_vread_iter(iter, addr, n, flags);
else if (!(vm->flags & VM_IOREMAP))
- aligned_vread(buf, addr, n);
+ copied = aligned_vread_iter(iter, addr, n);
else /* IOREMAP area is treated as memory hole */
- memset(buf, 0, n);
- buf += n;
- addr += n;
- count -= n;
+ copied = zero_iter(iter, n);
+
+ addr += copied;
+ remains -= copied;
+
+ if (copied != n)
+ goto finished;
}
-finished:
- spin_unlock(&vmap_area_lock);
- if (buf == buf_start)
- return 0;
+finished_zero:
+ spin_unlock(&vmap_area_lock);
/* zero-fill memory holes */
- if (buf != buf_start + buflen)
- memset(buf, 0, buflen - (buf - buf_start));
+ return count - remains + zero_iter(iter, remains);
+finished:
+ /* Nothing remains, or We couldn't copy/zero everything. */
+ spin_unlock(&vmap_area_lock);
- return buflen;
+ return count - remains;
}
/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9c1c5e8b24b8..98719e72b5e2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -35,7 +35,7 @@
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
-#include <linux/rwsem.h>
+#include <linux/mutex.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -57,6 +57,7 @@
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
+#include <linux/srcu.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -201,7 +202,9 @@ static void set_task_reclaim_state(struct task_struct *task,
}
LIST_HEAD(shrinker_list);
-DECLARE_RWSEM(shrinker_rwsem);
+DEFINE_MUTEX(shrinker_mutex);
+DEFINE_SRCU(shrinker_srcu);
+static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
#ifdef CONFIG_MEMCG
static int shrinker_nr_max;
@@ -220,13 +223,27 @@ static inline int shrinker_defer_size(int nr_items)
static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
int nid)
{
- return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
- lockdep_is_held(&shrinker_rwsem));
+ return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
+ &shrinker_srcu,
+ lockdep_is_held(&shrinker_mutex));
+}
+
+static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
+ int nid)
+{
+ return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
+ &shrinker_srcu);
+}
+
+static void free_shrinker_info_rcu(struct rcu_head *head)
+{
+ kvfree(container_of(head, struct shrinker_info, rcu));
}
static int expand_one_shrinker_info(struct mem_cgroup *memcg,
int map_size, int defer_size,
- int old_map_size, int old_defer_size)
+ int old_map_size, int old_defer_size,
+ int new_nr_max)
{
struct shrinker_info *new, *old;
struct mem_cgroup_per_node *pn;
@@ -240,12 +257,17 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
if (!old)
return 0;
+ /* Already expanded this shrinker_info */
+ if (new_nr_max <= old->map_nr_max)
+ continue;
+
new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
if (!new)
return -ENOMEM;
new->nr_deferred = (atomic_long_t *)(new + 1);
new->map = (void *)new->nr_deferred + defer_size;
+ new->map_nr_max = new_nr_max;
/* map: set all old bits, clear all new bits */
memset(new->map, (int)0xff, old_map_size);
@@ -256,7 +278,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
defer_size - old_defer_size);
rcu_assign_pointer(pn->shrinker_info, new);
- kvfree_rcu(old, rcu);
+ call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
}
return 0;
@@ -282,7 +304,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
int nid, size, ret = 0;
int map_size, defer_size = 0;
- down_write(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
map_size = shrinker_map_size(shrinker_nr_max);
defer_size = shrinker_defer_size(shrinker_nr_max);
size = map_size + defer_size;
@@ -295,34 +317,26 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
}
info->nr_deferred = (atomic_long_t *)(info + 1);
info->map = (void *)info->nr_deferred + defer_size;
+ info->map_nr_max = shrinker_nr_max;
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
}
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
return ret;
}
-static inline bool need_expand(int nr_max)
-{
- return round_up(nr_max, BITS_PER_LONG) >
- round_up(shrinker_nr_max, BITS_PER_LONG);
-}
-
static int expand_shrinker_info(int new_id)
{
int ret = 0;
- int new_nr_max = new_id + 1;
+ int new_nr_max = round_up(new_id + 1, BITS_PER_LONG);
int map_size, defer_size = 0;
int old_map_size, old_defer_size = 0;
struct mem_cgroup *memcg;
- if (!need_expand(new_nr_max))
- goto out;
-
if (!root_mem_cgroup)
goto out;
- lockdep_assert_held(&shrinker_rwsem);
+ lockdep_assert_held(&shrinker_mutex);
map_size = shrinker_map_size(new_nr_max);
defer_size = shrinker_defer_size(new_nr_max);
@@ -332,7 +346,8 @@ static int expand_shrinker_info(int new_id)
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
ret = expand_one_shrinker_info(memcg, map_size, defer_size,
- old_map_size, old_defer_size);
+ old_map_size, old_defer_size,
+ new_nr_max);
if (ret) {
mem_cgroup_iter_break(NULL, memcg);
goto out;
@@ -349,13 +364,16 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
struct shrinker_info *info;
-
- rcu_read_lock();
- info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
- /* Pairs with smp mb in shrink_slab() */
- smp_mb__before_atomic();
- set_bit(shrinker_id, info->map);
- rcu_read_unlock();
+ int srcu_idx;
+
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
+ info = shrinker_info_srcu(memcg, nid);
+ if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
+ /* Pairs with smp mb in shrink_slab() */
+ smp_mb__before_atomic();
+ set_bit(shrinker_id, info->map);
+ }
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
}
}
@@ -368,8 +386,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
if (mem_cgroup_disabled())
return -ENOSYS;
- down_write(&shrinker_rwsem);
- /* This may call shrinker, so it must use down_read_trylock() */
+ mutex_lock(&shrinker_mutex);
id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
@@ -383,7 +400,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
shrinker->id = id;
ret = 0;
unlock:
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
return ret;
}
@@ -393,7 +410,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
BUG_ON(id < 0);
- lockdep_assert_held(&shrinker_rwsem);
+ lockdep_assert_held(&shrinker_mutex);
idr_remove(&shrinker_idr, id);
}
@@ -403,7 +420,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;
- info = shrinker_info_protected(memcg, nid);
+ info = shrinker_info_srcu(memcg, nid);
return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
}
@@ -412,7 +429,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;
- info = shrinker_info_protected(memcg, nid);
+ info = shrinker_info_srcu(memcg, nid);
return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
}
@@ -428,16 +445,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
parent = root_mem_cgroup;
/* Prevent from concurrent shrinker_info expand */
- down_read(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
for_each_node(nid) {
child_info = shrinker_info_protected(memcg, nid);
parent_info = shrinker_info_protected(parent, nid);
- for (i = 0; i < shrinker_nr_max; i++) {
+ for (i = 0; i < child_info->map_nr_max; i++) {
nr = atomic_long_read(&child_info->nr_deferred[i]);
atomic_long_add(nr, &parent_info->nr_deferred[i]);
}
}
- up_read(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
}
static bool cgroup_reclaim(struct scan_control *sc)
@@ -686,9 +703,9 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
shrinker->name = NULL;
#endif
if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
- down_write(&shrinker_rwsem);
+ mutex_lock(&shrinker_mutex);
unregister_memcg_shrinker(shrinker);
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
return;
}
@@ -698,11 +715,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
void register_shrinker_prepared(struct shrinker *shrinker)
{
- down_write(&shrinker_rwsem);
- list_add_tail(&shrinker->list, &shrinker_list);
+ mutex_lock(&shrinker_mutex);
+ list_add_tail_rcu(&shrinker->list, &shrinker_list);
shrinker->flags |= SHRINKER_REGISTERED;
shrinker_debugfs_add(shrinker);
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
}
static int __register_shrinker(struct shrinker *shrinker)
@@ -752,13 +769,16 @@ void unregister_shrinker(struct shrinker *shrinker)
if (!(shrinker->flags & SHRINKER_REGISTERED))
return;
- down_write(&shrinker_rwsem);
- list_del(&shrinker->list);
+ mutex_lock(&shrinker_mutex);
+ list_del_rcu(&shrinker->list);
shrinker->flags &= ~SHRINKER_REGISTERED;
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
unregister_memcg_shrinker(shrinker);
debugfs_entry = shrinker_debugfs_remove(shrinker);
- up_write(&shrinker_rwsem);
+ mutex_unlock(&shrinker_mutex);
+
+ atomic_inc(&shrinker_srcu_generation);
+ synchronize_srcu(&shrinker_srcu);
debugfs_remove_recursive(debugfs_entry);
@@ -770,15 +790,13 @@ EXPORT_SYMBOL(unregister_shrinker);
/**
* synchronize_shrinkers - Wait for all running shrinkers to complete.
*
- * This is equivalent to calling unregister_shrink() and register_shrinker(),
- * but atomically and with less overhead. This is useful to guarantee that all
- * shrinker invocations have seen an update, before freeing memory, similar to
- * rcu.
+ * This is useful to guarantee that all shrinker invocations have seen an
+ * update, before freeing memory.
*/
void synchronize_shrinkers(void)
{
- down_write(&shrinker_rwsem);
- up_write(&shrinker_rwsem);
+ atomic_inc(&shrinker_srcu_generation);
+ synchronize_srcu(&shrinker_srcu);
}
EXPORT_SYMBOL(synchronize_shrinkers);
@@ -887,19 +905,20 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
{
struct shrinker_info *info;
unsigned long ret, freed = 0;
- int i;
+ int srcu_idx, generation;
+ int i = 0;
if (!mem_cgroup_online(memcg))
return 0;
- if (!down_read_trylock(&shrinker_rwsem))
- return 0;
-
- info = shrinker_info_protected(memcg, nid);
+again:
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
+ info = shrinker_info_srcu(memcg, nid);
if (unlikely(!info))
goto unlock;
- for_each_set_bit(i, info->map, shrinker_nr_max) {
+ generation = atomic_read(&shrinker_srcu_generation);
+ for_each_set_bit_from(i, info->map, info->map_nr_max) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -945,14 +964,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
set_shrinker_bit(memcg, nid, i);
}
freed += ret;
-
- if (rwsem_is_contended(&shrinker_rwsem)) {
- freed = freed ? : 1;
- break;
+ if (atomic_read(&shrinker_srcu_generation) != generation) {
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
+ i++;
+ goto again;
}
}
unlock:
- up_read(&shrinker_rwsem);
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
return freed;
}
#else /* CONFIG_MEMCG */
@@ -989,6 +1008,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
{
unsigned long ret, freed = 0;
struct shrinker *shrinker;
+ int srcu_idx, generation;
/*
* The root memcg might be allocated even though memcg is disabled
@@ -1000,10 +1020,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
- if (!down_read_trylock(&shrinker_rwsem))
- goto out;
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
- list_for_each_entry(shrinker, &shrinker_list, list) {
+ generation = atomic_read(&shrinker_srcu_generation);
+ list_for_each_entry_srcu(shrinker, &shrinker_list, list,
+ srcu_read_lock_held(&shrinker_srcu)) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -1014,19 +1035,14 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (ret == SHRINK_EMPTY)
ret = 0;
freed += ret;
- /*
- * Bail out if someone want to register a new shrinker to
- * prevent the registration from being stalled for long periods
- * by parallel ongoing shrinking.
- */
- if (rwsem_is_contended(&shrinker_rwsem)) {
+
+ if (atomic_read(&shrinker_srcu_generation) != generation) {
freed = freed ? : 1;
break;
}
}
- up_read(&shrinker_rwsem);
-out:
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
cond_resched();
return freed;
}
@@ -3604,7 +3620,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
}
/******************************************************************************
- * refault feedback loop
+ * PID controller
******************************************************************************/
/*
@@ -5663,14 +5679,14 @@ unlock:
* sysfs interface
******************************************************************************/
-static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
}
/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
-static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t len)
+static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
{
unsigned int msecs;
@@ -5682,11 +5698,9 @@ static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
return len;
}
-static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
- min_ttl_ms, 0644, show_min_ttl, store_min_ttl
-);
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms);
-static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
unsigned int caps = 0;
@@ -5703,7 +5717,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
}
/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
-static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
+static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t len)
{
int i;
@@ -5730,9 +5744,7 @@ static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
return len;
}
-static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
- enabled, 0644, show_enabled, store_enabled
-);
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled);
static struct attribute *lru_gen_attrs[] = {
&lru_gen_min_ttl_attr.attr,
@@ -5740,7 +5752,7 @@ static struct attribute *lru_gen_attrs[] = {
NULL
};
-static struct attribute_group lru_gen_attr_group = {
+static const struct attribute_group lru_gen_attr_group = {
.name = "lru_gen",
.attrs = lru_gen_attrs,
};
@@ -6990,7 +7002,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
* scan_control uses s8 fields for order, priority, and reclaim_idx.
* Confirm they are large enough for max values.
*/
- BUILD_BUG_ON(MAX_ORDER > S8_MAX);
+ BUILD_BUG_ON(MAX_ORDER >= S8_MAX);
BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1ea6a5ce1c41..c28046371b45 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1055,7 +1055,7 @@ static void fill_contig_page_info(struct zone *zone,
info->free_blocks_total = 0;
info->free_blocks_suitable = 0;
- for (order = 0; order < MAX_ORDER; order++) {
+ for (order = 0; order <= MAX_ORDER; order++) {
unsigned long blocks;
/*
@@ -1088,7 +1088,7 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
{
unsigned long requested = 1UL << order;
- if (WARN_ON_ONCE(order >= MAX_ORDER))
+ if (WARN_ON_ONCE(order > MAX_ORDER))
return 0;
if (!info->free_blocks_total)
@@ -1399,6 +1399,12 @@ const char * const vmstat_text[] = {
"direct_map_level2_splits",
"direct_map_level3_splits",
#endif
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+ "vma_lock_success",
+ "vma_lock_abort",
+ "vma_lock_retry",
+ "vma_lock_miss",
+#endif
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
@@ -1462,7 +1468,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
int order;
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
- for (order = 0; order < MAX_ORDER; ++order)
+ for (order = 0; order <= MAX_ORDER; ++order)
/*
* Access to nr_free is lockless as nr_free is used only for
* printing purposes. Use data_race to avoid KCSAN warning.
@@ -1491,7 +1497,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
pgdat->node_id,
zone->name,
migratetype_names[mtype]);
- for (order = 0; order < MAX_ORDER; ++order) {
+ for (order = 0; order <= MAX_ORDER; ++order) {
unsigned long freecount = 0;
struct free_area *area;
struct list_head *curr;
@@ -1531,7 +1537,7 @@ static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
/* Print header */
seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
- for (order = 0; order < MAX_ORDER; ++order)
+ for (order = 0; order <= MAX_ORDER; ++order)
seq_printf(m, "%6d ", order);
seq_putc(m, '\n');
@@ -2153,7 +2159,7 @@ static void unusable_show_print(struct seq_file *m,
seq_printf(m, "Node %d, zone %8s ",
pgdat->node_id,
zone->name);
- for (order = 0; order < MAX_ORDER; ++order) {
+ for (order = 0; order <= MAX_ORDER; ++order) {
fill_contig_page_info(zone, order, &info);
index = unusable_free_index(order, &info);
seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
@@ -2205,7 +2211,7 @@ static void extfrag_show_print(struct seq_file *m,
seq_printf(m, "Node %d, zone %8s ",
pgdat->node_id,
zone->name);
- for (order = 0; order < MAX_ORDER; ++order) {
+ for (order = 0; order <= MAX_ORDER; ++order) {
fill_contig_page_info(zone, order, &info);
index = __fragmentation_index(order, &info);
seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 3aed46ab7e6c..aea50e2aa350 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -127,7 +127,7 @@
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
#define HUGE_BITS 1
-#define FULLNESS_BITS 2
+#define FULLNESS_BITS 4
#define CLASS_BITS 8
#define ISOLATED_BITS 5
#define MAGIC_VAL_BITS 8
@@ -159,51 +159,44 @@
#define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
ZS_SIZE_CLASS_DELTA) + 1)
+/*
+ * Pages are distinguished by the ratio of used memory (that is the ratio
+ * of ->inuse objects to all objects that page can store). For example,
+ * INUSE_RATIO_10 means that the ratio of used objects is > 0% and <= 10%.
+ *
+ * The number of fullness groups is not random. It allows us to keep
+ * difference between the least busy page in the group (minimum permitted
+ * number of ->inuse objects) and the most busy page (maximum permitted
+ * number of ->inuse objects) at a reasonable value.
+ */
enum fullness_group {
- ZS_EMPTY,
- ZS_ALMOST_EMPTY,
- ZS_ALMOST_FULL,
- ZS_FULL,
- NR_ZS_FULLNESS,
+ ZS_INUSE_RATIO_0,
+ ZS_INUSE_RATIO_10,
+ /* NOTE: 8 more fullness groups here */
+ ZS_INUSE_RATIO_99 = 10,
+ ZS_INUSE_RATIO_100,
+ NR_FULLNESS_GROUPS,
};
enum class_stat_type {
- CLASS_EMPTY,
- CLASS_ALMOST_EMPTY,
- CLASS_ALMOST_FULL,
- CLASS_FULL,
- OBJ_ALLOCATED,
- OBJ_USED,
- NR_ZS_STAT_TYPE,
+ /* NOTE: stats for 12 fullness groups here: from inuse 0 to 100 */
+ ZS_OBJS_ALLOCATED = NR_FULLNESS_GROUPS,
+ ZS_OBJS_INUSE,
+ NR_CLASS_STAT_TYPES,
};
struct zs_size_stat {
- unsigned long objs[NR_ZS_STAT_TYPE];
+ unsigned long objs[NR_CLASS_STAT_TYPES];
};
#ifdef CONFIG_ZSMALLOC_STAT
static struct dentry *zs_stat_root;
#endif
-/*
- * We assign a page to ZS_ALMOST_EMPTY fullness group when:
- * n <= N / f, where
- * n = number of allocated objects
- * N = total number of objects zspage can store
- * f = fullness_threshold_frac
- *
- * Similarly, we assign zspage to:
- * ZS_ALMOST_FULL when n > N / f
- * ZS_EMPTY when n == 0
- * ZS_FULL when n == N
- *
- * (see: fix_fullness_group())
- */
-static const int fullness_threshold_frac = 4;
static size_t huge_class_size;
struct size_class {
- struct list_head fullness_list[NR_ZS_FULLNESS];
+ struct list_head fullness_list[NR_FULLNESS_GROUPS];
/*
* Size of objects stored in this class. Must be multiple
* of ZS_ALIGN.
@@ -547,8 +540,8 @@ static inline void set_freeobj(struct zspage *zspage, unsigned int obj)
}
static void get_zspage_mapping(struct zspage *zspage,
- unsigned int *class_idx,
- enum fullness_group *fullness)
+ unsigned int *class_idx,
+ int *fullness)
{
BUG_ON(zspage->magic != ZSPAGE_MAGIC);
@@ -557,14 +550,14 @@ static void get_zspage_mapping(struct zspage *zspage,
}
static struct size_class *zspage_class(struct zs_pool *pool,
- struct zspage *zspage)
+ struct zspage *zspage)
{
return pool->size_class[zspage->class];
}
static void set_zspage_mapping(struct zspage *zspage,
- unsigned int class_idx,
- enum fullness_group fullness)
+ unsigned int class_idx,
+ int fullness)
{
zspage->class = class_idx;
zspage->fullness = fullness;
@@ -588,23 +581,19 @@ static int get_size_class_index(int size)
return min_t(int, ZS_SIZE_CLASSES - 1, idx);
}
-/* type can be of enum type class_stat_type or fullness_group */
static inline void class_stat_inc(struct size_class *class,
int type, unsigned long cnt)
{
class->stats.objs[type] += cnt;
}
-/* type can be of enum type class_stat_type or fullness_group */
static inline void class_stat_dec(struct size_class *class,
int type, unsigned long cnt)
{
class->stats.objs[type] -= cnt;
}
-/* type can be of enum type class_stat_type or fullness_group */
-static inline unsigned long zs_stat_get(struct size_class *class,
- int type)
+static inline unsigned long zs_stat_get(struct size_class *class, int type)
{
return class->stats.objs[type];
}
@@ -630,32 +619,38 @@ static unsigned long zs_can_compact(struct size_class *class);
static int zs_stats_size_show(struct seq_file *s, void *v)
{
- int i;
+ int i, fg;
struct zs_pool *pool = s->private;
struct size_class *class;
int objs_per_zspage;
- unsigned long class_almost_full, class_almost_empty;
unsigned long obj_allocated, obj_used, pages_used, freeable;
- unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
unsigned long total_freeable = 0;
+ unsigned long inuse_totals[NR_FULLNESS_GROUPS] = {0, };
- seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n",
- "class", "size", "almost_full", "almost_empty",
+ seq_printf(s, " %5s %5s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %13s %10s %10s %16s %8s\n",
+ "class", "size", "10%", "20%", "30%", "40%",
+ "50%", "60%", "70%", "80%", "90%", "99%", "100%",
"obj_allocated", "obj_used", "pages_used",
"pages_per_zspage", "freeable");
for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+
class = pool->size_class[i];
if (class->index != i)
continue;
spin_lock(&pool->lock);
- class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
- class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
- obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
- obj_used = zs_stat_get(class, OBJ_USED);
+
+ seq_printf(s, " %5u %5u ", i, class->size);
+ for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) {
+ inuse_totals[fg] += zs_stat_get(class, fg);
+ seq_printf(s, "%9lu ", zs_stat_get(class, fg));
+ }
+
+ obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED);
+ obj_used = zs_stat_get(class, ZS_OBJS_INUSE);
freeable = zs_can_compact(class);
spin_unlock(&pool->lock);
@@ -663,14 +658,10 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
pages_used = obj_allocated / objs_per_zspage *
class->pages_per_zspage;
- seq_printf(s, " %5u %5u %11lu %12lu %13lu"
- " %10lu %10lu %16d %8lu\n",
- i, class->size, class_almost_full, class_almost_empty,
- obj_allocated, obj_used, pages_used,
- class->pages_per_zspage, freeable);
+ seq_printf(s, "%13lu %10lu %10lu %16d %8lu\n",
+ obj_allocated, obj_used, pages_used,
+ class->pages_per_zspage, freeable);
- total_class_almost_full += class_almost_full;
- total_class_almost_empty += class_almost_empty;
total_objs += obj_allocated;
total_used_objs += obj_used;
total_pages += pages_used;
@@ -678,10 +669,14 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
}
seq_puts(s, "\n");
- seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n",
- "Total", "", total_class_almost_full,
- total_class_almost_empty, total_objs,
- total_used_objs, total_pages, "", total_freeable);
+ seq_printf(s, " %5s %5s ", "Total", "");
+
+ for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++)
+ seq_printf(s, "%9lu ", inuse_totals[fg]);
+
+ seq_printf(s, "%13lu %10lu %10lu %16s %8lu\n",
+ total_objs, total_used_objs, total_pages, "",
+ total_freeable);
return 0;
}
@@ -726,30 +721,28 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
/*
* For each size class, zspages are divided into different groups
- * depending on how "full" they are. This was done so that we could
- * easily find empty or nearly empty zspages when we try to shrink
- * the pool (not yet implemented). This function returns fullness
+ * depending on their usage ratio. This function returns fullness
* status of the given page.
*/
-static enum fullness_group get_fullness_group(struct size_class *class,
- struct zspage *zspage)
+static int get_fullness_group(struct size_class *class, struct zspage *zspage)
{
- int inuse, objs_per_zspage;
- enum fullness_group fg;
+ int inuse, objs_per_zspage, ratio;
inuse = get_zspage_inuse(zspage);
objs_per_zspage = class->objs_per_zspage;
if (inuse == 0)
- fg = ZS_EMPTY;
- else if (inuse == objs_per_zspage)
- fg = ZS_FULL;
- else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac)
- fg = ZS_ALMOST_EMPTY;
- else
- fg = ZS_ALMOST_FULL;
+ return ZS_INUSE_RATIO_0;
+ if (inuse == objs_per_zspage)
+ return ZS_INUSE_RATIO_100;
- return fg;
+ ratio = 100 * inuse / objs_per_zspage;
+ /*
+ * Take integer division into consideration: a page with one inuse
+ * object out of 127 possible, will end up having 0 usage ratio,
+ * which is wrong as it belongs in ZS_INUSE_RATIO_10 fullness group.
+ */
+ return ratio / 10 + 1;
}
/*
@@ -760,21 +753,10 @@ static enum fullness_group get_fullness_group(struct size_class *class,
*/
static void insert_zspage(struct size_class *class,
struct zspage *zspage,
- enum fullness_group fullness)
+ int fullness)
{
- struct zspage *head;
-
class_stat_inc(class, fullness, 1);
- head = list_first_entry_or_null(&class->fullness_list[fullness],
- struct zspage, list);
- /*
- * We want to see more ZS_FULL pages and less almost empty/full.
- * Put pages with higher ->inuse first.
- */
- if (head && get_zspage_inuse(zspage) < get_zspage_inuse(head))
- list_add(&zspage->list, &head->list);
- else
- list_add(&zspage->list, &class->fullness_list[fullness]);
+ list_add(&zspage->list, &class->fullness_list[fullness]);
}
/*
@@ -783,7 +765,7 @@ static void insert_zspage(struct size_class *class,
*/
static void remove_zspage(struct size_class *class,
struct zspage *zspage,
- enum fullness_group fullness)
+ int fullness)
{
VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
@@ -794,17 +776,16 @@ static void remove_zspage(struct size_class *class,
/*
* Each size class maintains zspages in different fullness groups depending
* on the number of live objects they contain. When allocating or freeing
- * objects, the fullness status of the page can change, say, from ALMOST_FULL
- * to ALMOST_EMPTY when freeing an object. This function checks if such
- * a status change has occurred for the given page and accordingly moves the
- * page from the freelist of the old fullness group to that of the new
- * fullness group.
+ * objects, the fullness status of the page can change, for instance, from
+ * INUSE_RATIO_80 to INUSE_RATIO_70 when freeing an object. This function
+ * checks if such a status change has occurred for the given page and
+ * accordingly moves the page from the list of the old fullness group to that
+ * of the new fullness group.
*/
-static enum fullness_group fix_fullness_group(struct size_class *class,
- struct zspage *zspage)
+static int fix_fullness_group(struct size_class *class, struct zspage *zspage)
{
int class_idx;
- enum fullness_group currfg, newfg;
+ int currfg, newfg;
get_zspage_mapping(zspage, &class_idx, &currfg);
newfg = get_fullness_group(class, zspage);
@@ -977,7 +958,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
struct page *page, *next;
- enum fullness_group fg;
+ int fg;
unsigned int class_idx;
get_zspage_mapping(zspage, &class_idx, &fg);
@@ -985,7 +966,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
assert_spin_locked(&pool->lock);
VM_BUG_ON(get_zspage_inuse(zspage));
- VM_BUG_ON(fg != ZS_EMPTY);
+ VM_BUG_ON(fg != ZS_INUSE_RATIO_0);
/* Free all deferred handles from zs_free */
free_handles(pool, class, zspage);
@@ -1003,9 +984,8 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
cache_free_zspage(pool, zspage);
- class_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage);
- atomic_long_sub(class->pages_per_zspage,
- &pool->pages_allocated);
+ class_stat_dec(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
+ atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
}
static void free_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1024,7 +1004,7 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
return;
}
- remove_zspage(class, zspage, ZS_EMPTY);
+ remove_zspage(class, zspage, ZS_INUSE_RATIO_0);
#ifdef CONFIG_ZPOOL
list_del(&zspage->lru);
#endif
@@ -1160,9 +1140,9 @@ static struct zspage *find_get_zspage(struct size_class *class)
int i;
struct zspage *zspage;
- for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) {
+ for (i = ZS_INUSE_RATIO_99; i >= ZS_INUSE_RATIO_0; i--) {
zspage = list_first_entry_or_null(&class->fullness_list[i],
- struct zspage, list);
+ struct zspage, list);
if (zspage)
break;
}
@@ -1521,7 +1501,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
{
unsigned long handle, obj;
struct size_class *class;
- enum fullness_group newfg;
+ int newfg;
struct zspage *zspage;
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
@@ -1543,7 +1523,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
/* Now move the zspage to another fullness group, if required */
fix_fullness_group(class, zspage);
record_obj(handle, obj);
- class_stat_inc(class, OBJ_USED, 1);
+ class_stat_inc(class, ZS_OBJS_INUSE, 1);
spin_unlock(&pool->lock);
return handle;
@@ -1563,10 +1543,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
insert_zspage(class, zspage, newfg);
set_zspage_mapping(zspage, class->index, newfg);
record_obj(handle, obj);
- atomic_long_add(class->pages_per_zspage,
- &pool->pages_allocated);
- class_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage);
- class_stat_inc(class, OBJ_USED, 1);
+ atomic_long_add(class->pages_per_zspage, &pool->pages_allocated);
+ class_stat_inc(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
+ class_stat_inc(class, ZS_OBJS_INUSE, 1);
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
@@ -1622,7 +1601,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
struct page *f_page;
unsigned long obj;
struct size_class *class;
- enum fullness_group fullness;
+ int fullness;
if (IS_ERR_OR_NULL((void *)handle))
return;
@@ -1637,7 +1616,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
zspage = get_zspage(f_page);
class = zspage_class(pool, zspage);
- class_stat_dec(class, OBJ_USED, 1);
+ class_stat_dec(class, ZS_OBJS_INUSE, 1);
#ifdef CONFIG_ZPOOL
if (zspage->under_reclaim) {
@@ -1655,7 +1634,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
obj_free(class->size, obj, NULL);
fullness = fix_fullness_group(class, zspage);
- if (fullness == ZS_EMPTY)
+ if (fullness == ZS_INUSE_RATIO_0)
free_zspage(pool, class, zspage);
spin_unlock(&pool->lock);
@@ -1796,15 +1775,14 @@ struct zs_compact_control {
int obj_idx;
};
-static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
- struct zs_compact_control *cc)
+static void migrate_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zs_compact_control *cc)
{
unsigned long used_obj, free_obj;
unsigned long handle;
struct page *s_page = cc->s_page;
struct page *d_page = cc->d_page;
int obj_idx = cc->obj_idx;
- int ret = 0;
while (1) {
handle = find_alloced_obj(class, s_page, &obj_idx);
@@ -1817,10 +1795,8 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
}
/* Stop if there is no more space */
- if (zspage_full(class, get_zspage(d_page))) {
- ret = -ENOMEM;
+ if (zspage_full(class, get_zspage(d_page)))
break;
- }
used_obj = handle_to_obj(handle);
free_obj = obj_malloc(pool, get_zspage(d_page), handle);
@@ -1833,26 +1809,35 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
/* Remember last position in this iteration */
cc->s_page = s_page;
cc->obj_idx = obj_idx;
-
- return ret;
}
-static struct zspage *isolate_zspage(struct size_class *class, bool source)
+static struct zspage *isolate_src_zspage(struct size_class *class)
{
- int i;
struct zspage *zspage;
- enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL};
+ int fg;
- if (!source) {
- fg[0] = ZS_ALMOST_FULL;
- fg[1] = ZS_ALMOST_EMPTY;
+ for (fg = ZS_INUSE_RATIO_10; fg <= ZS_INUSE_RATIO_99; fg++) {
+ zspage = list_first_entry_or_null(&class->fullness_list[fg],
+ struct zspage, list);
+ if (zspage) {
+ remove_zspage(class, zspage, fg);
+ return zspage;
+ }
}
- for (i = 0; i < 2; i++) {
- zspage = list_first_entry_or_null(&class->fullness_list[fg[i]],
- struct zspage, list);
+ return zspage;
+}
+
+static struct zspage *isolate_dst_zspage(struct size_class *class)
+{
+ struct zspage *zspage;
+ int fg;
+
+ for (fg = ZS_INUSE_RATIO_99; fg >= ZS_INUSE_RATIO_10; fg--) {
+ zspage = list_first_entry_or_null(&class->fullness_list[fg],
+ struct zspage, list);
if (zspage) {
- remove_zspage(class, zspage, fg[i]);
+ remove_zspage(class, zspage, fg);
return zspage;
}
}
@@ -1865,12 +1850,11 @@ static struct zspage *isolate_zspage(struct size_class *class, bool source)
* @class: destination class
* @zspage: target page
*
- * Return @zspage's fullness_group
+ * Return @zspage's fullness status
*/
-static enum fullness_group putback_zspage(struct size_class *class,
- struct zspage *zspage)
+static int putback_zspage(struct size_class *class, struct zspage *zspage)
{
- enum fullness_group fullness;
+ int fullness;
fullness = get_fullness_group(class, zspage);
insert_zspage(class, zspage, fullness);
@@ -2134,7 +2118,7 @@ static void async_free_zspage(struct work_struct *work)
int i;
struct size_class *class;
unsigned int class_idx;
- enum fullness_group fullness;
+ int fullness;
struct zspage *zspage, *tmp;
LIST_HEAD(free_pages);
struct zs_pool *pool = container_of(work, struct zs_pool,
@@ -2146,7 +2130,8 @@ static void async_free_zspage(struct work_struct *work)
continue;
spin_lock(&pool->lock);
- list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages);
+ list_splice_init(&class->fullness_list[ZS_INUSE_RATIO_0],
+ &free_pages);
spin_unlock(&pool->lock);
}
@@ -2155,7 +2140,7 @@ static void async_free_zspage(struct work_struct *work)
lock_zspage(zspage);
get_zspage_mapping(zspage, &class_idx, &fullness);
- VM_BUG_ON(fullness != ZS_EMPTY);
+ VM_BUG_ON(fullness != ZS_INUSE_RATIO_0);
class = pool->size_class[class_idx];
spin_lock(&pool->lock);
#ifdef CONFIG_ZPOOL
@@ -2203,8 +2188,8 @@ static inline void zs_flush_migration(struct zs_pool *pool) { }
static unsigned long zs_can_compact(struct size_class *class)
{
unsigned long obj_wasted;
- unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
- unsigned long obj_used = zs_stat_get(class, OBJ_USED);
+ unsigned long obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED);
+ unsigned long obj_used = zs_stat_get(class, ZS_OBJS_INUSE);
if (obj_allocated <= obj_used)
return 0;
@@ -2219,7 +2204,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
struct size_class *class)
{
struct zs_compact_control cc;
- struct zspage *src_zspage;
+ struct zspage *src_zspage = NULL;
struct zspage *dst_zspage = NULL;
unsigned long pages_freed = 0;
@@ -2228,50 +2213,45 @@ static unsigned long __zs_compact(struct zs_pool *pool,
* as well as zpage allocation/free
*/
spin_lock(&pool->lock);
- while ((src_zspage = isolate_zspage(class, true))) {
- /* protect someone accessing the zspage(i.e., zs_map_object) */
- migrate_write_lock(src_zspage);
+ while (zs_can_compact(class)) {
+ int fg;
- if (!zs_can_compact(class))
+ if (!dst_zspage) {
+ dst_zspage = isolate_dst_zspage(class);
+ if (!dst_zspage)
+ break;
+ migrate_write_lock(dst_zspage);
+ cc.d_page = get_first_page(dst_zspage);
+ }
+
+ src_zspage = isolate_src_zspage(class);
+ if (!src_zspage)
break;
+ migrate_write_lock_nested(src_zspage);
+
cc.obj_idx = 0;
cc.s_page = get_first_page(src_zspage);
+ migrate_zspage(pool, class, &cc);
+ fg = putback_zspage(class, src_zspage);
+ migrate_write_unlock(src_zspage);
- while ((dst_zspage = isolate_zspage(class, false))) {
- migrate_write_lock_nested(dst_zspage);
-
- cc.d_page = get_first_page(dst_zspage);
- /*
- * If there is no more space in dst_page, resched
- * and see if anyone had allocated another zspage.
- */
- if (!migrate_zspage(pool, class, &cc))
- break;
+ if (fg == ZS_INUSE_RATIO_0) {
+ free_zspage(pool, class, src_zspage);
+ pages_freed += class->pages_per_zspage;
+ src_zspage = NULL;
+ }
+ if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100
+ || spin_is_contended(&pool->lock)) {
putback_zspage(class, dst_zspage);
migrate_write_unlock(dst_zspage);
dst_zspage = NULL;
- if (spin_is_contended(&pool->lock))
- break;
- }
-
- /* Stop if we couldn't find slot */
- if (dst_zspage == NULL)
- break;
- putback_zspage(class, dst_zspage);
- migrate_write_unlock(dst_zspage);
-
- if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
- migrate_write_unlock(src_zspage);
- free_zspage(pool, class, src_zspage);
- pages_freed += class->pages_per_zspage;
- } else
- migrate_write_unlock(src_zspage);
- spin_unlock(&pool->lock);
- cond_resched();
- spin_lock(&pool->lock);
+ spin_unlock(&pool->lock);
+ cond_resched();
+ spin_lock(&pool->lock);
+ }
}
if (src_zspage) {
@@ -2279,6 +2259,10 @@ static unsigned long __zs_compact(struct zs_pool *pool,
migrate_write_unlock(src_zspage);
}
+ if (dst_zspage) {
+ putback_zspage(class, dst_zspage);
+ migrate_write_unlock(dst_zspage);
+ }
spin_unlock(&pool->lock);
return pages_freed;
@@ -2421,7 +2405,7 @@ struct zs_pool *zs_create_pool(const char *name)
int pages_per_zspage;
int objs_per_zspage;
struct size_class *class;
- int fullness = 0;
+ int fullness;
size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
if (size > ZS_MAX_ALLOC_SIZE)
@@ -2475,9 +2459,12 @@ struct zs_pool *zs_create_pool(const char *name)
class->pages_per_zspage = pages_per_zspage;
class->objs_per_zspage = objs_per_zspage;
pool->size_class[i] = class;
- for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
- fullness++)
+
+ fullness = ZS_INUSE_RATIO_0;
+ while (fullness < NR_FULLNESS_GROUPS) {
INIT_LIST_HEAD(&class->fullness_list[fullness]);
+ fullness++;
+ }
prev_class = class;
}
@@ -2523,11 +2510,12 @@ void zs_destroy_pool(struct zs_pool *pool)
if (class->index != i)
continue;
- for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) {
- if (!list_empty(&class->fullness_list[fg])) {
- pr_info("Freeing non-empty class with size %db, fullness group %d\n",
- class->size, fg);
- }
+ for (fg = ZS_INUSE_RATIO_0; fg < NR_FULLNESS_GROUPS; fg++) {
+ if (list_empty(&class->fullness_list[fg]))
+ continue;
+
+ pr_err("Class-%d fullness group %d is not empty\n",
+ class->size, fg);
}
kfree(class);
}
@@ -2629,7 +2617,7 @@ static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
unsigned long handle;
struct zspage *zspage;
struct page *page;
- enum fullness_group fullness;
+ int fullness;
/* Lock LRU and fullness list */
spin_lock(&pool->lock);
@@ -2699,7 +2687,7 @@ next:
* while the page is removed from the pool. Fix it
* up for the check in __free_zspage().
*/
- zspage->fullness = ZS_EMPTY;
+ zspage->fullness = ZS_INUSE_RATIO_0;
__free_zspage(pool, class, zspage);
spin_unlock(&pool->lock);
diff --git a/mm/zswap.c b/mm/zswap.c
index f6c89049cf70..2f0ebd8bc620 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1073,15 +1073,23 @@ fail:
static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
{
- unsigned int pos;
unsigned long *page;
+ unsigned long val;
+ unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
page = (unsigned long *)ptr;
- for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
- if (page[pos] != page[0])
+ val = page[0];
+
+ if (val != page[last_pos])
+ return 0;
+
+ for (pos = 1; pos < last_pos; pos++) {
+ if (val != page[pos])
return 0;
}
- *value = page[0];
+
+ *value = val;
+
return 1;
}
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 854772dd52fd..9b66d6aeeb1a 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -843,7 +843,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
goto out;
/* the calculated number of cq entries fits to mlx5 cq allocation */
cqe_size_order = cache_line_size() == 128 ? 7 : 6;
- smc_order = MAX_ORDER - cqe_size_order - 1;
+ smc_order = MAX_ORDER - cqe_size_order;
if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index 64499056648a..51ad29940f05 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -38,7 +38,7 @@ static int param_set_bufsize(const char *val, const struct kernel_param *kp)
size = memparse(val, NULL);
order = get_order(size);
- if (order >= MAX_ORDER)
+ if (order > MAX_ORDER)
return -EINVAL;
ima_maxorder = order;
ima_bufsize = PAGE_SIZE << order;
diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h
index e65f89b12f1c..134f8eab0768 100644
--- a/tools/testing/memblock/linux/mmzone.h
+++ b/tools/testing/memblock/linux/mmzone.h
@@ -17,10 +17,10 @@ enum zone_type {
};
#define MAX_NR_ZONES __MAX_NR_ZONES
-#define MAX_ORDER 11
-#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
+#define MAX_ORDER 10
+#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
-#define pageblock_order (MAX_ORDER - 1)
+#define pageblock_order MAX_ORDER
#define pageblock_nr_pages BIT(pageblock_order)
#define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages)
#define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages)
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index 1e616a8c6a9c..f4f7c0aef702 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -98,6 +98,11 @@ static int alloc_anon_50M_check(const char *cgroup, void *arg)
int ret = -1;
buf = malloc(size);
+ if (buf == NULL) {
+ fprintf(stderr, "malloc() failed\n");
+ return -1;
+ }
+
for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
*ptr = 0;
@@ -211,6 +216,11 @@ static int alloc_anon_noexit(const char *cgroup, void *arg)
char *buf, *ptr;
buf = malloc(size);
+ if (buf == NULL) {
+ fprintf(stderr, "malloc() failed\n");
+ return -1;
+ }
+
for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
*ptr = 0;
@@ -778,6 +788,11 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
int ret = -1;
buf = malloc(size);
+ if (buf == NULL) {
+ fprintf(stderr, "malloc() failed\n");
+ return -1;
+ }
+
for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
*ptr = 0;
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 8984e0bb58c7..c0f93b668c0c 100644
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -220,7 +220,15 @@ CATEGORY="mremap" run_test ./mremap_test
CATEGORY="hugetlb" run_test ./thuge-gen
if [ $VADDR64 -ne 0 ]; then
+
+ # set overcommit_policy as OVERCOMMIT_ALWAYS so that kernel
+ # allows high virtual address allocation requests independent
+ # of platform's physical memory.
+
+ prev_policy=$(cat /proc/sys/vm/overcommit_memory)
+ echo 1 > /proc/sys/vm/overcommit_memory
CATEGORY="hugevm" run_test ./virtual_address_range
+ echo $prev_policy > /proc/sys/vm/overcommit_memory
# virtual address 128TB switch test
CATEGORY="hugevm" run_test ./va_128TBswitch.sh
diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index 76e1c36dd9e5..b8558c7f1a39 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -106,7 +106,7 @@ void split_pmd_thp(void)
for (i = 0; i < len; i++)
one_page[i] = (char)i;
- if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
+ if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
printf("No THP is allocated\n");
exit(EXIT_FAILURE);
}
@@ -122,7 +122,7 @@ void split_pmd_thp(void)
}
- if (check_huge_anon(one_page, 0, pmd_pagesize)) {
+ if (!check_huge_anon(one_page, 0, pmd_pagesize)) {
printf("Still AnonHugePages not split\n");
exit(EXIT_FAILURE);
}
@@ -169,7 +169,7 @@ void split_pte_mapped_thp(void)
for (i = 0; i < len; i++)
one_page[i] = (char)i;
- if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
+ if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
printf("No THP is allocated\n");
exit(EXIT_FAILURE);
}
diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c
index 7f22844ed704..a96d126cb40e 100644
--- a/tools/testing/selftests/mm/userfaultfd.c
+++ b/tools/testing/selftests/mm/userfaultfd.c
@@ -585,6 +585,8 @@ static void continue_range(int ufd, __u64 start, __u64 len)
req.range.start = start;
req.range.len = len;
req.mode = 0;
+ if (test_uffdio_wp)
+ req.mode |= UFFDIO_CONTINUE_MODE_WP;
if (ioctl(ufd, UFFDIO_CONTINUE, &req))
err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
@@ -1332,6 +1334,8 @@ static int userfaultfd_minor_test(void)
uffdio_register.range.start = (unsigned long)area_dst_alias;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
+ if (test_uffdio_wp)
+ uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
err("register failure");
@@ -1444,6 +1448,43 @@ static int pagemap_test_fork(bool present)
return result;
}
+static void userfaultfd_wp_unpopulated_test(int pagemap_fd)
+{
+ uint64_t value;
+
+ /* Test applying pte marker to anon unpopulated */
+ wp_range(uffd, (uint64_t)area_dst, page_size, true);
+ value = pagemap_read_vaddr(pagemap_fd, area_dst);
+ pagemap_check_wp(value, true);
+
+ /* Test unprotect on anon pte marker */
+ wp_range(uffd, (uint64_t)area_dst, page_size, false);
+ value = pagemap_read_vaddr(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+
+ /* Test zap on anon marker */
+ wp_range(uffd, (uint64_t)area_dst, page_size, true);
+ if (madvise(area_dst, page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+ value = pagemap_read_vaddr(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+
+ /* Test fault in after marker removed */
+ *area_dst = 1;
+ value = pagemap_read_vaddr(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+ /* Drop it to make pte none again */
+ if (madvise(area_dst, page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+
+ /* Test read-zero-page upon pte marker */
+ wp_range(uffd, (uint64_t)area_dst, page_size, true);
+ *(volatile char *)area_dst;
+ /* Drop it to make pte none again */
+ if (madvise(area_dst, page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+}
+
static void userfaultfd_pagemap_test(unsigned int test_pgsize)
{
struct uffdio_register uffdio_register;
@@ -1462,7 +1503,7 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize)
/* Flush so it doesn't flush twice in parent/child later */
fflush(stdout);
- uffd_test_ctx_init(0);
+ uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED);
if (test_pgsize > page_size) {
/* This is a thp test */
@@ -1482,6 +1523,10 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize)
pagemap_fd = pagemap_open();
+ /* Smoke test WP_UNPOPULATED first when it's still empty */
+ if (test_pgsize == page_size)
+ userfaultfd_wp_unpopulated_test(pagemap_fd);
+
/* Touch the page */
*area_dst = 1;
wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
@@ -1526,7 +1571,7 @@ static int userfaultfd_stress(void)
struct uffdio_register uffdio_register;
struct uffd_stats uffd_stats[nr_cpus];
- uffd_test_ctx_init(0);
+ uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED);
if (posix_memalign(&area, page_size, page_size))
err("out of memory");
diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index c0592646ed93..bae0ceaf95b1 100644
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -15,11 +15,15 @@
/*
* Maximum address range mapped with a single mmap()
- * call is little bit more than 16GB. Hence 16GB is
+ * call is little bit more than 1GB. Hence 1GB is
* chosen as the single chunk size for address space
* mapping.
*/
-#define MAP_CHUNK_SIZE 17179869184UL /* 16GB */
+
+#define SZ_1GB (1024 * 1024 * 1024UL)
+#define SZ_1TB (1024 * 1024 * 1024 * 1024UL)
+
+#define MAP_CHUNK_SIZE SZ_1GB
/*
* Address space till 128TB is mapped without any hint
@@ -32,13 +36,15 @@
* till it reaches 512TB. One with size 128TB and the
* other being 384TB.
*
- * On Arm64 the address space is 256TB and no high mappings
- * are supported so far.
+ * On Arm64 the address space is 256TB and support for
+ * high mappings up to 4PB virtual address space has
+ * been added.
*/
-#define NR_CHUNKS_128TB 8192UL /* Number of 16GB chunks for 128TB */
+#define NR_CHUNKS_128TB ((128 * SZ_1TB) / MAP_CHUNK_SIZE) /* Number of chunks for 128TB */
#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL)
#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL)
+#define NR_CHUNKS_3840TB (NR_CHUNKS_128TB * 30UL)
#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */
#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */
@@ -47,7 +53,7 @@
#define HIGH_ADDR_MARK ADDR_MARK_256TB
#define HIGH_ADDR_SHIFT 49
#define NR_CHUNKS_LOW NR_CHUNKS_256TB
-#define NR_CHUNKS_HIGH 0
+#define NR_CHUNKS_HIGH NR_CHUNKS_3840TB
#else
#define HIGH_ADDR_MARK ADDR_MARK_128TB
#define HIGH_ADDR_SHIFT 48
@@ -97,7 +103,7 @@ static int validate_lower_address_hint(void)
int main(int argc, char *argv[])
{
char *ptr[NR_CHUNKS_LOW];
- char *hptr[NR_CHUNKS_HIGH];
+ char **hptr;
char *hint;
unsigned long i, lchunks, hchunks;
@@ -115,6 +121,9 @@ int main(int argc, char *argv[])
return 1;
}
lchunks = i;
+ hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *));
+ if (hptr == NULL)
+ return 1;
for (i = 0; i < NR_CHUNKS_HIGH; i++) {
hint = hind_addr();
@@ -135,5 +144,6 @@ int main(int argc, char *argv[])
for (i = 0; i < hchunks; i++)
munmap(hptr[i], MAP_CHUNK_SIZE);
+ free(hptr);
return 0;
}