diff options
Diffstat (limited to 'arch')
86 files changed, 440 insertions, 1654 deletions
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index e5f881bc8288..8886ab539273 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1229,7 +1229,7 @@ arch_get_unmapped_area_1(unsigned long addr, unsigned long len, unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { unsigned long limit; diff --git a/arch/arc/mm/mmap.c b/arch/arc/mm/mmap.c index 69a915297155..2185afe8d59f 100644 --- a/arch/arc/mm/mmap.c +++ b/arch/arc/mm/mmap.c @@ -23,7 +23,8 @@ */ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 2286c2ea60ec..831793cd6ff9 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -61,7 +61,7 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address, return ret; } -#if USE_SPLIT_PTE_PTLOCKS +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) /* * If we are using split PTE locks, then we need to take the page * lock here. Otherwise we are using shared mm->page_table_lock @@ -80,10 +80,10 @@ static inline void do_pte_unlock(spinlock_t *ptl) { spin_unlock(ptl); } -#else /* !USE_SPLIT_PTE_PTLOCKS */ +#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ static inline void do_pte_lock(spinlock_t *ptl) {} static inline void do_pte_unlock(spinlock_t *ptl) {} -#endif /* USE_SPLIT_PTE_PTLOCKS */ +#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, unsigned long pfn) diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index d65d0e6ed10a..3dbb383c26d5 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -28,7 +28,8 @@ */ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -78,8 +79,8 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) + const unsigned long len, const unsigned long pgoff, + const unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e430ff33c80d..49f054dcd4de 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -101,6 +101,7 @@ config ARM64 select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_RT select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT @@ -2104,7 +2105,8 @@ config ARM64_MTE depends on ARM64_PAN select ARCH_HAS_SUBPAGE_FAULTS select ARCH_USES_HIGH_VMA_FLAGS - select ARCH_USES_PG_ARCH_X + select ARCH_USES_PG_ARCH_2 + select ARCH_USES_PG_ARCH_3 help Memory Tagging (part of the ARMv8.5 Extensions) provides architectural support for run-time, always-on detection of diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 7d7d97ad3cd5..4e350df9a02d 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -9,6 +9,7 @@ syscall-y += unistd_compat_32.h generic-y += early_ioremap.h generic-y += mcs_spinlock.h +generic-y += mmzone.h generic-y += qrwlock.h generic-y += qspinlock.h generic-y += parport.h diff --git a/arch/arm64/include/asm/mmzone.h b/arch/arm64/include/asm/mmzone.h deleted file mode 100644 index fa17e01d9ab2..000000000000 --- a/arch/arm64/include/asm/mmzone.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_MMZONE_H -#define __ASM_MMZONE_H - -#ifdef CONFIG_NUMA - -#include <asm/numa.h> - -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[(nid)]) - -#endif /* CONFIG_NUMA */ -#endif /* __ASM_MMZONE_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 96c2b0b07c4c..c329ea061dc9 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -407,6 +407,7 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages) /* * Select all bits except the pfn */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pfn = pte_pfn(pte); @@ -600,6 +601,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP))); } +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +#define pmd_special(pte) (!!((pmd_val(pte) & PTE_SPECIAL))) +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return set_pmd_bit(pmd, __pgprot(PTE_SPECIAL)); +} +#endif + #define __pmd_to_phys(pmd) __pte_to_phys(pmd_pte(pmd)) #define __phys_to_pmd_val(phys) __phys_to_pte_val(phys) #define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT) @@ -617,6 +626,27 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) #define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +#define pud_special(pte) pte_special(pud_pte(pud)) +#define pud_mkspecial(pte) pte_pud(pte_mkspecial(pud_pte(pud))) +#endif + +#define pmd_pgprot pmd_pgprot +static inline pgprot_t pmd_pgprot(pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd)); +} + +#define pud_pgprot pud_pgprot +static inline pgprot_t pud_pgprot(pud_t pud) +{ + unsigned long pfn = pud_pfn(pud); + + return __pgprot(pud_val(pfn_pud(pfn, __pgprot(0))) ^ pud_val(pud)); +} + static inline void __set_pte_at(struct mm_struct *mm, unsigned long __always_unused addr, pte_t *ptep, pte_t pte, unsigned int nr) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 0f6ef432fb84..5fc3af9f8f29 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -5,6 +5,7 @@ #include <linux/cpumask.h> #ifdef CONFIG_NUMA +#include <asm/numa.h> struct pci_bus; int pcibus_to_node(struct pci_bus *bus); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 638b6205526f..f9e30dd34c7a 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -62,7 +62,6 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) */ num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU; tmp = kvrealloc(kvm->arch.nested_mmus, - size_mul(sizeof(*kvm->arch.nested_mmus), kvm->arch.nested_mmus_size), size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus), GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!tmp) diff --git a/arch/csky/abiv1/mmap.c b/arch/csky/abiv1/mmap.c index 7f826331d409..1047865e82a9 100644 --- a/arch/csky/abiv1/mmap.c +++ b/arch/csky/abiv1/mmap.c @@ -23,7 +23,8 @@ */ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; diff --git a/arch/csky/kernel/vdso.c b/arch/csky/kernel/vdso.c index 2ca886e4a458..5c9ef63c29f1 100644 --- a/arch/csky/kernel/vdso.c +++ b/arch/csky/kernel/vdso.c @@ -45,9 +45,16 @@ arch_initcall(vdso_init); int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { + struct vm_area_struct *vma; struct mm_struct *mm = current->mm; unsigned long vdso_base, vdso_len; int ret; + static struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + }; + static struct vm_special_mapping vvar_mapping = { + .name = "[vvar]", + }; vdso_len = (vdso_pages + 1) << PAGE_SHIFT; @@ -65,22 +72,29 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, */ mm->context.vdso = (void *)vdso_base; - ret = - install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, + vdso_mapping.pages = vdso_pagelist; + vma = + _install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC), - vdso_pagelist); + &vdso_mapping); - if (unlikely(ret)) { + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); mm->context.vdso = NULL; goto end; } vdso_base += (vdso_pages << PAGE_SHIFT); - ret = install_special_mapping(mm, vdso_base, PAGE_SIZE, - (VM_READ | VM_MAYREAD), &vdso_pagelist[vdso_pages]); + vvar_mapping.pages = &vdso_pagelist[vdso_pages]; + vma = _install_special_mapping(mm, vdso_base, PAGE_SIZE, + (VM_READ | VM_MAYREAD), &vvar_mapping); - if (unlikely(ret)) + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); mm->context.vdso = NULL; + goto end; + } + ret = 0; end: mmap_write_unlock(mm); return ret; diff --git a/arch/hexagon/kernel/vdso.c b/arch/hexagon/kernel/vdso.c index 2e4872d62124..6fd27ff1df73 100644 --- a/arch/hexagon/kernel/vdso.c +++ b/arch/hexagon/kernel/vdso.c @@ -51,7 +51,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { int ret; unsigned long vdso_base; + struct vm_area_struct *vma; struct mm_struct *mm = current->mm; + static struct vm_special_mapping vdso_mapping = { + name = "[vdso]", + }; if (mmap_write_lock_killable(mm)) return -EINTR; @@ -66,16 +70,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) } /* MAYWRITE to allow gdb to COW and set breakpoints. */ - ret = install_special_mapping(mm, vdso_base, PAGE_SIZE, + vdso_mapping.pages = &vdso_page; + vma = _install_special_mapping(mm, vdso_base, PAGE_SIZE, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &vdso_page); + &vdso_mapping); - if (ret) + ret = PTR_ERR(vma); + if (IS_ERR(vma)) goto up_fail; mm->context.vdso = (void *)vdso_base; - + ret = 0; up_fail: mmap_write_unlock(mm); return ret; diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index b4252c357c8e..75b366407a60 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -96,7 +96,6 @@ CONFIG_ZPOOL=y CONFIG_ZSWAP=y CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD=y CONFIG_ZBUD=y -CONFIG_Z3FOLD=y CONFIG_ZSMALLOC=m # CONFIG_COMPAT_BRK is not set CONFIG_MEMORY_HOTPLUG=y diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild index 4635b755b2b4..5b5a6c90e6e2 100644 --- a/arch/loongarch/include/asm/Kbuild +++ b/arch/loongarch/include/asm/Kbuild @@ -8,5 +8,6 @@ generic-y += early_ioremap.h generic-y += qrwlock.h generic-y += user.h generic-y += ioctl.h +generic-y += mmzone.h generic-y += statfs.h generic-y += param.h diff --git a/arch/loongarch/include/asm/mmzone.h b/arch/loongarch/include/asm/mmzone.h deleted file mode 100644 index 2b9a90727e19..000000000000 --- a/arch/loongarch/include/asm/mmzone.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Author: Huacai Chen (chenhuacai@loongson.cn) - * Copyright (C) 2020-2022 Loongson Technology Corporation Limited - */ -#ifndef _ASM_MMZONE_H_ -#define _ASM_MMZONE_H_ - -#include <asm/page.h> -#include <asm/numa.h> - -extern struct pglist_data *node_data[]; - -#define NODE_DATA(nid) (node_data[(nid)]) - -#endif /* _ASM_MMZONE_H_ */ diff --git a/arch/loongarch/include/asm/topology.h b/arch/loongarch/include/asm/topology.h index 66128dec0bf6..50273c9187d0 100644 --- a/arch/loongarch/include/asm/topology.h +++ b/arch/loongarch/include/asm/topology.h @@ -8,6 +8,7 @@ #include <linux/smp.h> #ifdef CONFIG_NUMA +#include <asm/numa.h> extern cpumask_t cpus_on_node[]; diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index 8fe21f868f72..84fe7f854820 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -27,10 +27,7 @@ #include <asm/time.h> int numa_off; -struct pglist_data *node_data[MAX_NUMNODES]; unsigned char node_distances[MAX_NUMNODES][MAX_NUMNODES]; - -EXPORT_SYMBOL(node_data); EXPORT_SYMBOL(node_distances); static struct numa_meminfo numa_meminfo; @@ -190,24 +187,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) return numa_add_memblk_to(nid, start, end, &numa_meminfo); } -static void __init alloc_node_data(int nid) -{ - void *nd; - unsigned long nd_pa; - size_t nd_sz = roundup(sizeof(pg_data_t), PAGE_SIZE); - - nd_pa = memblock_phys_alloc_try_nid(nd_sz, SMP_CACHE_BYTES, nid); - if (!nd_pa) { - pr_err("Cannot find %zu Byte for node_data (initial node: %d)\n", nd_sz, nid); - return; - } - - nd = __va(nd_pa); - - node_data[nid] = nd; - memset(nd, 0, sizeof(pg_data_t)); -} - static void __init node_mem_init(unsigned int node) { unsigned long start_pfn, end_pfn; diff --git a/arch/loongarch/mm/mmap.c b/arch/loongarch/mm/mmap.c index 889030985135..914e82ff3f65 100644 --- a/arch/loongarch/mm/mmap.c +++ b/arch/loongarch/mm/mmap.c @@ -89,7 +89,8 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp, } unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) { return arch_get_unmapped_area_common(filp, addr0, len, pgoff, flags, UP); @@ -101,7 +102,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0, */ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { return arch_get_unmapped_area_common(filp, addr0, len, pgoff, flags, DOWN); diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 023ad33a7e94..397edf05dd72 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -502,7 +502,6 @@ config MACH_LOONGSON64 select USE_OF select BUILTIN_DTB select PCI_HOST_GENERIC - select HAVE_ARCH_NODEDATA_EXTENSION if NUMA help This enables the support of Loongson-2/3 family of machines. @@ -735,7 +734,6 @@ config SGI_IP27 select WAR_R10000_LLSC select MIPS_L1_CACHE_SHIFT_7 select NUMA - select HAVE_ARCH_NODEDATA_EXTENSION help This are the SGI Origin 200, Origin 2000 and Onyx 2 Graphics workstations. To compile a Linux kernel that runs on these, say Y @@ -2613,9 +2611,6 @@ config NUMA config SYS_SUPPORTS_NUMA bool -config HAVE_ARCH_NODEDATA_EXTENSION - bool - config RELOCATABLE bool "Relocatable kernel" depends on SYS_SUPPORTS_RELOCATABLE diff --git a/arch/mips/include/asm/mach-ip27/mmzone.h b/arch/mips/include/asm/mach-ip27/mmzone.h index 08c36e50a860..56959eb9cb26 100644 --- a/arch/mips/include/asm/mach-ip27/mmzone.h +++ b/arch/mips/include/asm/mach-ip27/mmzone.h @@ -22,7 +22,6 @@ struct node_data { extern struct node_data *__node_data[]; -#define NODE_DATA(n) (&__node_data[(n)]->pglist) #define hub_data(n) (&__node_data[(n)]->hub) #endif /* _ASM_MACH_MMZONE_H */ diff --git a/arch/mips/include/asm/mach-loongson64/mmzone.h b/arch/mips/include/asm/mach-loongson64/mmzone.h index a3d65d37b8b5..8fb70fd3c9c4 100644 --- a/arch/mips/include/asm/mach-loongson64/mmzone.h +++ b/arch/mips/include/asm/mach-loongson64/mmzone.h @@ -14,10 +14,6 @@ #define pa_to_nid(addr) (((addr) & 0xf00000000000) >> NODE_ADDRSPACE_SHIFT) #define nid_to_addrbase(nid) ((unsigned long)(nid) << NODE_ADDRSPACE_SHIFT) -extern struct pglist_data *__node_data[]; - -#define NODE_DATA(n) (__node_data[n]) - extern void __init prom_init_numa_memory(void); #endif /* _ASM_MACH_MMZONE_H */ diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index 68dafd6d3e25..8388400d052f 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -29,8 +29,6 @@ unsigned char __node_distances[MAX_NUMNODES][MAX_NUMNODES]; EXPORT_SYMBOL(__node_distances); -struct pglist_data *__node_data[MAX_NUMNODES]; -EXPORT_SYMBOL(__node_data); cpumask_t __node_cpumask[MAX_NUMNODES]; EXPORT_SYMBOL(__node_cpumask); @@ -83,12 +81,8 @@ static void __init init_topology_matrix(void) static void __init node_mem_init(unsigned int node) { - struct pglist_data *nd; unsigned long node_addrspace_offset; unsigned long start_pfn, end_pfn; - unsigned long nd_pa; - int tnid; - const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); node_addrspace_offset = nid_to_addrbase(node); pr_info("Node%d's addrspace_offset is 0x%lx\n", @@ -98,16 +92,8 @@ static void __init node_mem_init(unsigned int node) pr_info("Node%d: start_pfn=0x%lx, end_pfn=0x%lx\n", node, start_pfn, end_pfn); - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, node); - if (!nd_pa) - panic("Cannot allocate %zu bytes for node %d data\n", - nd_size, node); - nd = __va(nd_pa); - memset(nd, 0, sizeof(struct pglist_data)); - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (tnid != node) - pr_info("NODE_DATA(%d) on node %d\n", node, tnid); - __node_data[node] = nd; + alloc_node_data(node); + NODE_DATA(node)->node_start_pfn = start_pfn; NODE_DATA(node)->node_spanned_pages = end_pfn - start_pfn; @@ -198,13 +184,3 @@ void __init prom_init_numa_memory(void) pr_info("CP0_PageGrain: CP0 5.1 (0x%x)\n", read_c0_pagegrain()); prom_meminit(); } - -pg_data_t * __init arch_alloc_nodedata(int nid) -{ - return memblock_alloc(sizeof(pg_data_t), SMP_CACHE_BYTES); -} - -void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ - __node_data[nid] = pgdat; -} diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 7e11d7b58761..5d2a1225785b 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -98,7 +98,8 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp, } unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) { return arch_get_unmapped_area_common(filp, addr0, len, pgoff, flags, UP); @@ -110,7 +111,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0, */ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { return arch_get_unmapped_area_common(filp, addr0, len, pgoff, flags, DOWN); diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index b8ca94cfb4fe..1963313f55d8 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -35,7 +35,6 @@ #define PFN_NASIDSHFT (NASID_SHFT - PAGE_SHIFT) struct node_data *__node_data[MAX_NUMNODES]; - EXPORT_SYMBOL(__node_data); static u64 gen_region_mask(void) @@ -361,6 +360,7 @@ static void __init node_mem_init(nasid_t node) */ __node_data[node] = __va(slot_freepfn << PAGE_SHIFT); memset(__node_data[node], 0, PAGE_SIZE); + node_data[node] = &__node_data[node]->pglist; NODE_DATA(node)->node_start_pfn = start_pfn; NODE_DATA(node)->node_spanned_pages = end_pfn - start_pfn; @@ -423,13 +423,3 @@ void __init mem_init(void) memblock_free_all(); setup_zero_pages(); /* This comes from node 0 */ } - -pg_data_t * __init arch_alloc_nodedata(int nid) -{ - return memblock_alloc(sizeof(pg_data_t), SMP_CACHE_BYTES); -} - -void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ - __node_data[nid] = (struct node_data *)pgdat; -} diff --git a/arch/mips/sgi-ip27/ip27-smp.c b/arch/mips/sgi-ip27/ip27-smp.c index 5d2652a1d35a..62733e049570 100644 --- a/arch/mips/sgi-ip27/ip27-smp.c +++ b/arch/mips/sgi-ip27/ip27-smp.c @@ -70,11 +70,13 @@ void cpu_node_probe(void) gda_t *gdap = GDA; nodes_clear(node_online_map); + nodes_clear(node_possible_map); for (i = 0; i < MAX_NUMNODES; i++) { nasid_t nasid = gdap->g_nasidtable[i]; if (nasid == INVALID_NASID) break; node_set_online(nasid); + node_set(nasid, node_possible_map); highest = node_scan_cpus(nasid, highest); } diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 3459df28afee..a2278485de19 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -82,6 +82,10 @@ void __init mmu_init(void) pgd_t swapper_pg_dir[PTRS_PER_PGD] __aligned(PAGE_SIZE); pte_t invalid_pte_table[PTRS_PER_PTE] __aligned(PAGE_SIZE); static struct page *kuser_page[1]; +static struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + .pages = kuser_page, +}; static int alloc_kuser_page(void) { @@ -106,18 +110,18 @@ arch_initcall(alloc_kuser_page); int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; - int ret; + struct vm_area_struct *vma; mmap_write_lock(mm); /* Map kuser helpers to user space address */ - ret = install_special_mapping(mm, KUSER_BASE, KUSER_SIZE, + vma = _install_special_mapping(mm, KUSER_BASE, KUSER_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | - VM_MAYEXEC, kuser_page); + VM_MAYEXEC, &vdso_mapping); mmap_write_unlock(mm); - return ret; + return IS_ERR(vma) ? PTR_ERR(vma) : 0; } const char *arch_vma_name(struct vm_area_struct *vma) diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index f7722451276e..f852fe274abe 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -167,7 +167,8 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp, } unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) { return arch_get_unmapped_area_common(filp, addr, len, pgoff, flags, UP); @@ -175,7 +176,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { return arch_get_unmapped_area_common(filp, addr, len, pgoff, flags, DOWN); diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index 0356199bd9e7..aa664f7ddb63 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -40,7 +40,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, addr = ALIGN(addr, huge_page_size(h)); /* we need to make sure the colouring is OK */ - return arch_get_unmapped_area(file, addr, len, pgoff, flags); + return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0); } diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 6001d580c0dd..a5e3e7f97f4d 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -81,7 +81,6 @@ CONFIG_MODULE_SIG_SHA512=y CONFIG_PARTITION_ADVANCED=y CONFIG_BINFMT_MISC=m CONFIG_ZSWAP=y -CONFIG_Z3FOLD=y CONFIG_ZSMALLOC=y # CONFIG_SLAB_MERGE_DEFAULT is not set CONFIG_SLAB_FREELIST_RANDOM=y diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index f8ba72573cae..6d98e6f08d4d 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1098,6 +1098,7 @@ extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot); extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); +extern pud_t pud_modify(pud_t pud, pgprot_t newprot); extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); extern void set_pud_at(struct mm_struct *mm, unsigned long addr, @@ -1358,6 +1359,8 @@ static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, #define __HAVE_ARCH_PMDP_INVALIDATE extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +extern pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp); #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 99707456c2cd..a157ab513347 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -257,15 +257,6 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, extern void arch_exit_mmap(struct mm_struct *mm); -static inline void arch_unmap(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - unsigned long vdso_base = (unsigned long)mm->context.vdso; - - if (start <= vdso_base && vdso_base < end) - mm->context.vdso = NULL; -} - #ifdef CONFIG_PPC_MEM_KEYS bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, bool execute, bool foreign); diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h index da827d2d0866..d99863cd6cde 100644 --- a/arch/powerpc/include/asm/mmzone.h +++ b/arch/powerpc/include/asm/mmzone.h @@ -20,12 +20,6 @@ #ifdef CONFIG_NUMA -extern struct pglist_data *node_data[]; -/* - * Return a pointer to the node data for node n. - */ -#define NODE_DATA(nid) (node_data[nid]) - /* * Following are specific to this numa platform. */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 264a6c09517a..2f72ad885332 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -65,6 +65,7 @@ static inline unsigned long pte_pfn(pte_t pte) /* * Select all bits except the pfn */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pte_flags; diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 7a2ff9010f17..ee4b9d676cff 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -81,6 +81,21 @@ static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_str return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start); } +static void vdso_close(const struct vm_special_mapping *sm, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + + /* + * close() is called for munmap() but also for mremap(). In the mremap() + * case the vdso pointer has already been updated by the mremap() hook + * above, so it must not be set to NULL here. + */ + if (vma->vm_start != (unsigned long)mm->context.vdso) + return; + + mm->context.vdso = NULL; +} + static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf); @@ -92,11 +107,13 @@ static struct vm_special_mapping vvar_spec __ro_after_init = { static struct vm_special_mapping vdso32_spec __ro_after_init = { .name = "[vdso]", .mremap = vdso32_mremap, + .close = vdso_close, }; static struct vm_special_mapping vdso64_spec __ro_after_init = { .name = "[vdso]", .mremap = vdso64_mremap, + .close = vdso_close, }; #ifdef CONFIG_TIME_NS @@ -197,13 +214,6 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int /* Add required alignment. */ vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT); - /* - * Put vDSO base into mm struct. We need to do this before calling - * install_special_mapping or the perf counter mmap tracking code - * will fail to recognise it as a vDSO. - */ - mm->context.vdso = (void __user *)vdso_base + vvar_size; - vma = _install_special_mapping(mm, vdso_base, vvar_size, VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP | VM_PFNMAP, &vvar_spec); @@ -223,10 +233,15 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int vma = _install_special_mapping(mm, vdso_base + vvar_size, vdso_size, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, vdso_spec); - if (IS_ERR(vma)) + if (IS_ERR(vma)) { do_munmap(mm, vdso_base, vvar_size, NULL); + return PTR_ERR(vma); + } + + // Now that the mappings are in place, set the mm VDSO pointer + mm->context.vdso = (void __user *)vdso_base + vvar_size; - return PTR_ERR_OR_ZERO(vma); + return 0; } int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) @@ -240,8 +255,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return -EINTR; rc = __arch_setup_additional_pages(bprm, uses_interp); - if (rc) - mm->context.vdso = NULL; mmap_write_unlock(mm); return rc; diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index f4d8d3c40e5c..5a4a75369043 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -176,6 +176,17 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, return __pmd(old_pmd); } +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + unsigned long old_pud; + + VM_WARN_ON_ONCE(!pud_present(*pudp)); + old_pud = pud_hugepage_update(vma->vm_mm, address, pudp, _PAGE_PRESENT, _PAGE_INVALID); + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return __pud(old_pud); +} + pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, int full) { @@ -259,6 +270,15 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) pmdv &= _HPAGE_CHG_MASK; return pmd_set_protbits(__pmd(pmdv), newprot); } + +pud_t pud_modify(pud_t pud, pgprot_t newprot) +{ + unsigned long pudv; + + pudv = pud_val(pud); + pudv &= _HPAGE_CHG_MASK; + return pud_set_protbits(__pud(pudv), newprot); +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* For use by kexec, called with MMU off */ diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c index ef3ce37f1bb3..87307d0fc3b8 100644 --- a/arch/powerpc/mm/book3s64/slice.c +++ b/arch/powerpc/mm/book3s64/slice.c @@ -637,10 +637,11 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, + vm_flags_t vm_flags) { if (radix_enabled()) - return generic_get_unmapped_area(filp, addr, len, pgoff, flags); + return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); return slice_get_unmapped_area(addr, len, flags, mm_ctx_user_psize(¤t->mm->context), 0); @@ -650,10 +651,11 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, - const unsigned long flags) + const unsigned long flags, + vm_flags_t vm_flags) { if (radix_enabled()) - return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags); + return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags, vm_flags); return slice_get_unmapped_area(addr0, len, flags, mm_ctx_user_psize(¤t->mm->context), 1); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index aa89899f0c1a..3c1da08304d0 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -43,11 +43,9 @@ static char *cmdline __initdata; int numa_cpu_lookup_table[NR_CPUS]; cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; -struct pglist_data *node_data[MAX_NUMNODES]; EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(node_to_cpumask_map); -EXPORT_SYMBOL(node_data); static int primary_domain_index; static int n_mem_addr_cells, n_mem_size_cells; @@ -1095,27 +1093,9 @@ void __init dump_numa_cpu_topology(void) static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) { u64 spanned_pages = end_pfn - start_pfn; - const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); - u64 nd_pa; - void *nd; - int tnid; - - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); - if (!nd_pa) - panic("Cannot allocate %zu bytes for node %d data\n", - nd_size, nid); - - nd = __va(nd_pa); - - /* report and initialize */ - pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n", - nd_pa, nd_pa + nd_size - 1); - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (tnid != nid) - pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); - - node_data[nid] = nd; - memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + + alloc_node_data(nid); + NODE_DATA(nid)->node_id = nid; NODE_DATA(nid)->node_start_pfn = start_pfn; NODE_DATA(nid)->node_spanned_pages = spanned_pages; diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 8c31802f97e8..e89f64a0f24a 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -136,10 +136,10 @@ void pte_fragment_free(unsigned long *table, int kernel) #ifdef CONFIG_TRANSPARENT_HUGEPAGE void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) { - struct page *page; + struct folio *folio; - page = virt_to_page(pgtable); - SetPageActive(page); + folio = virt_to_folio(pgtable); + folio_set_active(folio); pte_fragment_free((unsigned long *)pgtable, 0); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index ab0656115424..7316396e452d 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -297,6 +297,12 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, } #if defined(CONFIG_PPC_8xx) + +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) || defined(CONFIG_SPLIT_PMD_PTLOCKS) +/* We need the same lock to protect the PMD table and the two PTE tables. */ +#error "8M hugetlb folios are incompatible with split page table locks" +#endif + static void __set_huge_pte_at(pmd_t *pmd, pte_t *ptep, pte_basic_t val) { pte_basic_t *entry = (pte_basic_t *)ptep; diff --git a/arch/powerpc/platforms/pseries/papr-vpd.c b/arch/powerpc/platforms/pseries/papr-vpd.c index c29e85db5f35..1574176e3ffc 100644 --- a/arch/powerpc/platforms/pseries/papr-vpd.c +++ b/arch/powerpc/platforms/pseries/papr-vpd.c @@ -156,10 +156,7 @@ static int vpd_blob_extend(struct vpd_blob *blob, const char *data, size_t len) const char *old_ptr = blob->data; char *new_ptr; - new_ptr = old_ptr ? - kvrealloc(old_ptr, old_len, new_len, GFP_KERNEL_ACCOUNT) : - kvmalloc(len, GFP_KERNEL_ACCOUNT); - + new_ptr = kvrealloc(old_ptr, new_len, GFP_KERNEL_ACCOUNT); if (!new_ptr) return -ENOMEM; diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index 5c589770f2a8..1461af12da6e 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -5,6 +5,7 @@ syscall-y += syscall_table_64.h generic-y += early_ioremap.h generic-y += flat.h generic-y += kvm_para.h +generic-y += mmzone.h generic-y += parport.h generic-y += spinlock.h generic-y += spinlock_types.h diff --git a/arch/riscv/include/asm/mmzone.h b/arch/riscv/include/asm/mmzone.h deleted file mode 100644 index fa17e01d9ab2..000000000000 --- a/arch/riscv/include/asm/mmzone.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_MMZONE_H -#define __ASM_MMZONE_H - -#ifdef CONFIG_NUMA - -#include <asm/numa.h> - -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[(nid)]) - -#endif /* CONFIG_NUMA */ -#endif /* __ASM_MMZONE_H */ diff --git a/arch/riscv/include/asm/topology.h b/arch/riscv/include/asm/topology.h index 61183688bdd5..fe1a8bf6902d 100644 --- a/arch/riscv/include/asm/topology.h +++ b/arch/riscv/include/asm/topology.h @@ -4,6 +4,10 @@ #include <linux/arch_topology.h> +#ifdef CONFIG_NUMA +#include <asm/numa.h> +#endif + /* Replace task scheduler's default frequency-invariant accounting */ #define arch_scale_freq_tick topology_scale_freq_tick #define arch_set_freq_scale topology_set_freq_scale diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 4b904110d27c..297bf7157968 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -7,3 +7,4 @@ generated-y += unistd_nr.h generic-y += asm-offsets.h generic-y += kvm_types.h generic-y += mcs_spinlock.h +generic-y += mmzone.h diff --git a/arch/s390/include/asm/mmzone.h b/arch/s390/include/asm/mmzone.h deleted file mode 100644 index 73e3e7c6976c..000000000000 --- a/arch/s390/include/asm/mmzone.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * NUMA support for s390 - * - * Copyright IBM Corp. 2015 - */ - -#ifndef _ASM_S390_MMZONE_H -#define _ASM_S390_MMZONE_H - -#ifdef CONFIG_NUMA - -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[nid]) - -#endif /* CONFIG_NUMA */ -#endif /* _ASM_S390_MMZONE_H */ diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 16e4caa931f1..73e1e03317b4 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -176,8 +176,6 @@ static inline int devmem_is_allowed(unsigned long pfn) int arch_make_folio_accessible(struct folio *folio); #define HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE -int arch_make_page_accessible(struct page *page); -#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE struct vm_layout { unsigned long kaslr_offset; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 3fa280d0672a..0ffbaf741955 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -955,6 +955,7 @@ static inline int pte_unused(pte_t pte) * young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID * must not be set. */ +#define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pte_flags = pte_val(pte) & _PAGE_CHG_MASK; diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c index 23ab9f02f278..ddc1448ea2e1 100644 --- a/arch/s390/kernel/numa.c +++ b/arch/s390/kernel/numa.c @@ -14,9 +14,6 @@ #include <linux/node.h> #include <asm/numa.h> -struct pglist_data *node_data[MAX_NUMNODES]; -EXPORT_SYMBOL(node_data); - void __init numa_setup(void) { int nid; diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 36db065c7cf7..9646f773208a 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -14,6 +14,7 @@ #include <linux/memblock.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include <linux/pagewalk.h> #include <asm/facility.h> #include <asm/sections.h> #include <asm/uv.h> @@ -462,9 +463,9 @@ EXPORT_SYMBOL_GPL(gmap_convert_to_secure); int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) { struct vm_area_struct *vma; + struct folio_walk fw; unsigned long uaddr; struct folio *folio; - struct page *page; int rc; rc = -EFAULT; @@ -483,11 +484,15 @@ int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) goto out; rc = 0; - /* we take an extra reference here */ - page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET); - if (IS_ERR_OR_NULL(page)) + folio = folio_walk_start(&fw, vma, uaddr, 0); + if (!folio) goto out; - folio = page_folio(page); + /* + * See gmap_make_secure(): large folios cannot be secure. Small + * folio implies FW_LEVEL_PTE. + */ + if (folio_test_large(folio) || !pte_write(fw.pte)) + goto out_walk_end; rc = uv_destroy_folio(folio); /* * Fault handlers can race; it is possible that two CPUs will fault @@ -500,7 +505,8 @@ int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) */ if (rc) rc = uv_convert_from_secure_folio(folio); - folio_put(folio); +out_walk_end: + folio_walk_end(&fw, vma); out: mmap_read_unlock(gmap->mm); return rc; @@ -548,11 +554,6 @@ int arch_make_folio_accessible(struct folio *folio) } EXPORT_SYMBOL_GPL(arch_make_folio_accessible); -int arch_make_page_accessible(struct page *page) -{ - return arch_make_folio_accessible(page_folio(page)); -} -EXPORT_SYMBOL_GPL(arch_make_page_accessible); static ssize_t uv_query_facilities(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 8e149ef5e89b..ad8b0d6b77ea 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -34,6 +34,7 @@ #include <linux/uaccess.h> #include <linux/hugetlb.h> #include <linux/kfence.h> +#include <linux/pagewalk.h> #include <asm/asm-extable.h> #include <asm/asm-offsets.h> #include <asm/ptrace.h> @@ -492,9 +493,9 @@ void do_secure_storage_access(struct pt_regs *regs) union teid teid = { .val = regs->int_parm_long }; unsigned long addr = get_fault_address(regs); struct vm_area_struct *vma; + struct folio_walk fw; struct mm_struct *mm; struct folio *folio; - struct page *page; struct gmap *gmap; int rc; @@ -536,15 +537,18 @@ void do_secure_storage_access(struct pt_regs *regs) vma = find_vma(mm, addr); if (!vma) return handle_fault_error(regs, SEGV_MAPERR); - page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET); - if (IS_ERR_OR_NULL(page)) { + folio = folio_walk_start(&fw, vma, addr, 0); + if (!folio) { mmap_read_unlock(mm); break; } - folio = page_folio(page); - if (arch_make_folio_accessible(folio)) - send_sig(SIGSEGV, current, 0); + /* arch_make_folio_accessible() needs a raised refcount. */ + folio_get(folio); + rc = arch_make_folio_accessible(folio); folio_put(folio); + folio_walk_end(&fw, vma); + if (rc) + send_sig(SIGSEGV, current, 0); mmap_read_unlock(mm); break; case KERNEL_FAULT: diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 206756946589..96efa061ce01 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -82,7 +82,7 @@ static int get_align_mask(struct file *filp, unsigned long flags) unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -117,7 +117,7 @@ check_asce_limit: unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index 5398729bfe1b..de5c0b389a3e 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -118,12 +118,11 @@ static inline int __memcpy_toio_inuser(void __iomem *dst, SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, const void __user *, user_buffer, size_t, length) { + struct follow_pfnmap_args args = { }; u8 local_buf[64]; void __iomem *io_addr; void *buf; struct vm_area_struct *vma; - pte_t *ptep; - spinlock_t *ptl; long ret; if (!zpci_is_enabled()) @@ -169,11 +168,13 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma, mmio_addr, &ptep, &ptl); + args.address = mmio_addr; + args.vma = vma; + ret = follow_pfnmap_start(&args); if (ret) goto out_unlock_mmap; - io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) @@ -181,7 +182,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, ret = zpci_memcpy_toio(io_addr, buf, length); out_unlock_pt: - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unlock_mmap: mmap_read_unlock(current->mm); out_free: @@ -260,12 +261,11 @@ static inline int __memcpy_fromio_inuser(void __user *dst, SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, void __user *, user_buffer, size_t, length) { + struct follow_pfnmap_args args = { }; u8 local_buf[64]; void __iomem *io_addr; void *buf; struct vm_area_struct *vma; - pte_t *ptep; - spinlock_t *ptl; long ret; if (!zpci_is_enabled()) @@ -308,11 +308,13 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma, mmio_addr, &ptep, &ptl); + args.vma = vma; + args.address = mmio_addr; + ret = follow_pfnmap_start(&args); if (ret) goto out_unlock_mmap; - io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) | + io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) | (mmio_addr & ~PAGE_MASK)); if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) { @@ -322,7 +324,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, ret = zpci_memcpy_fromio(buf, io_addr, length); out_unlock_pt: - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unlock_mmap: mmap_read_unlock(current->mm); diff --git a/arch/sh/include/asm/mmzone.h b/arch/sh/include/asm/mmzone.h index 7b8dead2723d..63f88b465e39 100644 --- a/arch/sh/include/asm/mmzone.h +++ b/arch/sh/include/asm/mmzone.h @@ -5,9 +5,6 @@ #ifdef CONFIG_NUMA #include <linux/numa.h> -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[nid]) - static inline int pfn_to_nid(unsigned long pfn) { int nid; diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c index 1bd85a6949c4..add35c51e017 100644 --- a/arch/sh/kernel/vsyscall/vsyscall.c +++ b/arch/sh/kernel/vsyscall/vsyscall.c @@ -36,6 +36,10 @@ __setup("vdso=", vdso_setup); */ extern const char vsyscall_trapa_start, vsyscall_trapa_end; static struct page *syscall_pages[1]; +static struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + .pages = syscall_pages, +}; int __init vsyscall_init(void) { @@ -58,6 +62,7 @@ int __init vsyscall_init(void) int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; unsigned long addr; int ret; @@ -70,14 +75,17 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto up_fail; } - ret = install_special_mapping(mm, addr, PAGE_SIZE, + vdso_mapping.pages = syscall_pages; + vma = _install_special_mapping(mm, addr, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, - syscall_pages); - if (unlikely(ret)) + &vdso_mapping); + ret = PTR_ERR(vma); + if (IS_ERR(vma)) goto up_fail; current->mm->context.vdso = (void *)addr; + ret = 0; up_fail: mmap_write_unlock(mm); diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index d1fe90b2f5ff..2a88b0c9e70f 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -212,12 +212,7 @@ void __init allocate_pgdat(unsigned int nid) get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); #ifdef CONFIG_NUMA - NODE_DATA(nid) = memblock_alloc_try_nid( - sizeof(struct pglist_data), - SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, - MEMBLOCK_ALLOC_ACCESSIBLE, nid); - if (!NODE_DATA(nid)) - panic("Can't allocate pgdat for node %d\n", nid); + alloc_node_data(nid); #endif NODE_DATA(nid)->node_start_pfn = start_pfn; diff --git a/arch/sh/mm/mmap.c b/arch/sh/mm/mmap.c index bee329d4149a..c442734d9b0c 100644 --- a/arch/sh/mm/mmap.c +++ b/arch/sh/mm/mmap.c @@ -52,7 +52,8 @@ static inline unsigned long COLOUR_ALIGN(unsigned long addr, } unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -99,7 +100,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, - const unsigned long flags) + const unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c index 50f0dc1744d0..9bc212b5e762 100644 --- a/arch/sh/mm/numa.c +++ b/arch/sh/mm/numa.c @@ -14,9 +14,6 @@ #include <linux/pfn.h> #include <asm/sections.h> -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL_GPL(node_data); - /* * On SH machines the conventional approach is to stash system RAM * in node 0, and other memory blocks in to node 1 and up, ordered by diff --git a/arch/sparc/include/asm/mmzone.h b/arch/sparc/include/asm/mmzone.h index a236d8aa893a..74eb2c71d077 100644 --- a/arch/sparc/include/asm/mmzone.h +++ b/arch/sparc/include/asm/mmzone.h @@ -6,10 +6,6 @@ #include <linux/cpumask.h> -extern struct pglist_data *node_data[]; - -#define NODE_DATA(nid) (node_data[nid]) - extern int numa_cpu_lookup_table[]; extern cpumask_t numa_cpumask_lookup_table[]; diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 3fe429d73a65..2b7f358762c1 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -783,6 +783,7 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) return __pmd(pte_val(pte)); } +#define pmd_pgprot pmd_pgprot static inline pgprot_t pmd_pgprot(pmd_t entry) { unsigned long val = pmd_val(entry); diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index 08a19727795c..80822f922e76 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c @@ -39,7 +39,7 @@ SYSCALL_DEFINE0(getpagesize) return PAGE_SIZE; /* Possibly older binaries want 8192 on sun4's? */ } -unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct vm_unmapped_area_info info = {}; diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index d9c3b34ca744..acade309dc2f 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -87,7 +87,7 @@ static inline unsigned long COLOR_ALIGN(unsigned long addr, return base + off; } -unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct * vma; @@ -146,7 +146,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, - const unsigned long flags) + const unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 53d7cb5bbffe..21f8cbbd0581 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1075,14 +1075,9 @@ static void __init allocate_node_data(int nid) { struct pglist_data *p; unsigned long start_pfn, end_pfn; -#ifdef CONFIG_NUMA - NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data), - SMP_CACHE_BYTES, nid); - if (!NODE_DATA(nid)) { - prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid); - prom_halt(); - } +#ifdef CONFIG_NUMA + alloc_node_data(nid); NODE_DATA(nid)->node_id = nid; #endif @@ -1115,11 +1110,9 @@ static void init_node_masks_nonnuma(void) } #ifdef CONFIG_NUMA -struct pglist_data *node_data[MAX_NUMNODES]; EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(numa_cpumask_lookup_table); -EXPORT_SYMBOL(node_data); static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio, u32 cfg_handle) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 273b17a5cb81..2852fcd82cbd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,6 +28,7 @@ config X86_64 select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE @@ -299,6 +300,7 @@ config X86 select NEED_PER_CPU_EMBED_FIRST_CHUNK select NEED_PER_CPU_PAGE_FIRST_CHUNK select NEED_SG_DMA_LENGTH + select NUMA_MEMBLKS if NUMA select PCI_DOMAINS if PCI select PCI_LOCKLESS_CONFIG if PCI select PERF_EVENTS @@ -1601,14 +1603,6 @@ config X86_64_ACPI_NUMA help Enable ACPI SRAT based node topology detection. -config NUMA_EMU - bool "NUMA emulation" - depends on NUMA - help - Enable NUMA emulation. A flat machine will be split - into virtual nodes when booted with "numa=fake=N", where N is the - number of nodes. This is only useful for debugging. - config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP range 1 10 @@ -1808,6 +1802,7 @@ config X86_PAT def_bool y prompt "x86 PAT support" if EXPERT depends on MTRR + select ARCH_USES_PG_ARCH_2 help Use PAT attributes to setup page level cache control. @@ -1819,10 +1814,6 @@ config X86_PAT If unsure, say Y. -config ARCH_USES_PG_UNCACHED - def_bool y - depends on X86_PAT - config X86_UMIP def_bool y prompt "User Mode Instruction Prevention" if EXPERT diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 944454306ef4..04a35b2c26e9 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -511,7 +511,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) if (init_unaccepted_memory()) { debug_putstr("Accepting memory... "); - accept_memory(__pa(output), __pa(output) + needed_size); + accept_memory(__pa(output), needed_size); } entry_offset = decompress_kernel(output, virt_addr, error); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index b353a7be380c..dd8d1a85f671 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -256,6 +256,6 @@ static inline bool init_unaccepted_memory(void) { return false; } /* Defined in EFI stub */ extern struct efi_unaccepted_memory *unaccepted_table; -void accept_memory(phys_addr_t start, phys_addr_t end); +void accept_memory(phys_addr_t start, unsigned long size); #endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index a192bdea69e2..6c23d1661b17 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -11,3 +11,4 @@ generated-y += xen-hypercalls.h generic-y += early_ioremap.h generic-y += mcs_spinlock.h +generic-y += mmzone.h diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 19091ebb8633..2886cb668d7f 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -238,11 +238,6 @@ static inline bool is_64bit_mm(struct mm_struct *mm) } #endif -static inline void arch_unmap(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ -} - /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h deleted file mode 100644 index c41b41edd691..000000000000 --- a/arch/x86/include/asm/mmzone.h +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_X86_32 -# include <asm/mmzone_32.h> -#else -# include <asm/mmzone_64.h> -#endif diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h deleted file mode 100644 index 2d4515e8b7df..000000000000 --- a/arch/x86/include/asm/mmzone_32.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Written by Pat Gaughen (gone@us.ibm.com) Mar 2002 - * - */ - -#ifndef _ASM_X86_MMZONE_32_H -#define _ASM_X86_MMZONE_32_H - -#include <asm/smp.h> - -#ifdef CONFIG_NUMA -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[nid]) -#endif /* CONFIG_NUMA */ - -#endif /* _ASM_X86_MMZONE_32_H */ diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h deleted file mode 100644 index 0c585046f744..000000000000 --- a/arch/x86/include/asm/mmzone_64.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* K8 NUMA support */ -/* Copyright 2002,2003 by Andi Kleen, SuSE Labs */ -/* 2.5 Version loosely based on the NUMAQ Code by Pat Gaughen. */ -#ifndef _ASM_X86_MMZONE_64_H -#define _ASM_X86_MMZONE_64_H - -#ifdef CONFIG_NUMA - -#include <linux/mmdebug.h> -#include <asm/smp.h> - -extern struct pglist_data *node_data[]; - -#define NODE_DATA(nid) (node_data[nid]) - -#endif -#endif /* _ASM_X86_MMZONE_64_H */ diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index ef2844d69173..5469d7a7c40f 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -10,8 +10,6 @@ #ifdef CONFIG_NUMA -#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) - extern int numa_off; /* @@ -25,9 +23,6 @@ extern int numa_off; extern s16 __apicid_to_node[MAX_LOCAL_APIC]; extern nodemask_t numa_nodes_parsed __initdata; -extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); -extern void __init numa_set_distance(int from, int to, int distance); - static inline void set_apicid_to_node(int apicid, s16 node) { __apicid_to_node[apicid] = node; @@ -54,31 +49,20 @@ static inline int numa_cpu_node(int cpu) extern void numa_set_node(int cpu, int node); extern void numa_clear_node(int cpu); extern void __init init_cpu_to_node(void); -extern void numa_add_cpu(int cpu); -extern void numa_remove_cpu(int cpu); +extern void numa_add_cpu(unsigned int cpu); +extern void numa_remove_cpu(unsigned int cpu); extern void init_gi_nodes(void); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } static inline void init_cpu_to_node(void) { } -static inline void numa_add_cpu(int cpu) { } -static inline void numa_remove_cpu(int cpu) { } +static inline void numa_add_cpu(unsigned int cpu) { } +static inline void numa_remove_cpu(unsigned int cpu) { } static inline void init_gi_nodes(void) { } #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEBUG_PER_CPU_MAPS -void debug_cpumask_set_cpu(int cpu, int node, bool enable); +void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable); #endif -#ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -int numa_emu_cmdline(char *str); -#else /* CONFIG_NUMA_EMU */ -static inline int numa_emu_cmdline(char *str) -{ - return -EINVAL; -} -#endif /* CONFIG_NUMA_EMU */ - #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e39311a89bf4..4c2d080d26b4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -120,6 +120,34 @@ extern pmdval_t early_pmd_flags; #define arch_end_context_switch(prev) do {} while(0) #endif /* CONFIG_PARAVIRT_XXL */ +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v & ~clear); +} + +static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v | set); +} + +static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v & ~clear); +} + /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -174,6 +202,13 @@ static inline int pud_young(pud_t pud) return pud_flags(pud) & _PAGE_ACCESSED; } +static inline bool pud_shstk(pud_t pud) +{ + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pud_flags(pud) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) == + (_PAGE_DIRTY | _PAGE_PSE); +} + static inline int pte_write(pte_t pte) { /* @@ -310,6 +345,30 @@ static inline int pud_devmap(pud_t pud) } #endif +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SPECIAL; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return pud_flags(pud) & _PAGE_SPECIAL; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + static inline int pgd_devmap(pgd_t pgd) { return 0; @@ -480,20 +539,6 @@ static inline pte_t pte_mkdevmap(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP); } -static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v | set); -} - -static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v & ~clear); -} - /* See comments above mksaveddirty_shift() */ static inline pmd_t pmd_mksaveddirty(pmd_t pmd) { @@ -588,20 +633,6 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); #define pmd_mkwrite pmd_mkwrite -static inline pud_t pud_set_flags(pud_t pud, pudval_t set) -{ - pudval_t v = native_pud_val(pud); - - return native_make_pud(v | set); -} - -static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) -{ - pudval_t v = native_pud_val(pud); - - return native_make_pud(v & ~clear); -} - /* See comments above mksaveddirty_shift() */ static inline pud_t pud_mksaveddirty(pud_t pud) { @@ -780,6 +811,12 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd) __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); } +static inline pud_t pud_mkinvalid(pud_t pud) +{ + return pfn_pud(pud_pfn(pud), + __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); +} + static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) @@ -827,14 +864,8 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) pmd_result = __pmd(val); /* - * To avoid creating Write=0,Dirty=1 PMDs, pte_modify() needs to avoid: - * 1. Marking Write=0 PMDs Dirty=1 - * 2. Marking Dirty=1 PMDs Write=0 - * - * The first case cannot happen because the _PAGE_CHG_MASK will filter - * out any Dirty bit passed in newprot. Handle the second case by - * going through the mksaveddirty exercise. Only do this if the old - * value was Write=1 to avoid doing this on Shadow Stack PTEs. + * Avoid creating shadow stack PMD by accident. See comment in + * pte_modify(). */ if (oldval & _PAGE_RW) pmd_result = pmd_mksaveddirty(pmd_result); @@ -844,6 +875,29 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return pmd_result; } +static inline pud_t pud_modify(pud_t pud, pgprot_t newprot) +{ + pudval_t val = pud_val(pud), oldval = val; + pud_t pud_result; + + val &= _HPAGE_CHG_MASK; + val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; + val = flip_protnone_guard(oldval, val, PHYSICAL_PUD_PAGE_MASK); + + pud_result = __pud(val); + + /* + * Avoid creating shadow stack PUD by accident. See comment in + * pte_modify(). + */ + if (oldval & _PAGE_RW) + pud_result = pud_mksaveddirty(pud_result); + else + pud_result = pud_clear_saveddirty(pud_result); + + return pud_result; +} + /* * mprotect needs to preserve PAT and encryption bits when updating * vm_page_prot @@ -1078,8 +1132,7 @@ static inline pmd_t *pud_pgtable(pud_t pud) #define pud_leaf pud_leaf static inline bool pud_leaf(pud_t pud) { - return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == - (_PAGE_PSE | _PAGE_PRESENT); + return pud_val(pud) & _PAGE_PSE; } static inline int pud_bad(pud_t pud) @@ -1383,10 +1436,28 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } #endif +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline pud_t pudp_establish(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, pud_t pud) +{ + page_table_check_pud_set(vma->vm_mm, pudp, pud); + if (IS_ENABLED(CONFIG_SMP)) { + return xchg(pudp, pud); + } else { + pud_t old = *pudp; + WRITE_ONCE(*pudp, pud); + return old; + } +} +#endif + #define __HAVE_ARCH_PMDP_INVALIDATE_AD extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp); + /* * Page table pages are page-aligned. The lower half of the top * level is used for userspace and the top half for the kernel. @@ -1668,6 +1739,9 @@ void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte); #define arch_check_zapped_pmd arch_check_zapped_pmd void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd); +#define arch_check_zapped_pud arch_check_zapped_pud +void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud); + #ifdef CONFIG_XEN_PV #define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young static inline bool arch_has_hw_nonleaf_pmd_young(void) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 3c4407271d08..7e9db77231ac 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -245,7 +245,6 @@ extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define HAVE_ARCH_UNMAPPED_AREA_VMFLAGS #define PAGE_AGP PAGE_KERNEL_NOCACHE #define HAVE_PAGE_AGP 1 diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 64df897c0ee3..3918c7a434f5 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -31,13 +31,4 @@ #endif /* CONFIG_SPARSEMEM */ -#ifndef __ASSEMBLY__ -#ifdef CONFIG_NUMA_KEEP_MEMINFO -extern int phys_to_target_node(phys_addr_t start); -#define phys_to_target_node phys_to_target_node -extern int memory_add_physaddr_to_nid(u64 start); -#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid -#endif -#endif /* __ASSEMBLY__ */ - #endif /* _ASM_X86_SPARSEMEM_H */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 01d7cd85ef97..87f8c9a71c49 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -121,7 +121,7 @@ static inline unsigned long stack_guard_placement(vm_flags_t vm_flags) } unsigned long -arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, +arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; @@ -158,7 +158,7 @@ arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned l } unsigned long -arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr0, +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { @@ -228,20 +228,5 @@ bottomup: * can happen with large stack limits and large mmap() * allocations. */ - return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); -} - -unsigned long -arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0); -} - -unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) -{ - return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff, flags, 0); + return arch_get_unmapped_area(filp, addr0, len, pgoff, flags, 0); } diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 8d3a00e5c528..690fbf48e853 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -57,7 +57,6 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o -obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 9332b36a1091..628833afee37 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/nodemask.h> #include <linux/memblock.h> +#include <linux/numa_memblks.h> #include <asm/io.h> #include <linux/pci_ids.h> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 6ce10e3c6228..64e5cdb2460a 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -13,6 +13,7 @@ #include <linux/sched.h> #include <linux/topology.h> #include <linux/sort.h> +#include <linux/numa_memblks.h> #include <asm/e820/api.h> #include <asm/proto.h> @@ -22,16 +23,6 @@ #include "numa_internal.h" int numa_off; -nodemask_t numa_nodes_parsed __initdata; - -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - -static struct numa_meminfo numa_meminfo __initdata_or_meminfo; -static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; - -static int numa_distance_cnt; -static u8 *numa_distance; static __init int numa_setup(char *opt) { @@ -124,456 +115,27 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } -static int __init numa_add_memblk_to(int nid, u64 start, u64 end, - struct numa_meminfo *mi) -{ - /* ignore zero length blks */ - if (start == end) - return 0; - - /* whine about and ignore invalid blks */ - if (start > end || nid < 0 || nid >= MAX_NUMNODES) { - pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", - nid, start, end - 1); - return 0; - } - - if (mi->nr_blks >= NR_NODE_MEMBLKS) { - pr_err("too many memblk ranges\n"); - return -EINVAL; - } - - mi->blk[mi->nr_blks].start = start; - mi->blk[mi->nr_blks].end = end; - mi->blk[mi->nr_blks].nid = nid; - mi->nr_blks++; - return 0; -} - -/** - * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo - * @idx: Index of memblk to remove - * @mi: numa_meminfo to remove memblk from - * - * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and - * decrementing @mi->nr_blks. - */ -void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) -{ - mi->nr_blks--; - memmove(&mi->blk[idx], &mi->blk[idx + 1], - (mi->nr_blks - idx) * sizeof(mi->blk[0])); -} - -/** - * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another - * @dst: numa_meminfo to append block to - * @idx: Index of memblk to remove - * @src: numa_meminfo to remove memblk from - */ -static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, - struct numa_meminfo *src) -{ - dst->blk[dst->nr_blks++] = src->blk[idx]; - numa_remove_memblk_from(idx, src); -} - -/** - * numa_add_memblk - Add one numa_memblk to numa_meminfo - * @nid: NUMA node ID of the new memblk - * @start: Start address of the new memblk - * @end: End address of the new memblk - * - * Add a new memblk to the default numa_meminfo. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init numa_add_memblk(int nid, u64 start, u64 end) -{ - return numa_add_memblk_to(nid, start, end, &numa_meminfo); -} - -/* Allocate NODE_DATA for a node on the local memory */ -static void __init alloc_node_data(int nid) +static int __init numa_register_nodes(void) { - const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - u64 nd_pa; - void *nd; - int tnid; - - /* - * Allocate node data. Try node-local memory and then any node. - * Never allocate in DMA zone. - */ - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); - if (!nd_pa) { - pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", - nd_size, nid); - return; - } - nd = __va(nd_pa); - - /* report and initialize */ - printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, - nd_pa, nd_pa + nd_size - 1); - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (tnid != nid) - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); - - node_data[nid] = nd; - memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); - - node_set_online(nid); -} - -/** - * numa_cleanup_meminfo - Cleanup a numa_meminfo - * @mi: numa_meminfo to clean up - * - * Sanitize @mi by merging and removing unnecessary memblks. Also check for - * conflicts and clear unused memblks. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init numa_cleanup_meminfo(struct numa_meminfo *mi) -{ - const u64 low = 0; - const u64 high = PFN_PHYS(max_pfn); - int i, j, k; - - /* first, trim all entries */ - for (i = 0; i < mi->nr_blks; i++) { - struct numa_memblk *bi = &mi->blk[i]; - - /* move / save reserved memory ranges */ - if (!memblock_overlaps_region(&memblock.memory, - bi->start, bi->end - bi->start)) { - numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); - continue; - } - - /* make sure all non-reserved blocks are inside the limits */ - bi->start = max(bi->start, low); - - /* preserve info for non-RAM areas above 'max_pfn': */ - if (bi->end > high) { - numa_add_memblk_to(bi->nid, high, bi->end, - &numa_reserved_meminfo); - bi->end = high; - } - - /* and there's no empty block */ - if (bi->start >= bi->end) - numa_remove_memblk_from(i--, mi); - } - - /* merge neighboring / overlapping entries */ - for (i = 0; i < mi->nr_blks; i++) { - struct numa_memblk *bi = &mi->blk[i]; - - for (j = i + 1; j < mi->nr_blks; j++) { - struct numa_memblk *bj = &mi->blk[j]; - u64 start, end; - - /* - * See whether there are overlapping blocks. Whine - * about but allow overlaps of the same nid. They - * will be merged below. - */ - if (bi->end > bj->start && bi->start < bj->end) { - if (bi->nid != bj->nid) { - pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", - bi->nid, bi->start, bi->end - 1, - bj->nid, bj->start, bj->end - 1); - return -EINVAL; - } - pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", - bi->nid, bi->start, bi->end - 1, - bj->start, bj->end - 1); - } - - /* - * Join together blocks on the same node, holes - * between which don't overlap with memory on other - * nodes. - */ - if (bi->nid != bj->nid) - continue; - start = min(bi->start, bj->start); - end = max(bi->end, bj->end); - for (k = 0; k < mi->nr_blks; k++) { - struct numa_memblk *bk = &mi->blk[k]; - - if (bi->nid == bk->nid) - continue; - if (start < bk->end && end > bk->start) - break; - } - if (k < mi->nr_blks) - continue; - printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", - bi->nid, bi->start, bi->end - 1, bj->start, - bj->end - 1, start, end - 1); - bi->start = start; - bi->end = end; - numa_remove_memblk_from(j--, mi); - } - } - - /* clear unused ones */ - for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { - mi->blk[i].start = mi->blk[i].end = 0; - mi->blk[i].nid = NUMA_NO_NODE; - } - - return 0; -} - -/* - * Set nodes, which have memory in @mi, in *@nodemask. - */ -static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, - const struct numa_meminfo *mi) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mi->blk); i++) - if (mi->blk[i].start != mi->blk[i].end && - mi->blk[i].nid != NUMA_NO_NODE) - node_set(mi->blk[i].nid, *nodemask); -} - -/** - * numa_reset_distance - Reset NUMA distance table - * - * The current table is freed. The next numa_set_distance() call will - * create a new one. - */ -void __init numa_reset_distance(void) -{ - size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); - - /* numa_distance could be 1LU marking allocation failure, test cnt */ - if (numa_distance_cnt) - memblock_free(numa_distance, size); - numa_distance_cnt = 0; - numa_distance = NULL; /* enable table creation */ -} - -static int __init numa_alloc_distance(void) -{ - nodemask_t nodes_parsed; - size_t size; - int i, j, cnt = 0; - u64 phys; - - /* size the new table and allocate it */ - nodes_parsed = numa_nodes_parsed; - numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); - - for_each_node_mask(i, nodes_parsed) - cnt = i; - cnt++; - size = cnt * cnt * sizeof(numa_distance[0]); - - phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, - PFN_PHYS(max_pfn_mapped)); - if (!phys) { - pr_warn("Warning: can't allocate distance table!\n"); - /* don't retry until explicitly reset */ - numa_distance = (void *)1LU; - return -ENOMEM; - } - - numa_distance = __va(phys); - numa_distance_cnt = cnt; - - /* fill with the default distances */ - for (i = 0; i < cnt; i++) - for (j = 0; j < cnt; j++) - numa_distance[i * cnt + j] = i == j ? - LOCAL_DISTANCE : REMOTE_DISTANCE; - printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); - - return 0; -} - -/** - * numa_set_distance - Set NUMA distance from one NUMA to another - * @from: the 'from' node to set distance - * @to: the 'to' node to set distance - * @distance: NUMA distance - * - * Set the distance from node @from to @to to @distance. If distance table - * doesn't exist, one which is large enough to accommodate all the currently - * known nodes will be created. - * - * If such table cannot be allocated, a warning is printed and further - * calls are ignored until the distance table is reset with - * numa_reset_distance(). - * - * If @from or @to is higher than the highest known node or lower than zero - * at the time of table creation or @distance doesn't make sense, the call - * is ignored. - * This is to allow simplification of specific NUMA config implementations. - */ -void __init numa_set_distance(int from, int to, int distance) -{ - if (!numa_distance && numa_alloc_distance() < 0) - return; - - if (from >= numa_distance_cnt || to >= numa_distance_cnt || - from < 0 || to < 0) { - pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", - from, to, distance); - return; - } - - if ((u8)distance != distance || - (from == to && distance != LOCAL_DISTANCE)) { - pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", - from, to, distance); - return; - } - - numa_distance[from * numa_distance_cnt + to] = distance; -} - -int __node_distance(int from, int to) -{ - if (from >= numa_distance_cnt || to >= numa_distance_cnt) - return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; - return numa_distance[from * numa_distance_cnt + to]; -} -EXPORT_SYMBOL(__node_distance); - -/* - * Mark all currently memblock-reserved physical memory (which covers the - * kernel's own memory ranges) as hot-unswappable. - */ -static void __init numa_clear_kernel_node_hotplug(void) -{ - nodemask_t reserved_nodemask = NODE_MASK_NONE; - struct memblock_region *mb_region; - int i; - - /* - * We have to do some preprocessing of memblock regions, to - * make them suitable for reservation. - * - * At this time, all memory regions reserved by memblock are - * used by the kernel, but those regions are not split up - * along node boundaries yet, and don't necessarily have their - * node ID set yet either. - * - * So iterate over all memory known to the x86 architecture, - * and use those ranges to set the nid in memblock.reserved. - * This will split up the memblock regions along node - * boundaries and will set the node IDs as well. - */ - for (i = 0; i < numa_meminfo.nr_blks; i++) { - struct numa_memblk *mb = numa_meminfo.blk + i; - int ret; - - ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); - WARN_ON_ONCE(ret); - } - - /* - * Now go over all reserved memblock regions, to construct a - * node mask of all kernel reserved memory areas. - * - * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, - * numa_meminfo might not include all memblock.reserved - * memory ranges, because quirks such as trim_snb_memory() - * reserve specific pages for Sandy Bridge graphics. ] - */ - for_each_reserved_mem_region(mb_region) { - int nid = memblock_get_region_node(mb_region); - - if (nid != NUMA_NO_NODE) - node_set(nid, reserved_nodemask); - } - - /* - * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory - * belonging to the reserved node mask. - * - * Note that this will include memory regions that reside - * on nodes that contain kernel memory - entire nodes - * become hot-unpluggable: - */ - for (i = 0; i < numa_meminfo.nr_blks; i++) { - struct numa_memblk *mb = numa_meminfo.blk + i; - - if (!node_isset(mb->nid, reserved_nodemask)) - continue; - - memblock_clear_hotplug(mb->start, mb->end - mb->start); - } -} - -static int __init numa_register_memblks(struct numa_meminfo *mi) -{ - int i, nid; - - /* Account for nodes with cpus and no memory */ - node_possible_map = numa_nodes_parsed; - numa_nodemask_from_meminfo(&node_possible_map, mi); - if (WARN_ON(nodes_empty(node_possible_map))) - return -EINVAL; - - for (i = 0; i < mi->nr_blks; i++) { - struct numa_memblk *mb = &mi->blk[i]; - memblock_set_node(mb->start, mb->end - mb->start, - &memblock.memory, mb->nid); - } - - /* - * At very early time, the kernel have to use some memory such as - * loading the kernel image. We cannot prevent this anyway. So any - * node the kernel resides in should be un-hotpluggable. - * - * And when we come here, alloc node data won't fail. - */ - numa_clear_kernel_node_hotplug(); - - /* - * If sections array is gonna be used for pfn -> nid mapping, check - * whether its granularity is fine enough. - */ - if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { - unsigned long pfn_align = node_map_pfn_alignment(); - - if (pfn_align && pfn_align < PAGES_PER_SECTION) { - pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", - PFN_PHYS(pfn_align) >> 20, - PFN_PHYS(PAGES_PER_SECTION) >> 20); - return -EINVAL; - } - } + int nid; if (!memblock_validate_numa_coverage(SZ_1M)) return -EINVAL; /* Finally register nodes. */ for_each_node_mask(nid, node_possible_map) { - u64 start = PFN_PHYS(max_pfn); - u64 end = 0; - - for (i = 0; i < mi->nr_blks; i++) { - if (nid != mi->blk[i].nid) - continue; - start = min(mi->blk[i].start, start); - end = max(mi->blk[i].end, end); - } + unsigned long start_pfn, end_pfn; - if (start >= end) + /* + * Note, get_pfn_range_for_nid() depends on + * memblock_set_node() having already happened + */ + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + if (start_pfn >= end_pfn) continue; alloc_node_data(nid); + node_set_online(nid); } /* Dump memblock with node info and return. */ @@ -609,39 +171,11 @@ static int __init numa_init(int (*init_func)(void)) for (i = 0; i < MAX_LOCAL_APIC; i++) set_apicid_to_node(i, NUMA_NO_NODE); - nodes_clear(numa_nodes_parsed); - nodes_clear(node_possible_map); - nodes_clear(node_online_map); - memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, - NUMA_NO_NODE)); - WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, - NUMA_NO_NODE)); - /* In case that parsing SRAT failed. */ - WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); - numa_reset_distance(); - - ret = init_func(); - if (ret < 0) - return ret; - - /* - * We reset memblock back to the top-down direction - * here because if we configured ACPI_NUMA, we have - * parsed SRAT in init_func(). It is ok to have the - * reset here even if we did't configure ACPI_NUMA - * or acpi numa init fails and fallbacks to dummy - * numa init. - */ - memblock_set_bottom_up(false); - - ret = numa_cleanup_meminfo(&numa_meminfo); + ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true); if (ret < 0) return ret; - numa_emulation(&numa_meminfo, numa_distance_cnt); - - ret = numa_register_memblks(&numa_meminfo); + ret = numa_register_nodes(); if (ret < 0) return ret; @@ -782,12 +316,12 @@ void __init init_cpu_to_node(void) #ifndef CONFIG_DEBUG_PER_CPU_MAPS # ifndef CONFIG_NUMA_EMU -void numa_add_cpu(int cpu) +void numa_add_cpu(unsigned int cpu) { cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } -void numa_remove_cpu(int cpu) +void numa_remove_cpu(unsigned int cpu) { cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } @@ -825,7 +359,7 @@ int early_cpu_to_node(int cpu) return per_cpu(x86_cpu_to_node_map, cpu); } -void debug_cpumask_set_cpu(int cpu, int node, bool enable) +void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable) { struct cpumask *mask; @@ -857,12 +391,12 @@ static void numa_set_cpumask(int cpu, bool enable) debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); } -void numa_add_cpu(int cpu) +void numa_add_cpu(unsigned int cpu) { numa_set_cpumask(cpu, true); } -void numa_remove_cpu(int cpu) +void numa_remove_cpu(unsigned int cpu) { numa_set_cpumask(cpu, false); } @@ -893,113 +427,29 @@ EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#ifdef CONFIG_NUMA_KEEP_MEMINFO -static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) +#ifdef CONFIG_NUMA_EMU +void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, + unsigned int nr_emu_nids) { - int i; - - for (i = 0; i < mi->nr_blks; i++) - if (mi->blk[i].start <= start && mi->blk[i].end > start) - return mi->blk[i].nid; - return NUMA_NO_NODE; -} - -int phys_to_target_node(phys_addr_t start) -{ - int nid = meminfo_to_nid(&numa_meminfo, start); + int i, j; /* - * Prefer online nodes, but if reserved memory might be - * hot-added continue the search with reserved ranges. + * Transform __apicid_to_node table to use emulated nids by + * reverse-mapping phys_nid. The maps should always exist but fall + * back to zero just in case. */ - if (nid != NUMA_NO_NODE) - return nid; - - return meminfo_to_nid(&numa_reserved_meminfo, start); -} -EXPORT_SYMBOL_GPL(phys_to_target_node); - -int memory_add_physaddr_to_nid(u64 start) -{ - int nid = meminfo_to_nid(&numa_meminfo, start); - - if (nid == NUMA_NO_NODE) - nid = numa_meminfo.blk[0].nid; - return nid; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); - -#endif - -static int __init cmp_memblk(const void *a, const void *b) -{ - const struct numa_memblk *ma = *(const struct numa_memblk **)a; - const struct numa_memblk *mb = *(const struct numa_memblk **)b; - - return (ma->start > mb->start) - (ma->start < mb->start); + for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { + if (__apicid_to_node[i] == NUMA_NO_NODE) + continue; + for (j = 0; j < nr_emu_nids; j++) + if (__apicid_to_node[i] == emu_nid_to_phys[j]) + break; + __apicid_to_node[i] = j < nr_emu_nids ? j : 0; + } } -static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; - -/** - * numa_fill_memblks - Fill gaps in numa_meminfo memblks - * @start: address to begin fill - * @end: address to end fill - * - * Find and extend numa_meminfo memblks to cover the physical - * address range @start-@end - * - * RETURNS: - * 0 : Success - * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end - */ - -int __init numa_fill_memblks(u64 start, u64 end) +u64 __init numa_emu_dma_end(void) { - struct numa_memblk **blk = &numa_memblk_list[0]; - struct numa_meminfo *mi = &numa_meminfo; - int count = 0; - u64 prev_end; - - /* - * Create a list of pointers to numa_meminfo memblks that - * overlap start, end. The list is used to make in-place - * changes that fill out the numa_meminfo memblks. - */ - for (int i = 0; i < mi->nr_blks; i++) { - struct numa_memblk *bi = &mi->blk[i]; - - if (memblock_addrs_overlap(start, end - start, bi->start, - bi->end - bi->start)) { - blk[count] = &mi->blk[i]; - count++; - } - } - if (!count) - return NUMA_NO_MEMBLK; - - /* Sort the list of pointers in memblk->start order */ - sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); - - /* Make sure the first/last memblks include start/end */ - blk[0]->start = min(blk[0]->start, start); - blk[count - 1]->end = max(blk[count - 1]->end, end); - - /* - * Fill any gaps by tracking the previous memblks - * end address and backfilling to it if needed. - */ - prev_end = blk[0]->end; - for (int i = 1; i < count; i++) { - struct numa_memblk *curr = blk[i]; - - if (prev_end >= curr->start) { - if (prev_end < curr->end) - prev_end = curr->end; - } else { - curr->start = prev_end; - prev_end = curr->end; - } - } - return 0; + return PFN_PHYS(MAX_DMA32_PFN); } +#endif /* CONFIG_NUMA_EMU */ diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c deleted file mode 100644 index 9a9305367fdd..000000000000 --- a/arch/x86/mm/numa_emulation.c +++ /dev/null @@ -1,585 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NUMA emulation - */ -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/topology.h> -#include <linux/memblock.h> -#include <asm/dma.h> - -#include "numa_internal.h" - -static int emu_nid_to_phys[MAX_NUMNODES]; -static char *emu_cmdline __initdata; - -int __init numa_emu_cmdline(char *str) -{ - emu_cmdline = str; - return 0; -} - -static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) -{ - int i; - - for (i = 0; i < mi->nr_blks; i++) - if (mi->blk[i].nid == nid) - return i; - return -ENOENT; -} - -static u64 __init mem_hole_size(u64 start, u64 end) -{ - unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = PFN_DOWN(end); - - if (start_pfn < end_pfn) - return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); - return 0; -} - -/* - * Sets up nid to range from @start to @end. The return value is -errno if - * something went wrong, 0 otherwise. - */ -static int __init emu_setup_memblk(struct numa_meminfo *ei, - struct numa_meminfo *pi, - int nid, int phys_blk, u64 size) -{ - struct numa_memblk *eb = &ei->blk[ei->nr_blks]; - struct numa_memblk *pb = &pi->blk[phys_blk]; - - if (ei->nr_blks >= NR_NODE_MEMBLKS) { - pr_err("NUMA: Too many emulated memblks, failing emulation\n"); - return -EINVAL; - } - - ei->nr_blks++; - eb->start = pb->start; - eb->end = pb->start + size; - eb->nid = nid; - - if (emu_nid_to_phys[nid] == NUMA_NO_NODE) - emu_nid_to_phys[nid] = pb->nid; - - pb->start += size; - if (pb->start >= pb->end) { - WARN_ON_ONCE(pb->start > pb->end); - numa_remove_memblk_from(phys_blk, pi); - } - - printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", - nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); - return 0; -} - -/* - * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr - * to max_addr. - * - * Returns zero on success or negative on error. - */ -static int __init split_nodes_interleave(struct numa_meminfo *ei, - struct numa_meminfo *pi, - u64 addr, u64 max_addr, int nr_nodes) -{ - nodemask_t physnode_mask = numa_nodes_parsed; - u64 size; - int big; - int nid = 0; - int i, ret; - - if (nr_nodes <= 0) - return -1; - if (nr_nodes > MAX_NUMNODES) { - pr_info("numa=fake=%d too large, reducing to %d\n", - nr_nodes, MAX_NUMNODES); - nr_nodes = MAX_NUMNODES; - } - - /* - * Calculate target node size. x86_32 freaks on __udivdi3() so do - * the division in ulong number of pages and convert back. - */ - size = max_addr - addr - mem_hole_size(addr, max_addr); - size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); - - /* - * Calculate the number of big nodes that can be allocated as a result - * of consolidating the remainder. - */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / - FAKE_NODE_MIN_SIZE; - - size &= FAKE_NODE_MIN_HASH_MASK; - if (!size) { - pr_err("Not enough memory for each node. " - "NUMA emulation disabled.\n"); - return -1; - } - - /* - * Continue to fill physical nodes with fake nodes until there is no - * memory left on any of them. - */ - while (!nodes_empty(physnode_mask)) { - for_each_node_mask(i, physnode_mask) { - u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); - u64 start, limit, end; - int phys_blk; - - phys_blk = emu_find_memblk_by_nid(i, pi); - if (phys_blk < 0) { - node_clear(i, physnode_mask); - continue; - } - start = pi->blk[phys_blk].start; - limit = pi->blk[phys_blk].end; - end = start + size; - - if (nid < big) - end += FAKE_NODE_MIN_SIZE; - - /* - * Continue to add memory to this fake node if its - * non-reserved memory is less than the per-node size. - */ - while (end - start - mem_hole_size(start, end) < size) { - end += FAKE_NODE_MIN_SIZE; - if (end > limit) { - end = limit; - break; - } - } - - /* - * If there won't be at least FAKE_NODE_MIN_SIZE of - * non-reserved memory in ZONE_DMA32 for the next node, - * this one must extend to the boundary. - */ - if (end < dma32_end && dma32_end - end - - mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) - end = dma32_end; - - /* - * If there won't be enough non-reserved memory for the - * next node, this one must extend to the end of the - * physical node. - */ - if (limit - end - mem_hole_size(end, limit) < size) - end = limit; - - ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, - phys_blk, - min(end, limit) - start); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* - * Returns the end address of a node so that there is at least `size' amount of - * non-reserved memory or `max_addr' is reached. - */ -static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) -{ - u64 end = start + size; - - while (end - start - mem_hole_size(start, end) < size) { - end += FAKE_NODE_MIN_SIZE; - if (end > max_addr) { - end = max_addr; - break; - } - } - return end; -} - -static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) -{ - unsigned long max_pfn = PHYS_PFN(max_addr); - unsigned long base_pfn = PHYS_PFN(base); - unsigned long hole_pfns = PHYS_PFN(hole); - - return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); -} - -/* - * Sets up fake nodes of `size' interleaved over physical nodes ranging from - * `addr' to `max_addr'. - * - * Returns zero on success or negative on error. - */ -static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, - struct numa_meminfo *pi, - u64 addr, u64 max_addr, u64 size, - int nr_nodes, struct numa_memblk *pblk, - int nid) -{ - nodemask_t physnode_mask = numa_nodes_parsed; - int i, ret, uniform = 0; - u64 min_size; - - if ((!size && !nr_nodes) || (nr_nodes && !pblk)) - return -1; - - /* - * In the 'uniform' case split the passed in physical node by - * nr_nodes, in the non-uniform case, ignore the passed in - * physical block and try to create nodes of at least size - * @size. - * - * In the uniform case, split the nodes strictly by physical - * capacity, i.e. ignore holes. In the non-uniform case account - * for holes and treat @size as a minimum floor. - */ - if (!nr_nodes) - nr_nodes = MAX_NUMNODES; - else { - nodes_clear(physnode_mask); - node_set(pblk->nid, physnode_mask); - uniform = 1; - } - - if (uniform) { - min_size = uniform_size(max_addr, addr, 0, nr_nodes); - size = min_size; - } else { - /* - * The limit on emulated nodes is MAX_NUMNODES, so the - * size per node is increased accordingly if the - * requested size is too small. This creates a uniform - * distribution of node sizes across the entire machine - * (but not necessarily over physical nodes). - */ - min_size = uniform_size(max_addr, addr, - mem_hole_size(addr, max_addr), nr_nodes); - } - min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); - if (size < min_size) { - pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", - size >> 20, min_size >> 20); - size = min_size; - } - size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); - - /* - * Fill physical nodes with fake nodes of size until there is no memory - * left on any of them. - */ - while (!nodes_empty(physnode_mask)) { - for_each_node_mask(i, physnode_mask) { - u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); - u64 start, limit, end; - int phys_blk; - - phys_blk = emu_find_memblk_by_nid(i, pi); - if (phys_blk < 0) { - node_clear(i, physnode_mask); - continue; - } - - start = pi->blk[phys_blk].start; - limit = pi->blk[phys_blk].end; - - if (uniform) - end = start + size; - else - end = find_end_of_node(start, limit, size); - /* - * If there won't be at least FAKE_NODE_MIN_SIZE of - * non-reserved memory in ZONE_DMA32 for the next node, - * this one must extend to the boundary. - */ - if (end < dma32_end && dma32_end - end - - mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) - end = dma32_end; - - /* - * If there won't be enough non-reserved memory for the - * next node, this one must extend to the end of the - * physical node. - */ - if ((limit - end - mem_hole_size(end, limit) < size) - && !uniform) - end = limit; - - ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, - phys_blk, - min(end, limit) - start); - if (ret < 0) - return ret; - } - } - return nid; -} - -static int __init split_nodes_size_interleave(struct numa_meminfo *ei, - struct numa_meminfo *pi, - u64 addr, u64 max_addr, u64 size) -{ - return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, - 0, NULL, 0); -} - -static int __init setup_emu2phys_nid(int *dfl_phys_nid) -{ - int i, max_emu_nid = 0; - - *dfl_phys_nid = NUMA_NO_NODE; - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { - if (emu_nid_to_phys[i] != NUMA_NO_NODE) { - max_emu_nid = i; - if (*dfl_phys_nid == NUMA_NO_NODE) - *dfl_phys_nid = emu_nid_to_phys[i]; - } - } - - return max_emu_nid; -} - -/** - * numa_emulation - Emulate NUMA nodes - * @numa_meminfo: NUMA configuration to massage - * @numa_dist_cnt: The size of the physical NUMA distance table - * - * Emulate NUMA nodes according to the numa=fake kernel parameter. - * @numa_meminfo contains the physical memory configuration and is modified - * to reflect the emulated configuration on success. @numa_dist_cnt is - * used to determine the size of the physical distance table. - * - * On success, the following modifications are made. - * - * - @numa_meminfo is updated to reflect the emulated nodes. - * - * - __apicid_to_node[] is updated such that APIC IDs are mapped to the - * emulated nodes. - * - * - NUMA distance table is rebuilt to represent distances between emulated - * nodes. The distances are determined considering how emulated nodes - * are mapped to physical nodes and match the actual distances. - * - * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical - * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). - * - * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with - * identity mapping and no other modification is made. - */ -void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) -{ - static struct numa_meminfo ei __initdata; - static struct numa_meminfo pi __initdata; - const u64 max_addr = PFN_PHYS(max_pfn); - u8 *phys_dist = NULL; - size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); - int max_emu_nid, dfl_phys_nid; - int i, j, ret; - - if (!emu_cmdline) - goto no_emu; - - memset(&ei, 0, sizeof(ei)); - pi = *numa_meminfo; - - for (i = 0; i < MAX_NUMNODES; i++) - emu_nid_to_phys[i] = NUMA_NO_NODE; - - /* - * If the numa=fake command-line contains a 'M' or 'G', it represents - * the fixed node size. Otherwise, if it is just a single number N, - * split the system RAM into N fake nodes. - */ - if (strchr(emu_cmdline, 'U')) { - nodemask_t physnode_mask = numa_nodes_parsed; - unsigned long n; - int nid = 0; - - n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); - ret = -1; - for_each_node_mask(i, physnode_mask) { - /* - * The reason we pass in blk[0] is due to - * numa_remove_memblk_from() called by - * emu_setup_memblk() will delete entry 0 - * and then move everything else up in the pi.blk - * array. Therefore we should always be looking - * at blk[0]. - */ - ret = split_nodes_size_interleave_uniform(&ei, &pi, - pi.blk[0].start, pi.blk[0].end, 0, - n, &pi.blk[0], nid); - if (ret < 0) - break; - if (ret < n) { - pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", - __func__, i, ret, n); - ret = -1; - break; - } - nid = ret; - } - } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { - u64 size; - - size = memparse(emu_cmdline, &emu_cmdline); - ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); - } else { - unsigned long n; - - n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); - ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); - } - if (*emu_cmdline == ':') - emu_cmdline++; - - if (ret < 0) - goto no_emu; - - if (numa_cleanup_meminfo(&ei) < 0) { - pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); - goto no_emu; - } - - /* copy the physical distance table */ - if (numa_dist_cnt) { - u64 phys; - - phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0, - PFN_PHYS(max_pfn_mapped)); - if (!phys) { - pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); - goto no_emu; - } - phys_dist = __va(phys); - - for (i = 0; i < numa_dist_cnt; i++) - for (j = 0; j < numa_dist_cnt; j++) - phys_dist[i * numa_dist_cnt + j] = - node_distance(i, j); - } - - /* - * Determine the max emulated nid and the default phys nid to use - * for unmapped nodes. - */ - max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); - - /* commit */ - *numa_meminfo = ei; - - /* Make sure numa_nodes_parsed only contains emulated nodes */ - nodes_clear(numa_nodes_parsed); - for (i = 0; i < ARRAY_SIZE(ei.blk); i++) - if (ei.blk[i].start != ei.blk[i].end && - ei.blk[i].nid != NUMA_NO_NODE) - node_set(ei.blk[i].nid, numa_nodes_parsed); - - /* - * Transform __apicid_to_node table to use emulated nids by - * reverse-mapping phys_nid. The maps should always exist but fall - * back to zero just in case. - */ - for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { - if (__apicid_to_node[i] == NUMA_NO_NODE) - continue; - for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) - if (__apicid_to_node[i] == emu_nid_to_phys[j]) - break; - __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; - } - - /* make sure all emulated nodes are mapped to a physical node */ - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) - if (emu_nid_to_phys[i] == NUMA_NO_NODE) - emu_nid_to_phys[i] = dfl_phys_nid; - - /* transform distance table */ - numa_reset_distance(); - for (i = 0; i < max_emu_nid + 1; i++) { - for (j = 0; j < max_emu_nid + 1; j++) { - int physi = emu_nid_to_phys[i]; - int physj = emu_nid_to_phys[j]; - int dist; - - if (get_option(&emu_cmdline, &dist) == 2) - ; - else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) - dist = physi == physj ? - LOCAL_DISTANCE : REMOTE_DISTANCE; - else - dist = phys_dist[physi * numa_dist_cnt + physj]; - - numa_set_distance(i, j, dist); - } - } - - /* free the copied physical distance table */ - memblock_free(phys_dist, phys_size); - return; - -no_emu: - /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) - emu_nid_to_phys[i] = i; -} - -#ifndef CONFIG_DEBUG_PER_CPU_MAPS -void numa_add_cpu(int cpu) -{ - int physnid, nid; - - nid = early_cpu_to_node(cpu); - BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); - - physnid = emu_nid_to_phys[nid]; - - /* - * Map the cpu to each emulated node that is allocated on the physical - * node of the cpu's apic id. - */ - for_each_online_node(nid) - if (emu_nid_to_phys[nid] == physnid) - cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); -} - -void numa_remove_cpu(int cpu) -{ - int i; - - for_each_online_node(i) - cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); -} -#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ -static void numa_set_cpumask(int cpu, bool enable) -{ - int nid, physnid; - - nid = early_cpu_to_node(cpu); - if (nid == NUMA_NO_NODE) { - /* early_cpu_to_node() already emits a warning and trace */ - return; - } - - physnid = emu_nid_to_phys[nid]; - - for_each_online_node(nid) { - if (emu_nid_to_phys[nid] != physnid) - continue; - - debug_cpumask_set_cpu(cpu, nid, enable); - } -} - -void numa_add_cpu(int cpu) -{ - numa_set_cpumask(cpu, true); -} - -void numa_remove_cpu(int cpu) -{ - numa_set_cpumask(cpu, false); -} -#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index 86860f279662..11e1ff370c10 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -5,30 +5,6 @@ #include <linux/types.h> #include <asm/numa.h> -struct numa_memblk { - u64 start; - u64 end; - int nid; -}; - -struct numa_meminfo { - int nr_blks; - struct numa_memblk blk[NR_NODE_MEMBLKS]; -}; - -void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); -int __init numa_cleanup_meminfo(struct numa_meminfo *mi); -void __init numa_reset_distance(void); - void __init x86_numa_init(void); -#ifdef CONFIG_NUMA_EMU -void __init numa_emulation(struct numa_meminfo *numa_meminfo, - int numa_dist_cnt); -#else -static inline void numa_emulation(struct numa_meminfo *numa_meminfo, - int numa_dist_cnt) -{ } -#endif - #endif /* __X86_MM_NUMA_INTERNAL_H */ diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index bdc2a240c2aa..f73b5ce270b3 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -104,7 +104,7 @@ __setup("debugpat", pat_debug_setup); #ifdef CONFIG_X86_PAT /* - * X86 PAT uses page flags arch_1 and uncached together to keep track of + * X86 PAT uses page flags arch_1 and arch_2 together to keep track of * memory type of pages that have backing page struct. * * X86 PAT supports 4 different memory types: @@ -118,9 +118,9 @@ __setup("debugpat", pat_debug_setup); #define _PGMT_WB 0 #define _PGMT_WC (1UL << PG_arch_1) -#define _PGMT_UC_MINUS (1UL << PG_uncached) -#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) -#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_UC_MINUS (1UL << PG_arch_2) +#define _PGMT_WT (1UL << PG_arch_2 | 1UL << PG_arch_1) +#define _PGMT_MASK (1UL << PG_arch_2 | 1UL << PG_arch_1) #define _PGMT_CLEAR_MASK (~_PGMT_MASK) static inline enum page_cache_mode get_page_memtype(struct page *pg) @@ -951,23 +951,20 @@ static void free_pfn_range(u64 paddr, unsigned long size) static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, resource_size_t *phys) { - pte_t *ptep, pte; - spinlock_t *ptl; + struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start }; - if (follow_pte(vma, vma->vm_start, &ptep, &ptl)) + if (follow_pfnmap_start(&args)) return -EINVAL; - pte = ptep_get(ptep); - /* Never return PFNs of anon folios in COW mappings. */ - if (vm_normal_folio(vma, vma->vm_start, pte)) { - pte_unmap_unlock(ptep, ptl); + if (!args.special) { + follow_pfnmap_end(&args); return -EINVAL; } - *prot = pgprot_val(pte_pgprot(pte)); - *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - pte_unmap_unlock(ptep, ptl); + *prot = pgprot_val(args.pgprot); + *phys = (resource_size_t)args.pfn << PAGE_SHIFT; + follow_pfnmap_end(&args); return 0; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index f5931499c2d6..5745a354a241 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -641,6 +641,18 @@ pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, } #endif +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + VM_WARN_ON_ONCE(!pud_present(*pudp)); + pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return old; +} +#endif + /** * reserve_top_address - reserves a hole in the top of kernel address space * @reserve - size of hole to reserve @@ -926,3 +938,9 @@ void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pmd_shstk(pmd)); } + +void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) +{ + /* See note in arch_check_zapped_pte() */ + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); +} diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index 76d9f6ce7a3d..f238f7b33cdd 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -52,8 +52,11 @@ subsys_initcall(init_vdso); int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - int err; + struct vm_area_struct *vma; struct mm_struct *mm = current->mm; + static struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + }; if (!vdso_enabled) return 0; @@ -61,12 +64,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (mmap_write_lock_killable(mm)) return -EINTR; - err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, + vdso_mapping.pages = vdsop; + vma = _install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdsop); + &vdso_mapping); mmap_write_unlock(mm); - return err; + return IS_ERR(vma) ? PTR_ERR(vma) : 0; } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 839e6613753d..55a4996d0c04 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -665,7 +665,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) { spinlock_t *ptl = NULL; -#if USE_SPLIT_PTE_PTLOCKS +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) ptl = ptlock_ptr(page_ptdesc(page)); spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif @@ -1553,7 +1553,8 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, __set_pfn_prot(pfn, PAGE_KERNEL_RO); - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned) + if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS) && + !pinned) __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); xen_mc_issue(XEN_LAZY_MMU); @@ -1581,7 +1582,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level) if (pinned) { xen_mc_batch(); - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) + if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS)) __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); __set_pfn_prot(pfn, PAGE_KERNEL); diff --git a/arch/xtensa/kernel/syscall.c b/arch/xtensa/kernel/syscall.c index b3c2450d6f23..dc54f854c2f5 100644 --- a/arch/xtensa/kernel/syscall.c +++ b/arch/xtensa/kernel/syscall.c @@ -55,7 +55,8 @@ asmlinkage long xtensa_fadvise64_64(int fd, int advice, #ifdef CONFIG_MMU unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) + unsigned long len, unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) { struct vm_area_struct *vmm; struct vma_iterator vmi; |