diff options
Diffstat (limited to 'arch/riscv/mm/init.c')
| -rw-r--r-- | arch/riscv/mm/init.c | 1037 |
1 files changed, 816 insertions, 221 deletions
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 478d6763a01a..addb8a9305be 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -20,17 +20,25 @@ #include <linux/dma-map-ops.h> #include <linux/crash_dump.h> #include <linux/hugetlb.h> +#include <linux/kfence.h> +#include <linux/execmem.h> +#include <asm/alternative.h> #include <asm/fixmap.h> -#include <asm/tlbflush.h> -#include <asm/sections.h> -#include <asm/soc.h> #include <asm/io.h> -#include <asm/ptdump.h> +#include <asm/kasan.h> +#include <asm/module.h> #include <asm/numa.h> +#include <asm/pgtable.h> +#include <asm/sections.h> +#include <asm/soc.h> +#include <asm/sparsemem.h> +#include <asm/tlbflush.h> #include "../kernel/head.h" +u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1]; + struct kernel_mapping kernel_map __ro_after_init; EXPORT_SYMBOL(kernel_map); #ifdef CONFIG_XIP_KERNEL @@ -44,24 +52,32 @@ u64 satp_mode __ro_after_init = SATP_MODE_32; #endif EXPORT_SYMBOL(satp_mode); -bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL); -bool pgtable_l5_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL); +#ifdef CONFIG_64BIT +bool pgtable_l4_enabled __ro_after_init = !IS_ENABLED(CONFIG_XIP_KERNEL); +bool pgtable_l5_enabled __ro_after_init = !IS_ENABLED(CONFIG_XIP_KERNEL); EXPORT_SYMBOL(pgtable_l4_enabled); EXPORT_SYMBOL(pgtable_l5_enabled); +#endif phys_addr_t phys_ram_base __ro_after_init; EXPORT_SYMBOL(phys_ram_base); +#ifdef CONFIG_SPARSEMEM_VMEMMAP +#define VMEMMAP_ADDR_ALIGN (1ULL << SECTION_SIZE_BITS) + +unsigned long vmemmap_start_pfn __ro_after_init; +EXPORT_SYMBOL(vmemmap_start_pfn); +#endif + unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); extern char _start[]; -#define DTB_EARLY_BASE_VA PGDIR_SIZE void *_dtb_early_va __initdata; uintptr_t _dtb_early_pa __initdata; -static phys_addr_t dma32_phys_limit __initdata; +phys_addr_t dma32_phys_limit __initdata; static void __init zone_sizes_init(void) { @@ -146,7 +162,7 @@ static void __init print_vm_layout(void) print_ml("kasan", KASAN_SHADOW_START, KASAN_SHADOW_END); #endif - print_ml("kernel", (unsigned long)KERNEL_LINK_ADDR, + print_ml("kernel", (unsigned long)kernel_map.virt_addr, (unsigned long)ADDRESS_SPACE_END); } } @@ -154,20 +170,36 @@ static void __init print_vm_layout(void) static void print_vm_layout(void) { } #endif /* CONFIG_DEBUG_VM */ -void __init mem_init(void) +void __init arch_mm_preinit(void) { + bool swiotlb = max_pfn > PFN_DOWN(dma32_phys_limit); #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif /* CONFIG_FLATMEM */ - swiotlb_init(max_pfn > PFN_DOWN(dma32_phys_limit), SWIOTLB_VERBOSE); - memblock_free_all(); + if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb && + dma_cache_alignment != 1) { + /* + * If no bouncing needed for ZONE_DMA, allocate 1MB swiotlb + * buffer per 1GB of RAM for kmalloc() bouncing on + * non-coherent platforms. + */ + unsigned long size = + DIV_ROUND_UP(memblock_phys_mem_size(), 1024); + swiotlb_adjust_size(min(swiotlb_size_or_default(), size)); + swiotlb = true; + } + + swiotlb_init(swiotlb, SWIOTLB_VERBOSE); print_vm_layout(); } /* Limit the memory size via mem. */ static phys_addr_t memory_limit; +#ifdef CONFIG_XIP_KERNEL +#define memory_limit (*(phys_addr_t *)XIP_FIXUP(&memory_limit)) +#endif /* CONFIG_XIP_KERNEL */ static int __init early_mem(char *p) { @@ -210,82 +242,132 @@ static void __init setup_bootmem(void) */ memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); - phys_ram_end = memblock_end_of_DRAM(); - if (!IS_ENABLED(CONFIG_XIP_KERNEL)) - phys_ram_base = memblock_start_of_DRAM(); /* - * memblock allocator is not aware of the fact that last 4K bytes of - * the addressable memory can not be mapped because of IS_ERR_VALUE - * macro. Make sure that last 4k bytes are not usable by memblock - * if end of dram is equal to maximum addressable memory. For 64-bit - * kernel, this problem can't happen here as the end of the virtual - * address space is occupied by the kernel mapping then this check must - * be done as soon as the kernel mapping base address is determined. + * Make sure we align the start of the memory on a PMD boundary so that + * at worst, we map the linear mapping with PMD mappings. + */ + if (!IS_ENABLED(CONFIG_XIP_KERNEL)) { + phys_ram_base = memblock_start_of_DRAM() & PMD_MASK; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_start_pfn = round_down(phys_ram_base, VMEMMAP_ADDR_ALIGN) >> PAGE_SHIFT; +#endif + } + + /* + * In 64-bit, any use of __va/__pa before this point is wrong as we + * did not know the start of DRAM before. + */ + if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU)) + kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base; + + /* + * The size of the linear page mapping may restrict the amount of + * usable RAM. + */ + if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU)) { + max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE; + if (memblock_end_of_DRAM() > max_mapped_addr) { + memblock_cap_memory_range(phys_ram_base, + max_mapped_addr - phys_ram_base); + pr_warn("Physical memory overflows the linear mapping size: region above %pa removed", + &max_mapped_addr); + } + } + + /* + * Reserve physical address space that would be mapped to virtual + * addresses greater than (void *)(-PAGE_SIZE) because: + * - This memory would overlap with ERR_PTR + * - This memory belongs to high memory, which is not supported + * + * This is not applicable to 64-bit kernel, because virtual addresses + * after (void *)(-PAGE_SIZE) are not linearly mapped: they are + * occupied by kernel mapping. Also it is unrealistic for high memory + * to exist on 64-bit platforms. */ if (!IS_ENABLED(CONFIG_64BIT)) { - max_mapped_addr = __pa(~(ulong)0); - if (max_mapped_addr == (phys_ram_end - 1)) - memblock_set_current_limit(max_mapped_addr - 4096); + max_mapped_addr = __va_to_pa_nodebug(-PAGE_SIZE); + memblock_reserve(max_mapped_addr, (phys_addr_t)-max_mapped_addr); } + phys_ram_end = memblock_end_of_DRAM(); min_low_pfn = PFN_UP(phys_ram_base); max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end); - high_memory = (void *)(__va(PFN_PHYS(max_low_pfn))); dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn)); - set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); reserve_initrd_mem(); + + /* + * No allocation should be done before reserving the memory as defined + * in the device tree, otherwise the allocation could end up in a + * reserved region. + */ + early_init_fdt_scan_reserved_mem(); + /* * If DTB is built in, no need to reserve its memblock. * Otherwise, do reserve it but avoid using * early_init_fdt_reserve_self() since __pa() does * not work for DTB pointers that are fixmap addresses */ - if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) { - /* - * In case the DTB is not located in a memory region we won't - * be able to locate it later on via the linear mapping and - * get a segfault when accessing it via __va(dtb_early_pa). - * To avoid this situation copy DTB to a memory region. - * Note that memblock_phys_alloc will also reserve DTB region. - */ - if (!memblock_is_memory(dtb_early_pa)) { - size_t fdt_size = fdt_totalsize(dtb_early_va); - phys_addr_t new_dtb_early_pa = memblock_phys_alloc(fdt_size, PAGE_SIZE); - void *new_dtb_early_va = early_memremap(new_dtb_early_pa, fdt_size); - - memcpy(new_dtb_early_va, dtb_early_va, fdt_size); - early_memunmap(new_dtb_early_va, fdt_size); - _dtb_early_pa = new_dtb_early_pa; - } else - memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); - } + if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) + memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); dma_contiguous_reserve(dma32_phys_limit); if (IS_ENABLED(CONFIG_64BIT)) hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); - memblock_allow_resize(); } -#ifdef CONFIG_MMU -struct pt_alloc_ops pt_ops __initdata; +#ifdef CONFIG_RELOCATABLE +extern unsigned long __rela_dyn_start, __rela_dyn_end; + +static void __init relocate_kernel(void) +{ + Elf_Rela *rela = (Elf_Rela *)&__rela_dyn_start; + /* + * This holds the offset between the linked virtual address and the + * relocated virtual address. + */ + uintptr_t reloc_offset = kernel_map.virt_addr - KERNEL_LINK_ADDR; + /* + * This holds the offset between kernel linked virtual address and + * physical address. + */ + uintptr_t va_kernel_link_pa_offset = KERNEL_LINK_ADDR - kernel_map.phys_addr; + + for ( ; rela < (Elf_Rela *)&__rela_dyn_end; rela++) { + Elf_Addr addr = (rela->r_offset - va_kernel_link_pa_offset); + Elf_Addr relocated_addr = rela->r_addend; + + if (rela->r_info != R_RISCV_RELATIVE) + continue; + + /* + * Make sure to not relocate vdso symbols like rt_sigreturn + * which are linked from the address 0 in vmlinux since + * vdso symbol addresses are actually used as an offset from + * mm->context.vdso in VDSO_OFFSET macro. + */ + if (relocated_addr >= KERNEL_LINK_ADDR) + relocated_addr += reloc_offset; + + *(Elf_Addr *)addr = relocated_addr; + } +} +#endif /* CONFIG_RELOCATABLE */ -unsigned long riscv_pfn_base __ro_after_init; -EXPORT_SYMBOL(riscv_pfn_base); +#ifdef CONFIG_MMU +struct pt_alloc_ops pt_ops __meminitdata; pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); -static p4d_t __maybe_unused early_dtb_p4d[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); -static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); -static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #ifdef CONFIG_XIP_KERNEL #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) -#define riscv_pfn_base (*(unsigned long *)XIP_FIXUP(&riscv_pfn_base)) #define trampoline_pg_dir ((pgd_t *)XIP_FIXUP(trampoline_pg_dir)) #define fixmap_pte ((pte_t *)XIP_FIXUP(fixmap_pte)) #define early_pg_dir ((pgd_t *)XIP_FIXUP(early_pg_dir)) @@ -299,7 +381,7 @@ static const pgprot_t protection_map[16] = { [VM_EXEC] = PAGE_EXEC, [VM_EXEC | VM_READ] = PAGE_READ_EXEC, [VM_EXEC | VM_WRITE] = PAGE_COPY_EXEC, - [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_READ_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_EXEC, [VM_SHARED] = PAGE_NONE, [VM_SHARED | VM_READ] = PAGE_READ, [VM_SHARED | VM_WRITE] = PAGE_SHARED, @@ -338,7 +420,7 @@ static inline pte_t *__init get_pte_virt_fixmap(phys_addr_t pa) return (pte_t *)set_fixmap_offset(FIX_PTE, pa); } -static inline pte_t *__init get_pte_virt_late(phys_addr_t pa) +static inline pte_t *__meminit get_pte_virt_late(phys_addr_t pa) { return (pte_t *) __va(pa); } @@ -357,19 +439,21 @@ static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t __init alloc_pte_late(uintptr_t va) +static phys_addr_t __meminit alloc_pte_late(uintptr_t va) { - unsigned long vaddr; - - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr || !pgtable_pte_page_ctor(virt_to_page(vaddr))); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); - return __pa(vaddr); + /* + * We do not know which mm the PTE page is associated to at this point. + * Passing NULL to the ctor is the safe option, though it may result + * in unnecessary work (e.g. initialising the ptlock for init_mm). + */ + BUG_ON(!ptdesc || !pagetable_pte_ctor(NULL, ptdesc)); + return __pa((pte_t *)ptdesc_address(ptdesc)); } -static void __init create_pte_mapping(pte_t *ptep, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_pte_mapping(pte_t *ptep, uintptr_t va, phys_addr_t pa, phys_addr_t sz, + pgprot_t prot) { uintptr_t pte_idx = pte_index(va); @@ -423,7 +507,7 @@ static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa) return (pmd_t *)set_fixmap_offset(FIX_PMD, pa); } -static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) +static pmd_t *__meminit get_pmd_virt_late(phys_addr_t pa) { return (pmd_t *) __va(pa); } @@ -440,19 +524,18 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t __init alloc_pmd_late(uintptr_t va) +static phys_addr_t __meminit alloc_pmd_late(uintptr_t va) { - unsigned long vaddr; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr || !pgtable_pmd_page_ctor(virt_to_page(vaddr))); - - return __pa(vaddr); + /* See comment in alloc_pte_late() regarding NULL passed the ctor */ + BUG_ON(!ptdesc || !pagetable_pmd_ctor(NULL, ptdesc)); + return __pa((pmd_t *)ptdesc_address(ptdesc)); } -static void __init create_pmd_mapping(pmd_t *pmdp, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_pmd_mapping(pmd_t *pmdp, + uintptr_t va, phys_addr_t pa, + phys_addr_t sz, pgprot_t prot) { pte_t *ptep; phys_addr_t pte_phys; @@ -488,7 +571,7 @@ static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) return (pud_t *)set_fixmap_offset(FIX_PUD, pa); } -static pud_t *__init get_pud_virt_late(phys_addr_t pa) +static pud_t *__meminit get_pud_virt_late(phys_addr_t pa) { return (pud_t *)__va(pa); } @@ -506,13 +589,13 @@ static phys_addr_t __init alloc_pud_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t alloc_pud_late(uintptr_t va) +static phys_addr_t __meminit alloc_pud_late(uintptr_t va) { - unsigned long vaddr; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr); - return __pa(vaddr); + BUG_ON(!ptdesc); + pagetable_pud_ctor(ptdesc); + return __pa((pud_t *)ptdesc_address(ptdesc)); } static p4d_t *__init get_p4d_virt_early(phys_addr_t pa) @@ -526,7 +609,7 @@ static p4d_t *__init get_p4d_virt_fixmap(phys_addr_t pa) return (p4d_t *)set_fixmap_offset(FIX_P4D, pa); } -static p4d_t *__init get_p4d_virt_late(phys_addr_t pa) +static p4d_t *__meminit get_p4d_virt_late(phys_addr_t pa) { return (p4d_t *)__va(pa); } @@ -544,18 +627,17 @@ static phys_addr_t __init alloc_p4d_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t alloc_p4d_late(uintptr_t va) +static phys_addr_t __meminit alloc_p4d_late(uintptr_t va) { - unsigned long vaddr; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr); - return __pa(vaddr); + BUG_ON(!ptdesc); + pagetable_p4d_ctor(ptdesc); + return __pa((p4d_t *)ptdesc_address(ptdesc)); } -static void __init create_pud_mapping(pud_t *pudp, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_pud_mapping(pud_t *pudp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, + pgprot_t prot) { pmd_t *nextp; phys_addr_t next_phys; @@ -580,9 +662,8 @@ static void __init create_pud_mapping(pud_t *pudp, create_pmd_mapping(nextp, va, pa, sz, prot); } -static void __init create_p4d_mapping(p4d_t *p4dp, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_p4d_mapping(p4d_t *p4dp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, + pgprot_t prot) { pud_t *nextp; phys_addr_t next_phys; @@ -626,9 +707,6 @@ static void __init create_p4d_mapping(p4d_t *p4dp, #define trampoline_pgd_next (pgtable_l5_enabled ? \ (uintptr_t)trampoline_p4d : (pgtable_l4_enabled ? \ (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)) -#define early_dtb_pgd_next (pgtable_l5_enabled ? \ - (uintptr_t)early_dtb_p4d : (pgtable_l4_enabled ? \ - (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd)) #else #define pgd_next_t pte_t #define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) @@ -636,15 +714,13 @@ static void __init create_p4d_mapping(p4d_t *p4dp, #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pte_mapping(__nextp, __va, __pa, __sz, __prot) #define fixmap_pgd_next ((uintptr_t)fixmap_pte) -#define early_dtb_pgd_next ((uintptr_t)early_dtb_pmd) #define create_p4d_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) #define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) #endif /* __PAGETABLE_PMD_FOLDED */ -void __init create_pgd_mapping(pgd_t *pgdp, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, + pgprot_t prot) { pgd_next_t *nextp; phys_addr_t next_phys; @@ -669,11 +745,21 @@ void __init create_pgd_mapping(pgd_t *pgdp, create_pgd_next_mapping(nextp, va, pa, sz, prot); } -static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) +static uintptr_t __meminit best_map_size(phys_addr_t pa, uintptr_t va, phys_addr_t size) { - /* Upgrade to PMD_SIZE mappings whenever possible */ - base &= PMD_SIZE - 1; - if (!base && size >= PMD_SIZE) + if (debug_pagealloc_enabled()) + return PAGE_SIZE; + + if (pgtable_l5_enabled && + !(pa & (P4D_SIZE - 1)) && !(va & (P4D_SIZE - 1)) && size >= P4D_SIZE) + return P4D_SIZE; + + if (pgtable_l4_enabled && + !(pa & (PUD_SIZE - 1)) && !(va & (PUD_SIZE - 1)) && size >= PUD_SIZE) + return PUD_SIZE; + + if (IS_ENABLED(CONFIG_64BIT) && + !(pa & (PMD_SIZE - 1)) && !(va & (PMD_SIZE - 1)) && size >= PMD_SIZE) return PMD_SIZE; return PAGE_SIZE; @@ -695,7 +781,7 @@ asmlinkage void __init __copy_data(void) #endif #ifdef CONFIG_STRICT_KERNEL_RWX -static __init pgprot_t pgprot_from_va(uintptr_t va) +static __meminit pgprot_t pgprot_from_va(uintptr_t va) { if (is_va_kernel_text(va)) return PAGE_KERNEL_READ_EXEC; @@ -718,11 +804,9 @@ void mark_rodata_ro(void) if (IS_ENABLED(CONFIG_64BIT)) set_kernel_memory(lm_alias(__start_rodata), lm_alias(_data), set_memory_ro); - - debug_checkwx(); } #else -static __init pgprot_t pgprot_from_va(uintptr_t va) +static __meminit pgprot_t pgprot_from_va(uintptr_t va) { if (IS_ENABLED(CONFIG_64BIT) && !is_kernel_mapping(va)) return PAGE_KERNEL; @@ -732,6 +816,9 @@ static __init pgprot_t pgprot_from_va(uintptr_t va) #endif /* CONFIG_STRICT_KERNEL_RWX */ #if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) +u64 __pi_set_satp_mode_from_cmdline(uintptr_t dtb_pa); +u64 __pi_set_satp_mode_from_fdt(uintptr_t dtb_pa); + static void __init disable_pgtable_l5(void) { pgtable_l5_enabled = false; @@ -746,17 +833,50 @@ static void __init disable_pgtable_l4(void) satp_mode = SATP_MODE_39; } +static int __init print_no4lvl(char *p) +{ + pr_info("Disabled 4-level and 5-level paging"); + return 0; +} +early_param("no4lvl", print_no4lvl); + +static int __init print_no5lvl(char *p) +{ + pr_info("Disabled 5-level paging"); + return 0; +} +early_param("no5lvl", print_no5lvl); + +static void __init set_mmap_rnd_bits_max(void) +{ + mmap_rnd_bits_max = MMAP_VA_BITS - PAGE_SHIFT - 3; +} + /* * There is a simple way to determine if 4-level is supported by the * underlying hardware: establish 1:1 mapping in 4-level page table mode * then read SATP to see if the configuration was taken into account * meaning sv48 is supported. + * The maximum SATP mode is limited by both the command line and the "mmu-type" + * property in the device tree, since some platforms may hang if an unsupported + * SATP mode is attempted. */ -static __init void set_satp_mode(void) +static __init void set_satp_mode(uintptr_t dtb_pa) { u64 identity_satp, hw_satp; uintptr_t set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; - bool check_l4 = false; + u64 satp_mode_limit = min_not_zero(__pi_set_satp_mode_from_cmdline(dtb_pa), + __pi_set_satp_mode_from_fdt(dtb_pa)); + + kernel_map.page_offset = PAGE_OFFSET_L5; + + if (satp_mode_limit == SATP_MODE_48) { + disable_pgtable_l5(); + } else if (satp_mode_limit == SATP_MODE_39) { + disable_pgtable_l5(); + disable_pgtable_l4(); + return; + } create_p4d_mapping(early_p4d, set_satp_mode_pmd, (uintptr_t)early_pud, @@ -775,7 +895,8 @@ static __init void set_satp_mode(void) retry: create_pgd_mapping(early_pg_dir, set_satp_mode_pmd, - check_l4 ? (uintptr_t)early_pud : (uintptr_t)early_p4d, + pgtable_l5_enabled ? + (uintptr_t)early_p4d : (uintptr_t)early_pud, PGDIR_SIZE, PAGE_TABLE); identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode; @@ -786,9 +907,8 @@ retry: local_flush_tlb_all(); if (hw_satp != identity_satp) { - if (!check_l4) { + if (pgtable_l5_enabled) { disable_pgtable_l5(); - check_l4 = true; memset(early_pg_dir, 0, PAGE_SIZE); goto retry; } @@ -824,7 +944,7 @@ retry: static void __init create_kernel_page_table(pgd_t *pgdir, __always_unused bool early) { - uintptr_t va, end_va; + uintptr_t va, start_va, end_va; /* Map the flash resident part */ end_va = kernel_map.virt_addr + kernel_map.xiprom_sz; @@ -834,10 +954,11 @@ static void __init create_kernel_page_table(pgd_t *pgdir, PMD_SIZE, PAGE_KERNEL_EXEC); /* Map the data in RAM */ - end_va = kernel_map.virt_addr + XIP_OFFSET + kernel_map.size; - for (va = kernel_map.virt_addr + XIP_OFFSET; va < end_va; va += PMD_SIZE) + start_va = kernel_map.virt_addr + (uintptr_t)&_sdata - (uintptr_t)&_start; + end_va = kernel_map.virt_addr + kernel_map.size; + for (va = start_va; va < end_va; va += PMD_SIZE) create_pgd_mapping(pgdir, va, - kernel_map.phys_addr + (va - (kernel_map.virt_addr + XIP_OFFSET)), + kernel_map.phys_addr + (va - start_va), PMD_SIZE, PAGE_KERNEL); } #else @@ -860,32 +981,27 @@ static void __init create_kernel_page_table(pgd_t *pgdir, bool early) * this means 2 PMD entries whereas for 32-bit kernel, this is only 1 PGDIR * entry. */ -static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) +static void __init create_fdt_early_page_table(uintptr_t fix_fdt_va, + uintptr_t dtb_pa) { #ifndef CONFIG_BUILTIN_DTB uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); - create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, - IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa, - PGDIR_SIZE, - IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL); + /* Make sure the fdt fixmap address is always aligned on PMD size */ + BUILD_BUG_ON(FIX_FDT % (PMD_SIZE / PAGE_SIZE)); - if (pgtable_l5_enabled) - create_p4d_mapping(early_dtb_p4d, DTB_EARLY_BASE_VA, - (uintptr_t)early_dtb_pud, P4D_SIZE, PAGE_TABLE); - - if (pgtable_l4_enabled) - create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA, - (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE); - - if (IS_ENABLED(CONFIG_64BIT)) { - create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, + /* In 32-bit only, the fdt lies in its own PGD */ + if (!IS_ENABLED(CONFIG_64BIT)) { + create_pgd_mapping(early_pg_dir, fix_fdt_va, + pa, MAX_FDT_SIZE, PAGE_KERNEL); + } else { + create_pmd_mapping(fixmap_pmd, fix_fdt_va, pa, PMD_SIZE, PAGE_KERNEL); - create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA + PMD_SIZE, + create_pmd_mapping(fixmap_pmd, fix_fdt_va + PMD_SIZE, pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL); } - dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1)); + dtb_early_va = (void *)fix_fdt_va + (dtb_pa & (PMD_SIZE - 1)); #else /* * For 64-bit kernel, __va can't be used since it would return a linear @@ -893,7 +1009,7 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) * setup_vm_final installs the linear mapping. For 32-bit kernel, as the * kernel is mapped in the linear mapping, that makes no difference. */ - dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa)); + dtb_early_va = kernel_mapping_pa_to_va(dtb_pa); #endif dtb_early_pa = dtb_pa; @@ -957,43 +1073,90 @@ static void __init pt_ops_set_late(void) #endif } +#ifdef CONFIG_RANDOMIZE_BASE +extern bool __init __pi_set_nokaslr_from_cmdline(uintptr_t dtb_pa); +extern u64 __init __pi_get_kaslr_seed(uintptr_t dtb_pa); +extern u64 __init __pi_get_kaslr_seed_zkr(const uintptr_t dtb_pa); + +static int __init print_nokaslr(char *p) +{ + pr_info("Disabled KASLR"); + return 0; +} +early_param("nokaslr", print_nokaslr); + +unsigned long kaslr_offset(void) +{ + return kernel_map.virt_offset; +} +#endif + asmlinkage void __init setup_vm(uintptr_t dtb_pa) { pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; - kernel_map.virt_addr = KERNEL_LINK_ADDR; - kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); +#ifdef CONFIG_RANDOMIZE_BASE + if (!__pi_set_nokaslr_from_cmdline(dtb_pa)) { + u64 kaslr_seed = __pi_get_kaslr_seed_zkr(dtb_pa); + u32 kernel_size = (uintptr_t)(&_end) - (uintptr_t)(&_start); + u32 nr_pos; + + if (kaslr_seed == 0) + kaslr_seed = __pi_get_kaslr_seed(dtb_pa); + /* + * Compute the number of positions available: we are limited + * by the early page table that only has one PUD and we must + * be aligned on PMD_SIZE. + */ + nr_pos = (PUD_SIZE - kernel_size) / PMD_SIZE; + + kernel_map.virt_offset = (kaslr_seed % nr_pos) * PMD_SIZE; + } +#endif + + kernel_map.virt_addr = KERNEL_LINK_ADDR + kernel_map.virt_offset; #ifdef CONFIG_XIP_KERNEL kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; kernel_map.xiprom_sz = (uintptr_t)(&_exiprom) - (uintptr_t)(&_xiprom); phys_ram_base = CONFIG_PHYS_RAM_BASE; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_start_pfn = round_down(phys_ram_base, VMEMMAP_ADDR_ALIGN) >> PAGE_SHIFT; +#endif kernel_map.phys_addr = (uintptr_t)CONFIG_PHYS_RAM_BASE; - kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_sdata); + kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_start); - kernel_map.va_kernel_xip_pa_offset = kernel_map.virt_addr - kernel_map.xiprom; + kernel_map.va_kernel_xip_text_pa_offset = kernel_map.virt_addr - kernel_map.xiprom; + kernel_map.va_kernel_xip_data_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr + + (uintptr_t)&_sdata - (uintptr_t)&_start; #else kernel_map.phys_addr = (uintptr_t)(&_start); kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr; + kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; #endif #if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) - set_satp_mode(); + set_satp_mode(dtb_pa); + set_mmap_rnd_bits_max(); #endif - kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr; - kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; - - riscv_pfn_base = PFN_DOWN(kernel_map.phys_addr); - /* - * The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit - * kernel, whereas for 64-bit kernel, the end of the virtual address - * space is occupied by the modules/BPF/kernel mappings which reduces - * the available size of the linear mapping. + * In 64-bit, we defer the setup of va_pa_offset to setup_bootmem, + * where we have the system memory layout: this allows us to align + * the physical and virtual mappings and then make use of PUD/P4D/PGD + * for the linear mapping. This is only possible because the kernel + * mapping lies outside the linear mapping. + * In 32-bit however, as the kernel resides in the linear mapping, + * setup_vm_final can not change the mapping established here, + * otherwise the same kernel addresses would get mapped to different + * physical addresses (if the start of dram is different from the + * kernel physical address start). */ - memory_limit = KERN_VIRT_SIZE - (IS_ENABLED(CONFIG_64BIT) ? SZ_4G : 0); + kernel_map.va_pa_offset = IS_ENABLED(CONFIG_64BIT) ? + 0UL : PAGE_OFFSET - kernel_map.phys_addr; + + memory_limit = KERN_VIRT_SIZE; /* Sanity check alignment and size */ BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0); @@ -1007,6 +1170,18 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) BUG_ON((kernel_map.virt_addr + kernel_map.size) > ADDRESS_SPACE_END - SZ_4K); #endif +#ifdef CONFIG_RELOCATABLE + /* + * Early page table uses only one PUD, which makes it possible + * to map PUD_SIZE aligned on PUD_SIZE: if the relocation offset + * makes the kernel cross over a PUD_SIZE boundary, raise a bug + * since a part of the kernel would not get mapped. + */ + if (IS_ENABLED(CONFIG_64BIT)) + BUG_ON(PUD_SIZE - (kernel_map.virt_addr & (PUD_SIZE - 1)) < kernel_map.size); + relocate_kernel(); +#endif + apply_early_boot_alternatives(); pt_ops_set_early(); @@ -1055,7 +1230,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) create_kernel_page_table(early_pg_dir, true); /* Setup early mapping for FDT early scan */ - create_fdt_early_page_table(early_pg_dir, dtb_pa); + create_fdt_early_page_table(__fix_to_virt(FIX_FDT), dtb_pa); /* * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap @@ -1090,16 +1265,51 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pt_ops_set_fixmap(); } -static void __init setup_vm_final(void) +static void __meminit create_linear_mapping_range(phys_addr_t start, phys_addr_t end, + uintptr_t fixed_map_size, const pgprot_t *pgprot) { + phys_addr_t pa; uintptr_t va, map_size; - phys_addr_t pa, start, end; + + for (pa = start; pa < end; pa += map_size) { + va = (uintptr_t)__va(pa); + map_size = fixed_map_size ? fixed_map_size : + best_map_size(pa, va, end - pa); + + create_pgd_mapping(swapper_pg_dir, va, pa, map_size, + pgprot ? *pgprot : pgprot_from_va(va)); + } +} + +static void __init create_linear_mapping_page_table(void) +{ + phys_addr_t start, end; + phys_addr_t kfence_pool __maybe_unused; u64 i; - /* Setup swapper PGD for fixmap */ - create_pgd_mapping(swapper_pg_dir, FIXADDR_START, - __pa_symbol(fixmap_pgd_next), - PGDIR_SIZE, PAGE_TABLE); +#ifdef CONFIG_STRICT_KERNEL_RWX + phys_addr_t ktext_start = __pa_symbol(_start); + phys_addr_t ktext_size = __init_data_begin - _start; + phys_addr_t krodata_start = __pa_symbol(__start_rodata); + phys_addr_t krodata_size = _data - __start_rodata; + + /* Isolate kernel text and rodata so they don't get mapped with a PUD */ + memblock_mark_nomap(ktext_start, ktext_size); + memblock_mark_nomap(krodata_start, krodata_size); +#endif + +#ifdef CONFIG_KFENCE + /* + * kfence pool must be backed by PAGE_SIZE mappings, so allocate it + * before we setup the linear mapping so that we avoid using hugepages + * for this region. + */ + kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); + BUG_ON(!kfence_pool); + + memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); + __kfence_pool = __va(kfence_pool); +#endif /* Map all memory banks in the linear mapping */ for_each_mem_range(i, &start, &end) { @@ -1108,18 +1318,45 @@ static void __init setup_vm_final(void) if (start <= __pa(PAGE_OFFSET) && __pa(PAGE_OFFSET) < end) start = __pa(PAGE_OFFSET); - if (end >= __pa(PAGE_OFFSET) + memory_limit) - end = __pa(PAGE_OFFSET) + memory_limit; - - for (pa = start; pa < end; pa += map_size) { - va = (uintptr_t)__va(pa); - map_size = best_map_size(pa, end - pa); - create_pgd_mapping(swapper_pg_dir, va, pa, map_size, - pgprot_from_va(va)); - } + create_linear_mapping_range(start, end, 0, NULL); } +#ifdef CONFIG_STRICT_KERNEL_RWX + create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0, NULL); + create_linear_mapping_range(krodata_start, krodata_start + krodata_size, 0, NULL); + + memblock_clear_nomap(ktext_start, ktext_size); + memblock_clear_nomap(krodata_start, krodata_size); +#endif + +#ifdef CONFIG_KFENCE + create_linear_mapping_range(kfence_pool, kfence_pool + KFENCE_POOL_SIZE, PAGE_SIZE, NULL); + + memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); +#endif +} + +static void __init setup_vm_final(void) +{ + /* Setup swapper PGD for fixmap */ +#if !defined(CONFIG_64BIT) + /* + * In 32-bit, the device tree lies in a pgd entry, so it must be copied + * directly in swapper_pg_dir in addition to the pgd entry that points + * to fixmap_pte. + */ + unsigned long idx = pgd_index(__fix_to_virt(FIX_FDT)); + + set_pgd(&swapper_pg_dir[idx], early_pg_dir[idx]); +#endif + create_pgd_mapping(swapper_pg_dir, FIXADDR_START, + __pa_symbol(fixmap_pgd_next), + PGDIR_SIZE, PAGE_TABLE); + + /* Map the linear mapping */ + create_linear_mapping_page_table(); + /* Map the kernel */ if (IS_ENABLED(CONFIG_64BIT)) create_kernel_page_table(swapper_pg_dir, false); @@ -1145,6 +1382,12 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) { dtb_early_va = (void *)dtb_pa; dtb_early_pa = dtb_pa; + +#ifdef CONFIG_RELOCATABLE + kernel_map.virt_addr = (uintptr_t)_start; + kernel_map.phys_addr = (uintptr_t)_start; + relocate_kernel(); +#endif } static inline void setup_vm_final(void) @@ -1159,71 +1402,32 @@ static inline void setup_vm_final(void) * line parameter. The memory reserved is used by dump capture kernel when * primary kernel is crashing. */ -static void __init reserve_crashkernel(void) +static void __init arch_reserve_crashkernel(void) { - unsigned long long crash_base = 0; - unsigned long long crash_size = 0; - unsigned long search_start = memblock_start_of_DRAM(); - unsigned long search_end = memblock_end_of_DRAM(); - - int ret = 0; + unsigned long long low_size = 0; + unsigned long long crash_base, crash_size; + bool high = false; + int ret; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) - return; - /* - * Don't reserve a region for a crash kernel on a crash kernel - * since it doesn't make much sense and we have limited memory - * resources. - */ - if (is_kdump_kernel()) { - pr_info("crashkernel: ignoring reservation request\n"); + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; - } ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &crash_size, &crash_base); - if (ret || !crash_size) + &crash_size, &crash_base, + &low_size, NULL, &high); + if (ret) return; - crash_size = PAGE_ALIGN(crash_size); - - if (crash_base) { - search_start = crash_base; - search_end = crash_base + crash_size; - } - - /* - * Current riscv boot protocol requires 2MB alignment for - * RV64 and 4MB alignment for RV32 (hugepage size) - * - * Try to alloc from 32bit addressible physical memory so that - * swiotlb can work on the crash kernel. - */ - crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, - search_start, - min(search_end, (unsigned long) SZ_4G)); - if (crash_base == 0) { - /* Try again without restricting region to 32bit addressible memory */ - crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, - search_start, search_end); - if (crash_base == 0) { - pr_warn("crashkernel: couldn't allocate %lldKB\n", - crash_size >> 10); - return; - } - } - - pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n", - crash_base, crash_base + crash_size, crash_size >> 20); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; + reserve_crashkernel_generic(crash_size, crash_base, low_size, high); } void __init paging_init(void) { setup_bootmem(); setup_vm_final(); + + /* Depend on that Linear Mapping is ready */ + memblock_allow_resize(); } void __init misc_mem_init(void) @@ -1231,15 +1435,406 @@ void __init misc_mem_init(void) early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); arch_numa_init(); sparse_init(); +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* The entire VMEMMAP region has been populated. Flush TLB for this region */ + local_flush_tlb_kernel_range(VMEMMAP_START, VMEMMAP_END); +#endif zone_sizes_init(); - reserve_crashkernel(); + arch_reserve_crashkernel(); memblock_dump_all(); } #ifdef CONFIG_SPARSEMEM_VMEMMAP +void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) +{ + pmd_set_huge(pmd, virt_to_phys(p), PAGE_KERNEL); +} + +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, + unsigned long addr, unsigned long next) +{ + vmemmap_verify((pte_t *)pmdp, node, addr, next); + return 1; +} + int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { - return vmemmap_populate_basepages(start, end, node, NULL); + /* + * Note that SPARSEMEM_VMEMMAP is only selected for rv64 and that we + * can't use hugepage mappings for 2-level page table because in case of + * memory hotplug, we are not able to update all the page tables with + * the new PMDs. + */ + return vmemmap_populate_hugepages(start, end, node, altmap); } #endif + +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) +/* + * Pre-allocates page-table pages for a specific area in the kernel + * page-table. Only the level which needs to be synchronized between + * all page-tables is allocated because the synchronization can be + * expensive. + */ +static void __init preallocate_pgd_pages_range(unsigned long start, unsigned long end, + const char *area) +{ + unsigned long addr; + const char *lvl; + + for (addr = start; addr < end && addr >= start; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + lvl = "p4d"; + p4d = p4d_alloc(&init_mm, pgd, addr); + if (!p4d) + goto failed; + + if (pgtable_l5_enabled) + continue; + + lvl = "pud"; + pud = pud_alloc(&init_mm, p4d, addr); + if (!pud) + goto failed; + + if (pgtable_l4_enabled) + continue; + + lvl = "pmd"; + pmd = pmd_alloc(&init_mm, pud, addr); + if (!pmd) + goto failed; + } + return; + +failed: + /* + * The pages have to be there now or they will be missing in + * process page-tables later. + */ + panic("Failed to pre-allocate %s pages for %s area\n", lvl, area); +} + +#define PAGE_END KASAN_SHADOW_START + +void __init pgtable_cache_init(void) +{ + preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc"); + if (IS_ENABLED(CONFIG_MODULES)) + preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, "bpf/modules"); + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { + preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, "vmemmap"); + preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct map"); + if (IS_ENABLED(CONFIG_KASAN)) + preallocate_pgd_pages_range(KASAN_SHADOW_START, KASAN_SHADOW_END, "kasan"); + } +} +#endif + +#ifdef CONFIG_EXECMEM +#ifdef CONFIG_MMU +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) +{ + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = MODULES_VADDR, + .end = MODULES_END, + .pgprot = PAGE_KERNEL, + .alignment = 1, + }, + [EXECMEM_KPROBES] = { + .start = VMALLOC_START, + .end = VMALLOC_END, + .pgprot = PAGE_KERNEL_READ_EXEC, + .alignment = 1, + }, + [EXECMEM_BPF] = { + .start = BPF_JIT_REGION_START, + .end = BPF_JIT_REGION_END, + .pgprot = PAGE_KERNEL, + .alignment = PAGE_SIZE, + }, + }, + }; + + return &execmem_info; +} +#endif /* CONFIG_MMU */ +#endif /* CONFIG_EXECMEM */ + +#ifdef CONFIG_MEMORY_HOTPLUG +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + struct page *page = pmd_page(*pmd); + struct ptdesc *ptdesc = page_ptdesc(page); + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pagetable_dtor(ptdesc); + if (PageReserved(page)) + free_reserved_page(page); + else + pagetable_free(ptdesc); + pmd_clear(pmd); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool is_vmemmap) +{ + struct page *page = pud_page(*pud); + struct ptdesc *ptdesc = page_ptdesc(page); + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + if (!is_vmemmap) + pagetable_dtor(ptdesc); + if (PageReserved(page)) + free_reserved_page(page); + else + pagetable_free(ptdesc); + pud_clear(pud); +} + +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + struct page *page = p4d_page(*p4d); + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + if (PageReserved(page)) + free_reserved_page(page); + else + __free_pages(page, 0); + p4d_clear(p4d); +} + +static void __meminit free_vmemmap_storage(struct page *page, size_t size, + struct vmem_altmap *altmap) +{ + int order = get_order(size); + + if (altmap) { + vmem_altmap_free(altmap, size >> PAGE_SHIFT); + return; + } + + if (PageReserved(page)) { + unsigned int nr_pages = 1 << order; + + while (nr_pages--) + free_reserved_page(page++); + return; + } + + __free_pages(page, order); +} + +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end, + bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pte_t *ptep, pte; + + for (; addr < end; addr = next) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + ptep = pte_base + pte_index(addr); + pte = ptep_get(ptep); + if (!pte_present(*ptep)) + continue; + + pte_clear(&init_mm, addr, ptep); + if (is_vmemmap) + free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap); + } +} + +static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end, + bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pte_t *pte_base; + pmd_t *pmdp, pmd; + + for (; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + pmdp = pmd_base + pmd_index(addr); + pmd = pmdp_get(pmdp); + if (!pmd_present(pmd)) + continue; + + if (pmd_leaf(pmd)) { + pmd_clear(pmdp); + if (is_vmemmap) + free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap); + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmdp); + remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap); + free_pte_table(pte_base, pmdp); + } +} + +static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end, + bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pud_t *pudp, pud; + pmd_t *pmd_base; + + for (; addr < end; addr = next) { + next = pud_addr_end(addr, end); + pudp = pud_base + pud_index(addr); + pud = pudp_get(pudp); + if (!pud_present(pud)) + continue; + + if (pud_leaf(pud)) { + if (pgtable_l4_enabled) { + pud_clear(pudp); + if (is_vmemmap) + free_vmemmap_storage(pud_page(pud), PUD_SIZE, altmap); + } + continue; + } + + pmd_base = pmd_offset(pudp, 0); + remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap); + + if (pgtable_l4_enabled) + free_pmd_table(pmd_base, pudp, is_vmemmap); + } +} + +static void __meminit remove_p4d_mapping(p4d_t *p4d_base, unsigned long addr, unsigned long end, + bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + p4d_t *p4dp, p4d; + pud_t *pud_base; + + for (; addr < end; addr = next) { + next = p4d_addr_end(addr, end); + p4dp = p4d_base + p4d_index(addr); + p4d = p4dp_get(p4dp); + if (!p4d_present(p4d)) + continue; + + if (p4d_leaf(p4d)) { + if (pgtable_l5_enabled) { + p4d_clear(p4dp); + if (is_vmemmap) + free_vmemmap_storage(p4d_page(p4d), P4D_SIZE, altmap); + } + continue; + } + + pud_base = pud_offset(p4dp, 0); + remove_pud_mapping(pud_base, addr, next, is_vmemmap, altmap); + + if (pgtable_l5_enabled) + free_pud_table(pud_base, p4dp); + } +} + +static void __meminit remove_pgd_mapping(unsigned long va, unsigned long end, bool is_vmemmap, + struct vmem_altmap *altmap) +{ + unsigned long addr, next; + p4d_t *p4d_base; + pgd_t *pgd; + + for (addr = va; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + pgd = pgd_offset_k(addr); + + if (!pgd_present(*pgd)) + continue; + + if (pgd_leaf(*pgd)) + continue; + + p4d_base = p4d_offset(pgd, 0); + remove_p4d_mapping(p4d_base, addr, next, is_vmemmap, altmap); + } + + flush_tlb_all(); +} + +static void __meminit remove_linear_mapping(phys_addr_t start, u64 size) +{ + unsigned long va = (unsigned long)__va(start); + unsigned long end = (unsigned long)__va(start + size); + + remove_pgd_mapping(va, end, false, NULL); +} + +struct range arch_get_mappable_range(void) +{ + struct range mhp_range; + + mhp_range.start = __pa(PAGE_OFFSET); + mhp_range.end = __pa(PAGE_END - 1); + return mhp_range; +} + +int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params) +{ + int ret = 0; + + create_linear_mapping_range(start, start + size, 0, ¶ms->pgprot); + ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params); + if (ret) { + remove_linear_mapping(start, size); + goto out; + } + + max_pfn = PFN_UP(start + size); + max_low_pfn = max_pfn; + + out: + flush_tlb_all(); + return ret; +} + +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +{ + __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap); + remove_linear_mapping(start, size); + flush_tlb_all(); +} + +void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) +{ + remove_pgd_mapping(start, end, true, altmap); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ |
