diff options
Diffstat (limited to 'arch/riscv/mm/init.c')
| -rw-r--r-- | arch/riscv/mm/init.c | 889 |
1 files changed, 682 insertions, 207 deletions
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 747e5b1ef02d..addb8a9305be 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -20,20 +20,25 @@ #include <linux/dma-map-ops.h> #include <linux/crash_dump.h> #include <linux/hugetlb.h> -#ifdef CONFIG_RELOCATABLE -#include <linux/elf.h> -#endif +#include <linux/kfence.h> +#include <linux/execmem.h> +#include <asm/alternative.h> #include <asm/fixmap.h> -#include <asm/tlbflush.h> -#include <asm/sections.h> -#include <asm/soc.h> #include <asm/io.h> -#include <asm/ptdump.h> +#include <asm/kasan.h> +#include <asm/module.h> #include <asm/numa.h> +#include <asm/pgtable.h> +#include <asm/sections.h> +#include <asm/soc.h> +#include <asm/sparsemem.h> +#include <asm/tlbflush.h> #include "../kernel/head.h" +u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1]; + struct kernel_mapping kernel_map __ro_after_init; EXPORT_SYMBOL(kernel_map); #ifdef CONFIG_XIP_KERNEL @@ -47,14 +52,23 @@ u64 satp_mode __ro_after_init = SATP_MODE_32; #endif EXPORT_SYMBOL(satp_mode); -bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL); -bool pgtable_l5_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL); +#ifdef CONFIG_64BIT +bool pgtable_l4_enabled __ro_after_init = !IS_ENABLED(CONFIG_XIP_KERNEL); +bool pgtable_l5_enabled __ro_after_init = !IS_ENABLED(CONFIG_XIP_KERNEL); EXPORT_SYMBOL(pgtable_l4_enabled); EXPORT_SYMBOL(pgtable_l5_enabled); +#endif phys_addr_t phys_ram_base __ro_after_init; EXPORT_SYMBOL(phys_ram_base); +#ifdef CONFIG_SPARSEMEM_VMEMMAP +#define VMEMMAP_ADDR_ALIGN (1ULL << SECTION_SIZE_BITS) + +unsigned long vmemmap_start_pfn __ro_after_init; +EXPORT_SYMBOL(vmemmap_start_pfn); +#endif + unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); @@ -63,7 +77,7 @@ extern char _start[]; void *_dtb_early_va __initdata; uintptr_t _dtb_early_pa __initdata; -static phys_addr_t dma32_phys_limit __initdata; +phys_addr_t dma32_phys_limit __initdata; static void __init zone_sizes_init(void) { @@ -156,20 +170,36 @@ static void __init print_vm_layout(void) static void print_vm_layout(void) { } #endif /* CONFIG_DEBUG_VM */ -void __init mem_init(void) +void __init arch_mm_preinit(void) { + bool swiotlb = max_pfn > PFN_DOWN(dma32_phys_limit); #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif /* CONFIG_FLATMEM */ - swiotlb_init(max_pfn > PFN_DOWN(dma32_phys_limit), SWIOTLB_VERBOSE); - memblock_free_all(); + if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb && + dma_cache_alignment != 1) { + /* + * If no bouncing needed for ZONE_DMA, allocate 1MB swiotlb + * buffer per 1GB of RAM for kmalloc() bouncing on + * non-coherent platforms. + */ + unsigned long size = + DIV_ROUND_UP(memblock_phys_mem_size(), 1024); + swiotlb_adjust_size(min(swiotlb_size_or_default(), size)); + swiotlb = true; + } + + swiotlb_init(swiotlb, SWIOTLB_VERBOSE); print_vm_layout(); } /* Limit the memory size via mem. */ static phys_addr_t memory_limit; +#ifdef CONFIG_XIP_KERNEL +#define memory_limit (*(phys_addr_t *)XIP_FIXUP(&memory_limit)) +#endif /* CONFIG_XIP_KERNEL */ static int __init early_mem(char *p) { @@ -212,38 +242,59 @@ static void __init setup_bootmem(void) */ memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); - phys_ram_end = memblock_end_of_DRAM(); - if (!IS_ENABLED(CONFIG_XIP_KERNEL)) - phys_ram_base = memblock_start_of_DRAM(); + /* + * Make sure we align the start of the memory on a PMD boundary so that + * at worst, we map the linear mapping with PMD mappings. + */ + if (!IS_ENABLED(CONFIG_XIP_KERNEL)) { + phys_ram_base = memblock_start_of_DRAM() & PMD_MASK; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_start_pfn = round_down(phys_ram_base, VMEMMAP_ADDR_ALIGN) >> PAGE_SHIFT; +#endif + } /* * In 64-bit, any use of __va/__pa before this point is wrong as we * did not know the start of DRAM before. */ - if (IS_ENABLED(CONFIG_64BIT)) + if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU)) kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base; /* - * memblock allocator is not aware of the fact that last 4K bytes of - * the addressable memory can not be mapped because of IS_ERR_VALUE - * macro. Make sure that last 4k bytes are not usable by memblock - * if end of dram is equal to maximum addressable memory. For 64-bit - * kernel, this problem can't happen here as the end of the virtual - * address space is occupied by the kernel mapping then this check must - * be done as soon as the kernel mapping base address is determined. + * The size of the linear page mapping may restrict the amount of + * usable RAM. + */ + if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU)) { + max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE; + if (memblock_end_of_DRAM() > max_mapped_addr) { + memblock_cap_memory_range(phys_ram_base, + max_mapped_addr - phys_ram_base); + pr_warn("Physical memory overflows the linear mapping size: region above %pa removed", + &max_mapped_addr); + } + } + + /* + * Reserve physical address space that would be mapped to virtual + * addresses greater than (void *)(-PAGE_SIZE) because: + * - This memory would overlap with ERR_PTR + * - This memory belongs to high memory, which is not supported + * + * This is not applicable to 64-bit kernel, because virtual addresses + * after (void *)(-PAGE_SIZE) are not linearly mapped: they are + * occupied by kernel mapping. Also it is unrealistic for high memory + * to exist on 64-bit platforms. */ if (!IS_ENABLED(CONFIG_64BIT)) { - max_mapped_addr = __pa(~(ulong)0); - if (max_mapped_addr == (phys_ram_end - 1)) - memblock_set_current_limit(max_mapped_addr - 4096); + max_mapped_addr = __va_to_pa_nodebug(-PAGE_SIZE); + memblock_reserve(max_mapped_addr, (phys_addr_t)-max_mapped_addr); } + phys_ram_end = memblock_end_of_DRAM(); min_low_pfn = PFN_UP(phys_ram_base); max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end); - high_memory = (void *)(__va(PFN_PHYS(max_low_pfn))); dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn)); - set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); reserve_initrd_mem(); @@ -266,11 +317,48 @@ static void __init setup_bootmem(void) dma_contiguous_reserve(dma32_phys_limit); if (IS_ENABLED(CONFIG_64BIT)) hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); - memblock_allow_resize(); } +#ifdef CONFIG_RELOCATABLE +extern unsigned long __rela_dyn_start, __rela_dyn_end; + +static void __init relocate_kernel(void) +{ + Elf_Rela *rela = (Elf_Rela *)&__rela_dyn_start; + /* + * This holds the offset between the linked virtual address and the + * relocated virtual address. + */ + uintptr_t reloc_offset = kernel_map.virt_addr - KERNEL_LINK_ADDR; + /* + * This holds the offset between kernel linked virtual address and + * physical address. + */ + uintptr_t va_kernel_link_pa_offset = KERNEL_LINK_ADDR - kernel_map.phys_addr; + + for ( ; rela < (Elf_Rela *)&__rela_dyn_end; rela++) { + Elf_Addr addr = (rela->r_offset - va_kernel_link_pa_offset); + Elf_Addr relocated_addr = rela->r_addend; + + if (rela->r_info != R_RISCV_RELATIVE) + continue; + + /* + * Make sure to not relocate vdso symbols like rt_sigreturn + * which are linked from the address 0 in vmlinux since + * vdso symbol addresses are actually used as an offset from + * mm->context.vdso in VDSO_OFFSET macro. + */ + if (relocated_addr >= KERNEL_LINK_ADDR) + relocated_addr += reloc_offset; + + *(Elf_Addr *)addr = relocated_addr; + } +} +#endif /* CONFIG_RELOCATABLE */ + #ifdef CONFIG_MMU -struct pt_alloc_ops pt_ops __initdata; +struct pt_alloc_ops pt_ops __meminitdata; pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; @@ -293,7 +381,7 @@ static const pgprot_t protection_map[16] = { [VM_EXEC] = PAGE_EXEC, [VM_EXEC | VM_READ] = PAGE_READ_EXEC, [VM_EXEC | VM_WRITE] = PAGE_COPY_EXEC, - [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_READ_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_EXEC, [VM_SHARED] = PAGE_NONE, [VM_SHARED | VM_READ] = PAGE_READ, [VM_SHARED | VM_WRITE] = PAGE_SHARED, @@ -332,7 +420,7 @@ static inline pte_t *__init get_pte_virt_fixmap(phys_addr_t pa) return (pte_t *)set_fixmap_offset(FIX_PTE, pa); } -static inline pte_t *__init get_pte_virt_late(phys_addr_t pa) +static inline pte_t *__meminit get_pte_virt_late(phys_addr_t pa) { return (pte_t *) __va(pa); } @@ -351,19 +439,21 @@ static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t __init alloc_pte_late(uintptr_t va) +static phys_addr_t __meminit alloc_pte_late(uintptr_t va) { - unsigned long vaddr; - - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr || !pgtable_pte_page_ctor(virt_to_page(vaddr))); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); - return __pa(vaddr); + /* + * We do not know which mm the PTE page is associated to at this point. + * Passing NULL to the ctor is the safe option, though it may result + * in unnecessary work (e.g. initialising the ptlock for init_mm). + */ + BUG_ON(!ptdesc || !pagetable_pte_ctor(NULL, ptdesc)); + return __pa((pte_t *)ptdesc_address(ptdesc)); } -static void __init create_pte_mapping(pte_t *ptep, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_pte_mapping(pte_t *ptep, uintptr_t va, phys_addr_t pa, phys_addr_t sz, + pgprot_t prot) { uintptr_t pte_idx = pte_index(va); @@ -417,7 +507,7 @@ static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa) return (pmd_t *)set_fixmap_offset(FIX_PMD, pa); } -static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) +static pmd_t *__meminit get_pmd_virt_late(phys_addr_t pa) { return (pmd_t *) __va(pa); } @@ -434,19 +524,18 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t __init alloc_pmd_late(uintptr_t va) +static phys_addr_t __meminit alloc_pmd_late(uintptr_t va) { - unsigned long vaddr; - - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr || !pgtable_pmd_page_ctor(virt_to_page(vaddr))); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); - return __pa(vaddr); + /* See comment in alloc_pte_late() regarding NULL passed the ctor */ + BUG_ON(!ptdesc || !pagetable_pmd_ctor(NULL, ptdesc)); + return __pa((pmd_t *)ptdesc_address(ptdesc)); } -static void __init create_pmd_mapping(pmd_t *pmdp, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_pmd_mapping(pmd_t *pmdp, + uintptr_t va, phys_addr_t pa, + phys_addr_t sz, pgprot_t prot) { pte_t *ptep; phys_addr_t pte_phys; @@ -482,7 +571,7 @@ static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) return (pud_t *)set_fixmap_offset(FIX_PUD, pa); } -static pud_t *__init get_pud_virt_late(phys_addr_t pa) +static pud_t *__meminit get_pud_virt_late(phys_addr_t pa) { return (pud_t *)__va(pa); } @@ -500,13 +589,13 @@ static phys_addr_t __init alloc_pud_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t alloc_pud_late(uintptr_t va) +static phys_addr_t __meminit alloc_pud_late(uintptr_t va) { - unsigned long vaddr; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr); - return __pa(vaddr); + BUG_ON(!ptdesc); + pagetable_pud_ctor(ptdesc); + return __pa((pud_t *)ptdesc_address(ptdesc)); } static p4d_t *__init get_p4d_virt_early(phys_addr_t pa) @@ -520,7 +609,7 @@ static p4d_t *__init get_p4d_virt_fixmap(phys_addr_t pa) return (p4d_t *)set_fixmap_offset(FIX_P4D, pa); } -static p4d_t *__init get_p4d_virt_late(phys_addr_t pa) +static p4d_t *__meminit get_p4d_virt_late(phys_addr_t pa) { return (p4d_t *)__va(pa); } @@ -538,18 +627,17 @@ static phys_addr_t __init alloc_p4d_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t alloc_p4d_late(uintptr_t va) +static phys_addr_t __meminit alloc_p4d_late(uintptr_t va) { - unsigned long vaddr; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr); - return __pa(vaddr); + BUG_ON(!ptdesc); + pagetable_p4d_ctor(ptdesc); + return __pa((p4d_t *)ptdesc_address(ptdesc)); } -static void __init create_pud_mapping(pud_t *pudp, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_pud_mapping(pud_t *pudp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, + pgprot_t prot) { pmd_t *nextp; phys_addr_t next_phys; @@ -574,9 +662,8 @@ static void __init create_pud_mapping(pud_t *pudp, create_pmd_mapping(nextp, va, pa, sz, prot); } -static void __init create_p4d_mapping(p4d_t *p4dp, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_p4d_mapping(p4d_t *p4dp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, + pgprot_t prot) { pud_t *nextp; phys_addr_t next_phys; @@ -632,9 +719,8 @@ static void __init create_p4d_mapping(p4d_t *p4dp, #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) #endif /* __PAGETABLE_PMD_FOLDED */ -void __init create_pgd_mapping(pgd_t *pgdp, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, + pgprot_t prot) { pgd_next_t *nextp; phys_addr_t next_phys; @@ -659,18 +745,21 @@ void __init create_pgd_mapping(pgd_t *pgdp, create_pgd_next_mapping(nextp, va, pa, sz, prot); } -static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) +static uintptr_t __meminit best_map_size(phys_addr_t pa, uintptr_t va, phys_addr_t size) { - if (!(base & (PGDIR_SIZE - 1)) && size >= PGDIR_SIZE) - return PGDIR_SIZE; + if (debug_pagealloc_enabled()) + return PAGE_SIZE; - if (!(base & (P4D_SIZE - 1)) && size >= P4D_SIZE) + if (pgtable_l5_enabled && + !(pa & (P4D_SIZE - 1)) && !(va & (P4D_SIZE - 1)) && size >= P4D_SIZE) return P4D_SIZE; - if (!(base & (PUD_SIZE - 1)) && size >= PUD_SIZE) + if (pgtable_l4_enabled && + !(pa & (PUD_SIZE - 1)) && !(va & (PUD_SIZE - 1)) && size >= PUD_SIZE) return PUD_SIZE; - if (!(base & (PMD_SIZE - 1)) && size >= PMD_SIZE) + if (IS_ENABLED(CONFIG_64BIT) && + !(pa & (PMD_SIZE - 1)) && !(va & (PMD_SIZE - 1)) && size >= PMD_SIZE) return PMD_SIZE; return PAGE_SIZE; @@ -692,7 +781,7 @@ asmlinkage void __init __copy_data(void) #endif #ifdef CONFIG_STRICT_KERNEL_RWX -static __init pgprot_t pgprot_from_va(uintptr_t va) +static __meminit pgprot_t pgprot_from_va(uintptr_t va) { if (is_va_kernel_text(va)) return PAGE_KERNEL_READ_EXEC; @@ -715,11 +804,9 @@ void mark_rodata_ro(void) if (IS_ENABLED(CONFIG_64BIT)) set_kernel_memory(lm_alias(__start_rodata), lm_alias(_data), set_memory_ro); - - debug_checkwx(); } #else -static __init pgprot_t pgprot_from_va(uintptr_t va) +static __meminit pgprot_t pgprot_from_va(uintptr_t va) { if (IS_ENABLED(CONFIG_64BIT) && !is_kernel_mapping(va)) return PAGE_KERNEL; @@ -730,6 +817,7 @@ static __init pgprot_t pgprot_from_va(uintptr_t va) #if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) u64 __pi_set_satp_mode_from_cmdline(uintptr_t dtb_pa); +u64 __pi_set_satp_mode_from_fdt(uintptr_t dtb_pa); static void __init disable_pgtable_l5(void) { @@ -759,21 +847,32 @@ static int __init print_no5lvl(char *p) } early_param("no5lvl", print_no5lvl); +static void __init set_mmap_rnd_bits_max(void) +{ + mmap_rnd_bits_max = MMAP_VA_BITS - PAGE_SHIFT - 3; +} + /* * There is a simple way to determine if 4-level is supported by the * underlying hardware: establish 1:1 mapping in 4-level page table mode * then read SATP to see if the configuration was taken into account * meaning sv48 is supported. + * The maximum SATP mode is limited by both the command line and the "mmu-type" + * property in the device tree, since some platforms may hang if an unsupported + * SATP mode is attempted. */ static __init void set_satp_mode(uintptr_t dtb_pa) { u64 identity_satp, hw_satp; uintptr_t set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; - u64 satp_mode_cmdline = __pi_set_satp_mode_from_cmdline(dtb_pa); + u64 satp_mode_limit = min_not_zero(__pi_set_satp_mode_from_cmdline(dtb_pa), + __pi_set_satp_mode_from_fdt(dtb_pa)); - if (satp_mode_cmdline == SATP_MODE_57) { + kernel_map.page_offset = PAGE_OFFSET_L5; + + if (satp_mode_limit == SATP_MODE_48) { disable_pgtable_l5(); - } else if (satp_mode_cmdline == SATP_MODE_48) { + } else if (satp_mode_limit == SATP_MODE_39) { disable_pgtable_l5(); disable_pgtable_l4(); return; @@ -841,49 +940,11 @@ retry: #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing." #endif -#ifdef CONFIG_RELOCATABLE -extern unsigned long __rela_dyn_start, __rela_dyn_end; - -static void __init relocate_kernel(void) -{ - Elf64_Rela *rela = (Elf64_Rela *)&__rela_dyn_start; - /* - * This holds the offset between the linked virtual address and the - * relocated virtual address. - */ - uintptr_t reloc_offset = kernel_map.virt_addr - KERNEL_LINK_ADDR; - /* - * This holds the offset between kernel linked virtual address and - * physical address. - */ - uintptr_t va_kernel_link_pa_offset = KERNEL_LINK_ADDR - kernel_map.phys_addr; - - for ( ; rela < (Elf64_Rela *)&__rela_dyn_end; rela++) { - Elf64_Addr addr = (rela->r_offset - va_kernel_link_pa_offset); - Elf64_Addr relocated_addr = rela->r_addend; - - if (rela->r_info != R_RISCV_RELATIVE) - continue; - - /* - * Make sure to not relocate vdso symbols like rt_sigreturn - * which are linked from the address 0 in vmlinux since - * vdso symbol addresses are actually used as an offset from - * mm->context.vdso in VDSO_OFFSET macro. - */ - if (relocated_addr >= KERNEL_LINK_ADDR) - relocated_addr += reloc_offset; - - *(Elf64_Addr *)addr = relocated_addr; - } -} -#endif /* CONFIG_RELOCATABLE */ - #ifdef CONFIG_XIP_KERNEL static void __init create_kernel_page_table(pgd_t *pgdir, __always_unused bool early) { - uintptr_t va, end_va; + uintptr_t va, start_va, end_va; /* Map the flash resident part */ end_va = kernel_map.virt_addr + kernel_map.xiprom_sz; @@ -893,10 +954,11 @@ static void __init create_kernel_page_table(pgd_t *pgdir, PMD_SIZE, PAGE_KERNEL_EXEC); /* Map the data in RAM */ - end_va = kernel_map.virt_addr + XIP_OFFSET + kernel_map.size; - for (va = kernel_map.virt_addr + XIP_OFFSET; va < end_va; va += PMD_SIZE) + start_va = kernel_map.virt_addr + (uintptr_t)&_sdata - (uintptr_t)&_start; + end_va = kernel_map.virt_addr + kernel_map.size; + for (va = start_va; va < end_va; va += PMD_SIZE) create_pgd_mapping(pgdir, va, - kernel_map.phys_addr + (va - (kernel_map.virt_addr + XIP_OFFSET)), + kernel_map.phys_addr + (va - start_va), PMD_SIZE, PAGE_KERNEL); } #else @@ -922,9 +984,9 @@ static void __init create_kernel_page_table(pgd_t *pgdir, bool early) static void __init create_fdt_early_page_table(uintptr_t fix_fdt_va, uintptr_t dtb_pa) { +#ifndef CONFIG_BUILTIN_DTB uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); -#ifndef CONFIG_BUILTIN_DTB /* Make sure the fdt fixmap address is always aligned on PMD size */ BUILD_BUG_ON(FIX_FDT % (PMD_SIZE / PAGE_SIZE)); @@ -947,7 +1009,7 @@ static void __init create_fdt_early_page_table(uintptr_t fix_fdt_va, * setup_vm_final installs the linear mapping. For 32-bit kernel, as the * kernel is mapped in the linear mapping, that makes no difference. */ - dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa)); + dtb_early_va = kernel_mapping_pa_to_va(dtb_pa); #endif dtb_early_pa = dtb_pa; @@ -1011,29 +1073,72 @@ static void __init pt_ops_set_late(void) #endif } +#ifdef CONFIG_RANDOMIZE_BASE +extern bool __init __pi_set_nokaslr_from_cmdline(uintptr_t dtb_pa); +extern u64 __init __pi_get_kaslr_seed(uintptr_t dtb_pa); +extern u64 __init __pi_get_kaslr_seed_zkr(const uintptr_t dtb_pa); + +static int __init print_nokaslr(char *p) +{ + pr_info("Disabled KASLR"); + return 0; +} +early_param("nokaslr", print_nokaslr); + +unsigned long kaslr_offset(void) +{ + return kernel_map.virt_offset; +} +#endif + asmlinkage void __init setup_vm(uintptr_t dtb_pa) { pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; - kernel_map.virt_addr = KERNEL_LINK_ADDR; - kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); +#ifdef CONFIG_RANDOMIZE_BASE + if (!__pi_set_nokaslr_from_cmdline(dtb_pa)) { + u64 kaslr_seed = __pi_get_kaslr_seed_zkr(dtb_pa); + u32 kernel_size = (uintptr_t)(&_end) - (uintptr_t)(&_start); + u32 nr_pos; + + if (kaslr_seed == 0) + kaslr_seed = __pi_get_kaslr_seed(dtb_pa); + /* + * Compute the number of positions available: we are limited + * by the early page table that only has one PUD and we must + * be aligned on PMD_SIZE. + */ + nr_pos = (PUD_SIZE - kernel_size) / PMD_SIZE; + + kernel_map.virt_offset = (kaslr_seed % nr_pos) * PMD_SIZE; + } +#endif + + kernel_map.virt_addr = KERNEL_LINK_ADDR + kernel_map.virt_offset; #ifdef CONFIG_XIP_KERNEL kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; kernel_map.xiprom_sz = (uintptr_t)(&_exiprom) - (uintptr_t)(&_xiprom); phys_ram_base = CONFIG_PHYS_RAM_BASE; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_start_pfn = round_down(phys_ram_base, VMEMMAP_ADDR_ALIGN) >> PAGE_SHIFT; +#endif kernel_map.phys_addr = (uintptr_t)CONFIG_PHYS_RAM_BASE; - kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_sdata); + kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_start); - kernel_map.va_kernel_xip_pa_offset = kernel_map.virt_addr - kernel_map.xiprom; + kernel_map.va_kernel_xip_text_pa_offset = kernel_map.virt_addr - kernel_map.xiprom; + kernel_map.va_kernel_xip_data_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr + + (uintptr_t)&_sdata - (uintptr_t)&_start; #else kernel_map.phys_addr = (uintptr_t)(&_start); kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr; + kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; #endif #if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) set_satp_mode(dtb_pa); + set_mmap_rnd_bits_max(); #endif /* @@ -1050,15 +1155,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) */ kernel_map.va_pa_offset = IS_ENABLED(CONFIG_64BIT) ? 0UL : PAGE_OFFSET - kernel_map.phys_addr; - kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; - /* - * The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit - * kernel, whereas for 64-bit kernel, the end of the virtual address - * space is occupied by the modules/BPF/kernel mappings which reduces - * the available size of the linear mapping. - */ - memory_limit = KERN_VIRT_SIZE - (IS_ENABLED(CONFIG_64BIT) ? SZ_4G : 0); + memory_limit = KERN_VIRT_SIZE; /* Sanity check alignment and size */ BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0); @@ -1079,7 +1177,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) * makes the kernel cross over a PUD_SIZE boundary, raise a bug * since a part of the kernel would not get mapped. */ - BUG_ON(PUD_SIZE - (kernel_map.virt_addr & (PUD_SIZE - 1)) < kernel_map.size); + if (IS_ENABLED(CONFIG_64BIT)) + BUG_ON(PUD_SIZE - (kernel_map.virt_addr & (PUD_SIZE - 1)) < kernel_map.size); relocate_kernel(); #endif @@ -1166,24 +1265,26 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pt_ops_set_fixmap(); } -static void __init create_linear_mapping_range(phys_addr_t start, - phys_addr_t end) +static void __meminit create_linear_mapping_range(phys_addr_t start, phys_addr_t end, + uintptr_t fixed_map_size, const pgprot_t *pgprot) { phys_addr_t pa; uintptr_t va, map_size; for (pa = start; pa < end; pa += map_size) { va = (uintptr_t)__va(pa); - map_size = best_map_size(pa, end - pa); + map_size = fixed_map_size ? fixed_map_size : + best_map_size(pa, va, end - pa); create_pgd_mapping(swapper_pg_dir, va, pa, map_size, - pgprot_from_va(va)); + pgprot ? *pgprot : pgprot_from_va(va)); } } static void __init create_linear_mapping_page_table(void) { phys_addr_t start, end; + phys_addr_t kfence_pool __maybe_unused; u64 i; #ifdef CONFIG_STRICT_KERNEL_RWX @@ -1197,6 +1298,19 @@ static void __init create_linear_mapping_page_table(void) memblock_mark_nomap(krodata_start, krodata_size); #endif +#ifdef CONFIG_KFENCE + /* + * kfence pool must be backed by PAGE_SIZE mappings, so allocate it + * before we setup the linear mapping so that we avoid using hugepages + * for this region. + */ + kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); + BUG_ON(!kfence_pool); + + memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); + __kfence_pool = __va(kfence_pool); +#endif + /* Map all memory banks in the linear mapping */ for_each_mem_range(i, &start, &end) { if (start >= end) @@ -1204,20 +1318,23 @@ static void __init create_linear_mapping_page_table(void) if (start <= __pa(PAGE_OFFSET) && __pa(PAGE_OFFSET) < end) start = __pa(PAGE_OFFSET); - if (end >= __pa(PAGE_OFFSET) + memory_limit) - end = __pa(PAGE_OFFSET) + memory_limit; - create_linear_mapping_range(start, end); + create_linear_mapping_range(start, end, 0, NULL); } #ifdef CONFIG_STRICT_KERNEL_RWX - create_linear_mapping_range(ktext_start, ktext_start + ktext_size); - create_linear_mapping_range(krodata_start, - krodata_start + krodata_size); + create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0, NULL); + create_linear_mapping_range(krodata_start, krodata_start + krodata_size, 0, NULL); memblock_clear_nomap(ktext_start, ktext_size); memblock_clear_nomap(krodata_start, krodata_size); #endif + +#ifdef CONFIG_KFENCE + create_linear_mapping_range(kfence_pool, kfence_pool + KFENCE_POOL_SIZE, PAGE_SIZE, NULL); + + memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); +#endif } static void __init setup_vm_final(void) @@ -1265,6 +1382,12 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) { dtb_early_va = (void *)dtb_pa; dtb_early_pa = dtb_pa; + +#ifdef CONFIG_RELOCATABLE + kernel_map.virt_addr = (uintptr_t)_start; + kernel_map.phys_addr = (uintptr_t)_start; + relocate_kernel(); +#endif } static inline void setup_vm_final(void) @@ -1279,71 +1402,32 @@ static inline void setup_vm_final(void) * line parameter. The memory reserved is used by dump capture kernel when * primary kernel is crashing. */ -static void __init reserve_crashkernel(void) +static void __init arch_reserve_crashkernel(void) { - unsigned long long crash_base = 0; - unsigned long long crash_size = 0; - unsigned long search_start = memblock_start_of_DRAM(); - unsigned long search_end = memblock_end_of_DRAM(); - - int ret = 0; + unsigned long long low_size = 0; + unsigned long long crash_base, crash_size; + bool high = false; + int ret; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; - /* - * Don't reserve a region for a crash kernel on a crash kernel - * since it doesn't make much sense and we have limited memory - * resources. - */ - if (is_kdump_kernel()) { - pr_info("crashkernel: ignoring reservation request\n"); - return; - } ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &crash_size, &crash_base); - if (ret || !crash_size) + &crash_size, &crash_base, + &low_size, NULL, &high); + if (ret) return; - crash_size = PAGE_ALIGN(crash_size); - - if (crash_base) { - search_start = crash_base; - search_end = crash_base + crash_size; - } - - /* - * Current riscv boot protocol requires 2MB alignment for - * RV64 and 4MB alignment for RV32 (hugepage size) - * - * Try to alloc from 32bit addressible physical memory so that - * swiotlb can work on the crash kernel. - */ - crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, - search_start, - min(search_end, (unsigned long) SZ_4G)); - if (crash_base == 0) { - /* Try again without restricting region to 32bit addressible memory */ - crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, - search_start, search_end); - if (crash_base == 0) { - pr_warn("crashkernel: couldn't allocate %lldKB\n", - crash_size >> 10); - return; - } - } - - pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n", - crash_base, crash_base + crash_size, crash_size >> 20); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; + reserve_crashkernel_generic(crash_size, crash_base, low_size, high); } void __init paging_init(void) { setup_bootmem(); setup_vm_final(); + + /* Depend on that Linear Mapping is ready */ + memblock_allow_resize(); } void __init misc_mem_init(void) @@ -1351,15 +1435,406 @@ void __init misc_mem_init(void) early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); arch_numa_init(); sparse_init(); +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* The entire VMEMMAP region has been populated. Flush TLB for this region */ + local_flush_tlb_kernel_range(VMEMMAP_START, VMEMMAP_END); +#endif zone_sizes_init(); - reserve_crashkernel(); + arch_reserve_crashkernel(); memblock_dump_all(); } #ifdef CONFIG_SPARSEMEM_VMEMMAP +void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) +{ + pmd_set_huge(pmd, virt_to_phys(p), PAGE_KERNEL); +} + +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, + unsigned long addr, unsigned long next) +{ + vmemmap_verify((pte_t *)pmdp, node, addr, next); + return 1; +} + int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { - return vmemmap_populate_basepages(start, end, node, NULL); + /* + * Note that SPARSEMEM_VMEMMAP is only selected for rv64 and that we + * can't use hugepage mappings for 2-level page table because in case of + * memory hotplug, we are not able to update all the page tables with + * the new PMDs. + */ + return vmemmap_populate_hugepages(start, end, node, altmap); } #endif + +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) +/* + * Pre-allocates page-table pages for a specific area in the kernel + * page-table. Only the level which needs to be synchronized between + * all page-tables is allocated because the synchronization can be + * expensive. + */ +static void __init preallocate_pgd_pages_range(unsigned long start, unsigned long end, + const char *area) +{ + unsigned long addr; + const char *lvl; + + for (addr = start; addr < end && addr >= start; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + lvl = "p4d"; + p4d = p4d_alloc(&init_mm, pgd, addr); + if (!p4d) + goto failed; + + if (pgtable_l5_enabled) + continue; + + lvl = "pud"; + pud = pud_alloc(&init_mm, p4d, addr); + if (!pud) + goto failed; + + if (pgtable_l4_enabled) + continue; + + lvl = "pmd"; + pmd = pmd_alloc(&init_mm, pud, addr); + if (!pmd) + goto failed; + } + return; + +failed: + /* + * The pages have to be there now or they will be missing in + * process page-tables later. + */ + panic("Failed to pre-allocate %s pages for %s area\n", lvl, area); +} + +#define PAGE_END KASAN_SHADOW_START + +void __init pgtable_cache_init(void) +{ + preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc"); + if (IS_ENABLED(CONFIG_MODULES)) + preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, "bpf/modules"); + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { + preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, "vmemmap"); + preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct map"); + if (IS_ENABLED(CONFIG_KASAN)) + preallocate_pgd_pages_range(KASAN_SHADOW_START, KASAN_SHADOW_END, "kasan"); + } +} +#endif + +#ifdef CONFIG_EXECMEM +#ifdef CONFIG_MMU +static struct execmem_info execmem_info __ro_after_init; + +struct execmem_info __init *execmem_arch_setup(void) +{ + execmem_info = (struct execmem_info){ + .ranges = { + [EXECMEM_DEFAULT] = { + .start = MODULES_VADDR, + .end = MODULES_END, + .pgprot = PAGE_KERNEL, + .alignment = 1, + }, + [EXECMEM_KPROBES] = { + .start = VMALLOC_START, + .end = VMALLOC_END, + .pgprot = PAGE_KERNEL_READ_EXEC, + .alignment = 1, + }, + [EXECMEM_BPF] = { + .start = BPF_JIT_REGION_START, + .end = BPF_JIT_REGION_END, + .pgprot = PAGE_KERNEL, + .alignment = PAGE_SIZE, + }, + }, + }; + + return &execmem_info; +} +#endif /* CONFIG_MMU */ +#endif /* CONFIG_EXECMEM */ + +#ifdef CONFIG_MEMORY_HOTPLUG +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + struct page *page = pmd_page(*pmd); + struct ptdesc *ptdesc = page_ptdesc(page); + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pagetable_dtor(ptdesc); + if (PageReserved(page)) + free_reserved_page(page); + else + pagetable_free(ptdesc); + pmd_clear(pmd); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool is_vmemmap) +{ + struct page *page = pud_page(*pud); + struct ptdesc *ptdesc = page_ptdesc(page); + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + if (!is_vmemmap) + pagetable_dtor(ptdesc); + if (PageReserved(page)) + free_reserved_page(page); + else + pagetable_free(ptdesc); + pud_clear(pud); +} + +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + struct page *page = p4d_page(*p4d); + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + if (PageReserved(page)) + free_reserved_page(page); + else + __free_pages(page, 0); + p4d_clear(p4d); +} + +static void __meminit free_vmemmap_storage(struct page *page, size_t size, + struct vmem_altmap *altmap) +{ + int order = get_order(size); + + if (altmap) { + vmem_altmap_free(altmap, size >> PAGE_SHIFT); + return; + } + + if (PageReserved(page)) { + unsigned int nr_pages = 1 << order; + + while (nr_pages--) + free_reserved_page(page++); + return; + } + + __free_pages(page, order); +} + +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end, + bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pte_t *ptep, pte; + + for (; addr < end; addr = next) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + ptep = pte_base + pte_index(addr); + pte = ptep_get(ptep); + if (!pte_present(*ptep)) + continue; + + pte_clear(&init_mm, addr, ptep); + if (is_vmemmap) + free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap); + } +} + +static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end, + bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pte_t *pte_base; + pmd_t *pmdp, pmd; + + for (; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + pmdp = pmd_base + pmd_index(addr); + pmd = pmdp_get(pmdp); + if (!pmd_present(pmd)) + continue; + + if (pmd_leaf(pmd)) { + pmd_clear(pmdp); + if (is_vmemmap) + free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap); + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmdp); + remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap); + free_pte_table(pte_base, pmdp); + } +} + +static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end, + bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pud_t *pudp, pud; + pmd_t *pmd_base; + + for (; addr < end; addr = next) { + next = pud_addr_end(addr, end); + pudp = pud_base + pud_index(addr); + pud = pudp_get(pudp); + if (!pud_present(pud)) + continue; + + if (pud_leaf(pud)) { + if (pgtable_l4_enabled) { + pud_clear(pudp); + if (is_vmemmap) + free_vmemmap_storage(pud_page(pud), PUD_SIZE, altmap); + } + continue; + } + + pmd_base = pmd_offset(pudp, 0); + remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap); + + if (pgtable_l4_enabled) + free_pmd_table(pmd_base, pudp, is_vmemmap); + } +} + +static void __meminit remove_p4d_mapping(p4d_t *p4d_base, unsigned long addr, unsigned long end, + bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + p4d_t *p4dp, p4d; + pud_t *pud_base; + + for (; addr < end; addr = next) { + next = p4d_addr_end(addr, end); + p4dp = p4d_base + p4d_index(addr); + p4d = p4dp_get(p4dp); + if (!p4d_present(p4d)) + continue; + + if (p4d_leaf(p4d)) { + if (pgtable_l5_enabled) { + p4d_clear(p4dp); + if (is_vmemmap) + free_vmemmap_storage(p4d_page(p4d), P4D_SIZE, altmap); + } + continue; + } + + pud_base = pud_offset(p4dp, 0); + remove_pud_mapping(pud_base, addr, next, is_vmemmap, altmap); + + if (pgtable_l5_enabled) + free_pud_table(pud_base, p4dp); + } +} + +static void __meminit remove_pgd_mapping(unsigned long va, unsigned long end, bool is_vmemmap, + struct vmem_altmap *altmap) +{ + unsigned long addr, next; + p4d_t *p4d_base; + pgd_t *pgd; + + for (addr = va; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + pgd = pgd_offset_k(addr); + + if (!pgd_present(*pgd)) + continue; + + if (pgd_leaf(*pgd)) + continue; + + p4d_base = p4d_offset(pgd, 0); + remove_p4d_mapping(p4d_base, addr, next, is_vmemmap, altmap); + } + + flush_tlb_all(); +} + +static void __meminit remove_linear_mapping(phys_addr_t start, u64 size) +{ + unsigned long va = (unsigned long)__va(start); + unsigned long end = (unsigned long)__va(start + size); + + remove_pgd_mapping(va, end, false, NULL); +} + +struct range arch_get_mappable_range(void) +{ + struct range mhp_range; + + mhp_range.start = __pa(PAGE_OFFSET); + mhp_range.end = __pa(PAGE_END - 1); + return mhp_range; +} + +int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params) +{ + int ret = 0; + + create_linear_mapping_range(start, start + size, 0, ¶ms->pgprot); + ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params); + if (ret) { + remove_linear_mapping(start, size); + goto out; + } + + max_pfn = PFN_UP(start + size); + max_low_pfn = max_pfn; + + out: + flush_tlb_all(); + return ret; +} + +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +{ + __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap); + remove_linear_mapping(start, size); + flush_tlb_all(); +} + +void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) +{ + remove_pgd_mapping(start, end, true, altmap); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ |
