diff options
Diffstat (limited to 'arch/arm64/mm')
-rw-r--r-- | arch/arm64/mm/copypage.c | 9 | ||||
-rw-r--r-- | arch/arm64/mm/fault.c | 147 | ||||
-rw-r--r-- | arch/arm64/mm/init.c | 125 | ||||
-rw-r--r-- | arch/arm64/mm/kasan_init.c | 19 | ||||
-rw-r--r-- | arch/arm64/mm/mmap.c | 21 | ||||
-rw-r--r-- | arch/arm64/mm/mmu.c | 149 | ||||
-rw-r--r-- | arch/arm64/mm/mteswap.c | 9 | ||||
-rw-r--r-- | arch/arm64/mm/pageattr.c | 6 | ||||
-rw-r--r-- | arch/arm64/mm/proc.S | 27 | ||||
-rw-r--r-- | arch/arm64/mm/ptdump.c | 6 |
10 files changed, 327 insertions, 191 deletions
diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 70a71f38b6a9..b5447e53cd73 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -23,6 +23,15 @@ void copy_highpage(struct page *to, struct page *from) if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) { set_bit(PG_mte_tagged, &to->flags); + page_kasan_tag_reset(to); + /* + * We need smp_wmb() in between setting the flags and clearing the + * tags because if another thread reads page->flags and builds a + * tagged address out of it, there is an actual dependency to the + * memory access, but on the current thread we do not guarantee that + * the new page->flags are visible before the tags were updated. + */ + smp_wmb(); mte_copy_page_tags(kto, kfrom); } } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 795d224f184f..3c40da479899 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -14,6 +14,7 @@ #include <linux/mm.h> #include <linux/hardirq.h> #include <linux/init.h> +#include <linux/kasan.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/page-flags.h> @@ -33,6 +34,7 @@ #include <asm/debug-monitors.h> #include <asm/esr.h> #include <asm/kprobes.h> +#include <asm/mte.h> #include <asm/processor.h> #include <asm/sysreg.h> #include <asm/system_misc.h> @@ -40,7 +42,7 @@ #include <asm/traps.h> struct fault_info { - int (*fn)(unsigned long addr, unsigned int esr, + int (*fn)(unsigned long far, unsigned int esr, struct pt_regs *regs); int sig; int code; @@ -296,6 +298,57 @@ static void die_kernel_fault(const char *msg, unsigned long addr, do_exit(SIGKILL); } +#ifdef CONFIG_KASAN_HW_TAGS +static void report_tag_fault(unsigned long addr, unsigned int esr, + struct pt_regs *regs) +{ + bool is_write = ((esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT) != 0; + + /* + * SAS bits aren't set for all faults reported in EL1, so we can't + * find out access size. + */ + kasan_report(addr, 0, is_write, regs->pc); +} +#else +/* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */ +static inline void report_tag_fault(unsigned long addr, unsigned int esr, + struct pt_regs *regs) { } +#endif + +static void do_tag_recovery(unsigned long addr, unsigned int esr, + struct pt_regs *regs) +{ + static bool reported; + + if (!READ_ONCE(reported)) { + report_tag_fault(addr, esr, regs); + WRITE_ONCE(reported, true); + } + + /* + * Disable MTE Tag Checking on the local CPU for the current EL. + * It will be done lazily on the other CPUs when they will hit a + * tag fault. + */ + sysreg_clear_set(sctlr_el1, SCTLR_ELx_TCF_MASK, SCTLR_ELx_TCF_NONE); + isb(); +} + +static bool is_el1_mte_sync_tag_check_fault(unsigned int esr) +{ + unsigned int ec = ESR_ELx_EC(esr); + unsigned int fsc = esr & ESR_ELx_FSC; + + if (ec != ESR_ELx_EC_DABT_CUR) + return false; + + if (fsc == ESR_ELx_FSC_MTE) + return true; + + return false; +} + static void __do_kernel_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { @@ -312,6 +365,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr)) return; + if (is_el1_mte_sync_tag_check_fault(esr)) { + do_tag_recovery(addr, esr, regs); + + return; + } + if (is_el1_permission_fault(addr, esr, regs)) { if (esr & ESR_ELx_WNR) msg = "write to read-only memory"; @@ -385,8 +444,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr) current->thread.fault_code = esr; } -static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs) +static void do_bad_area(unsigned long far, unsigned int esr, + struct pt_regs *regs) { + unsigned long addr = untagged_addr(far); + /* * If we are in kernel mode at this point, we have no context to * handle this fault with. @@ -395,8 +457,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re const struct fault_info *inf = esr_to_fault_info(esr); set_thread_esr(addr, esr); - arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr, - inf->name); + arm64_force_sig_fault(inf->sig, inf->code, far, inf->name); } else { __do_kernel_fault(addr, esr, regs); } @@ -448,7 +509,7 @@ static bool is_write_abort(unsigned int esr) return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM); } -static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, +static int __kprobes do_page_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf; @@ -456,6 +517,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, vm_fault_t fault; unsigned long vm_flags = VM_ACCESS_FLAGS; unsigned int mm_flags = FAULT_FLAG_DEFAULT; + unsigned long addr = untagged_addr(far); if (kprobe_page_fault(regs, esr)) return 0; @@ -479,11 +541,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, } if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) { - /* regs->orig_addr_limit may be 0 if we entered from EL0 */ - if (regs->orig_addr_limit == KERNEL_DS) - die_kernel_fault("access to user memory with fs=KERNEL_DS", - addr, esr, regs); - if (is_el1_instruction_abort(esr)) die_kernel_fault("execution of user memory", addr, esr, regs); @@ -567,8 +624,7 @@ retry: * We had some memory, but were unable to successfully fix up * this page fault. */ - arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr, - inf->name); + arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name); } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) { unsigned int lsb; @@ -576,8 +632,7 @@ retry: if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); - arm64_force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, lsb, - inf->name); + arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name); } else { /* * Something tried to access memory that isn't in our memory @@ -585,8 +640,7 @@ retry: */ arm64_force_sig_fault(SIGSEGV, fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR, - (void __user *)addr, - inf->name); + far, inf->name); } return 0; @@ -596,33 +650,35 @@ no_context: return 0; } -static int __kprobes do_translation_fault(unsigned long addr, +static int __kprobes do_translation_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { + unsigned long addr = untagged_addr(far); + if (is_ttbr0_addr(addr)) - return do_page_fault(addr, esr, regs); + return do_page_fault(far, esr, regs); - do_bad_area(addr, esr, regs); + do_bad_area(far, esr, regs); return 0; } -static int do_alignment_fault(unsigned long addr, unsigned int esr, +static int do_alignment_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { - do_bad_area(addr, esr, regs); + do_bad_area(far, esr, regs); return 0; } -static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) +static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs) { return 1; /* "fault" */ } -static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) +static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf; - void __user *siaddr; + unsigned long siaddr; inf = esr_to_fault_info(esr); @@ -634,19 +690,30 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 0; } - if (esr & ESR_ELx_FnV) - siaddr = NULL; - else - siaddr = (void __user *)addr; + if (esr & ESR_ELx_FnV) { + siaddr = 0; + } else { + /* + * The architecture specifies that the tag bits of FAR_EL1 are + * UNKNOWN for synchronous external aborts. Mask them out now + * so that userspace doesn't see them. + */ + siaddr = untagged_addr(far); + } arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr); return 0; } -static int do_tag_check_fault(unsigned long addr, unsigned int esr, +static int do_tag_check_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { - do_bad_area(addr, esr, regs); + /* + * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN for tag + * check faults. Mask them out now so that userspace doesn't see them. + */ + far &= (1UL << 60) - 1; + do_bad_area(far, esr, regs); return 0; } @@ -717,11 +784,12 @@ static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, }; -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_fault_info(esr); + unsigned long addr = untagged_addr(far); - if (!inf->fn(addr, esr, regs)) + if (!inf->fn(far, esr, regs)) return; if (!user_mode(regs)) { @@ -730,8 +798,12 @@ void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) show_pte(addr); } - arm64_notify_die(inf->name, regs, - inf->sig, inf->code, (void __user *)addr, esr); + /* + * At this point we have an unrecognized fault type whose tag bits may + * have been defined as UNKNOWN. Therefore we only expose the untagged + * address to the signal handler. + */ + arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr); } NOKPROBE_SYMBOL(do_mem_abort); @@ -744,8 +816,8 @@ NOKPROBE_SYMBOL(do_el0_irq_bp_hardening); void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - arm64_notify_die("SP/PC alignment exception", regs, - SIGBUS, BUS_ADRALN, (void __user *)addr, esr); + arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN, + addr, esr); } NOKPROBE_SYMBOL(do_sp_pc_abort); @@ -846,8 +918,7 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, arm64_apply_bp_hardening(); if (inf->fn(addr_if_watchpoint, esr, regs)) { - arm64_notify_die(inf->name, regs, - inf->sig, inf->code, (void __user *)pc, esr); + arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr); } debug_exception_exit(regs); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 095540667f0f..7deddf56f7c3 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -29,6 +29,7 @@ #include <linux/kexec.h> #include <linux/crash_dump.h> #include <linux/hugetlb.h> +#include <linux/acpi_iort.h> #include <asm/boot.h> #include <asm/fixmap.h> @@ -42,8 +43,6 @@ #include <asm/tlb.h> #include <asm/alternative.h> -#define ARM64_ZONE_DMA_BITS 30 - /* * We need to be able to catch inadvertent references to memstart_addr * that occur (potentially in generic code) before arm64_memblock_init() @@ -60,7 +59,7 @@ EXPORT_SYMBOL(memstart_addr); * bit addressable memory area. */ phys_addr_t arm64_dma_phys_limit __ro_after_init; -static phys_addr_t arm64_dma32_phys_limit __ro_after_init; +phys_addr_t arm64_dma32_phys_limit __ro_after_init; #ifdef CONFIG_KEXEC_CORE /* @@ -175,21 +174,34 @@ static void __init reserve_elfcorehdr(void) #endif /* CONFIG_CRASH_DUMP */ /* - * Return the maximum physical address for a zone with a given address size - * limit. It currently assumes that for memory starting above 4G, 32-bit - * devices will use a DMA offset. + * Return the maximum physical address for a zone accessible by the given bits + * limit. If DRAM starts above 32-bit, expand the zone to the maximum + * available memory, otherwise cap it at 32-bit. */ static phys_addr_t __init max_zone_phys(unsigned int zone_bits) { - phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, zone_bits); - return min(offset + (1ULL << zone_bits), memblock_end_of_DRAM()); + phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits); + phys_addr_t phys_start = memblock_start_of_DRAM(); + + if (phys_start > U32_MAX) + zone_mask = PHYS_ADDR_MAX; + else if (phys_start > zone_mask) + zone_mask = U32_MAX; + + return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; } static void __init zone_sizes_init(unsigned long min, unsigned long max) { unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; + unsigned int __maybe_unused acpi_zone_dma_bits; + unsigned int __maybe_unused dt_zone_dma_bits; #ifdef CONFIG_ZONE_DMA + acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); + dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL)); + zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits); + arm64_dma_phys_limit = max_zone_phys(zone_dma_bits); max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); #endif #ifdef CONFIG_ZONE_DMA32 @@ -269,7 +281,7 @@ static void __init fdt_enforce_memory_region(void) void __init arm64_memblock_init(void) { - const s64 linear_region_size = BIT(vabits_actual - 1); + const s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual); /* Handle linux,usable-memory-range property */ fdt_enforce_memory_region(); @@ -283,6 +295,9 @@ void __init arm64_memblock_init(void) memstart_addr = round_down(memblock_start_of_DRAM(), ARM64_MEMSTART_ALIGN); + if ((memblock_end_of_DRAM() - memstart_addr) > linear_region_size) + pr_warn("Memory doesn't fit in the linear mapping, VA_BITS too small\n"); + /* * Remove the memory that we will not be able to cover with the * linear mapping. Take care not to clip the kernel which may be @@ -348,15 +363,18 @@ void __init arm64_memblock_init(void) if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { extern u16 memstart_offset_seed; - u64 range = linear_region_size - - (memblock_end_of_DRAM() - memblock_start_of_DRAM()); + u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); + int parange = cpuid_feature_extract_unsigned_field( + mmfr0, ID_AA64MMFR0_PARANGE_SHIFT); + s64 range = linear_region_size - + BIT(id_aa64mmfr0_parange_to_phys_shift(parange)); /* * If the size of the linear region exceeds, by a sufficient - * margin, the size of the region that the available physical - * memory spans, randomize the linear region as well. + * margin, the size of the region that the physical memory can + * span, randomize the linear region as well. */ - if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) { + if (memstart_offset_seed > 0 && range >= (s64)ARM64_MEMSTART_ALIGN) { range /= ARM64_MEMSTART_ALIGN; memstart_addr -= ARM64_MEMSTART_ALIGN * ((range * memstart_offset_seed) >> 16); @@ -367,7 +385,7 @@ void __init arm64_memblock_init(void) * Register the kernel text, kernel data, initrd, and initial * pagetables with memblock. */ - memblock_reserve(__pa_symbol(_text), _end - _text); + memblock_reserve(__pa_symbol(_stext), _end - _stext); if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { /* the generic initrd code expects virtual addresses */ initrd_start = __phys_to_virt(phys_initrd_start); @@ -376,18 +394,11 @@ void __init arm64_memblock_init(void) early_init_fdt_scan_reserved_mem(); - if (IS_ENABLED(CONFIG_ZONE_DMA)) { - zone_dma_bits = ARM64_ZONE_DMA_BITS; - arm64_dma_phys_limit = max_zone_phys(ARM64_ZONE_DMA_BITS); - } - if (IS_ENABLED(CONFIG_ZONE_DMA32)) arm64_dma32_phys_limit = max_zone_phys(32); else arm64_dma32_phys_limit = PHYS_MASK + 1; - reserve_crashkernel(); - reserve_elfcorehdr(); high_memory = __va(memblock_end_of_DRAM() - 1) + 1; @@ -427,73 +438,14 @@ void __init bootmem_init(void) sparse_init(); zone_sizes_init(min, max); - memblock_dump_all(); -} - -#ifndef CONFIG_SPARSEMEM_VMEMMAP -static inline void free_memmap(unsigned long start_pfn, unsigned long end_pfn) -{ - struct page *start_pg, *end_pg; - unsigned long pg, pgend; - - /* - * Convert start_pfn/end_pfn to a struct page pointer. - */ - start_pg = pfn_to_page(start_pfn - 1) + 1; - end_pg = pfn_to_page(end_pfn - 1) + 1; - /* - * Convert to physical addresses, and round start upwards and end - * downwards. + * request_standard_resources() depends on crashkernel's memory being + * reserved, so do it here. */ - pg = (unsigned long)PAGE_ALIGN(__pa(start_pg)); - pgend = (unsigned long)__pa(end_pg) & PAGE_MASK; - - /* - * If there are free pages between these, free the section of the - * memmap array. - */ - if (pg < pgend) - memblock_free(pg, pgend - pg); -} - -/* - * The mem_map array can get very big. Free the unused area of the memory map. - */ -static void __init free_unused_memmap(void) -{ - unsigned long start, end, prev_end = 0; - int i; - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { -#ifdef CONFIG_SPARSEMEM - /* - * Take care not to free memmap entries that don't exist due - * to SPARSEMEM sections which aren't present. - */ - start = min(start, ALIGN(prev_end, PAGES_PER_SECTION)); -#endif - /* - * If we had a previous bank, and there is a space between the - * current bank and the previous, free it. - */ - if (prev_end && prev_end < start) - free_memmap(prev_end, start); - - /* - * Align up here since the VM subsystem insists that the - * memmap entries are valid from the bank end aligned to - * MAX_ORDER_NR_PAGES. - */ - prev_end = ALIGN(end, MAX_ORDER_NR_PAGES); - } + reserve_crashkernel(); -#ifdef CONFIG_SPARSEMEM - if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) - free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION)); -#endif + memblock_dump_all(); } -#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ /* * mem_init() marks the free areas in the mem_map and tells us how much memory @@ -510,9 +462,6 @@ void __init mem_init(void) set_max_mapnr(max_pfn - PHYS_PFN_OFFSET); -#ifndef CONFIG_SPARSEMEM_VMEMMAP - free_unused_memmap(); -#endif /* this will put all unused low memory onto the freelists */ memblock_free_all(); diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index b24e43d20667..d8e66c78440e 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -21,6 +21,8 @@ #include <asm/sections.h> #include <asm/tlbflush.h> +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE); /* @@ -208,7 +210,7 @@ static void __init clear_pgds(unsigned long start, set_pgd(pgd_offset_k(start), __pgd(0)); } -void __init kasan_init(void) +static void __init kasan_init_shadow(void) { u64 kimg_shadow_start, kimg_shadow_end; u64 mod_shadow_start, mod_shadow_end; @@ -269,8 +271,21 @@ void __init kasan_init(void) memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE); cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); +} - /* At this point kasan is fully initialized. Enable error messages */ +static void __init kasan_init_depth(void) +{ init_task.kasan_depth = 0; +} + +void __init kasan_init(void) +{ + kasan_init_shadow(); + kasan_init_depth(); +#if defined(CONFIG_KASAN_GENERIC) + /* CONFIG_KASAN_SW_TAGS also requires kasan_init_sw_tags(). */ pr_info("KernelAddressSanitizer initialized\n"); +#endif } + +#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 3028bacbc4e9..07937b49cb88 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -47,24 +47,3 @@ int valid_mmap_phys_addr_range(unsigned long pfn, size_t size) { return !(((pfn << PAGE_SHIFT) + size) & ~PHYS_MASK); } - -#ifdef CONFIG_STRICT_DEVMEM - -#include <linux/ioport.h> - -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain address - * is valid. The argument is a physical page number. We mimic x86 here by - * disallowing access to system RAM as well as device-exclusive MMIO regions. - * This effectively disable read()/write() on /dev/mem. - */ -int devmem_is_allowed(unsigned long pfn) -{ - if (iomem_is_exclusive(pfn << PAGE_SHIFT)) - return 0; - if (!page_is_ram(pfn)) - return 1; - return 0; -} - -#endif diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ca692a815731..ae0c3d023824 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -464,20 +464,35 @@ void __init mark_linear_text_alias_ro(void) /* * Remove the write permissions from the linear alias of .text/.rodata */ - update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), - (unsigned long)__init_begin - (unsigned long)_text, + update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext), + (unsigned long)__init_begin - (unsigned long)_stext, PAGE_KERNEL_RO); } +static bool crash_mem_map __initdata; + +static int __init enable_crash_mem_map(char *arg) +{ + /* + * Proper parameter parsing is done by reserve_crashkernel(). We only + * need to know if the linear map has to avoid block mappings so that + * the crashkernel reservations can be unmapped later. + */ + crash_mem_map = true; + + return 0; +} +early_param("crashkernel", enable_crash_mem_map); + static void __init map_mem(pgd_t *pgdp) { - phys_addr_t kernel_start = __pa_symbol(_text); + phys_addr_t kernel_start = __pa_symbol(_stext); phys_addr_t kernel_end = __pa_symbol(__init_begin); phys_addr_t start, end; int flags = 0; u64 i; - if (rodata_full || debug_pagealloc_enabled()) + if (rodata_full || crash_mem_map || debug_pagealloc_enabled()) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* @@ -487,11 +502,6 @@ static void __init map_mem(pgd_t *pgdp) * the following for-loop */ memblock_mark_nomap(kernel_start, kernel_end - kernel_start); -#ifdef CONFIG_KEXEC_CORE - if (crashk_res.end) - memblock_mark_nomap(crashk_res.start, - resource_size(&crashk_res)); -#endif /* map all the memory banks */ for_each_mem_range(i, &start, &end) { @@ -506,7 +516,7 @@ static void __init map_mem(pgd_t *pgdp) } /* - * Map the linear alias of the [_text, __init_begin) interval + * Map the linear alias of the [_stext, __init_begin) interval * as non-executable now, and remove the write permission in * mark_linear_text_alias_ro() below (which will be called after * alternative patching has completed). This makes the contents @@ -518,21 +528,6 @@ static void __init map_mem(pgd_t *pgdp) __map_memblock(pgdp, kernel_start, kernel_end, PAGE_KERNEL, NO_CONT_MAPPINGS); memblock_clear_nomap(kernel_start, kernel_end - kernel_start); - -#ifdef CONFIG_KEXEC_CORE - /* - * Use page-level mappings here so that we can shrink the region - * in page granularity and put back unused memory to buddy system - * through /sys/kernel/kexec_crash_size interface. - */ - if (crashk_res.end) { - __map_memblock(pgdp, crashk_res.start, crashk_res.end + 1, - PAGE_KERNEL, - NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS); - memblock_clear_nomap(crashk_res.start, - resource_size(&crashk_res)); - } -#endif } void mark_rodata_ro(void) @@ -665,7 +660,7 @@ static void __init map_kernel(pgd_t *pgdp) * Only rodata will be remapped with different permissions later on, * all other segments are allowed to use contiguous mappings. */ - map_kernel_segment(pgdp, _text, _etext, text_prot, &vmlinux_text, 0, + map_kernel_segment(pgdp, _stext, _etext, text_prot, &vmlinux_text, 0, VM_NO_GUARD); map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL, &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD); @@ -1132,8 +1127,11 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, void *p = NULL; p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); - if (!p) - return -ENOMEM; + if (!p) { + if (vmemmap_populate_basepages(addr, next, node, altmap)) + return -ENOMEM; + continue; + } pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL)); } else @@ -1510,13 +1508,43 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb, unsigned long end_pfn = arg->start_pfn + arg->nr_pages; unsigned long pfn = arg->start_pfn; - if (action != MEM_GOING_OFFLINE) + if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE)) return NOTIFY_OK; for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long start = PFN_PHYS(pfn); + unsigned long end = start + (1UL << PA_SECTION_SHIFT); + ms = __pfn_to_section(pfn); - if (early_section(ms)) + if (!early_section(ms)) + continue; + + if (action == MEM_GOING_OFFLINE) { + /* + * Boot memory removal is not supported. Prevent + * it via blocking any attempted offline request + * for the boot memory and just report it. + */ + pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end); return NOTIFY_BAD; + } else if (action == MEM_OFFLINE) { + /* + * This should have never happened. Boot memory + * offlining should have been prevented by this + * very notifier. Probably some memory removal + * procedure might have changed which would then + * require further debug. + */ + pr_err("Boot memory [%lx %lx] offlined\n", start, end); + + /* + * Core memory hotplug does not process a return + * code from the notifier for MEM_OFFLINE events. + * The error condition has been reported. Return + * from here as if ignored. + */ + return NOTIFY_DONE; + } } return NOTIFY_OK; } @@ -1525,9 +1553,66 @@ static struct notifier_block prevent_bootmem_remove_nb = { .notifier_call = prevent_bootmem_remove_notifier, }; +/* + * This ensures that boot memory sections on the platform are online + * from early boot. Memory sections could not be prevented from being + * offlined, unless for some reason they are not online to begin with. + * This helps validate the basic assumption on which the above memory + * event notifier works to prevent boot memory section offlining and + * its possible removal. + */ +static void validate_bootmem_online(void) +{ + phys_addr_t start, end, addr; + struct mem_section *ms; + u64 i; + + /* + * Scanning across all memblock might be expensive + * on some big memory systems. Hence enable this + * validation only with DEBUG_VM. + */ + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return; + + for_each_mem_range(i, &start, &end) { + for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) { + ms = __pfn_to_section(PHYS_PFN(addr)); + + /* + * All memory ranges in the system at this point + * should have been marked as early sections. + */ + WARN_ON(!early_section(ms)); + + /* + * Memory notifier mechanism here to prevent boot + * memory offlining depends on the fact that each + * early section memory on the system is initially + * online. Otherwise a given memory section which + * is already offline will be overlooked and can + * be removed completely. Call out such sections. + */ + if (!online_section(ms)) + pr_err("Boot memory [%llx %llx] is offline, can be removed\n", + addr, addr + (1UL << PA_SECTION_SHIFT)); + } + } +} + static int __init prevent_bootmem_remove_init(void) { - return register_memory_notifier(&prevent_bootmem_remove_nb); + int ret = 0; + + if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return ret; + + validate_bootmem_online(); + ret = register_memory_notifier(&prevent_bootmem_remove_nb); + if (ret) + pr_err("%s: Notifier registration failed %d\n", __func__, ret); + + return ret; } -device_initcall(prevent_bootmem_remove_init); +early_initcall(prevent_bootmem_remove_init); #endif diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c index c52c1847079c..7c4ef56265ee 100644 --- a/arch/arm64/mm/mteswap.c +++ b/arch/arm64/mm/mteswap.c @@ -53,6 +53,15 @@ bool mte_restore_tags(swp_entry_t entry, struct page *page) if (!tags) return false; + page_kasan_tag_reset(page); + /* + * We need smp_wmb() in between setting the flags and clearing the + * tags because if another thread reads page->flags and builds a + * tagged address out of it, there is an actual dependency to the + * memory access, but on the current thread we do not guarantee that + * the new page->flags are visible before the tags were updated. + */ + smp_wmb(); mte_restore_page_tags(page_address(page), tags); return true; diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 1b94f5b82654..92eccaf595c8 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -155,7 +155,7 @@ int set_direct_map_invalid_noflush(struct page *page) .clear_mask = __pgprot(PTE_VALID), }; - if (!rodata_full) + if (!debug_pagealloc_enabled() && !rodata_full) return 0; return apply_to_page_range(&init_mm, @@ -170,7 +170,7 @@ int set_direct_map_default_noflush(struct page *page) .clear_mask = __pgprot(PTE_RDONLY), }; - if (!rodata_full) + if (!debug_pagealloc_enabled() && !rodata_full) return 0; return apply_to_page_range(&init_mm, @@ -178,6 +178,7 @@ int set_direct_map_default_noflush(struct page *page) PAGE_SIZE, change_page_range, &data); } +#ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { if (!debug_pagealloc_enabled() && !rodata_full) @@ -185,6 +186,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) set_memory_valid((unsigned long)page_address(page), numpages, enable); } +#endif /* CONFIG_DEBUG_PAGEALLOC */ /* * This function is used to determine if a linear map page has been marked as diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 23c326a06b2d..1f7ee8c8b7b8 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -40,9 +40,15 @@ #define TCR_CACHE_FLAGS TCR_IRGN_WBWA | TCR_ORGN_WBWA #ifdef CONFIG_KASAN_SW_TAGS -#define TCR_KASAN_FLAGS TCR_TBI1 +#define TCR_KASAN_SW_FLAGS TCR_TBI1 | TCR_TBID1 #else -#define TCR_KASAN_FLAGS 0 +#define TCR_KASAN_SW_FLAGS 0 +#endif + +#ifdef CONFIG_KASAN_HW_TAGS +#define TCR_KASAN_HW_FLAGS SYS_TCR_EL1_TCMA1 | TCR_TBI1 | TCR_TBID1 +#else +#define TCR_KASAN_HW_FLAGS 0 #endif /* @@ -168,7 +174,7 @@ SYM_FUNC_END(cpu_do_resume) .pushsection ".idmap.text", "awx" .macro __idmap_cpu_set_reserved_ttbr1, tmp1, tmp2 - adrp \tmp1, empty_zero_page + adrp \tmp1, reserved_pg_dir phys_to_ttbr \tmp2, \tmp1 offset_ttbr1 \tmp2, \tmp1 msr ttbr1_el1, \tmp2 @@ -427,6 +433,10 @@ SYM_FUNC_START(__cpu_setup) */ mov_q x5, MAIR_EL1_SET #ifdef CONFIG_ARM64_MTE + mte_tcr .req x20 + + mov mte_tcr, #0 + /* * Update MAIR_EL1, GCR_EL1 and TFSR*_EL1 if MTE is supported * (ID_AA64PFR1_EL1[11:8] > 1). @@ -447,6 +457,9 @@ SYM_FUNC_START(__cpu_setup) /* clear any pending tag check faults in TFSR*_EL1 */ msr_s SYS_TFSR_EL1, xzr msr_s SYS_TFSRE0_EL1, xzr + + /* set the TCR_EL1 bits */ + mov_q mte_tcr, TCR_KASAN_HW_FLAGS 1: #endif msr mair_el1, x5 @@ -456,7 +469,11 @@ SYM_FUNC_START(__cpu_setup) */ mov_q x10, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \ TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ - TCR_TBI0 | TCR_A1 | TCR_KASAN_FLAGS + TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS +#ifdef CONFIG_ARM64_MTE + orr x10, x10, mte_tcr + .unreq mte_tcr +#endif tcr_clear_errata_bits x10, x9, x5 #ifdef CONFIG_ARM64_VA_BITS_52 @@ -489,6 +506,6 @@ SYM_FUNC_START(__cpu_setup) /* * Prepare SCTLR */ - mov_q x0, SCTLR_EL1_SET + mov_q x0, INIT_SCTLR_EL1_MMU_ON ret // return to head.S SYM_FUNC_END(__cpu_setup) diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 807dc634bbd2..04137a8f3d2d 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -29,7 +29,7 @@ enum address_markers_idx { PAGE_OFFSET_NR = 0, PAGE_END_NR, -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) KASAN_START_NR, #endif }; @@ -37,7 +37,7 @@ enum address_markers_idx { static struct addr_marker address_markers[] = { { PAGE_OFFSET, "Linear Mapping start" }, { 0 /* PAGE_END */, "Linear Mapping end" }, -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) { 0 /* KASAN_SHADOW_START */, "Kasan shadow start" }, { KASAN_SHADOW_END, "Kasan shadow end" }, #endif @@ -383,7 +383,7 @@ void ptdump_check_wx(void) static int ptdump_init(void) { address_markers[PAGE_END_NR].start_address = PAGE_END; -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) address_markers[KASAN_START_NR].start_address = KASAN_SHADOW_START; #endif ptdump_initialize(); |