summaryrefslogtreecommitdiff
path: root/arch/arm64/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/mm')
-rw-r--r--arch/arm64/mm/copypage.c9
-rw-r--r--arch/arm64/mm/fault.c147
-rw-r--r--arch/arm64/mm/init.c125
-rw-r--r--arch/arm64/mm/kasan_init.c19
-rw-r--r--arch/arm64/mm/mmap.c21
-rw-r--r--arch/arm64/mm/mmu.c149
-rw-r--r--arch/arm64/mm/mteswap.c9
-rw-r--r--arch/arm64/mm/pageattr.c6
-rw-r--r--arch/arm64/mm/proc.S27
-rw-r--r--arch/arm64/mm/ptdump.c6
10 files changed, 327 insertions, 191 deletions
diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c
index 70a71f38b6a9..b5447e53cd73 100644
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -23,6 +23,15 @@ void copy_highpage(struct page *to, struct page *from)
if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) {
set_bit(PG_mte_tagged, &to->flags);
+ page_kasan_tag_reset(to);
+ /*
+ * We need smp_wmb() in between setting the flags and clearing the
+ * tags because if another thread reads page->flags and builds a
+ * tagged address out of it, there is an actual dependency to the
+ * memory access, but on the current thread we do not guarantee that
+ * the new page->flags are visible before the tags were updated.
+ */
+ smp_wmb();
mte_copy_page_tags(kto, kfrom);
}
}
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 795d224f184f..3c40da479899 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -14,6 +14,7 @@
#include <linux/mm.h>
#include <linux/hardirq.h>
#include <linux/init.h>
+#include <linux/kasan.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/page-flags.h>
@@ -33,6 +34,7 @@
#include <asm/debug-monitors.h>
#include <asm/esr.h>
#include <asm/kprobes.h>
+#include <asm/mte.h>
#include <asm/processor.h>
#include <asm/sysreg.h>
#include <asm/system_misc.h>
@@ -40,7 +42,7 @@
#include <asm/traps.h>
struct fault_info {
- int (*fn)(unsigned long addr, unsigned int esr,
+ int (*fn)(unsigned long far, unsigned int esr,
struct pt_regs *regs);
int sig;
int code;
@@ -296,6 +298,57 @@ static void die_kernel_fault(const char *msg, unsigned long addr,
do_exit(SIGKILL);
}
+#ifdef CONFIG_KASAN_HW_TAGS
+static void report_tag_fault(unsigned long addr, unsigned int esr,
+ struct pt_regs *regs)
+{
+ bool is_write = ((esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT) != 0;
+
+ /*
+ * SAS bits aren't set for all faults reported in EL1, so we can't
+ * find out access size.
+ */
+ kasan_report(addr, 0, is_write, regs->pc);
+}
+#else
+/* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
+static inline void report_tag_fault(unsigned long addr, unsigned int esr,
+ struct pt_regs *regs) { }
+#endif
+
+static void do_tag_recovery(unsigned long addr, unsigned int esr,
+ struct pt_regs *regs)
+{
+ static bool reported;
+
+ if (!READ_ONCE(reported)) {
+ report_tag_fault(addr, esr, regs);
+ WRITE_ONCE(reported, true);
+ }
+
+ /*
+ * Disable MTE Tag Checking on the local CPU for the current EL.
+ * It will be done lazily on the other CPUs when they will hit a
+ * tag fault.
+ */
+ sysreg_clear_set(sctlr_el1, SCTLR_ELx_TCF_MASK, SCTLR_ELx_TCF_NONE);
+ isb();
+}
+
+static bool is_el1_mte_sync_tag_check_fault(unsigned int esr)
+{
+ unsigned int ec = ESR_ELx_EC(esr);
+ unsigned int fsc = esr & ESR_ELx_FSC;
+
+ if (ec != ESR_ELx_EC_DABT_CUR)
+ return false;
+
+ if (fsc == ESR_ELx_FSC_MTE)
+ return true;
+
+ return false;
+}
+
static void __do_kernel_fault(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
@@ -312,6 +365,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
"Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
return;
+ if (is_el1_mte_sync_tag_check_fault(esr)) {
+ do_tag_recovery(addr, esr, regs);
+
+ return;
+ }
+
if (is_el1_permission_fault(addr, esr, regs)) {
if (esr & ESR_ELx_WNR)
msg = "write to read-only memory";
@@ -385,8 +444,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
current->thread.fault_code = esr;
}
-static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static void do_bad_area(unsigned long far, unsigned int esr,
+ struct pt_regs *regs)
{
+ unsigned long addr = untagged_addr(far);
+
/*
* If we are in kernel mode at this point, we have no context to
* handle this fault with.
@@ -395,8 +457,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
const struct fault_info *inf = esr_to_fault_info(esr);
set_thread_esr(addr, esr);
- arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
- inf->name);
+ arm64_force_sig_fault(inf->sig, inf->code, far, inf->name);
} else {
__do_kernel_fault(addr, esr, regs);
}
@@ -448,7 +509,7 @@ static bool is_write_abort(unsigned int esr)
return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
}
-static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
+static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
struct pt_regs *regs)
{
const struct fault_info *inf;
@@ -456,6 +517,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
vm_fault_t fault;
unsigned long vm_flags = VM_ACCESS_FLAGS;
unsigned int mm_flags = FAULT_FLAG_DEFAULT;
+ unsigned long addr = untagged_addr(far);
if (kprobe_page_fault(regs, esr))
return 0;
@@ -479,11 +541,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
}
if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
- /* regs->orig_addr_limit may be 0 if we entered from EL0 */
- if (regs->orig_addr_limit == KERNEL_DS)
- die_kernel_fault("access to user memory with fs=KERNEL_DS",
- addr, esr, regs);
-
if (is_el1_instruction_abort(esr))
die_kernel_fault("execution of user memory",
addr, esr, regs);
@@ -567,8 +624,7 @@ retry:
* We had some memory, but were unable to successfully fix up
* this page fault.
*/
- arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr,
- inf->name);
+ arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name);
} else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
unsigned int lsb;
@@ -576,8 +632,7 @@ retry:
if (fault & VM_FAULT_HWPOISON_LARGE)
lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
- arm64_force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, lsb,
- inf->name);
+ arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name);
} else {
/*
* Something tried to access memory that isn't in our memory
@@ -585,8 +640,7 @@ retry:
*/
arm64_force_sig_fault(SIGSEGV,
fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
- (void __user *)addr,
- inf->name);
+ far, inf->name);
}
return 0;
@@ -596,33 +650,35 @@ no_context:
return 0;
}
-static int __kprobes do_translation_fault(unsigned long addr,
+static int __kprobes do_translation_fault(unsigned long far,
unsigned int esr,
struct pt_regs *regs)
{
+ unsigned long addr = untagged_addr(far);
+
if (is_ttbr0_addr(addr))
- return do_page_fault(addr, esr, regs);
+ return do_page_fault(far, esr, regs);
- do_bad_area(addr, esr, regs);
+ do_bad_area(far, esr, regs);
return 0;
}
-static int do_alignment_fault(unsigned long addr, unsigned int esr,
+static int do_alignment_fault(unsigned long far, unsigned int esr,
struct pt_regs *regs)
{
- do_bad_area(addr, esr, regs);
+ do_bad_area(far, esr, regs);
return 0;
}
-static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
{
return 1; /* "fault" */
}
-static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
{
const struct fault_info *inf;
- void __user *siaddr;
+ unsigned long siaddr;
inf = esr_to_fault_info(esr);
@@ -634,19 +690,30 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
return 0;
}
- if (esr & ESR_ELx_FnV)
- siaddr = NULL;
- else
- siaddr = (void __user *)addr;
+ if (esr & ESR_ELx_FnV) {
+ siaddr = 0;
+ } else {
+ /*
+ * The architecture specifies that the tag bits of FAR_EL1 are
+ * UNKNOWN for synchronous external aborts. Mask them out now
+ * so that userspace doesn't see them.
+ */
+ siaddr = untagged_addr(far);
+ }
arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
return 0;
}
-static int do_tag_check_fault(unsigned long addr, unsigned int esr,
+static int do_tag_check_fault(unsigned long far, unsigned int esr,
struct pt_regs *regs)
{
- do_bad_area(addr, esr, regs);
+ /*
+ * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN for tag
+ * check faults. Mask them out now so that userspace doesn't see them.
+ */
+ far &= (1UL << 60) - 1;
+ do_bad_area(far, esr, regs);
return 0;
}
@@ -717,11 +784,12 @@ static const struct fault_info fault_info[] = {
{ do_bad, SIGKILL, SI_KERNEL, "unknown 63" },
};
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
{
const struct fault_info *inf = esr_to_fault_info(esr);
+ unsigned long addr = untagged_addr(far);
- if (!inf->fn(addr, esr, regs))
+ if (!inf->fn(far, esr, regs))
return;
if (!user_mode(regs)) {
@@ -730,8 +798,12 @@ void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
show_pte(addr);
}
- arm64_notify_die(inf->name, regs,
- inf->sig, inf->code, (void __user *)addr, esr);
+ /*
+ * At this point we have an unrecognized fault type whose tag bits may
+ * have been defined as UNKNOWN. Therefore we only expose the untagged
+ * address to the signal handler.
+ */
+ arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr);
}
NOKPROBE_SYMBOL(do_mem_abort);
@@ -744,8 +816,8 @@ NOKPROBE_SYMBOL(do_el0_irq_bp_hardening);
void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
{
- arm64_notify_die("SP/PC alignment exception", regs,
- SIGBUS, BUS_ADRALN, (void __user *)addr, esr);
+ arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
+ addr, esr);
}
NOKPROBE_SYMBOL(do_sp_pc_abort);
@@ -846,8 +918,7 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
arm64_apply_bp_hardening();
if (inf->fn(addr_if_watchpoint, esr, regs)) {
- arm64_notify_die(inf->name, regs,
- inf->sig, inf->code, (void __user *)pc, esr);
+ arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr);
}
debug_exception_exit(regs);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 095540667f0f..7deddf56f7c3 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -29,6 +29,7 @@
#include <linux/kexec.h>
#include <linux/crash_dump.h>
#include <linux/hugetlb.h>
+#include <linux/acpi_iort.h>
#include <asm/boot.h>
#include <asm/fixmap.h>
@@ -42,8 +43,6 @@
#include <asm/tlb.h>
#include <asm/alternative.h>
-#define ARM64_ZONE_DMA_BITS 30
-
/*
* We need to be able to catch inadvertent references to memstart_addr
* that occur (potentially in generic code) before arm64_memblock_init()
@@ -60,7 +59,7 @@ EXPORT_SYMBOL(memstart_addr);
* bit addressable memory area.
*/
phys_addr_t arm64_dma_phys_limit __ro_after_init;
-static phys_addr_t arm64_dma32_phys_limit __ro_after_init;
+phys_addr_t arm64_dma32_phys_limit __ro_after_init;
#ifdef CONFIG_KEXEC_CORE
/*
@@ -175,21 +174,34 @@ static void __init reserve_elfcorehdr(void)
#endif /* CONFIG_CRASH_DUMP */
/*
- * Return the maximum physical address for a zone with a given address size
- * limit. It currently assumes that for memory starting above 4G, 32-bit
- * devices will use a DMA offset.
+ * Return the maximum physical address for a zone accessible by the given bits
+ * limit. If DRAM starts above 32-bit, expand the zone to the maximum
+ * available memory, otherwise cap it at 32-bit.
*/
static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
{
- phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, zone_bits);
- return min(offset + (1ULL << zone_bits), memblock_end_of_DRAM());
+ phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits);
+ phys_addr_t phys_start = memblock_start_of_DRAM();
+
+ if (phys_start > U32_MAX)
+ zone_mask = PHYS_ADDR_MAX;
+ else if (phys_start > zone_mask)
+ zone_mask = U32_MAX;
+
+ return min(zone_mask, memblock_end_of_DRAM() - 1) + 1;
}
static void __init zone_sizes_init(unsigned long min, unsigned long max)
{
unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
+ unsigned int __maybe_unused acpi_zone_dma_bits;
+ unsigned int __maybe_unused dt_zone_dma_bits;
#ifdef CONFIG_ZONE_DMA
+ acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
+ dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
+ zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
+ arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
#endif
#ifdef CONFIG_ZONE_DMA32
@@ -269,7 +281,7 @@ static void __init fdt_enforce_memory_region(void)
void __init arm64_memblock_init(void)
{
- const s64 linear_region_size = BIT(vabits_actual - 1);
+ const s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual);
/* Handle linux,usable-memory-range property */
fdt_enforce_memory_region();
@@ -283,6 +295,9 @@ void __init arm64_memblock_init(void)
memstart_addr = round_down(memblock_start_of_DRAM(),
ARM64_MEMSTART_ALIGN);
+ if ((memblock_end_of_DRAM() - memstart_addr) > linear_region_size)
+ pr_warn("Memory doesn't fit in the linear mapping, VA_BITS too small\n");
+
/*
* Remove the memory that we will not be able to cover with the
* linear mapping. Take care not to clip the kernel which may be
@@ -348,15 +363,18 @@ void __init arm64_memblock_init(void)
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
extern u16 memstart_offset_seed;
- u64 range = linear_region_size -
- (memblock_end_of_DRAM() - memblock_start_of_DRAM());
+ u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
+ int parange = cpuid_feature_extract_unsigned_field(
+ mmfr0, ID_AA64MMFR0_PARANGE_SHIFT);
+ s64 range = linear_region_size -
+ BIT(id_aa64mmfr0_parange_to_phys_shift(parange));
/*
* If the size of the linear region exceeds, by a sufficient
- * margin, the size of the region that the available physical
- * memory spans, randomize the linear region as well.
+ * margin, the size of the region that the physical memory can
+ * span, randomize the linear region as well.
*/
- if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) {
+ if (memstart_offset_seed > 0 && range >= (s64)ARM64_MEMSTART_ALIGN) {
range /= ARM64_MEMSTART_ALIGN;
memstart_addr -= ARM64_MEMSTART_ALIGN *
((range * memstart_offset_seed) >> 16);
@@ -367,7 +385,7 @@ void __init arm64_memblock_init(void)
* Register the kernel text, kernel data, initrd, and initial
* pagetables with memblock.
*/
- memblock_reserve(__pa_symbol(_text), _end - _text);
+ memblock_reserve(__pa_symbol(_stext), _end - _stext);
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
/* the generic initrd code expects virtual addresses */
initrd_start = __phys_to_virt(phys_initrd_start);
@@ -376,18 +394,11 @@ void __init arm64_memblock_init(void)
early_init_fdt_scan_reserved_mem();
- if (IS_ENABLED(CONFIG_ZONE_DMA)) {
- zone_dma_bits = ARM64_ZONE_DMA_BITS;
- arm64_dma_phys_limit = max_zone_phys(ARM64_ZONE_DMA_BITS);
- }
-
if (IS_ENABLED(CONFIG_ZONE_DMA32))
arm64_dma32_phys_limit = max_zone_phys(32);
else
arm64_dma32_phys_limit = PHYS_MASK + 1;
- reserve_crashkernel();
-
reserve_elfcorehdr();
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
@@ -427,73 +438,14 @@ void __init bootmem_init(void)
sparse_init();
zone_sizes_init(min, max);
- memblock_dump_all();
-}
-
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static inline void free_memmap(unsigned long start_pfn, unsigned long end_pfn)
-{
- struct page *start_pg, *end_pg;
- unsigned long pg, pgend;
-
- /*
- * Convert start_pfn/end_pfn to a struct page pointer.
- */
- start_pg = pfn_to_page(start_pfn - 1) + 1;
- end_pg = pfn_to_page(end_pfn - 1) + 1;
-
/*
- * Convert to physical addresses, and round start upwards and end
- * downwards.
+ * request_standard_resources() depends on crashkernel's memory being
+ * reserved, so do it here.
*/
- pg = (unsigned long)PAGE_ALIGN(__pa(start_pg));
- pgend = (unsigned long)__pa(end_pg) & PAGE_MASK;
-
- /*
- * If there are free pages between these, free the section of the
- * memmap array.
- */
- if (pg < pgend)
- memblock_free(pg, pgend - pg);
-}
-
-/*
- * The mem_map array can get very big. Free the unused area of the memory map.
- */
-static void __init free_unused_memmap(void)
-{
- unsigned long start, end, prev_end = 0;
- int i;
-
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
-#ifdef CONFIG_SPARSEMEM
- /*
- * Take care not to free memmap entries that don't exist due
- * to SPARSEMEM sections which aren't present.
- */
- start = min(start, ALIGN(prev_end, PAGES_PER_SECTION));
-#endif
- /*
- * If we had a previous bank, and there is a space between the
- * current bank and the previous, free it.
- */
- if (prev_end && prev_end < start)
- free_memmap(prev_end, start);
-
- /*
- * Align up here since the VM subsystem insists that the
- * memmap entries are valid from the bank end aligned to
- * MAX_ORDER_NR_PAGES.
- */
- prev_end = ALIGN(end, MAX_ORDER_NR_PAGES);
- }
+ reserve_crashkernel();
-#ifdef CONFIG_SPARSEMEM
- if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION))
- free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION));
-#endif
+ memblock_dump_all();
}
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
/*
* mem_init() marks the free areas in the mem_map and tells us how much memory
@@ -510,9 +462,6 @@ void __init mem_init(void)
set_max_mapnr(max_pfn - PHYS_PFN_OFFSET);
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
- free_unused_memmap();
-#endif
/* this will put all unused low memory onto the freelists */
memblock_free_all();
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index b24e43d20667..d8e66c78440e 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -21,6 +21,8 @@
#include <asm/sections.h>
#include <asm/tlbflush.h>
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+
static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
/*
@@ -208,7 +210,7 @@ static void __init clear_pgds(unsigned long start,
set_pgd(pgd_offset_k(start), __pgd(0));
}
-void __init kasan_init(void)
+static void __init kasan_init_shadow(void)
{
u64 kimg_shadow_start, kimg_shadow_end;
u64 mod_shadow_start, mod_shadow_end;
@@ -269,8 +271,21 @@ void __init kasan_init(void)
memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE);
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
+}
- /* At this point kasan is fully initialized. Enable error messages */
+static void __init kasan_init_depth(void)
+{
init_task.kasan_depth = 0;
+}
+
+void __init kasan_init(void)
+{
+ kasan_init_shadow();
+ kasan_init_depth();
+#if defined(CONFIG_KASAN_GENERIC)
+ /* CONFIG_KASAN_SW_TAGS also requires kasan_init_sw_tags(). */
pr_info("KernelAddressSanitizer initialized\n");
+#endif
}
+
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 3028bacbc4e9..07937b49cb88 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -47,24 +47,3 @@ int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
{
return !(((pfn << PAGE_SHIFT) + size) & ~PHYS_MASK);
}
-
-#ifdef CONFIG_STRICT_DEVMEM
-
-#include <linux/ioport.h>
-
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number. We mimic x86 here by
- * disallowing access to system RAM as well as device-exclusive MMIO regions.
- * This effectively disable read()/write() on /dev/mem.
- */
-int devmem_is_allowed(unsigned long pfn)
-{
- if (iomem_is_exclusive(pfn << PAGE_SHIFT))
- return 0;
- if (!page_is_ram(pfn))
- return 1;
- return 0;
-}
-
-#endif
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ca692a815731..ae0c3d023824 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -464,20 +464,35 @@ void __init mark_linear_text_alias_ro(void)
/*
* Remove the write permissions from the linear alias of .text/.rodata
*/
- update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
- (unsigned long)__init_begin - (unsigned long)_text,
+ update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext),
+ (unsigned long)__init_begin - (unsigned long)_stext,
PAGE_KERNEL_RO);
}
+static bool crash_mem_map __initdata;
+
+static int __init enable_crash_mem_map(char *arg)
+{
+ /*
+ * Proper parameter parsing is done by reserve_crashkernel(). We only
+ * need to know if the linear map has to avoid block mappings so that
+ * the crashkernel reservations can be unmapped later.
+ */
+ crash_mem_map = true;
+
+ return 0;
+}
+early_param("crashkernel", enable_crash_mem_map);
+
static void __init map_mem(pgd_t *pgdp)
{
- phys_addr_t kernel_start = __pa_symbol(_text);
+ phys_addr_t kernel_start = __pa_symbol(_stext);
phys_addr_t kernel_end = __pa_symbol(__init_begin);
phys_addr_t start, end;
int flags = 0;
u64 i;
- if (rodata_full || debug_pagealloc_enabled())
+ if (rodata_full || crash_mem_map || debug_pagealloc_enabled())
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
/*
@@ -487,11 +502,6 @@ static void __init map_mem(pgd_t *pgdp)
* the following for-loop
*/
memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
-#ifdef CONFIG_KEXEC_CORE
- if (crashk_res.end)
- memblock_mark_nomap(crashk_res.start,
- resource_size(&crashk_res));
-#endif
/* map all the memory banks */
for_each_mem_range(i, &start, &end) {
@@ -506,7 +516,7 @@ static void __init map_mem(pgd_t *pgdp)
}
/*
- * Map the linear alias of the [_text, __init_begin) interval
+ * Map the linear alias of the [_stext, __init_begin) interval
* as non-executable now, and remove the write permission in
* mark_linear_text_alias_ro() below (which will be called after
* alternative patching has completed). This makes the contents
@@ -518,21 +528,6 @@ static void __init map_mem(pgd_t *pgdp)
__map_memblock(pgdp, kernel_start, kernel_end,
PAGE_KERNEL, NO_CONT_MAPPINGS);
memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
-
-#ifdef CONFIG_KEXEC_CORE
- /*
- * Use page-level mappings here so that we can shrink the region
- * in page granularity and put back unused memory to buddy system
- * through /sys/kernel/kexec_crash_size interface.
- */
- if (crashk_res.end) {
- __map_memblock(pgdp, crashk_res.start, crashk_res.end + 1,
- PAGE_KERNEL,
- NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
- memblock_clear_nomap(crashk_res.start,
- resource_size(&crashk_res));
- }
-#endif
}
void mark_rodata_ro(void)
@@ -665,7 +660,7 @@ static void __init map_kernel(pgd_t *pgdp)
* Only rodata will be remapped with different permissions later on,
* all other segments are allowed to use contiguous mappings.
*/
- map_kernel_segment(pgdp, _text, _etext, text_prot, &vmlinux_text, 0,
+ map_kernel_segment(pgdp, _stext, _etext, text_prot, &vmlinux_text, 0,
VM_NO_GUARD);
map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
&vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
@@ -1132,8 +1127,11 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
void *p = NULL;
p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
- if (!p)
- return -ENOMEM;
+ if (!p) {
+ if (vmemmap_populate_basepages(addr, next, node, altmap))
+ return -ENOMEM;
+ continue;
+ }
pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
} else
@@ -1510,13 +1508,43 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
unsigned long pfn = arg->start_pfn;
- if (action != MEM_GOING_OFFLINE)
+ if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
return NOTIFY_OK;
for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ unsigned long start = PFN_PHYS(pfn);
+ unsigned long end = start + (1UL << PA_SECTION_SHIFT);
+
ms = __pfn_to_section(pfn);
- if (early_section(ms))
+ if (!early_section(ms))
+ continue;
+
+ if (action == MEM_GOING_OFFLINE) {
+ /*
+ * Boot memory removal is not supported. Prevent
+ * it via blocking any attempted offline request
+ * for the boot memory and just report it.
+ */
+ pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end);
return NOTIFY_BAD;
+ } else if (action == MEM_OFFLINE) {
+ /*
+ * This should have never happened. Boot memory
+ * offlining should have been prevented by this
+ * very notifier. Probably some memory removal
+ * procedure might have changed which would then
+ * require further debug.
+ */
+ pr_err("Boot memory [%lx %lx] offlined\n", start, end);
+
+ /*
+ * Core memory hotplug does not process a return
+ * code from the notifier for MEM_OFFLINE events.
+ * The error condition has been reported. Return
+ * from here as if ignored.
+ */
+ return NOTIFY_DONE;
+ }
}
return NOTIFY_OK;
}
@@ -1525,9 +1553,66 @@ static struct notifier_block prevent_bootmem_remove_nb = {
.notifier_call = prevent_bootmem_remove_notifier,
};
+/*
+ * This ensures that boot memory sections on the platform are online
+ * from early boot. Memory sections could not be prevented from being
+ * offlined, unless for some reason they are not online to begin with.
+ * This helps validate the basic assumption on which the above memory
+ * event notifier works to prevent boot memory section offlining and
+ * its possible removal.
+ */
+static void validate_bootmem_online(void)
+{
+ phys_addr_t start, end, addr;
+ struct mem_section *ms;
+ u64 i;
+
+ /*
+ * Scanning across all memblock might be expensive
+ * on some big memory systems. Hence enable this
+ * validation only with DEBUG_VM.
+ */
+ if (!IS_ENABLED(CONFIG_DEBUG_VM))
+ return;
+
+ for_each_mem_range(i, &start, &end) {
+ for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) {
+ ms = __pfn_to_section(PHYS_PFN(addr));
+
+ /*
+ * All memory ranges in the system at this point
+ * should have been marked as early sections.
+ */
+ WARN_ON(!early_section(ms));
+
+ /*
+ * Memory notifier mechanism here to prevent boot
+ * memory offlining depends on the fact that each
+ * early section memory on the system is initially
+ * online. Otherwise a given memory section which
+ * is already offline will be overlooked and can
+ * be removed completely. Call out such sections.
+ */
+ if (!online_section(ms))
+ pr_err("Boot memory [%llx %llx] is offline, can be removed\n",
+ addr, addr + (1UL << PA_SECTION_SHIFT));
+ }
+ }
+}
+
static int __init prevent_bootmem_remove_init(void)
{
- return register_memory_notifier(&prevent_bootmem_remove_nb);
+ int ret = 0;
+
+ if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return ret;
+
+ validate_bootmem_online();
+ ret = register_memory_notifier(&prevent_bootmem_remove_nb);
+ if (ret)
+ pr_err("%s: Notifier registration failed %d\n", __func__, ret);
+
+ return ret;
}
-device_initcall(prevent_bootmem_remove_init);
+early_initcall(prevent_bootmem_remove_init);
#endif
diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c
index c52c1847079c..7c4ef56265ee 100644
--- a/arch/arm64/mm/mteswap.c
+++ b/arch/arm64/mm/mteswap.c
@@ -53,6 +53,15 @@ bool mte_restore_tags(swp_entry_t entry, struct page *page)
if (!tags)
return false;
+ page_kasan_tag_reset(page);
+ /*
+ * We need smp_wmb() in between setting the flags and clearing the
+ * tags because if another thread reads page->flags and builds a
+ * tagged address out of it, there is an actual dependency to the
+ * memory access, but on the current thread we do not guarantee that
+ * the new page->flags are visible before the tags were updated.
+ */
+ smp_wmb();
mte_restore_page_tags(page_address(page), tags);
return true;
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 1b94f5b82654..92eccaf595c8 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -155,7 +155,7 @@ int set_direct_map_invalid_noflush(struct page *page)
.clear_mask = __pgprot(PTE_VALID),
};
- if (!rodata_full)
+ if (!debug_pagealloc_enabled() && !rodata_full)
return 0;
return apply_to_page_range(&init_mm,
@@ -170,7 +170,7 @@ int set_direct_map_default_noflush(struct page *page)
.clear_mask = __pgprot(PTE_RDONLY),
};
- if (!rodata_full)
+ if (!debug_pagealloc_enabled() && !rodata_full)
return 0;
return apply_to_page_range(&init_mm,
@@ -178,6 +178,7 @@ int set_direct_map_default_noflush(struct page *page)
PAGE_SIZE, change_page_range, &data);
}
+#ifdef CONFIG_DEBUG_PAGEALLOC
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
if (!debug_pagealloc_enabled() && !rodata_full)
@@ -185,6 +186,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
set_memory_valid((unsigned long)page_address(page), numpages, enable);
}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
/*
* This function is used to determine if a linear map page has been marked as
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 23c326a06b2d..1f7ee8c8b7b8 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -40,9 +40,15 @@
#define TCR_CACHE_FLAGS TCR_IRGN_WBWA | TCR_ORGN_WBWA
#ifdef CONFIG_KASAN_SW_TAGS
-#define TCR_KASAN_FLAGS TCR_TBI1
+#define TCR_KASAN_SW_FLAGS TCR_TBI1 | TCR_TBID1
#else
-#define TCR_KASAN_FLAGS 0
+#define TCR_KASAN_SW_FLAGS 0
+#endif
+
+#ifdef CONFIG_KASAN_HW_TAGS
+#define TCR_KASAN_HW_FLAGS SYS_TCR_EL1_TCMA1 | TCR_TBI1 | TCR_TBID1
+#else
+#define TCR_KASAN_HW_FLAGS 0
#endif
/*
@@ -168,7 +174,7 @@ SYM_FUNC_END(cpu_do_resume)
.pushsection ".idmap.text", "awx"
.macro __idmap_cpu_set_reserved_ttbr1, tmp1, tmp2
- adrp \tmp1, empty_zero_page
+ adrp \tmp1, reserved_pg_dir
phys_to_ttbr \tmp2, \tmp1
offset_ttbr1 \tmp2, \tmp1
msr ttbr1_el1, \tmp2
@@ -427,6 +433,10 @@ SYM_FUNC_START(__cpu_setup)
*/
mov_q x5, MAIR_EL1_SET
#ifdef CONFIG_ARM64_MTE
+ mte_tcr .req x20
+
+ mov mte_tcr, #0
+
/*
* Update MAIR_EL1, GCR_EL1 and TFSR*_EL1 if MTE is supported
* (ID_AA64PFR1_EL1[11:8] > 1).
@@ -447,6 +457,9 @@ SYM_FUNC_START(__cpu_setup)
/* clear any pending tag check faults in TFSR*_EL1 */
msr_s SYS_TFSR_EL1, xzr
msr_s SYS_TFSRE0_EL1, xzr
+
+ /* set the TCR_EL1 bits */
+ mov_q mte_tcr, TCR_KASAN_HW_FLAGS
1:
#endif
msr mair_el1, x5
@@ -456,7 +469,11 @@ SYM_FUNC_START(__cpu_setup)
*/
mov_q x10, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
- TCR_TBI0 | TCR_A1 | TCR_KASAN_FLAGS
+ TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS
+#ifdef CONFIG_ARM64_MTE
+ orr x10, x10, mte_tcr
+ .unreq mte_tcr
+#endif
tcr_clear_errata_bits x10, x9, x5
#ifdef CONFIG_ARM64_VA_BITS_52
@@ -489,6 +506,6 @@ SYM_FUNC_START(__cpu_setup)
/*
* Prepare SCTLR
*/
- mov_q x0, SCTLR_EL1_SET
+ mov_q x0, INIT_SCTLR_EL1_MMU_ON
ret // return to head.S
SYM_FUNC_END(__cpu_setup)
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 807dc634bbd2..04137a8f3d2d 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -29,7 +29,7 @@
enum address_markers_idx {
PAGE_OFFSET_NR = 0,
PAGE_END_NR,
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
KASAN_START_NR,
#endif
};
@@ -37,7 +37,7 @@ enum address_markers_idx {
static struct addr_marker address_markers[] = {
{ PAGE_OFFSET, "Linear Mapping start" },
{ 0 /* PAGE_END */, "Linear Mapping end" },
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
{ 0 /* KASAN_SHADOW_START */, "Kasan shadow start" },
{ KASAN_SHADOW_END, "Kasan shadow end" },
#endif
@@ -383,7 +383,7 @@ void ptdump_check_wx(void)
static int ptdump_init(void)
{
address_markers[PAGE_END_NR].start_address = PAGE_END;
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
address_markers[KASAN_START_NR].start_address = KASAN_SHADOW_START;
#endif
ptdump_initialize();