diff options
Diffstat (limited to 'arch/arm64/kernel')
110 files changed, 15505 insertions, 10407 deletions
diff --git a/arch/arm64/kernel/.gitignore b/arch/arm64/kernel/.gitignore index c5f676c3c224..bbb90f92d051 100644 --- a/arch/arm64/kernel/.gitignore +++ b/arch/arm64/kernel/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only vmlinux.lds diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index fc6488660f64..e5d03a7039b4 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -3,56 +3,63 @@ # Makefile for the linux kernel. # -CPPFLAGS_vmlinux.lds := -DTEXT_OFFSET=$(TEXT_OFFSET) -AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET) CFLAGS_armv8_deprecated.o := -I$(src) CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_insn.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) +# Remove stack protector to avoid triggering unneeded stack canary +# checks due to randomize_kstack_offset. +CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong +CFLAGS_syscall.o += -fno-stack-protector + +# When KASAN is enabled, a stack trace is recorded for every alloc/free, which +# can significantly impact performance. Avoid instrumenting the stack trace +# collection code to minimize this impact. +KASAN_SANITIZE_stacktrace.o := n + +# It's not safe to invoke KCOV when portions of the kernel environment aren't +# available or are out-of-sync with HW state. Since `noinstr` doesn't always +# inhibit KCOV instrumentation, disable it for the entire compilation unit. +KCOV_INSTRUMENT_entry-common.o := n +KCOV_INSTRUMENT_idle.o := n + # Object file lists. obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ entry-common.o entry-fpsimd.o process.o ptrace.o \ setup.o signal.o sys.o stacktrace.o time.o traps.o \ - io.o vdso.o hyp-stub.o psci.o cpu_ops.o insn.o \ + io.o vdso.o hyp-stub.o psci.o cpu_ops.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - syscall.o - -extra-$(CONFIG_EFI) := efi-entry.o - -OBJCOPYFLAGS := --prefix-symbols=__efistub_ -$(obj)/%.stub.o: $(obj)/%.o FORCE - $(call if_changed,objcopy) + syscall.o proton-pack.o idreg-override.o idle.o \ + patching.o obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o -ifneq ($(CONFIG_COMPAT_VDSO), y) obj-$(CONFIG_COMPAT) += sigreturn32.o -endif +obj-$(CONFIG_COMPAT_ALIGNMENT_FIXUPS) += compat_alignment.o obj-$(CONFIG_KUSER_HELPERS) += kuser32.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o -obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o +obj-$(CONFIG_MODULES) += module.o module-plts.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o -obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_CPU_PM) += sleep.o suspend.o obj-$(CONFIG_CPU_IDLE) += cpuidle.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KGDB) += kgdb.o -obj-$(CONFIG_EFI) += efi.o efi-entry.stub.o \ - efi-rt-wrapper.o +obj-$(CONFIG_EFI) += efi.o efi-rt-wrapper.o obj-$(CONFIG_PCI) += pci.o obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o obj-$(CONFIG_ACPI) += acpi.o obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o obj-$(CONFIG_PARAVIRT) += paravirt.o -obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o pi/ obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o +obj-$(CONFIG_ELF_CORE) += elfcore.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \ cpu-reset.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o @@ -61,14 +68,30 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o -obj-$(CONFIG_ARM64_SSBD) += ssbd.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o +obj-$(CONFIG_ARM64_MTE) += mte.o +obj-y += vdso-wrap.o +obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o +obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.o + +# We need to prevent the SCS patching code from patching itself. Using +# -mbranch-protection=none here to avoid the patchable PAC opcodes from being +# generated triggers an issue with full LTO on Clang, which stops emitting PAC +# instructions altogether. So instead, omit the unwind tables used by the +# patching code, so it will not be able to locate its own PAC instructions. +CFLAGS_patch-scs.o += -fno-asynchronous-unwind-tables -fno-unwind-tables -obj-y += vdso/ probes/ -obj-$(CONFIG_COMPAT_VDSO) += vdso32/ -head-y := head.o -extra-y += $(head-y) vmlinux.lds +# Force dependency (vdso*-wrap.S includes vdso.so through incbin) +$(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so +$(obj)/vdso32-wrap.o: $(obj)/vdso32/vdso.so + +obj-y += probes/ +obj-y += head.o +extra-y += vmlinux.lds ifeq ($(CONFIG_DEBUG_EFI),y) AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\"" endif + +# for cleaning +subdir- += vdso vdso32 diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 3a58e9db5cfe..dba8fcec7f33 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -13,22 +13,25 @@ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/acpi.h> +#include <linux/arm-smccc.h> #include <linux/cpumask.h> #include <linux/efi.h> #include <linux/efi-bgrt.h> #include <linux/init.h> #include <linux/irq.h> #include <linux/irqdomain.h> +#include <linux/irq_work.h> #include <linux/memblock.h> #include <linux/of_fdt.h> +#include <linux/libfdt.h> #include <linux/smp.h> #include <linux/serial_core.h> +#include <linux/pgtable.h> #include <acpi/ghes.h> #include <asm/cputype.h> #include <asm/cpu_ops.h> #include <asm/daifflags.h> -#include <asm/pgtable.h> #include <asm/smp_plat.h> int acpi_noirq = 1; /* skip ACPI IRQ initialization */ @@ -61,29 +64,22 @@ static int __init parse_acpi(char *arg) } early_param("acpi", parse_acpi); -static int __init dt_scan_depth1_nodes(unsigned long node, - const char *uname, int depth, - void *data) +static bool __init dt_is_stub(void) { - /* - * Ignore anything not directly under the root node; we'll - * catch its parent instead. - */ - if (depth != 1) - return 0; + int node; - if (strcmp(uname, "chosen") == 0) - return 0; + fdt_for_each_subnode(node, initial_boot_params, 0) { + const char *name = fdt_get_name(initial_boot_params, node, NULL); + if (strcmp(name, "chosen") == 0) + continue; + if (strcmp(name, "hypervisor") == 0 && + of_flat_dt_is_compatible(node, "xen,xen")) + continue; - if (strcmp(uname, "hypervisor") == 0 && - of_flat_dt_is_compatible(node, "xen,xen")) - return 0; + return false; + } - /* - * This node at depth 1 is neither a chosen node nor a xen node, - * which we do not expect. - */ - return 1; + return true; } /* @@ -204,8 +200,7 @@ void __init acpi_boot_table_init(void) * and ACPI has not been [force] enabled (acpi=on|force) */ if (param_acpi_off || - (!param_acpi_on && !param_acpi_force && - of_scan_flat_dt(dt_scan_depth1_nodes, NULL))) + (!param_acpi_on && !param_acpi_force && !dt_is_stub())) goto done; /* @@ -238,6 +233,18 @@ done: } } +static pgprot_t __acpi_get_writethrough_mem_attribute(void) +{ + /* + * Although UEFI specifies the use of Normal Write-through for + * EFI_MEMORY_WT, it is seldom used in practice and not implemented + * by most (all?) CPUs. Rather than allocate a MAIR just for this + * purpose, emit a warning and use Normal Non-cacheable instead. + */ + pr_warn_once("No MAIR allocation for EFI_MEMORY_WT; treating as Normal Non-cacheable\n"); + return __pgprot(PROT_NORMAL_NC); +} + pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) { /* @@ -245,7 +252,7 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) * types" of UEFI 2.5 section 2.3.6.1, each EFI memory type is * mapped to a corresponding MAIR attribute encoding. * The EFI memory attribute advises all possible capabilities - * of a memory region. We use the most efficient capability. + * of a memory region. */ u64 attr; @@ -253,13 +260,101 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) attr = efi_mem_attributes(addr); if (attr & EFI_MEMORY_WB) return PAGE_KERNEL; - if (attr & EFI_MEMORY_WT) - return __pgprot(PROT_NORMAL_WT); if (attr & EFI_MEMORY_WC) return __pgprot(PROT_NORMAL_NC); + if (attr & EFI_MEMORY_WT) + return __acpi_get_writethrough_mem_attribute(); return __pgprot(PROT_DEVICE_nGnRnE); } +void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) +{ + efi_memory_desc_t *md, *region = NULL; + pgprot_t prot; + + if (WARN_ON_ONCE(!efi_enabled(EFI_MEMMAP))) + return NULL; + + for_each_efi_memory_desc(md) { + u64 end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + + if (phys < md->phys_addr || phys >= end) + continue; + + if (phys + size > end) { + pr_warn(FW_BUG "requested region covers multiple EFI memory regions\n"); + return NULL; + } + region = md; + break; + } + + /* + * It is fine for AML to remap regions that are not represented in the + * EFI memory map at all, as it only describes normal memory, and MMIO + * regions that require a virtual mapping to make them accessible to + * the EFI runtime services. + */ + prot = __pgprot(PROT_DEVICE_nGnRnE); + if (region) { + switch (region->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + case EFI_PERSISTENT_MEMORY: + if (memblock_is_map_memory(phys) || + !memblock_is_region_memory(phys, size)) { + pr_warn(FW_BUG "requested region covers kernel memory @ %pa\n", &phys); + return NULL; + } + /* + * Mapping kernel memory is permitted if the region in + * question is covered by a single memblock with the + * NOMAP attribute set: this enables the use of ACPI + * table overrides passed via initramfs, which are + * reserved in memory using arch_reserve_mem_area() + * below. As this particular use case only requires + * read access, fall through to the R/O mapping case. + */ + fallthrough; + + case EFI_RUNTIME_SERVICES_CODE: + /* + * This would be unusual, but not problematic per se, + * as long as we take care not to create a writable + * mapping for executable code. + */ + prot = PAGE_KERNEL_RO; + break; + + case EFI_ACPI_RECLAIM_MEMORY: + /* + * ACPI reclaim memory is used to pass firmware tables + * and other data that is intended for consumption by + * the OS only, which may decide it wants to reclaim + * that memory and use it for something else. We never + * do that, but we usually add it to the linear map + * anyway, in which case we should use the existing + * mapping. + */ + if (memblock_is_map_memory(phys)) + return (void __iomem *)__phys_to_virt(phys); + fallthrough; + + default: + if (region->attribute & EFI_MEMORY_WB) + prot = PAGE_KERNEL; + else if (region->attribute & EFI_MEMORY_WC) + prot = __pgprot(PROT_NORMAL_NC); + else if (region->attribute & EFI_MEMORY_WT) + prot = __acpi_get_writethrough_mem_attribute(); + } + } + return ioremap_prot(phys, size, pgprot_val(prot)); +} + /* * Claim Synchronous External Aborts as a firmware first notification. * @@ -269,12 +364,19 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) int apei_claim_sea(struct pt_regs *regs) { int err = -ENOENT; + bool return_to_irqs_enabled; unsigned long current_flags; if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES)) return err; - current_flags = arch_local_save_flags(); + current_flags = local_daif_save_flags(); + + /* current_flags isn't useful here as daif doesn't tell us about pNMI */ + return_to_irqs_enabled = !irqs_disabled_flags(arch_local_save_flags()); + + if (regs) + return_to_irqs_enabled = interrupts_enabled(regs); /* * SEA can interrupt SError, mask it and describe this as an NMI so @@ -284,7 +386,134 @@ int apei_claim_sea(struct pt_regs *regs) nmi_enter(); err = ghes_notify_sea(); nmi_exit(); + + /* + * APEI NMI-like notifications are deferred to irq_work. Unless + * we interrupted irqs-masked code, we can do that now. + */ + if (!err) { + if (return_to_irqs_enabled) { + local_daif_restore(DAIF_PROCCTX_NOIRQ); + __irq_enter(); + irq_work_run(); + __irq_exit(); + } else { + pr_warn_ratelimited("APEI work queued but not completed"); + err = -EINPROGRESS; + } + } + local_daif_restore(current_flags); return err; } + +void arch_reserve_mem_area(acpi_physical_address addr, size_t size) +{ + memblock_mark_nomap(addr, size); +} + +#ifdef CONFIG_ACPI_FFH +/* + * Implements ARM64 specific callbacks to support ACPI FFH Operation Region as + * specified in https://developer.arm.com/docs/den0048/latest + */ +struct acpi_ffh_data { + struct acpi_ffh_info info; + void (*invoke_ffh_fn)(unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3, + unsigned long a4, unsigned long a5, + unsigned long a6, unsigned long a7, + struct arm_smccc_res *args, + struct arm_smccc_quirk *res); + void (*invoke_ffh64_fn)(const struct arm_smccc_1_2_regs *args, + struct arm_smccc_1_2_regs *res); +}; + +int acpi_ffh_address_space_arch_setup(void *handler_ctxt, void **region_ctxt) +{ + enum arm_smccc_conduit conduit; + struct acpi_ffh_data *ffh_ctxt; + + if (arm_smccc_get_version() < ARM_SMCCC_VERSION_1_2) + return -EOPNOTSUPP; + + conduit = arm_smccc_1_1_get_conduit(); + if (conduit == SMCCC_CONDUIT_NONE) { + pr_err("%s: invalid SMCCC conduit\n", __func__); + return -EOPNOTSUPP; + } + + ffh_ctxt = kzalloc(sizeof(*ffh_ctxt), GFP_KERNEL); + if (!ffh_ctxt) + return -ENOMEM; + + if (conduit == SMCCC_CONDUIT_SMC) { + ffh_ctxt->invoke_ffh_fn = __arm_smccc_smc; + ffh_ctxt->invoke_ffh64_fn = arm_smccc_1_2_smc; + } else { + ffh_ctxt->invoke_ffh_fn = __arm_smccc_hvc; + ffh_ctxt->invoke_ffh64_fn = arm_smccc_1_2_hvc; + } + + memcpy(ffh_ctxt, handler_ctxt, sizeof(ffh_ctxt->info)); + + *region_ctxt = ffh_ctxt; + return AE_OK; +} + +static bool acpi_ffh_smccc_owner_allowed(u32 fid) +{ + int owner = ARM_SMCCC_OWNER_NUM(fid); + + if (owner == ARM_SMCCC_OWNER_STANDARD || + owner == ARM_SMCCC_OWNER_SIP || owner == ARM_SMCCC_OWNER_OEM) + return true; + + return false; +} + +int acpi_ffh_address_space_arch_handler(acpi_integer *value, void *region_context) +{ + int ret = 0; + struct acpi_ffh_data *ffh_ctxt = region_context; + + if (ffh_ctxt->info.offset == 0) { + /* SMC/HVC 32bit call */ + struct arm_smccc_res res; + u32 a[8] = { 0 }, *ptr = (u32 *)value; + + if (!ARM_SMCCC_IS_FAST_CALL(*ptr) || ARM_SMCCC_IS_64(*ptr) || + !acpi_ffh_smccc_owner_allowed(*ptr) || + ffh_ctxt->info.length > 32) { + ret = AE_ERROR; + } else { + int idx, len = ffh_ctxt->info.length >> 2; + + for (idx = 0; idx < len; idx++) + a[idx] = *(ptr + idx); + + ffh_ctxt->invoke_ffh_fn(a[0], a[1], a[2], a[3], a[4], + a[5], a[6], a[7], &res, NULL); + memcpy(value, &res, sizeof(res)); + } + + } else if (ffh_ctxt->info.offset == 1) { + /* SMC/HVC 64bit call */ + struct arm_smccc_1_2_regs *r = (struct arm_smccc_1_2_regs *)value; + + if (!ARM_SMCCC_IS_FAST_CALL(r->a0) || !ARM_SMCCC_IS_64(r->a0) || + !acpi_ffh_smccc_owner_allowed(r->a0) || + ffh_ctxt->info.length > sizeof(*r)) { + ret = AE_ERROR; + } else { + ffh_ctxt->invoke_ffh64_fn(r, r); + memcpy(value, r, ffh_ctxt->info.length); + } + } else { + ret = AE_ERROR; + } + + return ret; +} +#endif /* CONFIG_ACPI_FFH */ diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c index 7ff800045434..e51535a5f939 100644 --- a/arch/arm64/kernel/acpi_numa.c +++ b/arch/arm64/kernel/acpi_numa.c @@ -109,7 +109,7 @@ void __init acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) pxm = pa->proximity_domain; node = acpi_map_pxm_to_node(pxm); - if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) { + if (node == NUMA_NO_NODE) { pr_err("SRAT: Too many proximity domains %d\n", pxm); bad_srat(); return; @@ -118,15 +118,3 @@ void __init acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) node_set(node, numa_nodes_parsed); } -int __init arm64_acpi_numa_init(void) -{ - int ret; - - ret = acpi_numa_init(); - if (ret) { - pr_info("Failed to initialise from firmware\n"); - return ret; - } - - return srat_disabled() ? -EINVAL : 0; -} diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c index e7c941d8340d..e1be29e608b7 100644 --- a/arch/arm64/kernel/acpi_parking_protocol.c +++ b/arch/arm64/kernel/acpi_parking_protocol.c @@ -99,10 +99,11 @@ static int acpi_parking_protocol_cpu_boot(unsigned int cpu) * that read this address need to convert this address to the * Boot-Loader's endianness before jumping. */ - writeq_relaxed(__pa_symbol(secondary_entry), &mailbox->entry_point); + writeq_relaxed(__pa_symbol(secondary_entry), + &mailbox->entry_point); writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id); - arch_send_wakeup_ipi_mask(cpumask_of(cpu)); + arch_send_wakeup_ipi(cpu); return 0; } diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index d1757ef1b1e7..8ff6610af496 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -10,18 +10,25 @@ #include <linux/init.h> #include <linux/cpu.h> +#include <linux/elf.h> #include <asm/cacheflush.h> #include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/insn.h> +#include <asm/module.h> #include <asm/sections.h> +#include <asm/vdso.h> #include <linux/stop_machine.h> -#define __ALT_PTR(a,f) ((void *)&(a)->f + (a)->f) +#define __ALT_PTR(a, f) ((void *)&(a)->f + (a)->f) #define ALT_ORIG_PTR(a) __ALT_PTR(a, orig_offset) #define ALT_REPL_PTR(a) __ALT_PTR(a, alt_offset) -static int all_alternatives_applied; +#define ALT_CAP(a) ((a)->cpucap & ~ARM64_CB_BIT) +#define ALT_HAS_CB(a) ((a)->cpucap & ARM64_CB_BIT) + +/* Volatile, as we may be patching the guts of READ_ONCE() */ +static volatile int all_alternatives_applied; static DECLARE_BITMAP(applied_alternatives, ARM64_NCAPS); @@ -30,38 +37,26 @@ struct alt_region { struct alt_instr *end; }; -bool alternative_is_applied(u16 cpufeature) +bool alternative_is_applied(u16 cpucap) { - if (WARN_ON(cpufeature >= ARM64_NCAPS)) + if (WARN_ON(cpucap >= ARM64_NCAPS)) return false; - return test_bit(cpufeature, applied_alternatives); + return test_bit(cpucap, applied_alternatives); } /* * Check if the target PC is within an alternative block. */ -static bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc) +static __always_inline bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc) { - unsigned long replptr; - - if (kernel_text_address(pc)) - return true; - - replptr = (unsigned long)ALT_REPL_PTR(alt); - if (pc >= replptr && pc <= (replptr + alt->alt_len)) - return false; - - /* - * Branching into *another* alternate sequence is doomed, and - * we're not even trying to fix it up. - */ - BUG(); + unsigned long replptr = (unsigned long)ALT_REPL_PTR(alt); + return !(pc >= replptr && pc <= (replptr + alt->alt_len)); } #define align_down(x, a) ((unsigned long)(x) & ~(((unsigned long)(a)) - 1)) -static u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnptr) +static __always_inline u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnptr) { u32 insn; @@ -106,7 +101,7 @@ static u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnp return insn; } -static void patch_alternative(struct alt_instr *alt, +static noinstr void patch_alternative(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst) { __le32 *replptr; @@ -126,13 +121,13 @@ static void patch_alternative(struct alt_instr *alt, * accidentally call into the cache.S code, which is patched by us at * runtime. */ -static void clean_dcache_range_nopatch(u64 start, u64 end) +static noinstr void clean_dcache_range_nopatch(u64 start, u64 end) { u64 cur, d_size, ctr_el0; - ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); + ctr_el0 = arm64_ftr_reg_ctrel0.sys_val; d_size = 4 << cpuid_feature_extract_unsigned_field(ctr_el0, - CTR_DMINLINE_SHIFT); + CTR_EL0_DminLine_SHIFT); cur = start & ~(d_size - 1); do { /* @@ -144,40 +139,37 @@ static void clean_dcache_range_nopatch(u64 start, u64 end) } while (cur += d_size, cur < end); } -static void __apply_alternatives(void *alt_region, bool is_module, - unsigned long *feature_mask) +static void __apply_alternatives(const struct alt_region *region, + bool is_module, + unsigned long *cpucap_mask) { struct alt_instr *alt; - struct alt_region *region = alt_region; __le32 *origptr, *updptr; alternative_cb_t alt_cb; for (alt = region->begin; alt < region->end; alt++) { int nr_inst; + int cap = ALT_CAP(alt); - if (!test_bit(alt->cpufeature, feature_mask)) + if (!test_bit(cap, cpucap_mask)) continue; - /* Use ARM64_CB_PATCH as an unconditional patch */ - if (alt->cpufeature < ARM64_CB_PATCH && - !cpus_have_cap(alt->cpufeature)) + if (!cpus_have_cap(cap)) continue; - if (alt->cpufeature == ARM64_CB_PATCH) + if (ALT_HAS_CB(alt)) BUG_ON(alt->alt_len != 0); else BUG_ON(alt->alt_len != alt->orig_len); - pr_info_once("patching kernel code\n"); - origptr = ALT_ORIG_PTR(alt); updptr = is_module ? origptr : lm_alias(origptr); nr_inst = alt->orig_len / AARCH64_INSN_SIZE; - if (alt->cpufeature < ARM64_CB_PATCH) - alt_cb = patch_alternative; - else + if (ALT_HAS_CB(alt)) alt_cb = ALT_REPL_PTR(alt); + else + alt_cb = patch_alternative; alt_cb(alt, origptr, updptr, nr_inst); @@ -193,43 +185,67 @@ static void __apply_alternatives(void *alt_region, bool is_module, */ if (!is_module) { dsb(ish); - __flush_icache_all(); + icache_inval_all_pou(); isb(); - /* Ignore ARM64_CB bit from feature mask */ bitmap_or(applied_alternatives, applied_alternatives, - feature_mask, ARM64_NCAPS); + cpucap_mask, ARM64_NCAPS); bitmap_and(applied_alternatives, applied_alternatives, - cpu_hwcaps, ARM64_NCAPS); + system_cpucaps, ARM64_NCAPS); } } +static void __init apply_alternatives_vdso(void) +{ + struct alt_region region; + const struct elf64_hdr *hdr; + const struct elf64_shdr *shdr; + const struct elf64_shdr *alt; + DECLARE_BITMAP(all_capabilities, ARM64_NCAPS); + + bitmap_fill(all_capabilities, ARM64_NCAPS); + + hdr = (struct elf64_hdr *)vdso_start; + shdr = (void *)hdr + hdr->e_shoff; + alt = find_section(hdr, shdr, ".altinstructions"); + if (!alt) + return; + + region = (struct alt_region){ + .begin = (void *)hdr + alt->sh_offset, + .end = (void *)hdr + alt->sh_offset + alt->sh_size, + }; + + __apply_alternatives(®ion, false, &all_capabilities[0]); +} + +static const struct alt_region kernel_alternatives __initconst = { + .begin = (struct alt_instr *)__alt_instructions, + .end = (struct alt_instr *)__alt_instructions_end, +}; + /* * We might be patching the stop_machine state machine, so implement a * really simple polling protocol here. */ -static int __apply_alternatives_multi_stop(void *unused) +static int __init __apply_alternatives_multi_stop(void *unused) { - struct alt_region region = { - .begin = (struct alt_instr *)__alt_instructions, - .end = (struct alt_instr *)__alt_instructions_end, - }; - /* We always have a CPU 0 at this point (__init) */ if (smp_processor_id()) { - while (!READ_ONCE(all_alternatives_applied)) + while (!all_alternatives_applied) cpu_relax(); isb(); } else { - DECLARE_BITMAP(remaining_capabilities, ARM64_NPATCHABLE); + DECLARE_BITMAP(remaining_capabilities, ARM64_NCAPS); - bitmap_complement(remaining_capabilities, boot_capabilities, - ARM64_NPATCHABLE); + bitmap_complement(remaining_capabilities, boot_cpucaps, + ARM64_NCAPS); BUG_ON(all_alternatives_applied); - __apply_alternatives(®ion, false, remaining_capabilities); + __apply_alternatives(&kernel_alternatives, false, + remaining_capabilities); /* Barriers provided by the cache flushing */ - WRITE_ONCE(all_alternatives_applied, 1); + all_alternatives_applied = 1; } return 0; @@ -237,6 +253,9 @@ static int __apply_alternatives_multi_stop(void *unused) void __init apply_alternatives_all(void) { + pr_info("applying system-wide alternatives\n"); + + apply_alternatives_vdso(); /* better not try code patching on a live SMP system */ stop_machine(__apply_alternatives_multi_stop, NULL, cpu_online_mask); } @@ -248,15 +267,13 @@ void __init apply_alternatives_all(void) */ void __init apply_boot_alternatives(void) { - struct alt_region region = { - .begin = (struct alt_instr *)__alt_instructions, - .end = (struct alt_instr *)__alt_instructions_end, - }; - /* If called on non-boot cpu things could go wrong */ WARN_ON(smp_processor_id() != 0); - __apply_alternatives(®ion, false, &boot_capabilities[0]); + pr_info("applying boot alternatives\n"); + + __apply_alternatives(&kernel_alternatives, false, + &boot_cpucaps[0]); } #ifdef CONFIG_MODULES @@ -266,10 +283,18 @@ void apply_alternatives_module(void *start, size_t length) .begin = start, .end = start + length, }; - DECLARE_BITMAP(all_capabilities, ARM64_NPATCHABLE); + DECLARE_BITMAP(all_capabilities, ARM64_NCAPS); - bitmap_fill(all_capabilities, ARM64_NPATCHABLE); + bitmap_fill(all_capabilities, ARM64_NCAPS); __apply_alternatives(®ion, true, &all_capabilities[0]); } #endif + +noinstr void alt_cb_patch_nops(struct alt_instr *alt, __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + for (int i = 0; i < nr_inst; i++) + updptr[i] = cpu_to_le32(aarch64_insn_gen_nop()); +} +EXPORT_SYMBOL(alt_cb_patch_nops); diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index ca158be21f83..dd6ce86d4332 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -17,7 +17,6 @@ #include <asm/sysreg.h> #include <asm/system_misc.h> #include <asm/traps.h> -#include <asm/kprobes.h> #define CREATE_TRACE_POINTS #include "trace-events-emulation.h" @@ -39,225 +38,44 @@ enum insn_emulation_mode { enum legacy_insn_status { INSN_DEPRECATED, INSN_OBSOLETE, -}; - -struct insn_emulation_ops { - const char *name; - enum legacy_insn_status status; - struct undef_hook *hooks; - int (*set_hw_mode)(bool enable); + INSN_UNAVAILABLE, }; struct insn_emulation { - struct list_head node; - struct insn_emulation_ops *ops; + const char *name; + enum legacy_insn_status status; + bool (*try_emulate)(struct pt_regs *regs, + u32 insn); + int (*set_hw_mode)(bool enable); + int current_mode; int min; int max; -}; - -static LIST_HEAD(insn_emulation); -static int nr_insn_emulated __initdata; -static DEFINE_RAW_SPINLOCK(insn_emulation_lock); - -static void register_emulation_hooks(struct insn_emulation_ops *ops) -{ - struct undef_hook *hook; - - BUG_ON(!ops->hooks); - - for (hook = ops->hooks; hook->instr_mask; hook++) - register_undef_hook(hook); - - pr_notice("Registered %s emulation handler\n", ops->name); -} - -static void remove_emulation_hooks(struct insn_emulation_ops *ops) -{ - struct undef_hook *hook; - - BUG_ON(!ops->hooks); - - for (hook = ops->hooks; hook->instr_mask; hook++) - unregister_undef_hook(hook); - - pr_notice("Removed %s emulation handler\n", ops->name); -} - -static void enable_insn_hw_mode(void *data) -{ - struct insn_emulation *insn = (struct insn_emulation *)data; - if (insn->ops->set_hw_mode) - insn->ops->set_hw_mode(true); -} - -static void disable_insn_hw_mode(void *data) -{ - struct insn_emulation *insn = (struct insn_emulation *)data; - if (insn->ops->set_hw_mode) - insn->ops->set_hw_mode(false); -} - -/* Run set_hw_mode(mode) on all active CPUs */ -static int run_all_cpu_set_hw_mode(struct insn_emulation *insn, bool enable) -{ - if (!insn->ops->set_hw_mode) - return -EINVAL; - if (enable) - on_each_cpu(enable_insn_hw_mode, (void *)insn, true); - else - on_each_cpu(disable_insn_hw_mode, (void *)insn, true); - return 0; -} -/* - * Run set_hw_mode for all insns on a starting CPU. - * Returns: - * 0 - If all the hooks ran successfully. - * -EINVAL - At least one hook is not supported by the CPU. - */ -static int run_all_insn_set_hw_mode(unsigned int cpu) -{ - int rc = 0; - unsigned long flags; - struct insn_emulation *insn; - - raw_spin_lock_irqsave(&insn_emulation_lock, flags); - list_for_each_entry(insn, &insn_emulation, node) { - bool enable = (insn->current_mode == INSN_HW); - if (insn->ops->set_hw_mode && insn->ops->set_hw_mode(enable)) { - pr_warn("CPU[%u] cannot support the emulation of %s", - cpu, insn->ops->name); - rc = -EINVAL; - } - } - raw_spin_unlock_irqrestore(&insn_emulation_lock, flags); - return rc; -} - -static int update_insn_emulation_mode(struct insn_emulation *insn, - enum insn_emulation_mode prev) -{ - int ret = 0; - - switch (prev) { - case INSN_UNDEF: /* Nothing to be done */ - break; - case INSN_EMULATE: - remove_emulation_hooks(insn->ops); - break; - case INSN_HW: - if (!run_all_cpu_set_hw_mode(insn, false)) - pr_notice("Disabled %s support\n", insn->ops->name); - break; - } - - switch (insn->current_mode) { - case INSN_UNDEF: - break; - case INSN_EMULATE: - register_emulation_hooks(insn->ops); - break; - case INSN_HW: - ret = run_all_cpu_set_hw_mode(insn, true); - if (!ret) - pr_notice("Enabled %s support\n", insn->ops->name); - break; - } - - return ret; -} - -static void __init register_insn_emulation(struct insn_emulation_ops *ops) -{ - unsigned long flags; - struct insn_emulation *insn; - - insn = kzalloc(sizeof(*insn), GFP_KERNEL); - if (!insn) - return; - - insn->ops = ops; - insn->min = INSN_UNDEF; - - switch (ops->status) { - case INSN_DEPRECATED: - insn->current_mode = INSN_EMULATE; - /* Disable the HW mode if it was turned on at early boot time */ - run_all_cpu_set_hw_mode(insn, false); - insn->max = INSN_HW; - break; - case INSN_OBSOLETE: - insn->current_mode = INSN_UNDEF; - insn->max = INSN_EMULATE; - break; - } - - raw_spin_lock_irqsave(&insn_emulation_lock, flags); - list_add(&insn->node, &insn_emulation); - nr_insn_emulated++; - raw_spin_unlock_irqrestore(&insn_emulation_lock, flags); - - /* Register any handlers if required */ - update_insn_emulation_mode(insn, INSN_UNDEF); -} - -static int emulation_proc_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret = 0; - struct insn_emulation *insn = (struct insn_emulation *) table->data; - enum insn_emulation_mode prev_mode = insn->current_mode; - - table->data = &insn->current_mode; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + /* sysctl for this emulation */ + struct ctl_table sysctl; +}; - if (ret || !write || prev_mode == insn->current_mode) - goto ret; +#define ARM_OPCODE_CONDTEST_FAIL 0 +#define ARM_OPCODE_CONDTEST_PASS 1 +#define ARM_OPCODE_CONDTEST_UNCOND 2 - ret = update_insn_emulation_mode(insn, prev_mode); - if (ret) { - /* Mode change failed, revert to previous mode. */ - insn->current_mode = prev_mode; - update_insn_emulation_mode(insn, INSN_UNDEF); - } -ret: - table->data = insn; - return ret; -} +#define ARM_OPCODE_CONDITION_UNCOND 0xf -static void __init register_insn_emulation_sysctl(void) +static unsigned int __maybe_unused aarch32_check_condition(u32 opcode, u32 psr) { - unsigned long flags; - int i = 0; - struct insn_emulation *insn; - struct ctl_table *insns_sysctl, *sysctl; - - insns_sysctl = kcalloc(nr_insn_emulated + 1, sizeof(*sysctl), - GFP_KERNEL); - if (!insns_sysctl) - return; - - raw_spin_lock_irqsave(&insn_emulation_lock, flags); - list_for_each_entry(insn, &insn_emulation, node) { - sysctl = &insns_sysctl[i]; - - sysctl->mode = 0644; - sysctl->maxlen = sizeof(int); + u32 cc_bits = opcode >> 28; - sysctl->procname = insn->ops->name; - sysctl->data = insn; - sysctl->extra1 = &insn->min; - sysctl->extra2 = &insn->max; - sysctl->proc_handler = emulation_proc_handler; - i++; + if (cc_bits != ARM_OPCODE_CONDITION_UNCOND) { + if ((*aarch32_opcode_cond_checks[cc_bits])(psr)) + return ARM_OPCODE_CONDTEST_PASS; + else + return ARM_OPCODE_CONDTEST_FAIL; } - raw_spin_unlock_irqrestore(&insn_emulation_lock, flags); - - register_sysctl("abi", insns_sysctl); + return ARM_OPCODE_CONDTEST_UNCOND; } +#ifdef CONFIG_SWP_EMULATION /* * Implement emulation of the SWP/SWPB instructions using load-exclusive and * store-exclusive. @@ -277,9 +95,9 @@ static void __init register_insn_emulation_sysctl(void) #define __user_swpX_asm(data, addr, res, temp, temp2, B) \ do { \ - uaccess_enable(); \ + uaccess_enable_privileged(); \ __asm__ __volatile__( \ - " mov %w3, %w7\n" \ + " mov %w3, %w6\n" \ "0: ldxr"B" %w2, [%4]\n" \ "1: stxr"B" %w0, %w1, [%4]\n" \ " cbz %w0, 2f\n" \ @@ -290,19 +108,13 @@ do { \ "2:\n" \ " mov %w1, %w2\n" \ "3:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "4: mov %w0, %w6\n" \ - " b 3b\n" \ - " .popsection" \ - _ASM_EXTABLE(0b, 4b) \ - _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE_UACCESS_ERR(0b, 3b, %w0) \ + _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0) \ : "=&r" (res), "+r" (data), "=&r" (temp), "=&r" (temp2) \ : "r" ((unsigned long)addr), "i" (-EAGAIN), \ - "i" (-EFAULT), \ "i" (__SWP_LL_SC_LOOPS) \ : "memory"); \ - uaccess_disable(); \ + uaccess_disable_privileged(); \ } while (0) #define __user_swp_asm(data, addr, res, temp, temp2) \ @@ -344,25 +156,6 @@ static int emulate_swpX(unsigned int address, unsigned int *data, return res; } -#define ARM_OPCODE_CONDTEST_FAIL 0 -#define ARM_OPCODE_CONDTEST_PASS 1 -#define ARM_OPCODE_CONDTEST_UNCOND 2 - -#define ARM_OPCODE_CONDITION_UNCOND 0xf - -static unsigned int __kprobes aarch32_check_condition(u32 opcode, u32 psr) -{ - u32 cc_bits = opcode >> 28; - - if (cc_bits != ARM_OPCODE_CONDITION_UNCOND) { - if ((*aarch32_opcode_cond_checks[cc_bits])(psr)) - return ARM_OPCODE_CONDTEST_PASS; - else - return ARM_OPCODE_CONDTEST_FAIL; - } - return ARM_OPCODE_CONDTEST_UNCOND; -} - /* * swp_handler logs the id of calling process, dissects the instruction, sanity * checks the memory location, calls emulate_swpX for the actual operation and @@ -435,28 +228,27 @@ fault: return 0; } -/* - * Only emulate SWP/SWPB executed in ARM state/User mode. - * The kernel must be SWP free and SWP{B} does not exist in Thumb. - */ -static struct undef_hook swp_hooks[] = { - { - .instr_mask = 0x0fb00ff0, - .instr_val = 0x01000090, - .pstate_mask = PSR_AA32_MODE_MASK, - .pstate_val = PSR_AA32_MODE_USR, - .fn = swp_handler - }, - { } -}; +static bool try_emulate_swp(struct pt_regs *regs, u32 insn) +{ + /* SWP{B} only exists in ARM state and does not exist in Thumb */ + if (!compat_user_mode(regs) || compat_thumb_mode(regs)) + return false; + + if ((insn & 0x0fb00ff0) != 0x01000090) + return false; -static struct insn_emulation_ops swp_ops = { + return swp_handler(regs, insn) == 0; +} + +static struct insn_emulation insn_swp = { .name = "swp", .status = INSN_OBSOLETE, - .hooks = swp_hooks, + .try_emulate = try_emulate_swp, .set_hw_mode = NULL, }; +#endif /* CONFIG_SWP_EMULATION */ +#ifdef CONFIG_CP15_BARRIER_EMULATION static int cp15barrier_handler(struct pt_regs *regs, u32 instr) { perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, regs->pc); @@ -519,31 +311,29 @@ static int cp15_barrier_set_hw_mode(bool enable) return 0; } -static struct undef_hook cp15_barrier_hooks[] = { - { - .instr_mask = 0x0fff0fdf, - .instr_val = 0x0e070f9a, - .pstate_mask = PSR_AA32_MODE_MASK, - .pstate_val = PSR_AA32_MODE_USR, - .fn = cp15barrier_handler, - }, - { - .instr_mask = 0x0fff0fff, - .instr_val = 0x0e070f95, - .pstate_mask = PSR_AA32_MODE_MASK, - .pstate_val = PSR_AA32_MODE_USR, - .fn = cp15barrier_handler, - }, - { } -}; +static bool try_emulate_cp15_barrier(struct pt_regs *regs, u32 insn) +{ + if (!compat_user_mode(regs) || compat_thumb_mode(regs)) + return false; + + if ((insn & 0x0fff0fdf) == 0x0e070f9a) + return cp15barrier_handler(regs, insn) == 0; -static struct insn_emulation_ops cp15_barrier_ops = { + if ((insn & 0x0fff0fff) == 0x0e070f95) + return cp15barrier_handler(regs, insn) == 0; + + return false; +} + +static struct insn_emulation insn_cp15_barrier = { .name = "cp15_barrier", .status = INSN_DEPRECATED, - .hooks = cp15_barrier_hooks, + .try_emulate = try_emulate_cp15_barrier, .set_hw_mode = cp15_barrier_set_hw_mode, }; +#endif /* CONFIG_CP15_BARRIER_EMULATION */ +#ifdef CONFIG_SETEND_EMULATION static int setend_set_hw_mode(bool enable) { if (!cpu_supports_mixed_endian_el0()) @@ -591,55 +381,244 @@ static int t16_setend_handler(struct pt_regs *regs, u32 instr) return rc; } -static struct undef_hook setend_hooks[] = { - { - .instr_mask = 0xfffffdff, - .instr_val = 0xf1010000, - .pstate_mask = PSR_AA32_MODE_MASK, - .pstate_val = PSR_AA32_MODE_USR, - .fn = a32_setend_handler, - }, - { - /* Thumb mode */ - .instr_mask = 0x0000fff7, - .instr_val = 0x0000b650, - .pstate_mask = (PSR_AA32_T_BIT | PSR_AA32_MODE_MASK), - .pstate_val = (PSR_AA32_T_BIT | PSR_AA32_MODE_USR), - .fn = t16_setend_handler, - }, - {} -}; +static bool try_emulate_setend(struct pt_regs *regs, u32 insn) +{ + if (compat_thumb_mode(regs) && + (insn & 0xfffffff7) == 0x0000b650) + return t16_setend_handler(regs, insn) == 0; -static struct insn_emulation_ops setend_ops = { + if (compat_user_mode(regs) && + (insn & 0xfffffdff) == 0xf1010000) + return a32_setend_handler(regs, insn) == 0; + + return false; +} + +static struct insn_emulation insn_setend = { .name = "setend", .status = INSN_DEPRECATED, - .hooks = setend_hooks, + .try_emulate = try_emulate_setend, .set_hw_mode = setend_set_hw_mode, }; +#endif /* CONFIG_SETEND_EMULATION */ + +static struct insn_emulation *insn_emulations[] = { +#ifdef CONFIG_SWP_EMULATION + &insn_swp, +#endif +#ifdef CONFIG_CP15_BARRIER_EMULATION + &insn_cp15_barrier, +#endif +#ifdef CONFIG_SETEND_EMULATION + &insn_setend, +#endif +}; + +static DEFINE_MUTEX(insn_emulation_mutex); + +static void enable_insn_hw_mode(void *data) +{ + struct insn_emulation *insn = data; + if (insn->set_hw_mode) + insn->set_hw_mode(true); +} + +static void disable_insn_hw_mode(void *data) +{ + struct insn_emulation *insn = data; + if (insn->set_hw_mode) + insn->set_hw_mode(false); +} + +/* Run set_hw_mode(mode) on all active CPUs */ +static int run_all_cpu_set_hw_mode(struct insn_emulation *insn, bool enable) +{ + if (!insn->set_hw_mode) + return -EINVAL; + if (enable) + on_each_cpu(enable_insn_hw_mode, (void *)insn, true); + else + on_each_cpu(disable_insn_hw_mode, (void *)insn, true); + return 0; +} /* - * Invoked as late_initcall, since not needed before init spawned. + * Run set_hw_mode for all insns on a starting CPU. + * Returns: + * 0 - If all the hooks ran successfully. + * -EINVAL - At least one hook is not supported by the CPU. + */ +static int run_all_insn_set_hw_mode(unsigned int cpu) +{ + int rc = 0; + unsigned long flags; + + /* + * Disable IRQs to serialize against an IPI from + * run_all_cpu_set_hw_mode(), ensuring the HW is programmed to the most + * recent enablement state if the two race with one another. + */ + local_irq_save(flags); + for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) { + struct insn_emulation *insn = insn_emulations[i]; + bool enable = READ_ONCE(insn->current_mode) == INSN_HW; + if (insn->set_hw_mode && insn->set_hw_mode(enable)) { + pr_warn("CPU[%u] cannot support the emulation of %s", + cpu, insn->name); + rc = -EINVAL; + } + } + local_irq_restore(flags); + + return rc; +} + +static int update_insn_emulation_mode(struct insn_emulation *insn, + enum insn_emulation_mode prev) +{ + int ret = 0; + + switch (prev) { + case INSN_UNDEF: /* Nothing to be done */ + break; + case INSN_EMULATE: + break; + case INSN_HW: + if (!run_all_cpu_set_hw_mode(insn, false)) + pr_notice("Disabled %s support\n", insn->name); + break; + } + + switch (insn->current_mode) { + case INSN_UNDEF: + break; + case INSN_EMULATE: + break; + case INSN_HW: + ret = run_all_cpu_set_hw_mode(insn, true); + if (!ret) + pr_notice("Enabled %s support\n", insn->name); + break; + } + + return ret; +} + +static int emulation_proc_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = 0; + struct insn_emulation *insn = container_of(table->data, struct insn_emulation, current_mode); + enum insn_emulation_mode prev_mode = insn->current_mode; + + mutex_lock(&insn_emulation_mutex); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write || prev_mode == insn->current_mode) + goto ret; + + ret = update_insn_emulation_mode(insn, prev_mode); + if (ret) { + /* Mode change failed, revert to previous mode. */ + WRITE_ONCE(insn->current_mode, prev_mode); + update_insn_emulation_mode(insn, INSN_UNDEF); + } +ret: + mutex_unlock(&insn_emulation_mutex); + return ret; +} + +static void __init register_insn_emulation(struct insn_emulation *insn) +{ + struct ctl_table *sysctl; + + insn->min = INSN_UNDEF; + + switch (insn->status) { + case INSN_DEPRECATED: + insn->current_mode = INSN_EMULATE; + /* Disable the HW mode if it was turned on at early boot time */ + run_all_cpu_set_hw_mode(insn, false); + insn->max = INSN_HW; + break; + case INSN_OBSOLETE: + insn->current_mode = INSN_UNDEF; + insn->max = INSN_EMULATE; + break; + case INSN_UNAVAILABLE: + insn->current_mode = INSN_UNDEF; + insn->max = INSN_UNDEF; + break; + } + + /* Program the HW if required */ + update_insn_emulation_mode(insn, INSN_UNDEF); + + if (insn->status != INSN_UNAVAILABLE) { + sysctl = &insn->sysctl; + + sysctl->mode = 0644; + sysctl->maxlen = sizeof(int); + + sysctl->procname = insn->name; + sysctl->data = &insn->current_mode; + sysctl->extra1 = &insn->min; + sysctl->extra2 = &insn->max; + sysctl->proc_handler = emulation_proc_handler; + + register_sysctl_sz("abi", sysctl, 1); + } +} + +bool try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn) +{ + for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) { + struct insn_emulation *ie = insn_emulations[i]; + + if (ie->status == INSN_UNAVAILABLE) + continue; + + /* + * A trap may race with the mode being changed + * INSN_EMULATE<->INSN_HW. Try to emulate the instruction to + * avoid a spurious UNDEF. + */ + if (READ_ONCE(ie->current_mode) == INSN_UNDEF) + continue; + + if (ie->try_emulate(regs, insn)) + return true; + } + + return false; +} + +/* + * Invoked as core_initcall, which guarantees that the instruction + * emulation is ready for userspace. */ static int __init armv8_deprecated_init(void) { - if (IS_ENABLED(CONFIG_SWP_EMULATION)) - register_insn_emulation(&swp_ops); +#ifdef CONFIG_SETEND_EMULATION + if (!system_supports_mixed_endian_el0()) { + insn_setend.status = INSN_UNAVAILABLE; + pr_info("setend instruction emulation is not supported on this system\n"); + } - if (IS_ENABLED(CONFIG_CP15_BARRIER_EMULATION)) - register_insn_emulation(&cp15_barrier_ops); +#endif + for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) { + struct insn_emulation *ie = insn_emulations[i]; - if (IS_ENABLED(CONFIG_SETEND_EMULATION)) { - if(system_supports_mixed_endian_el0()) - register_insn_emulation(&setend_ops); - else - pr_info("setend instruction emulation is not supported on this system\n"); + if (ie->status == INSN_UNAVAILABLE) + continue; + + register_insn_emulation(ie); } cpuhp_setup_state_nocalls(CPUHP_AP_ARM64_ISNDEP_STARTING, "arm64/isndep:starting", run_all_insn_set_hw_mode, NULL); - register_insn_emulation_sysctl(); - return 0; } diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index a5bdce8af65b..5a7dbbe0ce63 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -9,6 +9,8 @@ #include <linux/arm_sdei.h> #include <linux/sched.h> +#include <linux/ftrace.h> +#include <linux/kexec.h> #include <linux/mm.h> #include <linux/dma-mapping.h> #include <linux/kvm_host.h> @@ -28,18 +30,32 @@ int main(void) { DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); BLANK(); + DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); - DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit)); #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif +#ifdef CONFIG_SHADOW_CALL_STACK + DEFINE(TSK_TI_SCS_BASE, offsetof(struct task_struct, thread_info.scs_base)); + DEFINE(TSK_TI_SCS_SP, offsetof(struct task_struct, thread_info.scs_sp)); +#endif DEFINE(TSK_STACK, offsetof(struct task_struct, stack)); #ifdef CONFIG_STACKPROTECTOR DEFINE(TSK_STACK_CANARY, offsetof(struct task_struct, stack_canary)); #endif BLANK(); DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context)); + DEFINE(THREAD_SCTLR_USER, offsetof(struct task_struct, thread.sctlr_user)); +#ifdef CONFIG_ARM64_PTR_AUTH + DEFINE(THREAD_KEYS_USER, offsetof(struct task_struct, thread.keys_user)); +#endif +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL + DEFINE(THREAD_KEYS_KERNEL, offsetof(struct task_struct, thread.keys_kernel)); +#endif +#ifdef CONFIG_ARM64_MTE + DEFINE(THREAD_MTE_CTRL, offsetof(struct task_struct, thread.mte_ctrl)); +#endif BLANK(); DEFINE(S_X0, offsetof(struct pt_regs, regs[0])); DEFINE(S_X2, offsetof(struct pt_regs, regs[2])); @@ -62,11 +78,27 @@ int main(void) DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate)); DEFINE(S_PC, offsetof(struct pt_regs, pc)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); - DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); + DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1)); DEFINE(S_PMR_SAVE, offsetof(struct pt_regs, pmr_save)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); - DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); + DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); BLANK(); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS + DEFINE(FREGS_X0, offsetof(struct ftrace_regs, regs[0])); + DEFINE(FREGS_X2, offsetof(struct ftrace_regs, regs[2])); + DEFINE(FREGS_X4, offsetof(struct ftrace_regs, regs[4])); + DEFINE(FREGS_X6, offsetof(struct ftrace_regs, regs[6])); + DEFINE(FREGS_X8, offsetof(struct ftrace_regs, regs[8])); + DEFINE(FREGS_FP, offsetof(struct ftrace_regs, fp)); + DEFINE(FREGS_LR, offsetof(struct ftrace_regs, lr)); + DEFINE(FREGS_SP, offsetof(struct ftrace_regs, sp)); + DEFINE(FREGS_PC, offsetof(struct ftrace_regs, pc)); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + DEFINE(FREGS_DIRECT_TRAMP, offsetof(struct ftrace_regs, direct_tramp)); +#endif + DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); + BLANK(); +#endif #ifdef CONFIG_COMPAT DEFINE(COMPAT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0)); DEFINE(COMPAT_RT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_rt_sigframe, sig.uc.uc_mcontext.arm_r0)); @@ -86,23 +118,33 @@ int main(void) BLANK(); DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); BLANK(); - DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); -#ifdef CONFIG_KVM_ARM_HOST + DEFINE(FTR_OVR_VAL_OFFSET, offsetof(struct arm64_ftr_override, val)); + DEFINE(FTR_OVR_MASK_OFFSET, offsetof(struct arm64_ftr_override, mask)); + BLANK(); +#ifdef CONFIG_KVM DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1)); - DEFINE(VCPU_WORKAROUND_FLAGS, offsetof(struct kvm_vcpu, arch.workaround_flags)); DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2)); - DEFINE(CPU_GP_REGS, offsetof(struct kvm_cpu_context, gp_regs)); + DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_cpu_context, regs)); + DEFINE(CPU_RGSR_EL1, offsetof(struct kvm_cpu_context, sys_regs[RGSR_EL1])); + DEFINE(CPU_GCR_EL1, offsetof(struct kvm_cpu_context, sys_regs[GCR_EL1])); DEFINE(CPU_APIAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIAKEYLO_EL1])); DEFINE(CPU_APIBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIBKEYLO_EL1])); DEFINE(CPU_APDAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDAKEYLO_EL1])); DEFINE(CPU_APDBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDBKEYLO_EL1])); DEFINE(CPU_APGAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APGAKEYLO_EL1])); - DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_regs, regs)); DEFINE(HOST_CONTEXT_VCPU, offsetof(struct kvm_cpu_context, __hyp_running_vcpu)); DEFINE(HOST_DATA_CONTEXT, offsetof(struct kvm_host_data, host_ctxt)); + DEFINE(NVHE_INIT_MAIR_EL2, offsetof(struct kvm_nvhe_init_params, mair_el2)); + DEFINE(NVHE_INIT_TCR_EL2, offsetof(struct kvm_nvhe_init_params, tcr_el2)); + DEFINE(NVHE_INIT_TPIDR_EL2, offsetof(struct kvm_nvhe_init_params, tpidr_el2)); + DEFINE(NVHE_INIT_STACK_HYP_VA, offsetof(struct kvm_nvhe_init_params, stack_hyp_va)); + DEFINE(NVHE_INIT_PGD_PA, offsetof(struct kvm_nvhe_init_params, pgd_pa)); + DEFINE(NVHE_INIT_HCR_EL2, offsetof(struct kvm_nvhe_init_params, hcr_el2)); + DEFINE(NVHE_INIT_VTTBR, offsetof(struct kvm_nvhe_init_params, vttbr)); + DEFINE(NVHE_INIT_VTCR, offsetof(struct kvm_nvhe_init_params, vtcr)); #endif #ifdef CONFIG_CPU_PM DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); @@ -115,6 +157,15 @@ int main(void) DEFINE(ARM_SMCCC_RES_X2_OFFS, offsetof(struct arm_smccc_res, a2)); DEFINE(ARM_SMCCC_QUIRK_ID_OFFS, offsetof(struct arm_smccc_quirk, id)); DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS, offsetof(struct arm_smccc_quirk, state)); + DEFINE(ARM_SMCCC_1_2_REGS_X0_OFFS, offsetof(struct arm_smccc_1_2_regs, a0)); + DEFINE(ARM_SMCCC_1_2_REGS_X2_OFFS, offsetof(struct arm_smccc_1_2_regs, a2)); + DEFINE(ARM_SMCCC_1_2_REGS_X4_OFFS, offsetof(struct arm_smccc_1_2_regs, a4)); + DEFINE(ARM_SMCCC_1_2_REGS_X6_OFFS, offsetof(struct arm_smccc_1_2_regs, a6)); + DEFINE(ARM_SMCCC_1_2_REGS_X8_OFFS, offsetof(struct arm_smccc_1_2_regs, a8)); + DEFINE(ARM_SMCCC_1_2_REGS_X10_OFFS, offsetof(struct arm_smccc_1_2_regs, a10)); + DEFINE(ARM_SMCCC_1_2_REGS_X12_OFFS, offsetof(struct arm_smccc_1_2_regs, a12)); + DEFINE(ARM_SMCCC_1_2_REGS_X14_OFFS, offsetof(struct arm_smccc_1_2_regs, a14)); + DEFINE(ARM_SMCCC_1_2_REGS_X16_OFFS, offsetof(struct arm_smccc_1_2_regs, a16)); BLANK(); DEFINE(HIBERN_PBE_ORIG, offsetof(struct pbe, orig_address)); DEFINE(HIBERN_PBE_ADDR, offsetof(struct pbe, address)); @@ -128,5 +179,41 @@ int main(void) DEFINE(SDEI_EVENT_INTREGS, offsetof(struct sdei_registered_event, interrupted_regs)); DEFINE(SDEI_EVENT_PRIORITY, offsetof(struct sdei_registered_event, priority)); #endif +#ifdef CONFIG_ARM64_PTR_AUTH + DEFINE(PTRAUTH_USER_KEY_APIA, offsetof(struct ptrauth_keys_user, apia)); +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL + DEFINE(PTRAUTH_KERNEL_KEY_APIA, offsetof(struct ptrauth_keys_kernel, apia)); +#endif + BLANK(); +#endif +#ifdef CONFIG_KEXEC_CORE + DEFINE(KIMAGE_ARCH_DTB_MEM, offsetof(struct kimage, arch.dtb_mem)); + DEFINE(KIMAGE_ARCH_EL2_VECTORS, offsetof(struct kimage, arch.el2_vectors)); + DEFINE(KIMAGE_ARCH_ZERO_PAGE, offsetof(struct kimage, arch.zero_page)); + DEFINE(KIMAGE_ARCH_PHYS_OFFSET, offsetof(struct kimage, arch.phys_offset)); + DEFINE(KIMAGE_ARCH_TTBR1, offsetof(struct kimage, arch.ttbr1)); + DEFINE(KIMAGE_HEAD, offsetof(struct kimage, head)); + DEFINE(KIMAGE_START, offsetof(struct kimage, start)); + BLANK(); +#endif +#ifdef CONFIG_FUNCTION_TRACER + DEFINE(FTRACE_OPS_FUNC, offsetof(struct ftrace_ops, func)); +#endif + BLANK(); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + DEFINE(FGRET_REGS_X0, offsetof(struct fgraph_ret_regs, regs[0])); + DEFINE(FGRET_REGS_X1, offsetof(struct fgraph_ret_regs, regs[1])); + DEFINE(FGRET_REGS_X2, offsetof(struct fgraph_ret_regs, regs[2])); + DEFINE(FGRET_REGS_X3, offsetof(struct fgraph_ret_regs, regs[3])); + DEFINE(FGRET_REGS_X4, offsetof(struct fgraph_ret_regs, regs[4])); + DEFINE(FGRET_REGS_X5, offsetof(struct fgraph_ret_regs, regs[5])); + DEFINE(FGRET_REGS_X6, offsetof(struct fgraph_ret_regs, regs[6])); + DEFINE(FGRET_REGS_X7, offsetof(struct fgraph_ret_regs, regs[7])); + DEFINE(FGRET_REGS_FP, offsetof(struct fgraph_ret_regs, fp)); + DEFINE(FGRET_REGS_SIZE, sizeof(struct fgraph_ret_regs)); +#endif +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + DEFINE(FTRACE_OPS_DIRECT_CALL, offsetof(struct ftrace_ops, direct_call)); +#endif return 0; } diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index 7fa6828bb488..d9c9218fa1fd 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -11,11 +11,6 @@ #include <linux/of.h> #define MAX_CACHE_LEVEL 7 /* Max 7 level supported */ -/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */ -#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1)) -#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level)) -#define CLIDR_CTYPE(clidr, level) \ - (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) int cache_line_size(void) { @@ -43,10 +38,9 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, this_leaf->type = type; } -static int __init_cache_level(unsigned int cpu) +static void detect_cache_level(unsigned int *level_p, unsigned int *leaves_p) { - unsigned int ctype, level, leaves, fw_level; - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + unsigned int ctype, level, leaves; for (level = 1, leaves = 0; level <= MAX_CACHE_LEVEL; level++) { ctype = get_cache_type(level); @@ -58,10 +52,34 @@ static int __init_cache_level(unsigned int cpu) leaves += (ctype == CACHE_TYPE_SEPARATE) ? 2 : 1; } - if (acpi_disabled) + *level_p = level; + *leaves_p = leaves; +} + +int early_cache_level(unsigned int cpu) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + + detect_cache_level(&this_cpu_ci->num_levels, &this_cpu_ci->num_leaves); + + return 0; +} + +int init_cache_level(unsigned int cpu) +{ + unsigned int level, leaves; + int fw_level, ret; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + + detect_cache_level(&level, &leaves); + + if (acpi_disabled) { fw_level = of_find_last_cache_level(cpu); - else - fw_level = acpi_find_last_cache_level(cpu); + } else { + ret = acpi_get_cache_info(cpu, &fw_level, NULL); + if (ret < 0) + fw_level = 0; + } if (level < fw_level) { /* @@ -78,7 +96,7 @@ static int __init_cache_level(unsigned int cpu) return 0; } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { unsigned int level, idx; enum cache_type type; @@ -97,6 +115,3 @@ static int __populate_cache_leaves(unsigned int cpu) } return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c new file mode 100644 index 000000000000..deff21bfa680 --- /dev/null +++ b/arch/arm64/kernel/compat_alignment.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0-only +// based on arch/arm/mm/alignment.c + +#include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/perf_event.h> +#include <linux/uaccess.h> + +#include <asm/exception.h> +#include <asm/ptrace.h> +#include <asm/traps.h> + +/* + * 32-bit misaligned trap handler (c) 1998 San Mehat (CCC) -July 1998 + * + * Speed optimisations and better fault handling by Russell King. + */ +#define CODING_BITS(i) (i & 0x0e000000) + +#define LDST_P_BIT(i) (i & (1 << 24)) /* Preindex */ +#define LDST_U_BIT(i) (i & (1 << 23)) /* Add offset */ +#define LDST_W_BIT(i) (i & (1 << 21)) /* Writeback */ +#define LDST_L_BIT(i) (i & (1 << 20)) /* Load */ + +#define LDST_P_EQ_U(i) ((((i) ^ ((i) >> 1)) & (1 << 23)) == 0) + +#define LDSTHD_I_BIT(i) (i & (1 << 22)) /* double/half-word immed */ + +#define RN_BITS(i) ((i >> 16) & 15) /* Rn */ +#define RD_BITS(i) ((i >> 12) & 15) /* Rd */ +#define RM_BITS(i) (i & 15) /* Rm */ + +#define REGMASK_BITS(i) (i & 0xffff) + +#define BAD_INSTR 0xdeadc0de + +/* Thumb-2 32 bit format per ARMv7 DDI0406A A6.3, either f800h,e800h,f800h */ +#define IS_T32(hi16) \ + (((hi16) & 0xe000) == 0xe000 && ((hi16) & 0x1800)) + +union offset_union { + unsigned long un; + signed long sn; +}; + +#define TYPE_ERROR 0 +#define TYPE_FAULT 1 +#define TYPE_LDST 2 +#define TYPE_DONE 3 + +static void +do_alignment_finish_ldst(unsigned long addr, u32 instr, struct pt_regs *regs, + union offset_union offset) +{ + if (!LDST_U_BIT(instr)) + offset.un = -offset.un; + + if (!LDST_P_BIT(instr)) + addr += offset.un; + + if (!LDST_P_BIT(instr) || LDST_W_BIT(instr)) + regs->regs[RN_BITS(instr)] = addr; +} + +static int +do_alignment_ldrdstrd(unsigned long addr, u32 instr, struct pt_regs *regs) +{ + unsigned int rd = RD_BITS(instr); + unsigned int rd2; + int load; + + if ((instr & 0xfe000000) == 0xe8000000) { + /* ARMv7 Thumb-2 32-bit LDRD/STRD */ + rd2 = (instr >> 8) & 0xf; + load = !!(LDST_L_BIT(instr)); + } else if (((rd & 1) == 1) || (rd == 14)) { + return TYPE_ERROR; + } else { + load = ((instr & 0xf0) == 0xd0); + rd2 = rd + 1; + } + + if (load) { + unsigned int val, val2; + + if (get_user(val, (u32 __user *)addr) || + get_user(val2, (u32 __user *)(addr + 4))) + return TYPE_FAULT; + regs->regs[rd] = val; + regs->regs[rd2] = val2; + } else { + if (put_user(regs->regs[rd], (u32 __user *)addr) || + put_user(regs->regs[rd2], (u32 __user *)(addr + 4))) + return TYPE_FAULT; + } + return TYPE_LDST; +} + +/* + * LDM/STM alignment handler. + * + * There are 4 variants of this instruction: + * + * B = rn pointer before instruction, A = rn pointer after instruction + * ------ increasing address -----> + * | | r0 | r1 | ... | rx | | + * PU = 01 B A + * PU = 11 B A + * PU = 00 A B + * PU = 10 A B + */ +static int +do_alignment_ldmstm(unsigned long addr, u32 instr, struct pt_regs *regs) +{ + unsigned int rd, rn, nr_regs, regbits; + unsigned long eaddr, newaddr; + unsigned int val; + + /* count the number of registers in the mask to be transferred */ + nr_regs = hweight16(REGMASK_BITS(instr)) * 4; + + rn = RN_BITS(instr); + newaddr = eaddr = regs->regs[rn]; + + if (!LDST_U_BIT(instr)) + nr_regs = -nr_regs; + newaddr += nr_regs; + if (!LDST_U_BIT(instr)) + eaddr = newaddr; + + if (LDST_P_EQ_U(instr)) /* U = P */ + eaddr += 4; + + for (regbits = REGMASK_BITS(instr), rd = 0; regbits; + regbits >>= 1, rd += 1) + if (regbits & 1) { + if (LDST_L_BIT(instr)) { + if (get_user(val, (u32 __user *)eaddr)) + return TYPE_FAULT; + if (rd < 15) + regs->regs[rd] = val; + else + regs->pc = val; + } else { + /* + * The PC register has a bias of +8 in ARM mode + * and +4 in Thumb mode. This means that a read + * of the value of PC should account for this. + * Since Thumb does not permit STM instructions + * to refer to PC, just add 8 here. + */ + val = (rd < 15) ? regs->regs[rd] : regs->pc + 8; + if (put_user(val, (u32 __user *)eaddr)) + return TYPE_FAULT; + } + eaddr += 4; + } + + if (LDST_W_BIT(instr)) + regs->regs[rn] = newaddr; + + return TYPE_DONE; +} + +/* + * Convert Thumb multi-word load/store instruction forms to equivalent ARM + * instructions so we can reuse ARM userland alignment fault fixups for Thumb. + * + * This implementation was initially based on the algorithm found in + * gdb/sim/arm/thumbemu.c. It is basically just a code reduction of same + * to convert only Thumb ld/st instruction forms to equivalent ARM forms. + * + * NOTES: + * 1. Comments below refer to ARM ARM DDI0100E Thumb Instruction sections. + * 2. If for some reason we're passed an non-ld/st Thumb instruction to + * decode, we return 0xdeadc0de. This should never happen under normal + * circumstances but if it does, we've got other problems to deal with + * elsewhere and we obviously can't fix those problems here. + */ + +static unsigned long thumb2arm(u16 tinstr) +{ + u32 L = (tinstr & (1<<11)) >> 11; + + switch ((tinstr & 0xf800) >> 11) { + /* 6.6.1 Format 1: */ + case 0xc000 >> 11: /* 7.1.51 STMIA */ + case 0xc800 >> 11: /* 7.1.25 LDMIA */ + { + u32 Rn = (tinstr & (7<<8)) >> 8; + u32 W = ((L<<Rn) & (tinstr&255)) ? 0 : 1<<21; + + return 0xe8800000 | W | (L<<20) | (Rn<<16) | + (tinstr&255); + } + + /* 6.6.1 Format 2: */ + case 0xb000 >> 11: /* 7.1.48 PUSH */ + case 0xb800 >> 11: /* 7.1.47 POP */ + if ((tinstr & (3 << 9)) == 0x0400) { + static const u32 subset[4] = { + 0xe92d0000, /* STMDB sp!,{registers} */ + 0xe92d4000, /* STMDB sp!,{registers,lr} */ + 0xe8bd0000, /* LDMIA sp!,{registers} */ + 0xe8bd8000 /* LDMIA sp!,{registers,pc} */ + }; + return subset[(L<<1) | ((tinstr & (1<<8)) >> 8)] | + (tinstr & 255); /* register_list */ + } + fallthrough; /* for illegal instruction case */ + + default: + return BAD_INSTR; + } +} + +/* + * Convert Thumb-2 32 bit LDM, STM, LDRD, STRD to equivalent instruction + * handlable by ARM alignment handler, also find the corresponding handler, + * so that we can reuse ARM userland alignment fault fixups for Thumb. + * + * @pinstr: original Thumb-2 instruction; returns new handlable instruction + * @regs: register context. + * @poffset: return offset from faulted addr for later writeback + * + * NOTES: + * 1. Comments below refer to ARMv7 DDI0406A Thumb Instruction sections. + * 2. Register name Rt from ARMv7 is same as Rd from ARMv6 (Rd is Rt) + */ +static void * +do_alignment_t32_to_handler(u32 *pinstr, struct pt_regs *regs, + union offset_union *poffset) +{ + u32 instr = *pinstr; + u16 tinst1 = (instr >> 16) & 0xffff; + u16 tinst2 = instr & 0xffff; + + switch (tinst1 & 0xffe0) { + /* A6.3.5 Load/Store multiple */ + case 0xe880: /* STM/STMIA/STMEA,LDM/LDMIA, PUSH/POP T2 */ + case 0xe8a0: /* ...above writeback version */ + case 0xe900: /* STMDB/STMFD, LDMDB/LDMEA */ + case 0xe920: /* ...above writeback version */ + /* no need offset decision since handler calculates it */ + return do_alignment_ldmstm; + + case 0xf840: /* POP/PUSH T3 (single register) */ + if (RN_BITS(instr) == 13 && (tinst2 & 0x09ff) == 0x0904) { + u32 L = !!(LDST_L_BIT(instr)); + const u32 subset[2] = { + 0xe92d0000, /* STMDB sp!,{registers} */ + 0xe8bd0000, /* LDMIA sp!,{registers} */ + }; + *pinstr = subset[L] | (1<<RD_BITS(instr)); + return do_alignment_ldmstm; + } + /* Else fall through for illegal instruction case */ + break; + + /* A6.3.6 Load/store double, STRD/LDRD(immed, lit, reg) */ + case 0xe860: + case 0xe960: + case 0xe8e0: + case 0xe9e0: + poffset->un = (tinst2 & 0xff) << 2; + fallthrough; + + case 0xe940: + case 0xe9c0: + return do_alignment_ldrdstrd; + + /* + * No need to handle load/store instructions up to word size + * since ARMv6 and later CPUs can perform unaligned accesses. + */ + default: + break; + } + return NULL; +} + +static int alignment_get_arm(struct pt_regs *regs, __le32 __user *ip, u32 *inst) +{ + __le32 instr = 0; + int fault; + + fault = get_user(instr, ip); + if (fault) + return fault; + + *inst = __le32_to_cpu(instr); + return 0; +} + +static int alignment_get_thumb(struct pt_regs *regs, __le16 __user *ip, u16 *inst) +{ + __le16 instr = 0; + int fault; + + fault = get_user(instr, ip); + if (fault) + return fault; + + *inst = __le16_to_cpu(instr); + return 0; +} + +int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) +{ + union offset_union offset; + unsigned long instrptr; + int (*handler)(unsigned long addr, u32 instr, struct pt_regs *regs); + unsigned int type; + u32 instr = 0; + int isize = 4; + int thumb2_32b = 0; + + instrptr = instruction_pointer(regs); + + if (compat_thumb_mode(regs)) { + __le16 __user *ptr = (__le16 __user *)(instrptr & ~1); + u16 tinstr, tinst2; + + if (alignment_get_thumb(regs, ptr, &tinstr)) + return 1; + + if (IS_T32(tinstr)) { /* Thumb-2 32-bit */ + if (alignment_get_thumb(regs, ptr + 1, &tinst2)) + return 1; + instr = ((u32)tinstr << 16) | tinst2; + thumb2_32b = 1; + } else { + isize = 2; + instr = thumb2arm(tinstr); + } + } else { + if (alignment_get_arm(regs, (__le32 __user *)instrptr, &instr)) + return 1; + } + + switch (CODING_BITS(instr)) { + case 0x00000000: /* 3.13.4 load/store instruction extensions */ + if (LDSTHD_I_BIT(instr)) + offset.un = (instr & 0xf00) >> 4 | (instr & 15); + else + offset.un = regs->regs[RM_BITS(instr)]; + + if ((instr & 0x001000f0) == 0x000000d0 || /* LDRD */ + (instr & 0x001000f0) == 0x000000f0) /* STRD */ + handler = do_alignment_ldrdstrd; + else + return 1; + break; + + case 0x08000000: /* ldm or stm, or thumb-2 32bit instruction */ + if (thumb2_32b) { + offset.un = 0; + handler = do_alignment_t32_to_handler(&instr, regs, &offset); + } else { + offset.un = 0; + handler = do_alignment_ldmstm; + } + break; + + default: + return 1; + } + + type = handler(addr, instr, regs); + + if (type == TYPE_ERROR || type == TYPE_FAULT) + return 1; + + if (type == TYPE_LDST) + do_alignment_finish_ldst(addr, instr, regs, offset); + + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, regs->pc); + arm64_skip_faulting_instruction(regs, isize); + + return 0; +} diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S index 6ea337d464c4..c87445dde674 100644 --- a/arch/arm64/kernel/cpu-reset.S +++ b/arch/arm64/kernel/cpu-reset.S @@ -8,16 +8,16 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/assembler.h> #include <asm/sysreg.h> #include <asm/virt.h> .text -.pushsection .idmap.text, "awx" +.pushsection .idmap.text, "a" /* - * __cpu_soft_restart(el2_switch, entry, arg0, arg1, arg2) - Helper for - * cpu_soft_restart. + * cpu_soft_restart(el2_switch, entry, arg0, arg1, arg2) * * @el2_switch: Flag to indicate a switch to EL2 is needed. * @entry: Location to jump to for soft reset. @@ -29,12 +29,13 @@ * branch to what would be the reset vector. It must be executed with the * flat identity mapping. */ -ENTRY(__cpu_soft_restart) - /* Clear sctlr_el1 flags. */ - mrs x12, sctlr_el1 - ldr x13, =SCTLR_ELx_FLAGS - bic x12, x12, x13 +SYM_TYPED_FUNC_START(cpu_soft_restart) + mov_q x12, INIT_SCTLR_EL1_MMU_OFF pre_disable_mmu_workaround + /* + * either disable EL1&0 translation regime or disable EL2&0 translation + * regime if HCR_EL2.E2H == 1 + */ msr sctlr_el1, x12 isb @@ -42,11 +43,11 @@ ENTRY(__cpu_soft_restart) mov x0, #HVC_SOFT_RESTART hvc #0 // no return -1: mov x18, x1 // entry +1: mov x8, x1 // entry mov x0, x2 // arg0 mov x1, x3 // arg1 mov x2, x4 // arg2 - br x18 -ENDPROC(__cpu_soft_restart) + br x8 +SYM_FUNC_END(cpu_soft_restart) .popsection diff --git a/arch/arm64/kernel/cpu-reset.h b/arch/arm64/kernel/cpu-reset.h deleted file mode 100644 index ed50e9587ad8..000000000000 --- a/arch/arm64/kernel/cpu-reset.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * CPU reset routines - * - * Copyright (C) 2015 Huawei Futurewei Technologies. - */ - -#ifndef _ARM64_CPU_RESET_H -#define _ARM64_CPU_RESET_H - -#include <asm/virt.h> - -void __cpu_soft_restart(unsigned long el2_switch, unsigned long entry, - unsigned long arg0, unsigned long arg1, unsigned long arg2); - -static inline void __noreturn cpu_soft_restart(unsigned long entry, - unsigned long arg0, - unsigned long arg1, - unsigned long arg2) -{ - typeof(__cpu_soft_restart) *restart; - - unsigned long el2_switch = !is_kernel_in_hyp_mode() && - is_hyp_mode_available(); - restart = (void *)__pa_symbol(__cpu_soft_restart); - - cpu_install_idmap(); - restart(el2_switch, entry, arg0, arg1, arg2); - unreachable(); -} - -#endif diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 6a09ca7644ea..967c7c7a4e7d 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -11,6 +11,7 @@ #include <asm/cpu.h> #include <asm/cputype.h> #include <asm/cpufeature.h> +#include <asm/kvm_asm.h> #include <asm/smp_plat.h> static bool __maybe_unused @@ -105,379 +106,12 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap) sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0); } -atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1); - -#include <asm/mmu_context.h> -#include <asm/cacheflush.h> - -DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); - -#ifdef CONFIG_KVM_INDIRECT_VECTORS -extern char __smccc_workaround_1_smc_start[]; -extern char __smccc_workaround_1_smc_end[]; - -static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, - const char *hyp_vecs_end) -{ - void *dst = lm_alias(__bp_harden_hyp_vecs_start + slot * SZ_2K); - int i; - - for (i = 0; i < SZ_2K; i += 0x80) - memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start); - - __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); -} - -static void install_bp_hardening_cb(bp_hardening_cb_t fn, - const char *hyp_vecs_start, - const char *hyp_vecs_end) -{ - static DEFINE_RAW_SPINLOCK(bp_lock); - int cpu, slot = -1; - - /* - * detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if - * we're a guest. Skip the hyp-vectors work. - */ - if (!hyp_vecs_start) { - __this_cpu_write(bp_hardening_data.fn, fn); - return; - } - - raw_spin_lock(&bp_lock); - for_each_possible_cpu(cpu) { - if (per_cpu(bp_hardening_data.fn, cpu) == fn) { - slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu); - break; - } - } - - if (slot == -1) { - slot = atomic_inc_return(&arm64_el2_vector_last_slot); - BUG_ON(slot >= BP_HARDEN_EL2_SLOTS); - __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end); - } - - __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); - __this_cpu_write(bp_hardening_data.fn, fn); - raw_spin_unlock(&bp_lock); -} -#else -#define __smccc_workaround_1_smc_start NULL -#define __smccc_workaround_1_smc_end NULL - -static void install_bp_hardening_cb(bp_hardening_cb_t fn, - const char *hyp_vecs_start, - const char *hyp_vecs_end) -{ - __this_cpu_write(bp_hardening_data.fn, fn); -} -#endif /* CONFIG_KVM_INDIRECT_VECTORS */ - -#include <linux/arm-smccc.h> - -static void call_smc_arch_workaround_1(void) -{ - arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); -} - -static void call_hvc_arch_workaround_1(void) -{ - arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); -} - -static void qcom_link_stack_sanitization(void) -{ - u64 tmp; - - asm volatile("mov %0, x30 \n" - ".rept 16 \n" - "bl . + 4 \n" - ".endr \n" - "mov x30, %0 \n" - : "=&r" (tmp)); -} - -static bool __nospectre_v2; -static int __init parse_nospectre_v2(char *str) -{ - __nospectre_v2 = true; - return 0; -} -early_param("nospectre_v2", parse_nospectre_v2); - -/* - * -1: No workaround - * 0: No workaround required - * 1: Workaround installed - */ -static int detect_harden_bp_fw(void) -{ - bp_hardening_cb_t cb; - void *smccc_start, *smccc_end; - struct arm_smccc_res res; - u32 midr = read_cpuid_id(); - - arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, - ARM_SMCCC_ARCH_WORKAROUND_1, &res); - - switch ((int)res.a0) { - case 1: - /* Firmware says we're just fine */ - return 0; - case 0: - break; - default: - return -1; - } - - switch (arm_smccc_1_1_get_conduit()) { - case SMCCC_CONDUIT_HVC: - cb = call_hvc_arch_workaround_1; - /* This is a guest, no need to patch KVM vectors */ - smccc_start = NULL; - smccc_end = NULL; - break; - - case SMCCC_CONDUIT_SMC: - cb = call_smc_arch_workaround_1; - smccc_start = __smccc_workaround_1_smc_start; - smccc_end = __smccc_workaround_1_smc_end; - break; - - default: - return -1; - } - - if (((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR) || - ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1)) - cb = qcom_link_stack_sanitization; - - if (IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR)) - install_bp_hardening_cb(cb, smccc_start, smccc_end); - - return 1; -} - -DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); - -int ssbd_state __read_mostly = ARM64_SSBD_KERNEL; -static bool __ssb_safe = true; - -static const struct ssbd_options { - const char *str; - int state; -} ssbd_options[] = { - { "force-on", ARM64_SSBD_FORCE_ENABLE, }, - { "force-off", ARM64_SSBD_FORCE_DISABLE, }, - { "kernel", ARM64_SSBD_KERNEL, }, -}; - -static int __init ssbd_cfg(char *buf) -{ - int i; - - if (!buf || !buf[0]) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(ssbd_options); i++) { - int len = strlen(ssbd_options[i].str); - - if (strncmp(buf, ssbd_options[i].str, len)) - continue; - - ssbd_state = ssbd_options[i].state; - return 0; - } - - return -EINVAL; -} -early_param("ssbd", ssbd_cfg); - -void __init arm64_update_smccc_conduit(struct alt_instr *alt, - __le32 *origptr, __le32 *updptr, - int nr_inst) -{ - u32 insn; - - BUG_ON(nr_inst != 1); - - switch (arm_smccc_1_1_get_conduit()) { - case SMCCC_CONDUIT_HVC: - insn = aarch64_insn_get_hvc_value(); - break; - case SMCCC_CONDUIT_SMC: - insn = aarch64_insn_get_smc_value(); - break; - default: - return; - } - - *updptr = cpu_to_le32(insn); -} - -void __init arm64_enable_wa2_handling(struct alt_instr *alt, - __le32 *origptr, __le32 *updptr, - int nr_inst) -{ - BUG_ON(nr_inst != 1); - /* - * Only allow mitigation on EL1 entry/exit and guest - * ARCH_WORKAROUND_2 handling if the SSBD state allows it to - * be flipped. - */ - if (arm64_get_ssbd_state() == ARM64_SSBD_KERNEL) - *updptr = cpu_to_le32(aarch64_insn_gen_nop()); -} - -void arm64_set_ssbd_mitigation(bool state) -{ - int conduit; - - if (!IS_ENABLED(CONFIG_ARM64_SSBD)) { - pr_info_once("SSBD disabled by kernel configuration\n"); - return; - } - - if (this_cpu_has_cap(ARM64_SSBS)) { - if (state) - asm volatile(SET_PSTATE_SSBS(0)); - else - asm volatile(SET_PSTATE_SSBS(1)); - return; - } - - conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, state, - NULL); - - WARN_ON_ONCE(conduit == SMCCC_CONDUIT_NONE); -} - -static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry, - int scope) -{ - struct arm_smccc_res res; - bool required = true; - s32 val; - bool this_cpu_safe = false; - int conduit; - - WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - - if (cpu_mitigations_off()) - ssbd_state = ARM64_SSBD_FORCE_DISABLE; - - /* delay setting __ssb_safe until we get a firmware response */ - if (is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list)) - this_cpu_safe = true; - - if (this_cpu_has_cap(ARM64_SSBS)) { - if (!this_cpu_safe) - __ssb_safe = false; - required = false; - goto out_printmsg; - } - - conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, - ARM_SMCCC_ARCH_WORKAROUND_2, &res); - - if (conduit == SMCCC_CONDUIT_NONE) { - ssbd_state = ARM64_SSBD_UNKNOWN; - if (!this_cpu_safe) - __ssb_safe = false; - return false; - } - - val = (s32)res.a0; - - switch (val) { - case SMCCC_RET_NOT_SUPPORTED: - ssbd_state = ARM64_SSBD_UNKNOWN; - if (!this_cpu_safe) - __ssb_safe = false; - return false; - - /* machines with mixed mitigation requirements must not return this */ - case SMCCC_RET_NOT_REQUIRED: - pr_info_once("%s mitigation not required\n", entry->desc); - ssbd_state = ARM64_SSBD_MITIGATED; - return false; - - case SMCCC_RET_SUCCESS: - __ssb_safe = false; - required = true; - break; - - case 1: /* Mitigation not required on this CPU */ - required = false; - break; - - default: - WARN_ON(1); - if (!this_cpu_safe) - __ssb_safe = false; - return false; - } - - switch (ssbd_state) { - case ARM64_SSBD_FORCE_DISABLE: - arm64_set_ssbd_mitigation(false); - required = false; - break; - - case ARM64_SSBD_KERNEL: - if (required) { - __this_cpu_write(arm64_ssbd_callback_required, 1); - arm64_set_ssbd_mitigation(true); - } - break; - - case ARM64_SSBD_FORCE_ENABLE: - arm64_set_ssbd_mitigation(true); - required = true; - break; - - default: - WARN_ON(1); - break; - } - -out_printmsg: - switch (ssbd_state) { - case ARM64_SSBD_FORCE_DISABLE: - pr_info_once("%s disabled from command-line\n", entry->desc); - break; - - case ARM64_SSBD_FORCE_ENABLE: - pr_info_once("%s forced from command-line\n", entry->desc); - break; - } - - return required; -} - -/* known invulnerable cores */ -static const struct midr_range arm64_ssb_cpus[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), - MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), - {}, -}; - #ifdef CONFIG_ARM64_ERRATUM_1463225 -DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); - static bool has_cortex_a76_erratum_1463225(const struct arm64_cpu_capabilities *entry, int scope) { - u32 midr = read_cpuid_id(); - /* Cortex-A76 r0p0 - r3p1 */ - struct midr_range range = MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 1); - - WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range(midr, &range) && is_kernel_in_hyp_mode(); + return is_affected_midr_range_list(entry, scope) && is_kernel_in_hyp_mode(); } #endif @@ -524,80 +158,6 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused) .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, \ CAP_MIDR_RANGE_LIST(midr_list) -/* Track overall mitigation state. We are only mitigated if all cores are ok */ -static bool __hardenbp_enab = true; -static bool __spectrev2_safe = true; - -int get_spectre_v2_workaround_state(void) -{ - if (__spectrev2_safe) - return ARM64_BP_HARDEN_NOT_REQUIRED; - - if (!__hardenbp_enab) - return ARM64_BP_HARDEN_UNKNOWN; - - return ARM64_BP_HARDEN_WA_NEEDED; -} - -/* - * List of CPUs that do not need any Spectre-v2 mitigation at all. - */ -static const struct midr_range spectre_v2_safe_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), - MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), - { /* sentinel */ } -}; - -/* - * Track overall bp hardening for all heterogeneous cores in the machine. - * We are only considered "safe" if all booted cores are known safe. - */ -static bool __maybe_unused -check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope) -{ - int need_wa; - - WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - - /* If the CPU has CSV2 set, we're safe */ - if (cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64PFR0_EL1), - ID_AA64PFR0_CSV2_SHIFT)) - return false; - - /* Alternatively, we have a list of unaffected CPUs */ - if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list)) - return false; - - /* Fallback to firmware detection */ - need_wa = detect_harden_bp_fw(); - if (!need_wa) - return false; - - __spectrev2_safe = false; - - if (!IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR)) { - pr_warn_once("spectrev2 mitigation disabled by kernel configuration\n"); - __hardenbp_enab = false; - return false; - } - - /* forced off */ - if (__nospectre_v2 || cpu_mitigations_off()) { - pr_info_once("spectrev2 mitigation disabled by command line option\n"); - __hardenbp_enab = false; - return false; - } - - if (need_wa < 0) { - pr_warn_once("ARM_SMCCC_ARCH_WORKAROUND_1 missing from firmware\n"); - __hardenbp_enab = false; - } - - return (need_wa > 0); -} - static const __maybe_unused struct midr_range tx2_family_cpus[] = { MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), @@ -627,23 +187,13 @@ has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry, int scope) { u32 midr = read_cpuid_id(); - bool has_dic = read_cpuid_cachetype() & BIT(CTR_DIC_SHIFT); + bool has_dic = read_cpuid_cachetype() & BIT(CTR_EL0_DIC_SHIFT); const struct midr_range range = MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1); WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); return is_midr_in_range(midr, &range) && has_dic; } -#if defined(CONFIG_HARDEN_EL2_VECTORS) || defined(CONFIG_ARM64_ERRATUM_1319367) - -static const struct midr_range ca57_a72[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), - {}, -}; - -#endif - #ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = { #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1009 @@ -659,11 +209,41 @@ static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = { { ERRATA_MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 0), }, + { + /* Kryo4xx Gold (rcpe to rfpe) => (r0p0 to r3p0) */ + ERRATA_MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xe), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2441007 + { + ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2441009 + { + /* Cortex-A510 r0p0 -> r1p1. Fixed in r1p2 */ + ERRATA_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1), + }, #endif {}, }; #endif +#ifdef CONFIG_CAVIUM_ERRATUM_23154 +static const struct midr_range cavium_erratum_23154_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_THUNDERX), + MIDR_ALL_VERSIONS(MIDR_THUNDERX_81XX), + MIDR_ALL_VERSIONS(MIDR_THUNDERX_83XX), + MIDR_ALL_VERSIONS(MIDR_OCTX2_98XX), + MIDR_ALL_VERSIONS(MIDR_OCTX2_96XX), + MIDR_ALL_VERSIONS(MIDR_OCTX2_95XX), + MIDR_ALL_VERSIONS(MIDR_OCTX2_95XXN), + MIDR_ALL_VERSIONS(MIDR_OCTX2_95XXMM), + MIDR_ALL_VERSIONS(MIDR_OCTX2_95XXO), + {}, +}; +#endif + #ifdef CONFIG_CAVIUM_ERRATUM_27456 const struct midr_range cavium_erratum_27456_cpus[] = { /* Cavium ThunderX, T88 pass 1.x - 2.1 */ @@ -725,6 +305,8 @@ static const struct midr_range erratum_1418040_list[] = { MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 1), /* Neoverse-N1 r0p0 to r3p1 */ MIDR_RANGE(MIDR_NEOVERSE_N1, 0, 0, 3, 1), + /* Kryo4xx Gold (rcpe to rfpf) => (r0p0 to r3p1) */ + MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xf), {}, }; #endif @@ -735,6 +317,8 @@ static const struct midr_range erratum_845719_list[] = { MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 4), /* Brahma-B53 r0p[0] */ MIDR_REV(MIDR_BRAHMA_B53, 0, 0), + /* Kryo2XX Silver rAp4 */ + MIDR_REV(MIDR_QCOM_KRYO_2XX_SILVER, 0xa, 0x4), {}, }; #endif @@ -756,10 +340,99 @@ static const struct arm64_cpu_capabilities erratum_843419_list[] = { }; #endif +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT +static const struct midr_range erratum_speculative_at_list[] = { +#ifdef CONFIG_ARM64_ERRATUM_1165522 + /* Cortex A76 r0p0 to r2p0 */ + MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0), +#endif +#ifdef CONFIG_ARM64_ERRATUM_1319367 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), +#endif +#ifdef CONFIG_ARM64_ERRATUM_1530923 + /* Cortex A55 r0p0 to r2p0 */ + MIDR_RANGE(MIDR_CORTEX_A55, 0, 0, 2, 0), + /* Kryo4xx Silver (rdpe => r1p0) */ + MIDR_REV(MIDR_QCOM_KRYO_4XX_SILVER, 0xd, 0xe), +#endif + {}, +}; +#endif + +#ifdef CONFIG_ARM64_ERRATUM_1463225 +static const struct midr_range erratum_1463225[] = { + /* Cortex-A76 r0p0 - r3p1 */ + MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 1), + /* Kryo4xx Gold (rcpe to rfpf) => (r0p0 to r3p1) */ + MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xf), + {}, +}; +#endif + +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE +static const struct midr_range trbe_overwrite_fill_mode_cpus[] = { +#ifdef CONFIG_ARM64_ERRATUM_2139208 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2119858 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_RANGE(MIDR_CORTEX_X2, 0, 0, 2, 0), +#endif + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE */ + +#ifdef CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE +static const struct midr_range tsb_flush_fail_cpus[] = { +#ifdef CONFIG_ARM64_ERRATUM_2067961 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2054223 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), +#endif + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE */ + +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE +static struct midr_range trbe_write_out_of_range_cpus[] = { +#ifdef CONFIG_ARM64_ERRATUM_2253138 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2224489 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_RANGE(MIDR_CORTEX_X2, 0, 0, 2, 0), +#endif + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE */ + +#ifdef CONFIG_ARM64_ERRATUM_1742098 +static struct midr_range broken_aarch32_aes[] = { + MIDR_RANGE(MIDR_CORTEX_A57, 0, 1, 0xf, 0xf), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE */ + +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD +static const struct midr_range erratum_spec_unpriv_load_list[] = { +#ifdef CONFIG_ARM64_ERRATUM_3117295 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A510), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2966298 + /* Cortex-A520 r0p0 to r0p1 */ + MIDR_REV_RANGE(MIDR_CORTEX_A520, 0, 0, 1), +#endif + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { - .desc = "ARM errata 826319, 827319, 824069, 819472", + .desc = "ARM errata 826319, 827319, 824069, or 819472", .capability = ARM64_WORKAROUND_CLEAN_CACHE, ERRATA_MIDR_RANGE_LIST(workaround_clean_cache), .cpu_enable = cpu_enable_cache_maint_trap, @@ -803,10 +476,10 @@ const struct arm64_cpu_capabilities arm64_errata[] = { #endif #ifdef CONFIG_CAVIUM_ERRATUM_23154 { - /* Cavium ThunderX, pass 1.x */ - .desc = "Cavium erratum 23154", + .desc = "Cavium errata 23154 and 38545", .capability = ARM64_WORKAROUND_CAVIUM_23154, - ERRATA_MIDR_REV_RANGE(MIDR_THUNDERX, 0, 0, 1), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + ERRATA_MIDR_RANGE_LIST(cavium_erratum_23154_cpus), }, #endif #ifdef CONFIG_CAVIUM_ERRATUM_27456 @@ -841,7 +514,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = { #endif #ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI { - .desc = "Qualcomm erratum 1009, ARM erratum 1286807", + .desc = "Qualcomm erratum 1009, or ARM erratum 1286807, 2441009", .capability = ARM64_WORKAROUND_REPEAT_TLBI, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, .matches = cpucap_multi_entry_cap_matches, @@ -857,37 +530,54 @@ const struct arm64_cpu_capabilities arm64_errata[] = { }, #endif { - .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + .desc = "Spectre-v2", + .capability = ARM64_SPECTRE_V2, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, - .matches = check_branch_predictor, + .matches = has_spectre_v2, + .cpu_enable = spectre_v2_enable_mitigation, }, -#ifdef CONFIG_HARDEN_EL2_VECTORS +#ifdef CONFIG_RANDOMIZE_BASE { - .desc = "EL2 vector hardening", - .capability = ARM64_HARDEN_EL2_VECTORS, - ERRATA_MIDR_RANGE_LIST(ca57_a72), + /* Must come after the Spectre-v2 entry */ + .desc = "Spectre-v3a", + .capability = ARM64_SPECTRE_V3A, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_spectre_v3a, + .cpu_enable = spectre_v3a_enable_mitigation, }, #endif { - .desc = "Speculative Store Bypass Disable", - .capability = ARM64_SSBD, + .desc = "Spectre-v4", + .capability = ARM64_SPECTRE_V4, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, - .matches = has_ssbd_mitigation, - .midr_range_list = arm64_ssb_cpus, + .matches = has_spectre_v4, + .cpu_enable = spectre_v4_enable_mitigation, + }, + { + .desc = "Spectre-BHB", + .capability = ARM64_SPECTRE_BHB, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = is_spectre_bhb_affected, + .cpu_enable = spectre_bhb_enable_mitigation, }, #ifdef CONFIG_ARM64_ERRATUM_1418040 { .desc = "ARM erratum 1418040", .capability = ARM64_WORKAROUND_1418040, ERRATA_MIDR_RANGE_LIST(erratum_1418040_list), + /* + * We need to allow affected CPUs to come in late, but + * also need the non-affected CPUs to be able to come + * in at any point in time. Wonderful. + */ + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, }, #endif -#ifdef CONFIG_ARM64_ERRATUM_1165522 +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT { - /* Cortex-A76 r0p0 to r2p0 */ - .desc = "ARM erratum 1165522", - .capability = ARM64_WORKAROUND_1165522, - ERRATA_MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0), + .desc = "ARM errata 1165522, 1319367, or 1530923", + .capability = ARM64_WORKAROUND_SPECULATIVE_AT, + ERRATA_MIDR_RANGE_LIST(erratum_speculative_at_list), }, #endif #ifdef CONFIG_ARM64_ERRATUM_1463225 @@ -896,6 +586,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .capability = ARM64_WORKAROUND_1463225, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, .matches = has_cortex_a76_erratum_1463225, + .midr_range_list = erratum_1463225, }, #endif #ifdef CONFIG_CAVIUM_TX2_ERRATUM_219 @@ -921,50 +612,135 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .cpu_enable = cpu_enable_trap_ctr_access, }, #endif -#ifdef CONFIG_ARM64_ERRATUM_1319367 +#ifdef CONFIG_ARM64_ERRATUM_1508412 { - .desc = "ARM erratum 1319367", - .capability = ARM64_WORKAROUND_1319367, - ERRATA_MIDR_RANGE_LIST(ca57_a72), + /* we depend on the firmware portion for correctness */ + .desc = "ARM erratum 1508412 (kernel portion)", + .capability = ARM64_WORKAROUND_1508412, + ERRATA_MIDR_RANGE(MIDR_CORTEX_A77, + 0, 0, + 1, 0), }, #endif +#ifdef CONFIG_NVIDIA_CARMEL_CNP_ERRATUM { - } -}; + /* NVIDIA Carmel */ + .desc = "NVIDIA Carmel CNP erratum", + .capability = ARM64_WORKAROUND_NVIDIA_CARMEL_CNP, + ERRATA_MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL), + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE + { + /* + * The erratum work around is handled within the TRBE + * driver and can be applied per-cpu. So, we can allow + * a late CPU to come online with this erratum. + */ + .desc = "ARM erratum 2119858 or 2139208", + .capability = ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + CAP_MIDR_RANGE_LIST(trbe_overwrite_fill_mode_cpus), + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE + { + .desc = "ARM erratum 2067961 or 2054223", + .capability = ARM64_WORKAROUND_TSB_FLUSH_FAILURE, + ERRATA_MIDR_RANGE_LIST(tsb_flush_fail_cpus), + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE + { + .desc = "ARM erratum 2253138 or 2224489", + .capability = ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + CAP_MIDR_RANGE_LIST(trbe_write_out_of_range_cpus), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2645198 + { + .desc = "ARM erratum 2645198", + .capability = ARM64_WORKAROUND_2645198, + ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A715) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2077057 + { + .desc = "ARM erratum 2077057", + .capability = ARM64_WORKAROUND_2077057, + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2064142 + { + .desc = "ARM erratum 2064142", + .capability = ARM64_WORKAROUND_2064142, -ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "Mitigation: __user pointer sanitization\n"); -} + /* Cortex-A510 r0p0 - r0p2 */ + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2457168 + { + .desc = "ARM erratum 2457168", + .capability = ARM64_WORKAROUND_2457168, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, -ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, - char *buf) -{ - switch (get_spectre_v2_workaround_state()) { - case ARM64_BP_HARDEN_NOT_REQUIRED: - return sprintf(buf, "Not affected\n"); - case ARM64_BP_HARDEN_WA_NEEDED: - return sprintf(buf, "Mitigation: Branch predictor hardening\n"); - case ARM64_BP_HARDEN_UNKNOWN: - default: - return sprintf(buf, "Vulnerable\n"); - } -} + /* Cortex-A510 r0p0-r1p1 */ + CAP_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2038923 + { + .desc = "ARM erratum 2038923", + .capability = ARM64_WORKAROUND_2038923, -ssize_t cpu_show_spec_store_bypass(struct device *dev, - struct device_attribute *attr, char *buf) -{ - if (__ssb_safe) - return sprintf(buf, "Not affected\n"); - - switch (ssbd_state) { - case ARM64_SSBD_KERNEL: - case ARM64_SSBD_FORCE_ENABLE: - if (IS_ENABLED(CONFIG_ARM64_SSBD)) - return sprintf(buf, - "Mitigation: Speculative Store Bypass disabled via prctl\n"); - } + /* Cortex-A510 r0p0 - r0p2 */ + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_1902691 + { + .desc = "ARM erratum 1902691", + .capability = ARM64_WORKAROUND_1902691, - return sprintf(buf, "Vulnerable\n"); -} + /* Cortex-A510 r0p0 - r0p1 */ + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 1) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_1742098 + { + .desc = "ARM erratum 1742098", + .capability = ARM64_WORKAROUND_1742098, + CAP_MIDR_RANGE_LIST(broken_aarch32_aes), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2658417 + { + .desc = "ARM erratum 2658417", + .capability = ARM64_WORKAROUND_2658417, + /* Cortex-A510 r0p0 - r1p1 */ + ERRATA_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1), + MIDR_FIXED(MIDR_CPU_VAR_REV(1,1), BIT(25)), + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD + { + .desc = "ARM errata 2966298, 3117295", + .capability = ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD, + /* Cortex-A520 r0p0 - r0p1 */ + ERRATA_MIDR_RANGE_LIST(erratum_spec_unpriv_load_list), + }, +#endif +#ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38 + { + .desc = "AmpereOne erratum AC03_CPU_38", + .capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38, + ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1), + }, +#endif + { + } +}; diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c index 7e07072757af..e133011f64b5 100644 --- a/arch/arm64/kernel/cpu_ops.c +++ b/arch/arm64/kernel/cpu_ops.c @@ -15,10 +15,12 @@ #include <asm/smp_plat.h> extern const struct cpu_operations smp_spin_table_ops; +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL extern const struct cpu_operations acpi_parking_protocol_ops; +#endif extern const struct cpu_operations cpu_psci_ops; -const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; +static const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; static const struct cpu_operations *const dt_supported_cpu_ops[] __initconst = { &smp_spin_table_ops, @@ -94,7 +96,7 @@ static const char *__init cpu_read_enable_method(int cpu) /* * Read a cpu's enable method and record it in cpu_ops. */ -int __init cpu_read_ops(int cpu) +int __init init_cpu_ops(int cpu) { const char *enable_method = cpu_read_enable_method(cpu); @@ -109,3 +111,8 @@ int __init cpu_read_ops(int cpu) return 0; } + +const struct cpu_operations *get_cpu_ops(int cpu) +{ + return cpu_ops[cpu]; +} diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 04cf64e9f0c9..8d1a634a403e 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3,6 +3,61 @@ * Contains CPU feature definitions * * Copyright (C) 2015 ARM Ltd. + * + * A note for the weary kernel hacker: the code here is confusing and hard to + * follow! That's partly because it's solving a nasty problem, but also because + * there's a little bit of over-abstraction that tends to obscure what's going + * on behind a maze of helper functions and macros. + * + * The basic problem is that hardware folks have started gluing together CPUs + * with distinct architectural features; in some cases even creating SoCs where + * user-visible instructions are available only on a subset of the available + * cores. We try to address this by snapshotting the feature registers of the + * boot CPU and comparing these with the feature registers of each secondary + * CPU when bringing them up. If there is a mismatch, then we update the + * snapshot state to indicate the lowest-common denominator of the feature, + * known as the "safe" value. This snapshot state can be queried to view the + * "sanitised" value of a feature register. + * + * The sanitised register values are used to decide which capabilities we + * have in the system. These may be in the form of traditional "hwcaps" + * advertised to userspace or internal "cpucaps" which are used to configure + * things like alternative patching and static keys. While a feature mismatch + * may result in a TAINT_CPU_OUT_OF_SPEC kernel taint, a capability mismatch + * may prevent a CPU from being onlined at all. + * + * Some implementation details worth remembering: + * + * - Mismatched features are *always* sanitised to a "safe" value, which + * usually indicates that the feature is not supported. + * + * - A mismatched feature marked with FTR_STRICT will cause a "SANITY CHECK" + * warning when onlining an offending CPU and the kernel will be tainted + * with TAINT_CPU_OUT_OF_SPEC. + * + * - Features marked as FTR_VISIBLE have their sanitised value visible to + * userspace. FTR_VISIBLE features in registers that are only visible + * to EL0 by trapping *must* have a corresponding HWCAP so that late + * onlining of CPUs cannot lead to features disappearing at runtime. + * + * - A "feature" is typically a 4-bit register field. A "capability" is the + * high-level description derived from the sanitised field value. + * + * - Read the Arm ARM (DDI 0487F.a) section D13.1.3 ("Principles of the ID + * scheme for fields in ID registers") to understand when feature fields + * may be signed or unsigned (FTR_SIGNED and FTR_UNSIGNED accordingly). + * + * - KVM exposes its own view of the feature registers to guest operating + * systems regardless of FTR_VISIBLE. This is typically driven from the + * sanitised register values to allow virtual CPUs to be migrated between + * arbitrary physical CPUs, but some features not present on the host are + * also advertised and emulated. Look at sys_reg_descs[] for the gory + * details. + * + * - If the arm64_ftr_bits[] for a register has a missing field, then this + * field is treated as STRICT RES0, including for read_sanitised_ftr_reg(). + * This is stronger than FTR_HIDDEN and can be used to hide features from + * KVM guests. */ #define pr_fmt(fmt) "CPU features: " fmt @@ -10,79 +65,87 @@ #include <linux/bsearch.h> #include <linux/cpumask.h> #include <linux/crash_dump.h> +#include <linux/kstrtox.h> #include <linux/sort.h> #include <linux/stop_machine.h> +#include <linux/sysfs.h> #include <linux/types.h> +#include <linux/minmax.h> #include <linux/mm.h> #include <linux/cpu.h> +#include <linux/kasan.h> +#include <linux/percpu.h> + #include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/cpu_ops.h> #include <asm/fpsimd.h> +#include <asm/hwcap.h> +#include <asm/insn.h> +#include <asm/kvm_host.h> #include <asm/mmu_context.h> +#include <asm/mte.h> #include <asm/processor.h> +#include <asm/smp.h> #include <asm/sysreg.h> #include <asm/traps.h> +#include <asm/vectors.h> #include <asm/virt.h> /* Kernel representation of AT_HWCAP and AT_HWCAP2 */ -static unsigned long elf_hwcap __read_mostly; +static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly; #ifdef CONFIG_COMPAT #define COMPAT_ELF_HWCAP_DEFAULT \ (COMPAT_HWCAP_HALF|COMPAT_HWCAP_THUMB|\ COMPAT_HWCAP_FAST_MULT|COMPAT_HWCAP_EDSP|\ - COMPAT_HWCAP_TLS|COMPAT_HWCAP_VFP|\ - COMPAT_HWCAP_VFPv3|COMPAT_HWCAP_VFPv4|\ - COMPAT_HWCAP_NEON|COMPAT_HWCAP_IDIV|\ + COMPAT_HWCAP_TLS|COMPAT_HWCAP_IDIV|\ COMPAT_HWCAP_LPAE) unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT; unsigned int compat_elf_hwcap2 __read_mostly; #endif -DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); -EXPORT_SYMBOL(cpu_hwcaps); -static struct arm64_cpu_capabilities const __ro_after_init *cpu_hwcaps_ptrs[ARM64_NCAPS]; +DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS); +EXPORT_SYMBOL(system_cpucaps); +static struct arm64_cpu_capabilities const __ro_after_init *cpucap_ptrs[ARM64_NCAPS]; + +DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS); -/* Need also bit for ARM64_CB_PATCH */ -DECLARE_BITMAP(boot_capabilities, ARM64_NPATCHABLE); +bool arm64_use_ng_mappings = false; +EXPORT_SYMBOL(arm64_use_ng_mappings); + +DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors; /* - * Flag to indicate if we have computed the system wide - * capabilities based on the boot time active CPUs. This - * will be used to determine if a new booting CPU should - * go through the verification process to make sure that it - * supports the system capabilities, without using a hotplug - * notifier. + * Permit PER_LINUX32 and execve() of 32-bit binaries even if not all CPUs + * support it? */ -static bool sys_caps_initialised; - -static inline void set_sys_caps_initialised(void) -{ - sys_caps_initialised = true; -} +static bool __read_mostly allow_mismatched_32bit_el0; -static int dump_cpu_hwcaps(struct notifier_block *self, unsigned long v, void *p) -{ - /* file-wide pr_fmt adds "CPU features: " prefix */ - pr_emerg("0x%*pb\n", ARM64_NCAPS, &cpu_hwcaps); - return 0; -} +/* + * Static branch enabled only if allow_mismatched_32bit_el0 is set and we have + * seen at least one CPU capable of 32-bit EL0. + */ +DEFINE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0); -static struct notifier_block cpu_hwcaps_notifier = { - .notifier_call = dump_cpu_hwcaps -}; +/* + * Mask of CPUs supporting 32-bit EL0. + * Only valid if arm64_mismatched_32bit_el0 is enabled. + */ +static cpumask_var_t cpu_32bit_el0_mask __cpumask_var_read_mostly; -static int __init register_cpu_hwcaps_dumper(void) +void dump_cpu_features(void) { - atomic_notifier_chain_register(&panic_notifier_list, - &cpu_hwcaps_notifier); - return 0; + /* file-wide pr_fmt adds "CPU features: " prefix */ + pr_emerg("0x%*pb\n", ARM64_NCAPS, &system_cpucaps); } -__initcall(register_cpu_hwcaps_dumper); -DEFINE_STATIC_KEY_ARRAY_FALSE(cpu_hwcap_keys, ARM64_NCAPS); -EXPORT_SYMBOL(cpu_hwcap_keys); +#define ARM64_CPUID_FIELDS(reg, field, min_value) \ + .sys_reg = SYS_##reg, \ + .field_pos = reg##_##field##_SHIFT, \ + .field_width = reg##_##field##_WIDTH, \ + .sign = reg##_##field##_SIGNED, \ + .min_field_value = reg##_##field##_##min_value, #define __ARM64_FTR_BITS(SIGNED, VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ { \ @@ -108,88 +171,176 @@ EXPORT_SYMBOL(cpu_hwcap_keys); .width = 0, \ } -/* meta feature for alternatives */ -static bool __maybe_unused -cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused); - static void cpu_enable_cnp(struct arm64_cpu_capabilities const *cap); +static bool __system_matches_cap(unsigned int n); + /* * NOTE: Any changes to the visibility of features should be kept in * sync with the documentation of the CPU feature register ABI. */ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_FHM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_DP_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM4_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RDM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_ATOMICS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_CRC32_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA1_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RNDR_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TLB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_FHM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_DP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM4_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RDM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_ATOMIC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_CRC32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_AES_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_SB_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_FRINTTS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SPECRES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FRINTTS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_GPI_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPI_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_GPA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_LRCPC_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_FCMA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_JSCVT_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_LRCPC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FCMA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_JSCVT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_API_SHIFT, 4, 0), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_API_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_APA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_DPB_SHIFT, 4, 0), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_APA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DPB_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CSSC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRFM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CLRBHB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_MOPS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_GPA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_WFxT_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_DIT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_DIT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AMU_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_MPAM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SEL2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_RAS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0), - S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), - S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), - /* Linux doesn't care about the EL3 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SVE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_RAS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_GIC_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AdvSIMD_SHIFT, 4, ID_AA64PFR0_EL1_AdvSIMD_NI), + S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_FP_SHIFT, 4, ID_AA64PFR0_EL1_FP_NI), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL1_SHIFT, 4, ID_AA64PFR0_EL1_ELx_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL0_SHIFT, 4, ID_AA64PFR0_EL1_ELx_64BIT_ONLY), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_RAS_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_SHIFT, 4, ID_AA64PFR1_EL1_MTE_NI), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SSBS_SHIFT, 4, ID_AA64PFR1_EL1_SSBS_NI), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_BT_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SM4_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F64MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F32MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SHA3_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_BITPERM_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SHA3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_AES_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_B16B16_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SVEVER_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SMEver_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I16I64_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F64F64_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I16I32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_B16B16_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F16_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I8I32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_B16F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_BI32I32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F32F32_SHIFT, 1, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_ECV_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_FGT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_EXS_SHIFT, 4, 0), + /* + * Page size not being supported at Stage-2 is not fatal. You + * just give up KVM if PAGE_SIZE isn't supported there. Go fix + * your favourite nesting hypervisor. + * + * There is a small corner case where the hypervisor explicitly + * advertises a given granule size at Stage-2 (value 2) on some + * vCPUs, and uses the fallback to Stage-1 (value 0) for other + * vCPUs. Although this is not forbidden by the architecture, it + * indicates that the hypervisor is being silly (or buggy). + * + * We make no effort to cope with this and pretend that if these + * fields are inconsistent across vCPUs, then it isn't worth + * trying to bring KVM up. + */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN4_2_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN64_2_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN16_2_SHIFT, 4, 1), /* * We already refuse to boot CPUs that don't support our configured * page size, so we can only detect mismatches for a page size other @@ -197,145 +348,268 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { * exist in the wild so, even though we don't like it, we'll have to go * along with it and treat them as non-strict. */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN4_SHIFT, 4, ID_AA64MMFR0_TGRAN4_NI), - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN64_SHIFT, 4, ID_AA64MMFR0_TGRAN64_NI), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN16_SHIFT, 4, ID_AA64MMFR0_TGRAN16_NI), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN4_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN4_NI), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN64_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN64_NI), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN16_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN16_NI), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_BIGENDEL0_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_BIGENDEL0_SHIFT, 4, 0), /* Linux shouldn't care about secure memory */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_SNSMEM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_BIGENDEL_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_ASID_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_SNSMEM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_BIGEND_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_ASIDBITS_SHIFT, 4, 0), /* * Differing PARange is fine as long as all peripherals and memory are mapped * within the minimum PARange of all CPUs */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_PARANGE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_PARANGE_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_PAN_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_LOR_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_HPD_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_VHE_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_VMIDBITS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_HADBS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TIDCP1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HCX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ETS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TWED_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_XNX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64MMFR1_EL1_SpecSEI_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_PAN_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_LO_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HPDS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_VH_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_VMIDBits_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HAFDBS_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LSM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_UAO_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_CNP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_E0PD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_EVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_BBM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_TTL_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_FWB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_IDS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_AT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_ST_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_NV_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_CCIDX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_VARange_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_IESB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_LSM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_UAO_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_CnP_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_ctr[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RES1 */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DIC_SHIFT, 1, 1), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IDC_SHIFT, 1, 1), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_CWG_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_ERG_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DMINLINE_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DIC_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IDC_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_CWG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_ERG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DminLine_SHIFT, 4, 1), /* * Linux can handle differing I-cache policies. Userspace JITs will * make use of *minLine. * If we have differing I-cache policies, report it as the weakest - VIPT. */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, 14, 2, ICACHE_POLICY_VIPT), /* L1Ip */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IMINLINE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, CTR_EL0_L1Ip_SHIFT, 2, CTR_EL0_L1Ip_VIPT), /* L1Ip */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IminLine_SHIFT, 4, 0), ARM64_FTR_END, }; +static struct arm64_ftr_override __ro_after_init no_override = { }; + struct arm64_ftr_reg arm64_ftr_reg_ctrel0 = { .name = "SYS_CTR_EL0", - .ftr_bits = ftr_ctr + .ftr_bits = ftr_ctr, + .override = &no_override, }; static const struct arm64_ftr_bits ftr_id_mmfr0[] = { - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0xf), /* InnerShr */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0), /* FCSE */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, 20, 4, 0), /* AuxReg */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0), /* TCM */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), /* ShareLvl */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0xf), /* OuterShr */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* PMSA */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* VMSA */ + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_InnerShr_SHIFT, 4, 0xf), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_FCSE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_AuxReg_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_TCM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_ShareLvl_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_OuterShr_SHIFT, 4, 0xf), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_PMSA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_VMSA_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 36, 28, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_PMSVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_CTX_CMPS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_WRPS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_BRPS_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_DoubleLock_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_PMSVer_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_CTX_CMPs_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_WRPs_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_BRPs_SHIFT, 4, 0), /* * We can instantiate multiple PMU instances with different levels * of support. */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_PMUVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_TRACEVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_DEBUGVER_SHIFT, 4, 0x6), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_EL1_DebugVer_SHIFT, 4, 0x6), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_mvfr0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPRound_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPShVec_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPSqrt_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPDivide_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPTrap_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPDP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPSP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_SIMDReg_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_mvfr1[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDFMAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_FPHP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDHP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDSP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDInt_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDLS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_FPDNaN_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_FPFtZ_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_mvfr2[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* FPMisc */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* SIMDMisc */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_EL1_FPMisc_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_EL1_SIMDMisc_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_dczid[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 4, 1, 1), /* DZP */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* BS */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, DCZID_EL0_DZP_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, DCZID_EL0_BS_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_gmid[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, GMID_EL1_BS_SHIFT, 4, 0), ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_isar0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Divide_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Debug_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Coproc_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_CmpBranch_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_BitField_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_BitCount_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Swap_SHIFT, 4, 0), + ARM64_FTR_END, +}; static const struct arm64_ftr_bits ftr_id_isar5[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_RDM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_CRC32_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA1_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_AES_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SEVL_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_RDM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_CRC32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_SHA2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_SHA1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_SEVL_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_mmfr4[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* ac2 */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_EVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_CCIDX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_LSM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_HPDS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_CnP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_XNX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_AC2_SHIFT, 4, 0), + + /* + * SpecSEI = 1 indicates that the PE might generate an SError on an + * external abort on speculative read. It is safe to assume that an + * SError might be generated than it will not be. Hence it has been + * classified as FTR_HIGHER_SAFE. + */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_MMFR4_EL1_SpecSEI_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_isar4[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_SWP_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_PSR_M_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_SynchPrim_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_Barrier_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_SMC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_Writeback_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_WithShifts_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_Unpriv_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_mmfr5[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR5_EL1_ETS_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_isar6[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_I8MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_BF16_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_SPECRES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_SB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_FHM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_DP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_JSCVT_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_pfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), /* State3 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0), /* State2 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* State1 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* State0 */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_DIT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_CSV2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State0_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_pfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_GIC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Virt_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Sec_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_GenTimer_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Virtualization_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_MProgMod_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Security_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_ProgMod_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_pfr2[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_EL1_SSBS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_EL1_CSV3_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_dfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0), - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0xf), /* PerfMon */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), + /* [31:28] TraceFilt */ + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_EL1_PerfMon_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MProfDbg_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MMapTrc_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopTrc_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MMapDbg_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopSDbg_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopDbg_SHIFT, 4, 0), ARM64_FTR_END, }; -static const struct arm64_ftr_bits ftr_zcr[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, - ZCR_ELx_LEN_SHIFT, ZCR_ELx_LEN_SIZE, 0), /* LEN */ +static const struct arm64_ftr_bits ftr_id_dfr1[] = { + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR1_EL1_MTPMU_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -343,7 +617,7 @@ static const struct arm64_ftr_bits ftr_zcr[] = { * Common ftr bits for a 32bit register with all hidden, strict * attributes, with 4bit feature fields and a default safe value of * 0. Covers the following 32bit registers: - * id_isar[0-4], id_mmfr[1-3], id_pfr1, mvfr[0-1] + * id_isar[1-3], id_mmfr[1-3] */ static const struct arm64_ftr_bits ftr_generic_32bits[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0), @@ -367,13 +641,30 @@ static const struct arm64_ftr_bits ftr_raz[] = { ARM64_FTR_END, }; -#define ARM64_FTR_REG(id, table) { \ - .sys_id = id, \ - .reg = &(struct arm64_ftr_reg){ \ - .name = #id, \ - .ftr_bits = &((table)[0]), \ +#define __ARM64_FTR_REG_OVERRIDE(id_str, id, table, ovr) { \ + .sys_id = id, \ + .reg = &(struct arm64_ftr_reg){ \ + .name = id_str, \ + .override = (ovr), \ + .ftr_bits = &((table)[0]), \ }} +#define ARM64_FTR_REG_OVERRIDE(id, table, ovr) \ + __ARM64_FTR_REG_OVERRIDE(#id, id, table, ovr) + +#define ARM64_FTR_REG(id, table) \ + __ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override) + +struct arm64_ftr_override __ro_after_init id_aa64mmfr1_override; +struct arm64_ftr_override __ro_after_init id_aa64pfr0_override; +struct arm64_ftr_override __ro_after_init id_aa64pfr1_override; +struct arm64_ftr_override __ro_after_init id_aa64zfr0_override; +struct arm64_ftr_override __ro_after_init id_aa64smfr0_override; +struct arm64_ftr_override __ro_after_init id_aa64isar1_override; +struct arm64_ftr_override __ro_after_init id_aa64isar2_override; + +struct arm64_ftr_override arm64_sw_feature_override; + static const struct __ftr_reg_entry { u32 sys_id; struct arm64_ftr_reg *reg; @@ -381,7 +672,7 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 1 */ ARM64_FTR_REG(SYS_ID_PFR0_EL1, ftr_id_pfr0), - ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_id_pfr1), ARM64_FTR_REG(SYS_ID_DFR0_EL1, ftr_id_dfr0), ARM64_FTR_REG(SYS_ID_MMFR0_EL1, ftr_id_mmfr0), ARM64_FTR_REG(SYS_ID_MMFR1_EL1, ftr_generic_32bits), @@ -389,23 +680,32 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_MMFR3_EL1, ftr_generic_32bits), /* Op1 = 0, CRn = 0, CRm = 2 */ - ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_id_isar0), ARM64_FTR_REG(SYS_ID_ISAR1_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_ISAR2_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_ISAR3_EL1, ftr_generic_32bits), - ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_id_isar4), ARM64_FTR_REG(SYS_ID_ISAR5_EL1, ftr_id_isar5), ARM64_FTR_REG(SYS_ID_MMFR4_EL1, ftr_id_mmfr4), + ARM64_FTR_REG(SYS_ID_ISAR6_EL1, ftr_id_isar6), /* Op1 = 0, CRn = 0, CRm = 3 */ - ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_generic_32bits), - ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_mvfr0), + ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_mvfr1), ARM64_FTR_REG(SYS_MVFR2_EL1, ftr_mvfr2), + ARM64_FTR_REG(SYS_ID_PFR2_EL1, ftr_id_pfr2), + ARM64_FTR_REG(SYS_ID_DFR1_EL1, ftr_id_dfr1), + ARM64_FTR_REG(SYS_ID_MMFR5_EL1, ftr_id_mmfr5), /* Op1 = 0, CRn = 0, CRm = 4 */ - ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0), - ARM64_FTR_REG(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1), - ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0, + &id_aa64pfr0_override), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1, + &id_aa64pfr1_override), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0, + &id_aa64zfr0_override), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0, + &id_aa64smfr0_override), /* Op1 = 0, CRn = 0, CRm = 5 */ ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0), @@ -413,15 +713,20 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 6 */ ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0), - ARM64_FTR_REG(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1, + &id_aa64isar1_override), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2, + &id_aa64isar2_override), /* Op1 = 0, CRn = 0, CRm = 7 */ ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), - ARM64_FTR_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1, + &id_aa64mmfr1_override), ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2), + ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3), - /* Op1 = 0, CRn = 1, CRm = 2 */ - ARM64_FTR_REG(SYS_ZCR_EL1, ftr_zcr), + /* Op1 = 1, CRn = 0, CRm = 0 */ + ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), /* Op1 = 3, CRn = 0, CRm = 0 */ { SYS_CTR_EL0, &arm64_ftr_reg_ctrel0 }, @@ -437,16 +742,16 @@ static int search_cmp_ftr_reg(const void *id, const void *regp) } /* - * get_arm64_ftr_reg - Lookup a feature register entry using its - * sys_reg() encoding. With the array arm64_ftr_regs sorted in the - * ascending order of sys_id , we use binary search to find a matching + * get_arm64_ftr_reg_nowarn - Looks up a feature register entry using + * its sys_reg() encoding. With the array arm64_ftr_regs sorted in the + * ascending order of sys_id, we use binary search to find a matching * entry. * * returns - Upon success, matching ftr_reg entry for id. * - NULL on failure. It is upto the caller to decide * the impact of a failure. */ -static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) +static struct arm64_ftr_reg *get_arm64_ftr_reg_nowarn(u32 sys_id) { const struct __ftr_reg_entry *ret; @@ -460,6 +765,27 @@ static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) return NULL; } +/* + * get_arm64_ftr_reg - Looks up a feature register entry using + * its sys_reg() encoding. This calls get_arm64_ftr_reg_nowarn(). + * + * returns - Upon success, matching ftr_reg entry for id. + * - NULL on failure but with an WARN_ON(). + */ +struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) +{ + struct arm64_ftr_reg *reg; + + reg = get_arm64_ftr_reg_nowarn(sys_id); + + /* + * Requesting a non-existent register search is an error. Warn + * and let the caller handle it. + */ + WARN_ON(!reg); + return reg; +} + static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg, s64 ftr_val) { @@ -470,7 +796,7 @@ static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg, return reg; } -static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, +s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { s64 ret = 0; @@ -480,14 +806,14 @@ static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, ret = ftrp->safe_val; break; case FTR_LOWER_SAFE: - ret = new < cur ? new : cur; + ret = min(new, cur); break; case FTR_HIGHER_OR_ZERO_SAFE: if (!cur || !new) break; - /* Fallthrough */ + fallthrough; case FTR_HIGHER_SAFE: - ret = new > cur ? new : cur; + ret = max(new, cur); break; default: BUG(); @@ -498,11 +824,52 @@ static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, static void __init sort_ftr_regs(void) { - int i; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(arm64_ftr_regs); i++) { + const struct arm64_ftr_reg *ftr_reg = arm64_ftr_regs[i].reg; + const struct arm64_ftr_bits *ftr_bits = ftr_reg->ftr_bits; + unsigned int j = 0; + + /* + * Features here must be sorted in descending order with respect + * to their shift values and should not overlap with each other. + */ + for (; ftr_bits->width != 0; ftr_bits++, j++) { + unsigned int width = ftr_reg->ftr_bits[j].width; + unsigned int shift = ftr_reg->ftr_bits[j].shift; + unsigned int prev_shift; + + WARN((shift + width) > 64, + "%s has invalid feature at shift %d\n", + ftr_reg->name, shift); + + /* + * Skip the first feature. There is nothing to + * compare against for now. + */ + if (j == 0) + continue; - /* Check that the array is sorted so that we can do the binary search */ - for (i = 1; i < ARRAY_SIZE(arm64_ftr_regs); i++) - BUG_ON(arm64_ftr_regs[i].sys_id < arm64_ftr_regs[i - 1].sys_id); + prev_shift = ftr_reg->ftr_bits[j - 1].shift; + WARN((shift + width) > prev_shift, + "%s has feature overlap at shift %d\n", + ftr_reg->name, shift); + } + + /* + * Skip the first register. There is nothing to + * compare against for now. + */ + if (i == 0) + continue; + /* + * Registers here must be sorted in ascending order with respect + * to sys_id for subsequent binary search in get_arm64_ftr_reg() + * to work correctly. + */ + BUG_ON(arm64_ftr_regs[i].sys_id <= arm64_ftr_regs[i - 1].sys_id); + } } /* @@ -511,7 +878,7 @@ static void __init sort_ftr_regs(void) * Any bits that are not covered by an arm64_ftr_bits entry are considered * RES0 for the system-wide value, and must strictly match. */ -static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new) +static void init_cpu_ftr_reg(u32 sys_reg, u64 new) { u64 val = 0; u64 strict_mask = ~0x0ULL; @@ -521,11 +888,45 @@ static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new) const struct arm64_ftr_bits *ftrp; struct arm64_ftr_reg *reg = get_arm64_ftr_reg(sys_reg); - BUG_ON(!reg); + if (!reg) + return; - for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) { + for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) { u64 ftr_mask = arm64_ftr_mask(ftrp); s64 ftr_new = arm64_ftr_value(ftrp, new); + s64 ftr_ovr = arm64_ftr_value(ftrp, reg->override->val); + + if ((ftr_mask & reg->override->mask) == ftr_mask) { + s64 tmp = arm64_ftr_safe_value(ftrp, ftr_ovr, ftr_new); + char *str = NULL; + + if (ftr_ovr != tmp) { + /* Unsafe, remove the override */ + reg->override->mask &= ~ftr_mask; + reg->override->val &= ~ftr_mask; + tmp = ftr_ovr; + str = "ignoring override"; + } else if (ftr_new != tmp) { + /* Override was valid */ + ftr_new = tmp; + str = "forced"; + } else if (ftr_ovr == tmp) { + /* Override was the safe value */ + str = "already set"; + } + + if (str) + pr_warn("%s[%d:%d]: %s to %llx\n", + reg->name, + ftrp->shift + ftrp->width - 1, + ftrp->shift, str, tmp); + } else if ((ftr_mask & reg->override->val) == ftr_mask) { + reg->override->val &= ~ftr_mask; + pr_warn("%s[%d:%d]: impossible override, ignored\n", + reg->name, + ftrp->shift + ftrp->width - 1, + ftrp->shift); + } val = arm64_ftr_set_value(ftrp, val, ftr_new); @@ -551,28 +952,84 @@ extern const struct arm64_cpu_capabilities arm64_errata[]; static const struct arm64_cpu_capabilities arm64_features[]; static void __init -init_cpu_hwcaps_indirect_list_from_array(const struct arm64_cpu_capabilities *caps) +init_cpucap_indirect_list_from_array(const struct arm64_cpu_capabilities *caps) { for (; caps->matches; caps++) { if (WARN(caps->capability >= ARM64_NCAPS, "Invalid capability %d\n", caps->capability)) continue; - if (WARN(cpu_hwcaps_ptrs[caps->capability], + if (WARN(cpucap_ptrs[caps->capability], "Duplicate entry for capability %d\n", caps->capability)) continue; - cpu_hwcaps_ptrs[caps->capability] = caps; + cpucap_ptrs[caps->capability] = caps; } } -static void __init init_cpu_hwcaps_indirect_list(void) +static void __init init_cpucap_indirect_list(void) { - init_cpu_hwcaps_indirect_list_from_array(arm64_features); - init_cpu_hwcaps_indirect_list_from_array(arm64_errata); + init_cpucap_indirect_list_from_array(arm64_features); + init_cpucap_indirect_list_from_array(arm64_errata); } static void __init setup_boot_cpu_capabilities(void); +static void init_32bit_cpu_features(struct cpuinfo_32bit *info) +{ + init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); + init_cpu_ftr_reg(SYS_ID_DFR1_EL1, info->reg_id_dfr1); + init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0); + init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1); + init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2); + init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3); + init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4); + init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5); + init_cpu_ftr_reg(SYS_ID_ISAR6_EL1, info->reg_id_isar6); + init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0); + init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1); + init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2); + init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3); + init_cpu_ftr_reg(SYS_ID_MMFR4_EL1, info->reg_id_mmfr4); + init_cpu_ftr_reg(SYS_ID_MMFR5_EL1, info->reg_id_mmfr5); + init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0); + init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1); + init_cpu_ftr_reg(SYS_ID_PFR2_EL1, info->reg_id_pfr2); + init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0); + init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1); + init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2); +} + +#ifdef CONFIG_ARM64_PSEUDO_NMI +static bool enable_pseudo_nmi; + +static int __init early_enable_pseudo_nmi(char *p) +{ + return kstrtobool(p, &enable_pseudo_nmi); +} +early_param("irqchip.gicv3_pseudo_nmi", early_enable_pseudo_nmi); + +static __init void detect_system_supports_pseudo_nmi(void) +{ + struct device_node *np; + + if (!enable_pseudo_nmi) + return; + + /* + * Detect broken MediaTek firmware that doesn't properly save and + * restore GIC priorities. + */ + np = of_find_compatible_node(NULL, NULL, "arm,gic-v3"); + if (np && of_property_read_bool(np, "mediatek,broken-save-restore-fw")) { + pr_info("Pseudo-NMI disabled due to MediaTek Chromebook GICR save problem\n"); + enable_pseudo_nmi = false; + } + of_node_put(np); +} +#else /* CONFIG_ARM64_PSEUDO_NMI */ +static inline void detect_system_supports_pseudo_nmi(void) { } +#endif + void __init init_cpu_features(struct cpuinfo_arm64 *info) { /* Before we start using the tables, make sure it is sorted */ @@ -585,48 +1042,45 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64DFR1_EL1, info->reg_id_aa64dfr1); init_cpu_ftr_reg(SYS_ID_AA64ISAR0_EL1, info->reg_id_aa64isar0); init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1); + init_cpu_ftr_reg(SYS_ID_AA64ISAR2_EL1, info->reg_id_aa64isar2); init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0); init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); + init_cpu_ftr_reg(SYS_ID_AA64MMFR3_EL1, info->reg_id_aa64mmfr3); init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0); + init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0); - if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { - init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); - init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0); - init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1); - init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2); - init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3); - init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4); - init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5); - init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0); - init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1); - init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2); - init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3); - init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0); - init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1); - init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0); - init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1); - init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2); - } + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) + init_32bit_cpu_features(&info->aarch32); + + if (IS_ENABLED(CONFIG_ARM64_SVE) && + id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + unsigned long cpacr = cpacr_save_enable_kernel_sve(); - if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) { - init_cpu_ftr_reg(SYS_ZCR_EL1, info->reg_zcr); - sve_init_vq_map(); + vec_init_vq_map(ARM64_VEC_SVE); + + cpacr_restore(cpacr); } - /* - * Initialize the indirect array of CPU hwcaps capabilities pointers - * before we handle the boot CPU below. - */ - init_cpu_hwcaps_indirect_list(); + if (IS_ENABLED(CONFIG_ARM64_SME) && + id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { + unsigned long cpacr = cpacr_save_enable_kernel_sme(); - /* - * Detect and enable early CPU capabilities based on the boot CPU, - * after we have initialised the CPU feature infrastructure. - */ - setup_boot_cpu_capabilities(); + /* + * We mask out SMPS since even if the hardware + * supports priorities the kernel does not at present + * and we block access to them. + */ + info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; + vec_init_vq_map(ARM64_VEC_SME); + + cpacr_restore(cpacr); + } + + if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) + init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); } static void update_cpu_ftr_reg(struct arm64_ftr_reg *reg, u64 new) @@ -650,7 +1104,9 @@ static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot) { struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id); - BUG_ON(!regp); + if (!regp) + return 0; + update_cpu_ftr_reg(regp, val); if ((boot & regp->strict_mask) == (val & regp->strict_mask)) return 0; @@ -659,6 +1115,112 @@ static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot) return 1; } +static void relax_cpu_ftr_reg(u32 sys_id, int field) +{ + const struct arm64_ftr_bits *ftrp; + struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id); + + if (!regp) + return; + + for (ftrp = regp->ftr_bits; ftrp->width; ftrp++) { + if (ftrp->shift == field) { + regp->strict_mask &= ~arm64_ftr_mask(ftrp); + break; + } + } + + /* Bogus field? */ + WARN_ON(!ftrp->width); +} + +static void lazy_init_32bit_cpu_features(struct cpuinfo_arm64 *info, + struct cpuinfo_arm64 *boot) +{ + static bool boot_cpu_32bit_regs_overridden = false; + + if (!allow_mismatched_32bit_el0 || boot_cpu_32bit_regs_overridden) + return; + + if (id_aa64pfr0_32bit_el0(boot->reg_id_aa64pfr0)) + return; + + boot->aarch32 = info->aarch32; + init_32bit_cpu_features(&boot->aarch32); + boot_cpu_32bit_regs_overridden = true; +} + +static int update_32bit_cpu_features(int cpu, struct cpuinfo_32bit *info, + struct cpuinfo_32bit *boot) +{ + int taint = 0; + u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + /* + * If we don't have AArch32 at EL1, then relax the strictness of + * EL1-dependent register fields to avoid spurious sanity check fails. + */ + if (!id_aa64pfr0_32bit_el1(pfr0)) { + relax_cpu_ftr_reg(SYS_ID_ISAR4_EL1, ID_ISAR4_EL1_SMC_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Virt_frac_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Sec_frac_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Virtualization_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Security_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_ProgMod_SHIFT); + } + + taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu, + info->reg_id_dfr0, boot->reg_id_dfr0); + taint |= check_update_ftr_reg(SYS_ID_DFR1_EL1, cpu, + info->reg_id_dfr1, boot->reg_id_dfr1); + taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu, + info->reg_id_isar0, boot->reg_id_isar0); + taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu, + info->reg_id_isar1, boot->reg_id_isar1); + taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu, + info->reg_id_isar2, boot->reg_id_isar2); + taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu, + info->reg_id_isar3, boot->reg_id_isar3); + taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu, + info->reg_id_isar4, boot->reg_id_isar4); + taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu, + info->reg_id_isar5, boot->reg_id_isar5); + taint |= check_update_ftr_reg(SYS_ID_ISAR6_EL1, cpu, + info->reg_id_isar6, boot->reg_id_isar6); + + /* + * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and + * ACTLR formats could differ across CPUs and therefore would have to + * be trapped for virtualization anyway. + */ + taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu, + info->reg_id_mmfr0, boot->reg_id_mmfr0); + taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu, + info->reg_id_mmfr1, boot->reg_id_mmfr1); + taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu, + info->reg_id_mmfr2, boot->reg_id_mmfr2); + taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu, + info->reg_id_mmfr3, boot->reg_id_mmfr3); + taint |= check_update_ftr_reg(SYS_ID_MMFR4_EL1, cpu, + info->reg_id_mmfr4, boot->reg_id_mmfr4); + taint |= check_update_ftr_reg(SYS_ID_MMFR5_EL1, cpu, + info->reg_id_mmfr5, boot->reg_id_mmfr5); + taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu, + info->reg_id_pfr0, boot->reg_id_pfr0); + taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu, + info->reg_id_pfr1, boot->reg_id_pfr1); + taint |= check_update_ftr_reg(SYS_ID_PFR2_EL1, cpu, + info->reg_id_pfr2, boot->reg_id_pfr2); + taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu, + info->reg_mvfr0, boot->reg_mvfr0); + taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu, + info->reg_mvfr1, boot->reg_mvfr1); + taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu, + info->reg_mvfr2, boot->reg_mvfr2); + + return taint; +} + /* * Update system wide CPU feature registers with the values from a * non-boot CPU. Also performs SANITY checks to make sure that there @@ -708,6 +1270,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64isar0, boot->reg_id_aa64isar0); taint |= check_update_ftr_reg(SYS_ID_AA64ISAR1_EL1, cpu, info->reg_id_aa64isar1, boot->reg_id_aa64isar1); + taint |= check_update_ftr_reg(SYS_ID_AA64ISAR2_EL1, cpu, + info->reg_id_aa64isar2, boot->reg_id_aa64isar2); /* * Differing PARange support is fine as long as all peripherals and @@ -720,10 +1284,9 @@ void update_cpu_features(int cpu, info->reg_id_aa64mmfr1, boot->reg_id_aa64mmfr1); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu, info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); + taint |= check_update_ftr_reg(SYS_ID_AA64MMFR3_EL1, cpu, + info->reg_id_aa64mmfr3, boot->reg_id_aa64mmfr3); - /* - * EL3 is not our concern. - */ taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu, info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0); taint |= check_update_ftr_reg(SYS_ID_AA64PFR1_EL1, cpu, @@ -732,61 +1295,62 @@ void update_cpu_features(int cpu, taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu, info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0); - /* - * If we have AArch32, we care about 32-bit features for compat. - * If the system doesn't support AArch32, don't update them. - */ - if (id_aa64pfr0_32bit_el0(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1)) && - id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { - - taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu, - info->reg_id_dfr0, boot->reg_id_dfr0); - taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu, - info->reg_id_isar0, boot->reg_id_isar0); - taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu, - info->reg_id_isar1, boot->reg_id_isar1); - taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu, - info->reg_id_isar2, boot->reg_id_isar2); - taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu, - info->reg_id_isar3, boot->reg_id_isar3); - taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu, - info->reg_id_isar4, boot->reg_id_isar4); - taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu, - info->reg_id_isar5, boot->reg_id_isar5); + taint |= check_update_ftr_reg(SYS_ID_AA64SMFR0_EL1, cpu, + info->reg_id_aa64smfr0, boot->reg_id_aa64smfr0); + + /* Probe vector lengths */ + if (IS_ENABLED(CONFIG_ARM64_SVE) && + id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + if (!system_capabilities_finalized()) { + unsigned long cpacr = cpacr_save_enable_kernel_sve(); + + vec_update_vq_map(ARM64_VEC_SVE); + + cpacr_restore(cpacr); + } + } + + if (IS_ENABLED(CONFIG_ARM64_SME) && + id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { + unsigned long cpacr = cpacr_save_enable_kernel_sme(); /* - * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and - * ACTLR formats could differ across CPUs and therefore would have to - * be trapped for virtualization anyway. + * We mask out SMPS since even if the hardware + * supports priorities the kernel does not at present + * and we block access to them. */ - taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu, - info->reg_id_mmfr0, boot->reg_id_mmfr0); - taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu, - info->reg_id_mmfr1, boot->reg_id_mmfr1); - taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu, - info->reg_id_mmfr2, boot->reg_id_mmfr2); - taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu, - info->reg_id_mmfr3, boot->reg_id_mmfr3); - taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu, - info->reg_id_pfr0, boot->reg_id_pfr0); - taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu, - info->reg_id_pfr1, boot->reg_id_pfr1); - taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu, - info->reg_mvfr0, boot->reg_mvfr0); - taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu, - info->reg_mvfr1, boot->reg_mvfr1); - taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu, - info->reg_mvfr2, boot->reg_mvfr2); + info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; + + /* Probe vector lengths */ + if (!system_capabilities_finalized()) + vec_update_vq_map(ARM64_VEC_SME); + + cpacr_restore(cpacr); } - if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) { - taint |= check_update_ftr_reg(SYS_ZCR_EL1, cpu, - info->reg_zcr, boot->reg_zcr); + /* + * The kernel uses the LDGM/STGM instructions and the number of tags + * they read/write depends on the GMID_EL1.BS field. Check that the + * value is the same on all CPUs. + */ + if (IS_ENABLED(CONFIG_ARM64_MTE) && + id_aa64pfr1_mte(info->reg_id_aa64pfr1)) { + taint |= check_update_ftr_reg(SYS_GMID_EL1, cpu, + info->reg_gmid, boot->reg_gmid); + } - /* Probe vector lengths, unless we already gave up on SVE */ - if (id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1)) && - !sys_caps_initialised) - sve_update_vq_map(); + /* + * If we don't have AArch32 at all then skip the checks entirely + * as the register values may be UNKNOWN and we're not going to be + * using them for anything. + * + * This relies on a sanitised view of the AArch64 ID registers + * (e.g. SYS_ID_AA64PFR0_EL1), so we call it last. + */ + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { + lazy_init_32bit_cpu_features(info, boot); + taint |= update_32bit_cpu_features(cpu, &info->aarch32, + &boot->aarch32); } /* @@ -803,34 +1367,43 @@ u64 read_sanitised_ftr_reg(u32 id) { struct arm64_ftr_reg *regp = get_arm64_ftr_reg(id); - /* We shouldn't get a request for an unsupported register */ - BUG_ON(!regp); + if (!regp) + return 0; return regp->sys_val; } +EXPORT_SYMBOL_GPL(read_sanitised_ftr_reg); #define read_sysreg_case(r) \ - case r: return read_sysreg_s(r) + case r: val = read_sysreg_s(r); break; /* * __read_sysreg_by_encoding() - Used by a STARTING cpu before cpuinfo is populated. * Read the system register on the current CPU */ -static u64 __read_sysreg_by_encoding(u32 sys_id) +u64 __read_sysreg_by_encoding(u32 sys_id) { + struct arm64_ftr_reg *regp; + u64 val; + switch (sys_id) { read_sysreg_case(SYS_ID_PFR0_EL1); read_sysreg_case(SYS_ID_PFR1_EL1); + read_sysreg_case(SYS_ID_PFR2_EL1); read_sysreg_case(SYS_ID_DFR0_EL1); + read_sysreg_case(SYS_ID_DFR1_EL1); read_sysreg_case(SYS_ID_MMFR0_EL1); read_sysreg_case(SYS_ID_MMFR1_EL1); read_sysreg_case(SYS_ID_MMFR2_EL1); read_sysreg_case(SYS_ID_MMFR3_EL1); + read_sysreg_case(SYS_ID_MMFR4_EL1); + read_sysreg_case(SYS_ID_MMFR5_EL1); read_sysreg_case(SYS_ID_ISAR0_EL1); read_sysreg_case(SYS_ID_ISAR1_EL1); read_sysreg_case(SYS_ID_ISAR2_EL1); read_sysreg_case(SYS_ID_ISAR3_EL1); read_sysreg_case(SYS_ID_ISAR4_EL1); read_sysreg_case(SYS_ID_ISAR5_EL1); + read_sysreg_case(SYS_ID_ISAR6_EL1); read_sysreg_case(SYS_MVFR0_EL1); read_sysreg_case(SYS_MVFR1_EL1); read_sysreg_case(SYS_MVFR2_EL1); @@ -838,13 +1411,16 @@ static u64 __read_sysreg_by_encoding(u32 sys_id) read_sysreg_case(SYS_ID_AA64PFR0_EL1); read_sysreg_case(SYS_ID_AA64PFR1_EL1); read_sysreg_case(SYS_ID_AA64ZFR0_EL1); + read_sysreg_case(SYS_ID_AA64SMFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR0_EL1); read_sysreg_case(SYS_ID_AA64MMFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR2_EL1); + read_sysreg_case(SYS_ID_AA64MMFR3_EL1); read_sysreg_case(SYS_ID_AA64ISAR0_EL1); read_sysreg_case(SYS_ID_AA64ISAR1_EL1); + read_sysreg_case(SYS_ID_AA64ISAR2_EL1); read_sysreg_case(SYS_CNTFRQ_EL0); read_sysreg_case(SYS_CTR_EL0); @@ -854,32 +1430,126 @@ static u64 __read_sysreg_by_encoding(u32 sys_id) BUG(); return 0; } + + regp = get_arm64_ftr_reg(sys_id); + if (regp) { + val &= ~regp->override->mask; + val |= (regp->override->val & regp->override->mask); + } + + return val; } #include <linux/irqchip/arm-gic-v3.h> static bool +has_always(const struct arm64_cpu_capabilities *entry, int scope) +{ + return true; +} + +static bool feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry) { - int val = cpuid_feature_extract_field(reg, entry->field_pos, entry->sign); + int val = cpuid_feature_extract_field_width(reg, entry->field_pos, + entry->field_width, + entry->sign); return val >= entry->min_field_value; } -static bool -has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) +static u64 +read_scoped_sysreg(const struct arm64_cpu_capabilities *entry, int scope) { - u64 val; - WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible()); if (scope == SCOPE_SYSTEM) - val = read_sanitised_ftr_reg(entry->sys_reg); + return read_sanitised_ftr_reg(entry->sys_reg); else - val = __read_sysreg_by_encoding(entry->sys_reg); + return __read_sysreg_by_encoding(entry->sys_reg); +} + +static bool +has_user_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) +{ + int mask; + struct arm64_ftr_reg *regp; + u64 val = read_scoped_sysreg(entry, scope); + + regp = get_arm64_ftr_reg(entry->sys_reg); + if (!regp) + return false; + + mask = cpuid_feature_extract_unsigned_field_width(regp->user_mask, + entry->field_pos, + entry->field_width); + if (!mask) + return false; return feature_matches(val, entry); } +static bool +has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 val = read_scoped_sysreg(entry, scope); + return feature_matches(val, entry); +} + +const struct cpumask *system_32bit_el0_cpumask(void) +{ + if (!system_supports_32bit_el0()) + return cpu_none_mask; + + if (static_branch_unlikely(&arm64_mismatched_32bit_el0)) + return cpu_32bit_el0_mask; + + return cpu_possible_mask; +} + +static int __init parse_32bit_el0_param(char *str) +{ + allow_mismatched_32bit_el0 = true; + return 0; +} +early_param("allow_mismatched_32bit_el0", parse_32bit_el0_param); + +static ssize_t aarch32_el0_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const struct cpumask *mask = system_32bit_el0_cpumask(); + + return sysfs_emit(buf, "%*pbl\n", cpumask_pr_args(mask)); +} +static const DEVICE_ATTR_RO(aarch32_el0); + +static int __init aarch32_el0_sysfs_init(void) +{ + struct device *dev_root; + int ret = 0; + + if (!allow_mismatched_32bit_el0) + return 0; + + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + ret = device_create_file(dev_root, &dev_attr_aarch32_el0); + put_device(dev_root); + } + return ret; +} +device_initcall(aarch32_el0_sysfs_init); + +static bool has_32bit_el0(const struct arm64_cpu_capabilities *entry, int scope) +{ + if (!has_cpuid_feature(entry, scope)) + return allow_mismatched_32bit_el0; + + if (scope == SCOPE_SYSTEM) + pr_info("detected: 32-bit EL0 Support\n"); + + return true; +} + static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry, int scope) { bool has_sre; @@ -895,24 +1565,6 @@ static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry, return has_sre; } -static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry, int __unused) -{ - u32 midr = read_cpuid_id(); - - /* Cavium ThunderX pass 1.x and 2.x */ - return midr_is_cpu_model_range(midr, MIDR_THUNDERX, - MIDR_CPU_VAR_REV(0, 0), - MIDR_CPU_VAR_REV(1, MIDR_REVISION_MASK)); -} - -static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unused) -{ - u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); - - return cpuid_feature_extract_signed_field(pfr0, - ID_AA64PFR0_FP_SHIFT) < 0; -} - static bool has_cache_idc(const struct arm64_cpu_capabilities *entry, int scope) { @@ -923,7 +1575,7 @@ static bool has_cache_idc(const struct arm64_cpu_capabilities *entry, else ctr = read_cpuid_effective_cachetype(); - return ctr & BIT(CTR_IDC_SHIFT); + return ctr & BIT(CTR_EL0_IDC_SHIFT); } static void cpu_emulate_effective_ctr(const struct arm64_cpu_capabilities *__unused) @@ -934,7 +1586,7 @@ static void cpu_emulate_effective_ctr(const struct arm64_cpu_capabilities *__unu * to the CTR_EL0 on this CPU and emulate it with the real/safe * value. */ - if (!(read_cpuid_cachetype() & BIT(CTR_IDC_SHIFT))) + if (!(read_cpuid_cachetype() & BIT(CTR_EL0_IDC_SHIFT))) sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0); } @@ -948,7 +1600,7 @@ static bool has_cache_dic(const struct arm64_cpu_capabilities *entry, else ctr = read_cpuid_cachetype(); - return ctr & BIT(CTR_DIC_SHIFT); + return ctr & BIT(CTR_EL0_DIC_SHIFT); } static bool __maybe_unused @@ -959,12 +1611,55 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) * may share TLB entries with a CPU stuck in the crashed * kernel. */ - if (is_kdump_kernel()) + if (is_kdump_kernel()) + return false; + + if (cpus_have_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP)) return false; return has_cpuid_feature(entry, scope); } +/* + * This check is triggered during the early boot before the cpufeature + * is initialised. Checking the status on the local CPU allows the boot + * CPU to detect the need for non-global mappings and thus avoiding a + * pagetable re-write after all the CPUs are booted. This check will be + * anyway run on individual CPUs, allowing us to get the consistent + * state once the SMP CPUs are up and thus make the switch to non-global + * mappings if required. + */ +bool kaslr_requires_kpti(void) +{ + if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return false; + + /* + * E0PD does a similar job to KPTI so can be used instead + * where available. + */ + if (IS_ENABLED(CONFIG_ARM64_E0PD)) { + u64 mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1); + if (cpuid_feature_extract_unsigned_field(mmfr2, + ID_AA64MMFR2_EL1_E0PD_SHIFT)) + return false; + } + + /* + * Systems affected by Cavium erratum 24756 are incompatible + * with KPTI. + */ + if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) { + extern const struct midr_range cavium_erratum_27456_cpus[]; + + if (is_midr_in_range_list(read_cpuid_id(), + cavium_erratum_27456_cpus)) + return false; + } + + return kaslr_enabled(); +} + static bool __meltdown_safe = true; static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ @@ -975,6 +1670,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, static const struct midr_range kpti_safe_list[] = { MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), @@ -983,6 +1679,10 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_GOLD), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), { /* sentinel */ } }; char const *str = "kpti command line option"; @@ -1000,15 +1700,19 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, /* * For reasons that aren't entirely clear, enabling KPTI on Cavium * ThunderX leads to apparent I-cache corruption of kernel text, which - * ends as well as you might imagine. Don't even try. + * ends as well as you might imagine. Don't even try. We cannot rely + * on the cpus_have_*cap() helpers here to detect the CPU erratum + * because cpucap detection order may change. However, since we know + * affected CPUs are always in a homogeneous configuration, it is + * safe to rely on this_cpu_has_cap() here. */ - if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_27456)) { + if (this_cpu_has_cap(ARM64_WORKAROUND_CAVIUM_27456)) { str = "ARM64_WORKAROUND_CAVIUM_27456"; __kpti_forced = -1; } /* Useful for KASLR robustness */ - if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset() > 0) { + if (kaslr_requires_kpti()) { if (!__kpti_forced) { str = "KASLR"; __kpti_forced = 1; @@ -1035,47 +1739,145 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, return !meltdown_safe; } +#if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2) +static bool has_lpa2_at_stage1(u64 mmfr0) +{ + unsigned int tgran; + + tgran = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_TGRAN_SHIFT); + return tgran == ID_AA64MMFR0_EL1_TGRAN_LPA2; +} + +static bool has_lpa2_at_stage2(u64 mmfr0) +{ + unsigned int tgran; + + tgran = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_TGRAN_2_SHIFT); + return tgran == ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2; +} + +static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 mmfr0; + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + return has_lpa2_at_stage1(mmfr0) && has_lpa2_at_stage2(mmfr0); +} +#else +static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) +{ + return false; +} +#endif + #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -static void -kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) +#define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) + +extern +void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, + phys_addr_t size, pgprot_t prot, + phys_addr_t (*pgtable_alloc)(int), int flags); + +static phys_addr_t __initdata kpti_ng_temp_alloc; + +static phys_addr_t __init kpti_ng_pgd_alloc(int shift) +{ + kpti_ng_temp_alloc -= PAGE_SIZE; + return kpti_ng_temp_alloc; +} + +static int __init __kpti_install_ng_mappings(void *__unused) { - typedef void (kpti_remap_fn)(int, int, phys_addr_t); + typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long); extern kpti_remap_fn idmap_kpti_install_ng_mappings; kpti_remap_fn *remap_fn; - static bool kpti_applied = false; int cpu = smp_processor_id(); + int levels = CONFIG_PGTABLE_LEVELS; + int order = order_base_2(levels); + u64 kpti_ng_temp_pgd_pa = 0; + pgd_t *kpti_ng_temp_pgd; + u64 alloc = 0; + + remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); + + if (!cpu) { + alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); + kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE); + kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd); + + // + // Create a minimal page table hierarchy that permits us to map + // the swapper page tables temporarily as we traverse them. + // + // The physical pages are laid out as follows: + // + // +--------+-/-------+-/------ +-\\--------+ + // : PTE[] : | PMD[] : | PUD[] : || PGD[] : + // +--------+-\-------+-\------ +-//--------+ + // ^ + // The first page is mapped into this hierarchy at a PMD_SHIFT + // aligned virtual address, so that we can manipulate the PTE + // level entries while the mapping is active. The first entry + // covers the PTE[] page itself, the remaining entries are free + // to be used as a ad-hoc fixmap. + // + create_kpti_ng_temp_pgd(kpti_ng_temp_pgd, __pa(alloc), + KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL, + kpti_ng_pgd_alloc, 0); + } + + cpu_install_idmap(); + remap_fn(cpu, num_online_cpus(), kpti_ng_temp_pgd_pa, KPTI_NG_TEMP_VA); + cpu_uninstall_idmap(); + + if (!cpu) { + free_pages(alloc, order); + arm64_use_ng_mappings = true; + } + + return 0; +} + +static void __init kpti_install_ng_mappings(void) +{ + /* Check whether KPTI is going to be used */ + if (!arm64_kernel_unmapped_at_el0()) + return; /* * We don't need to rewrite the page-tables if either we've done * it already or we have KASLR enabled and therefore have not * created any global mappings at all. */ - if (kpti_applied || kaslr_offset() > 0) + if (arm64_use_ng_mappings) return; - remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); - - cpu_install_idmap(); - remap_fn(cpu, num_online_cpus(), __pa_symbol(swapper_pg_dir)); - cpu_uninstall_idmap(); - - if (!cpu) - kpti_applied = true; - - return; + stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask); } + #else -static void -kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) +static inline void kpti_install_ng_mappings(void) { } #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ +static void cpu_enable_kpti(struct arm64_cpu_capabilities const *cap) +{ + if (__this_cpu_read(this_cpu_vector) == vectors) { + const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI); + + __this_cpu_write(this_cpu_vector, v); + } + +} + static int __init parse_kpti(char *str) { bool enabled; - int ret = strtobool(str, &enabled); + int ret = kstrtobool(str, &enabled); if (ret) return ret; @@ -1086,12 +1888,15 @@ static int __init parse_kpti(char *str) early_param("kpti", parse_kpti); #ifdef CONFIG_ARM64_HW_AFDBM +static struct cpumask dbm_cpus __read_mostly; + static inline void __cpu_enable_hw_dbm(void) { u64 tcr = read_sysreg(tcr_el1) | TCR_HD; write_sysreg(tcr, tcr_el1); isb(); + local_flush_tlb_all(); } static bool cpu_has_broken_dbm(void) @@ -1099,7 +1904,12 @@ static bool cpu_has_broken_dbm(void) /* List of CPUs which have broken DBM support. */ static const struct midr_range cpus[] = { #ifdef CONFIG_ARM64_ERRATUM_1024718 - MIDR_RANGE(MIDR_CORTEX_A55, 0, 0, 1, 0), // A55 r0p0 -r1p0 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + /* Kryo4xx Silver (rdpe => r1p0) */ + MIDR_REV(MIDR_QCOM_KRYO_4XX_SILVER, 0xd, 0xe), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2051678 + MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2), #endif {}, }; @@ -1115,42 +1925,87 @@ static bool cpu_can_use_dbm(const struct arm64_cpu_capabilities *cap) static void cpu_enable_hw_dbm(struct arm64_cpu_capabilities const *cap) { - if (cpu_can_use_dbm(cap)) + if (cpu_can_use_dbm(cap)) { __cpu_enable_hw_dbm(); + cpumask_set_cpu(smp_processor_id(), &dbm_cpus); + } } static bool has_hw_dbm(const struct arm64_cpu_capabilities *cap, int __unused) { - static bool detected = false; /* * DBM is a non-conflicting feature. i.e, the kernel can safely * run a mix of CPUs with and without the feature. So, we * unconditionally enable the capability to allow any late CPU * to use the feature. We only enable the control bits on the - * CPU, if it actually supports. - * - * We have to make sure we print the "feature" detection only - * when at least one CPU actually uses it. So check if this CPU - * can actually use it and print the message exactly once. - * - * This is safe as all CPUs (including secondary CPUs - due to the - * LOCAL_CPU scope - and the hotplugged CPUs - via verification) - * goes through the "matches" check exactly once. Also if a CPU - * matches the criteria, it is guaranteed that the CPU will turn - * the DBM on, as the capability is unconditionally enabled. + * CPU, if it is supported. */ - if (!detected && cpu_can_use_dbm(cap)) { - detected = true; - pr_info("detected: Hardware dirty bit management\n"); - } return true; } #endif -#ifdef CONFIG_ARM64_VHE +#ifdef CONFIG_ARM64_AMU_EXTN + +/* + * The "amu_cpus" cpumask only signals that the CPU implementation for the + * flagged CPUs supports the Activity Monitors Unit (AMU) but does not provide + * information regarding all the events that it supports. When a CPU bit is + * set in the cpumask, the user of this feature can only rely on the presence + * of the 4 fixed counters for that CPU. But this does not guarantee that the + * counters are enabled or access to these counters is enabled by code + * executed at higher exception levels (firmware). + */ +static struct cpumask amu_cpus __read_mostly; + +bool cpu_has_amu_feat(int cpu) +{ + return cpumask_test_cpu(cpu, &amu_cpus); +} + +int get_cpu_with_amu_feat(void) +{ + return cpumask_any(&amu_cpus); +} + +static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap) +{ + if (has_cpuid_feature(cap, SCOPE_LOCAL_CPU)) { + cpumask_set_cpu(smp_processor_id(), &amu_cpus); + + /* 0 reference values signal broken/disabled counters */ + if (!this_cpu_has_cap(ARM64_WORKAROUND_2457168)) + update_freq_counters_refs(); + } +} + +static bool has_amu(const struct arm64_cpu_capabilities *cap, + int __unused) +{ + /* + * The AMU extension is a non-conflicting feature: the kernel can + * safely run a mix of CPUs with and without support for the + * activity monitors extension. Therefore, unconditionally enable + * the capability to allow any late CPU to use the feature. + * + * With this feature unconditionally enabled, the cpu_enable + * function will be called for all CPUs that match the criteria, + * including secondary and hotplugged, marking this feature as + * present on that respective CPU. The enable function will also + * print a detection message. + */ + + return true; +} +#else +int get_cpu_with_amu_feat(void) +{ + return nr_cpu_ids; +} +#endif + static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused) { return is_kernel_in_hyp_mode(); @@ -1169,57 +2024,33 @@ static void cpu_copy_el2regs(const struct arm64_cpu_capabilities *__unused) if (!alternative_is_applied(ARM64_HAS_VIRT_HOST_EXTN)) write_sysreg(read_sysreg(tpidr_el1), tpidr_el2); } -#endif - -static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused) -{ - u64 val = read_sysreg_s(SYS_CLIDR_EL1); - - /* Check that CLIDR_EL1.LOU{U,IS} are both 0 */ - WARN_ON(val & (7 << 27 | 7 << 21)); -} -#ifdef CONFIG_ARM64_SSBD -static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr) +static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap, + int scope) { - if (user_mode(regs)) - return 1; + if (kvm_get_mode() != KVM_MODE_NV) + return false; - if (instr & BIT(PSTATE_Imm_shift)) - regs->pstate |= PSR_SSBS_BIT; - else - regs->pstate &= ~PSR_SSBS_BIT; + if (!has_cpuid_feature(cap, scope)) { + pr_warn("unavailable: %s\n", cap->desc); + return false; + } - arm64_skip_faulting_instruction(regs, 4); - return 0; + return true; } -static struct undef_hook ssbs_emulation_hook = { - .instr_mask = ~(1U << PSTATE_Imm_shift), - .instr_val = 0xd500401f | PSTATE_SSBS, - .fn = ssbs_emulation_handler, -}; - -static void cpu_enable_ssbs(const struct arm64_cpu_capabilities *__unused) +static bool hvhe_possible(const struct arm64_cpu_capabilities *entry, + int __unused) { - static bool undef_hook_registered = false; - static DEFINE_RAW_SPINLOCK(hook_lock); + u64 val; - raw_spin_lock(&hook_lock); - if (!undef_hook_registered) { - register_undef_hook(&ssbs_emulation_hook); - undef_hook_registered = true; - } - raw_spin_unlock(&hook_lock); + val = read_sysreg(id_aa64mmfr1_el1); + if (!cpuid_feature_extract_unsigned_field(val, ID_AA64MMFR1_EL1_VH_SHIFT)) + return false; - if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) { - sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS); - arm64_set_ssbd_mitigation(false); - } else { - arm64_set_ssbd_mitigation(true); - } + val = arm64_sw_feature_override.val & arm64_sw_feature_override.mask; + return cpuid_feature_extract_unsigned_field(val, ARM64_SW_FEATURE_OVERRIDE_HVHE); } -#endif /* CONFIG_ARM64_SSBD */ #ifdef CONFIG_ARM64_PAN static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) @@ -1231,7 +2062,7 @@ static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) WARN_ON_ONCE(in_interrupt()); sysreg_clear_set(sctlr_el1, SCTLR_EL1_SPAN, 0); - asm(SET_PSTATE_PAN(1)); + set_pstate_pan(1); } #endif /* CONFIG_ARM64_PAN */ @@ -1244,39 +2075,231 @@ static void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused) #endif /* CONFIG_ARM64_RAS_EXTN */ #ifdef CONFIG_ARM64_PTR_AUTH -static void cpu_enable_address_auth(struct arm64_cpu_capabilities const *cap) +static bool has_address_auth_cpucap(const struct arm64_cpu_capabilities *entry, int scope) { - sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | - SCTLR_ELx_ENDA | SCTLR_ELx_ENDB); + int boot_val, sec_val; + + /* We don't expect to be called with SCOPE_SYSTEM */ + WARN_ON(scope == SCOPE_SYSTEM); + /* + * The ptr-auth feature levels are not intercompatible with lower + * levels. Hence we must match ptr-auth feature level of the secondary + * CPUs with that of the boot CPU. The level of boot cpu is fetched + * from the sanitised register whereas direct register read is done for + * the secondary CPUs. + * The sanitised feature state is guaranteed to match that of the + * boot CPU as a mismatched secondary CPU is parked before it gets + * a chance to update the state, with the capability. + */ + boot_val = cpuid_feature_extract_field(read_sanitised_ftr_reg(entry->sys_reg), + entry->field_pos, entry->sign); + if (scope & SCOPE_BOOT_CPU) + return boot_val >= entry->min_field_value; + /* Now check for the secondary CPUs with SCOPE_LOCAL_CPU scope */ + sec_val = cpuid_feature_extract_field(__read_sysreg_by_encoding(entry->sys_reg), + entry->field_pos, entry->sign); + return (sec_val >= entry->min_field_value) && (sec_val == boot_val); } -#endif /* CONFIG_ARM64_PTR_AUTH */ -#ifdef CONFIG_ARM64_PSEUDO_NMI -static bool enable_pseudo_nmi; +static bool has_address_auth_metacap(const struct arm64_cpu_capabilities *entry, + int scope) +{ + bool api = has_address_auth_cpucap(cpucap_ptrs[ARM64_HAS_ADDRESS_AUTH_IMP_DEF], scope); + bool apa = has_address_auth_cpucap(cpucap_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA5], scope); + bool apa3 = has_address_auth_cpucap(cpucap_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA3], scope); -static int __init early_enable_pseudo_nmi(char *p) + return apa || apa3 || api; +} + +static bool has_generic_auth(const struct arm64_cpu_capabilities *entry, + int __unused) { - return strtobool(p, &enable_pseudo_nmi); + bool gpi = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_IMP_DEF); + bool gpa = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_ARCH_QARMA5); + bool gpa3 = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_ARCH_QARMA3); + + return gpa || gpa3 || gpi; } -early_param("irqchip.gicv3_pseudo_nmi", early_enable_pseudo_nmi); +#endif /* CONFIG_ARM64_PTR_AUTH */ +#ifdef CONFIG_ARM64_E0PD +static void cpu_enable_e0pd(struct arm64_cpu_capabilities const *cap) +{ + if (this_cpu_has_cap(ARM64_HAS_E0PD)) + sysreg_clear_set(tcr_el1, 0, TCR_E0PD1); +} +#endif /* CONFIG_ARM64_E0PD */ + +#ifdef CONFIG_ARM64_PSEUDO_NMI static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry, int scope) { - return enable_pseudo_nmi && has_useable_gicv3_cpuif(entry, scope); + /* + * ARM64_HAS_GIC_CPUIF_SYSREGS has a lower index, and is a boot CPU + * feature, so will be detected earlier. + */ + BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_MASKING <= ARM64_HAS_GIC_CPUIF_SYSREGS); + if (!cpus_have_cap(ARM64_HAS_GIC_CPUIF_SYSREGS)) + return false; + + return enable_pseudo_nmi; +} + +static bool has_gic_prio_relaxed_sync(const struct arm64_cpu_capabilities *entry, + int scope) +{ + /* + * If we're not using priority masking then we won't be poking PMR_EL1, + * and there's no need to relax synchronization of writes to it, and + * ICC_CTLR_EL1 might not be accessible and we must avoid reads from + * that. + * + * ARM64_HAS_GIC_PRIO_MASKING has a lower index, and is a boot CPU + * feature, so will be detected earlier. + */ + BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_RELAXED_SYNC <= ARM64_HAS_GIC_PRIO_MASKING); + if (!cpus_have_cap(ARM64_HAS_GIC_PRIO_MASKING)) + return false; + + /* + * When Priority Mask Hint Enable (PMHE) == 0b0, PMR is not used as a + * hint for interrupt distribution, a DSB is not necessary when + * unmasking IRQs via PMR, and we can relax the barrier to a NOP. + * + * Linux itself doesn't use 1:N distribution, so has no need to + * set PMHE. The only reason to have it set is if EL3 requires it + * (and we can't change it). + */ + return (gic_read_ctlr() & ICC_CTLR_EL1_PMHE_MASK) == 0; } #endif +#ifdef CONFIG_ARM64_BTI +static void bti_enable(const struct arm64_cpu_capabilities *__unused) +{ + /* + * Use of X16/X17 for tail-calls and trampolines that jump to + * function entry points using BR is a requirement for + * marking binaries with GNU_PROPERTY_AARCH64_FEATURE_1_BTI. + * So, be strict and forbid other BRs using other registers to + * jump onto a PACIxSP instruction: + */ + sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_BT0 | SCTLR_EL1_BT1); + isb(); +} +#endif /* CONFIG_ARM64_BTI */ + +#ifdef CONFIG_ARM64_MTE +static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) +{ + sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_ATA | SCTLR_EL1_ATA0); + + mte_cpu_setup(); + + /* + * Clear the tags in the zero page. This needs to be done via the + * linear map which has the Tagged attribute. + */ + if (try_page_mte_tagging(ZERO_PAGE(0))) { + mte_clear_page_tags(lm_alias(empty_zero_page)); + set_page_mte_tagged(ZERO_PAGE(0)); + } + + kasan_init_hw_tags_cpu(); +} +#endif /* CONFIG_ARM64_MTE */ + +static void user_feature_fixup(void) +{ + if (cpus_have_cap(ARM64_WORKAROUND_2658417)) { + struct arm64_ftr_reg *regp; + + regp = get_arm64_ftr_reg(SYS_ID_AA64ISAR1_EL1); + if (regp) + regp->user_mask &= ~ID_AA64ISAR1_EL1_BF16_MASK; + } +} + +static void elf_hwcap_fixup(void) +{ +#ifdef CONFIG_COMPAT + if (cpus_have_cap(ARM64_WORKAROUND_1742098)) + compat_elf_hwcap2 &= ~COMPAT_HWCAP2_AES; +#endif /* CONFIG_COMPAT */ +} + +#ifdef CONFIG_KVM +static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused) +{ + return kvm_get_mode() == KVM_MODE_PROTECTED; +} +#endif /* CONFIG_KVM */ + +static void cpu_trap_el0_impdef(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_TIDCP); +} + +static void cpu_enable_dit(const struct arm64_cpu_capabilities *__unused) +{ + set_pstate_dit(1); +} + +static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_MSCEn); +} + +/* Internal helper functions to match cpu capability type */ +static bool +cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) +{ + return !!(cap->type & ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU); +} + +static bool +cpucap_late_cpu_permitted(const struct arm64_cpu_capabilities *cap) +{ + return !!(cap->type & ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU); +} + +static bool +cpucap_panic_on_conflict(const struct arm64_cpu_capabilities *cap) +{ + return !!(cap->type & ARM64_CPUCAP_PANIC_ON_CONFLICT); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { + .capability = ARM64_ALWAYS_BOOT, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_always, + }, + { + .capability = ARM64_ALWAYS_SYSTEM, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_always, + }, + { .desc = "GIC system register CPU interface", - .capability = ARM64_HAS_SYSREG_GIC_CPUIF, + .capability = ARM64_HAS_GIC_CPUIF_SYSREGS, .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = has_useable_gicv3_cpuif, - .sys_reg = SYS_ID_AA64PFR0_EL1, - .field_pos = ID_AA64PFR0_GIC_SHIFT, - .sign = FTR_UNSIGNED, - .min_field_value = 1, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, GIC, IMP) + }, + { + .desc = "Enhanced Counter Virtualization", + .capability = ARM64_HAS_ECV, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, ECV, IMP) + }, + { + .desc = "Enhanced Counter Virtualization (CNTPOFF)", + .capability = ARM64_HAS_ECV_CNTPOFF, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, ECV, CNTPOFF) }, #ifdef CONFIG_ARM64_PAN { @@ -1284,93 +2307,89 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_PAN, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64MMFR1_EL1, - .field_pos = ID_AA64MMFR1_PAN_SHIFT, - .sign = FTR_UNSIGNED, - .min_field_value = 1, .cpu_enable = cpu_enable_pan, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, PAN, IMP) }, #endif /* CONFIG_ARM64_PAN */ -#if defined(CONFIG_AS_LSE) && defined(CONFIG_ARM64_LSE_ATOMICS) +#ifdef CONFIG_ARM64_EPAN + { + .desc = "Enhanced Privileged Access Never", + .capability = ARM64_HAS_EPAN, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, PAN, PAN3) + }, +#endif /* CONFIG_ARM64_EPAN */ +#ifdef CONFIG_ARM64_LSE_ATOMICS { .desc = "LSE atomic instructions", .capability = ARM64_HAS_LSE_ATOMICS, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64ISAR0_EL1, - .field_pos = ID_AA64ISAR0_ATOMICS_SHIFT, - .sign = FTR_UNSIGNED, - .min_field_value = 2, + ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, ATOMIC, IMP) }, -#endif /* CONFIG_AS_LSE && CONFIG_ARM64_LSE_ATOMICS */ +#endif /* CONFIG_ARM64_LSE_ATOMICS */ { - .desc = "Software prefetching using PRFM", - .capability = ARM64_HAS_NO_HW_PREFETCH, - .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, - .matches = has_no_hw_prefetch, + .desc = "Virtualization Host Extensions", + .capability = ARM64_HAS_VIRT_HOST_EXTN, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, + .matches = runs_at_el2, + .cpu_enable = cpu_copy_el2regs, }, -#ifdef CONFIG_ARM64_UAO { - .desc = "User Access Override", - .capability = ARM64_HAS_UAO, + .desc = "Nested Virtualization Support", + .capability = ARM64_HAS_NESTED_VIRT, .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64MMFR2_EL1, - .field_pos = ID_AA64MMFR2_UAO_SHIFT, - .min_field_value = 1, - /* - * We rely on stop_machine() calling uao_thread_switch() to set - * UAO immediately after patching. - */ + .matches = has_nested_virt_support, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2) }, -#endif /* CONFIG_ARM64_UAO */ -#ifdef CONFIG_ARM64_PAN { - .capability = ARM64_ALT_PAN_NOT_UAO, + .capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE, .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .matches = cpufeature_pan_not_uao, + .matches = has_32bit_el0, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, EL0, AARCH32) }, -#endif /* CONFIG_ARM64_PAN */ -#ifdef CONFIG_ARM64_VHE +#ifdef CONFIG_KVM { - .desc = "Virtualization Host Extensions", - .capability = ARM64_HAS_VIRT_HOST_EXTN, - .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, - .matches = runs_at_el2, - .cpu_enable = cpu_copy_el2regs, + .desc = "32-bit EL1 Support", + .capability = ARM64_HAS_32BIT_EL1, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, EL1, AARCH32) }, -#endif /* CONFIG_ARM64_VHE */ { - .desc = "32-bit EL0 Support", - .capability = ARM64_HAS_32BIT_EL0, + .desc = "Protected KVM", + .capability = ARM64_KVM_PROTECTED_MODE, .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = is_kvm_protected_mode, + }, + { + .desc = "HCRX_EL2 register", + .capability = ARM64_HAS_HCX, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64PFR0_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64PFR0_EL0_SHIFT, - .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HCX, IMP) }, +#endif { .desc = "Kernel page table isolation (KPTI)", .capability = ARM64_UNMAP_KERNEL_AT_EL0, .type = ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE, + .cpu_enable = cpu_enable_kpti, + .matches = unmap_kernel_at_el0, /* * The ID feature fields below are used to indicate that * the CPU doesn't need KPTI. See unmap_kernel_at_el0 for * more details. */ - .sys_reg = SYS_ID_AA64PFR0_EL1, - .field_pos = ID_AA64PFR0_CSV3_SHIFT, - .min_field_value = 1, - .matches = unmap_kernel_at_el0, - .cpu_enable = kpti_install_ng_mappings, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, CSV3, IMP) }, { - /* FP/SIMD is not implemented */ - .capability = ARM64_HAS_NO_FPSIMD, + .capability = ARM64_HAS_FPSIMD, .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .min_field_value = 0, - .matches = has_no_fpsimd, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_fpsimd, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, FP, IMP) }, #ifdef CONFIG_ARM64_PMEM { @@ -1378,19 +2397,14 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_DCPOP, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64ISAR1_EL1, - .field_pos = ID_AA64ISAR1_DPB_SHIFT, - .min_field_value = 1, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, DPB, IMP) }, { .desc = "Data cache clean to Point of Deep Persistence", .capability = ARM64_HAS_DCPODP, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64ISAR1_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_DPB_SHIFT, - .min_field_value = 2, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, DPB, DPB2) }, #endif #ifdef CONFIG_ARM64_SVE @@ -1398,12 +2412,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .desc = "Scalable Vector Extension", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_SVE, - .sys_reg = SYS_ID_AA64PFR0_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64PFR0_SVE_SHIFT, - .min_field_value = ID_AA64PFR0_SVE, + .cpu_enable = cpu_enable_sve, .matches = has_cpuid_feature, - .cpu_enable = sve_kernel_enable, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, SVE, IMP) }, #endif /* CONFIG_ARM64_SVE */ #ifdef CONFIG_ARM64_RAS_EXTN @@ -1412,13 +2423,21 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_RAS_EXTN, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64PFR0_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64PFR0_RAS_SHIFT, - .min_field_value = ID_AA64PFR0_RAS_V1, .cpu_enable = cpu_clear_disr, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, IMP) }, #endif /* CONFIG_ARM64_RAS_EXTN */ +#ifdef CONFIG_ARM64_AMU_EXTN + { + .desc = "Activity Monitors Unit (AMU)", + .capability = ARM64_HAS_AMU_EXTN, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + .matches = has_amu, + .cpu_enable = cpu_amu_enable, + .cpus = &amu_cpus, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, AMU, IMP) + }, +#endif /* CONFIG_ARM64_AMU_EXTN */ { .desc = "Data cache clean to the PoU not required for I/D coherence", .capability = ARM64_HAS_CACHE_IDC, @@ -1436,31 +2455,32 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .desc = "Stage-2 Force Write-Back", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_HAS_STAGE2_FWB, - .sys_reg = SYS_ID_AA64MMFR2_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64MMFR2_FWB_SHIFT, - .min_field_value = 1, .matches = has_cpuid_feature, - .cpu_enable = cpu_has_fwb, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, FWB, IMP) + }, + { + .desc = "ARMv8.4 Translation Table Level", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_ARMv8_4_TTL, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, TTL, IMP) + }, + { + .desc = "TLB range maintenance instructions", + .capability = ARM64_HAS_TLB_RANGE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, TLB, RANGE) }, #ifdef CONFIG_ARM64_HW_AFDBM { - /* - * Since we turn this on always, we don't want the user to - * think that the feature is available when it may not be. - * So hide the description. - * - * .desc = "Hardware pagetable Dirty Bit Management", - * - */ + .desc = "Hardware dirty bit management", .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, .capability = ARM64_HW_DBM, - .sys_reg = SYS_ID_AA64MMFR1_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64MMFR1_HADBS_SHIFT, - .min_field_value = 2, .matches = has_hw_dbm, .cpu_enable = cpu_enable_hw_dbm, + .cpus = &dbm_cpus, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, DBM) }, #endif { @@ -1468,34 +2488,23 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_CRC32, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64ISAR0_EL1, - .field_pos = ID_AA64ISAR0_CRC32_SHIFT, - .min_field_value = 1, + ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, CRC32, IMP) }, -#ifdef CONFIG_ARM64_SSBD { .desc = "Speculative Store Bypassing Safe (SSBS)", .capability = ARM64_SSBS, - .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64PFR1_EL1, - .field_pos = ID_AA64PFR1_SSBS_SHIFT, - .sign = FTR_UNSIGNED, - .min_field_value = ID_AA64PFR1_SSBS_PSTATE_ONLY, - .cpu_enable = cpu_enable_ssbs, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SSBS, IMP) }, -#endif #ifdef CONFIG_ARM64_CNP { .desc = "Common not Private translations", .capability = ARM64_HAS_CNP, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_useable_cnp, - .sys_reg = SYS_ID_AA64MMFR2_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64MMFR2_CNP_SHIFT, - .min_field_value = 1, .cpu_enable = cpu_enable_cnp, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, CnP, IMP) }, #endif { @@ -1503,53 +2512,60 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_SB, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64ISAR1_EL1, - .field_pos = ID_AA64ISAR1_SB_SHIFT, - .sign = FTR_UNSIGNED, - .min_field_value = 1, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, SB, IMP) }, #ifdef CONFIG_ARM64_PTR_AUTH { - .desc = "Address authentication (architected algorithm)", - .capability = ARM64_HAS_ADDRESS_AUTH_ARCH, - .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .sys_reg = SYS_ID_AA64ISAR1_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_APA_SHIFT, - .min_field_value = ID_AA64ISAR1_APA_ARCHITECTED, - .matches = has_cpuid_feature, - .cpu_enable = cpu_enable_address_auth, + .desc = "Address authentication (architected QARMA5 algorithm)", + .capability = ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA5, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_address_auth_cpucap, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, APA, PAuth) + }, + { + .desc = "Address authentication (architected QARMA3 algorithm)", + .capability = ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA3, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_address_auth_cpucap, + ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, APA3, PAuth) }, { .desc = "Address authentication (IMP DEF algorithm)", .capability = ARM64_HAS_ADDRESS_AUTH_IMP_DEF, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_address_auth_cpucap, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, API, PAuth) + }, + { + .capability = ARM64_HAS_ADDRESS_AUTH, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_address_auth_metacap, + }, + { + .desc = "Generic authentication (architected QARMA5 algorithm)", + .capability = ARM64_HAS_GENERIC_AUTH_ARCH_QARMA5, .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .sys_reg = SYS_ID_AA64ISAR1_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_API_SHIFT, - .min_field_value = ID_AA64ISAR1_API_IMP_DEF, .matches = has_cpuid_feature, - .cpu_enable = cpu_enable_address_auth, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, GPA, IMP) }, { - .desc = "Generic authentication (architected algorithm)", - .capability = ARM64_HAS_GENERIC_AUTH_ARCH, + .desc = "Generic authentication (architected QARMA3 algorithm)", + .capability = ARM64_HAS_GENERIC_AUTH_ARCH_QARMA3, .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .sys_reg = SYS_ID_AA64ISAR1_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_GPA_SHIFT, - .min_field_value = ID_AA64ISAR1_GPA_ARCHITECTED, .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, GPA3, IMP) }, { .desc = "Generic authentication (IMP DEF algorithm)", .capability = ARM64_HAS_GENERIC_AUTH_IMP_DEF, .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .sys_reg = SYS_ID_AA64ISAR1_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_GPI_SHIFT, - .min_field_value = ID_AA64ISAR1_GPI_IMP_DEF, .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, GPI, IMP) + }, + { + .capability = ARM64_HAS_GENERIC_AUTH, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_generic_auth, }, #endif /* CONFIG_ARM64_PTR_AUTH */ #ifdef CONFIG_ARM64_PSEUDO_NMI @@ -1558,24 +2574,177 @@ static const struct arm64_cpu_capabilities arm64_features[] = { * Depends on having GICv3 */ .desc = "IRQ priority masking", - .capability = ARM64_HAS_IRQ_PRIO_MASKING, + .capability = ARM64_HAS_GIC_PRIO_MASKING, .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = can_use_gic_priorities, - .sys_reg = SYS_ID_AA64PFR0_EL1, - .field_pos = ID_AA64PFR0_GIC_SHIFT, - .sign = FTR_UNSIGNED, - .min_field_value = 1, }, + { + /* + * Depends on ARM64_HAS_GIC_PRIO_MASKING + */ + .capability = ARM64_HAS_GIC_PRIO_RELAXED_SYNC, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, + .matches = has_gic_prio_relaxed_sync, + }, +#endif +#ifdef CONFIG_ARM64_E0PD + { + .desc = "E0PD", + .capability = ARM64_HAS_E0PD, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .cpu_enable = cpu_enable_e0pd, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, E0PD, IMP) + }, +#endif + { + .desc = "Random Number Generator", + .capability = ARM64_HAS_RNG, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, RNDR, IMP) + }, +#ifdef CONFIG_ARM64_BTI + { + .desc = "Branch Target Identification", + .capability = ARM64_BTI, +#ifdef CONFIG_ARM64_BTI_KERNEL + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, +#else + .type = ARM64_CPUCAP_SYSTEM_FEATURE, #endif + .matches = has_cpuid_feature, + .cpu_enable = bti_enable, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, BT, IMP) + }, +#endif +#ifdef CONFIG_ARM64_MTE + { + .desc = "Memory Tagging Extension", + .capability = ARM64_MTE, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_mte, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, MTE, MTE2) + }, + { + .desc = "Asymmetric MTE Tag Check Fault", + .capability = ARM64_MTE_ASYMM, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, MTE, MTE3) + }, +#endif /* CONFIG_ARM64_MTE */ + { + .desc = "RCpc load-acquire (LDAPR)", + .capability = ARM64_HAS_LDAPR, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, LRCPC, IMP) + }, + { + .desc = "Fine Grained Traps", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_FGT, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, IMP) + }, +#ifdef CONFIG_ARM64_SME + { + .desc = "Scalable Matrix Extension", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_sme, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SME, IMP) + }, + /* FA64 should be sorted after the base SME capability */ + { + .desc = "FA64", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME_FA64, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_fa64, + ARM64_CPUID_FIELDS(ID_AA64SMFR0_EL1, FA64, IMP) + }, + { + .desc = "SME2", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME2, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_sme2, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SME, SME2) + }, +#endif /* CONFIG_ARM64_SME */ + { + .desc = "WFx with timeout", + .capability = ARM64_HAS_WFXT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, WFxT, IMP) + }, + { + .desc = "Trap EL0 IMPLEMENTATION DEFINED functionality", + .capability = ARM64_HAS_TIDCP1, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_trap_el0_impdef, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, TIDCP1, IMP) + }, + { + .desc = "Data independent timing control (DIT)", + .capability = ARM64_HAS_DIT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_dit, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, DIT, IMP) + }, + { + .desc = "Memory Copy and Memory Set instructions", + .capability = ARM64_HAS_MOPS, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_mops, + ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, MOPS, IMP) + }, + { + .capability = ARM64_HAS_TCR2, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, TCRX, IMP) + }, + { + .desc = "Stage-1 Permission Indirection Extension (S1PIE)", + .capability = ARM64_HAS_S1PIE, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1PIE, IMP) + }, + { + .desc = "VHE for hypervisor only", + .capability = ARM64_KVM_HVHE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = hvhe_possible, + }, + { + .desc = "Enhanced Virtualization Traps", + .capability = ARM64_HAS_EVT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP) + }, + { + .desc = "52-bit Virtual Addressing for KVM (LPA2)", + .capability = ARM64_HAS_LPA2, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_lpa2, + }, {}, }; -#define HWCAP_CPUID_MATCH(reg, field, s, min_value) \ - .matches = has_cpuid_feature, \ - .sys_reg = reg, \ - .field_pos = field, \ - .sign = s, \ - .min_field_value = min_value, +#define HWCAP_CPUID_MATCH(reg, field, min_value) \ + .matches = has_user_cpuid_feature, \ + ARM64_CPUID_FIELDS(reg, field, min_value) #define __HWCAP_CAP(name, cap_type, cap) \ .desc = name, \ @@ -1583,10 +2752,10 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .hwcap_type = cap_type, \ .hwcap = cap, \ -#define HWCAP_CAP(reg, field, s, min_value, cap_type, cap) \ +#define HWCAP_CAP(reg, field, min_value, cap_type, cap) \ { \ __HWCAP_CAP(#cap, cap_type, cap) \ - HWCAP_CPUID_MATCH(reg, field, s, min_value) \ + HWCAP_CPUID_MATCH(reg, field, min_value) \ } #define HWCAP_MULTI_CAP(list, cap_type, cap) \ @@ -1596,91 +2765,179 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .match_list = list, \ } +#define HWCAP_CAP_MATCH(match, cap_type, cap) \ + { \ + __HWCAP_CAP(#cap, cap_type, cap) \ + .matches = match, \ + } + #ifdef CONFIG_ARM64_PTR_AUTH static const struct arm64_cpu_capabilities ptr_auth_hwcap_addr_matches[] = { { - HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_APA_SHIFT, - FTR_UNSIGNED, ID_AA64ISAR1_APA_ARCHITECTED) + HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, APA, PAuth) + }, + { + HWCAP_CPUID_MATCH(ID_AA64ISAR2_EL1, APA3, PAuth) }, { - HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_API_SHIFT, - FTR_UNSIGNED, ID_AA64ISAR1_API_IMP_DEF) + HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, API, PAuth) }, {}, }; static const struct arm64_cpu_capabilities ptr_auth_hwcap_gen_matches[] = { { - HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_GPA_SHIFT, - FTR_UNSIGNED, ID_AA64ISAR1_GPA_ARCHITECTED) + HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, GPA, IMP) + }, + { + HWCAP_CPUID_MATCH(ID_AA64ISAR2_EL1, GPA3, IMP) }, { - HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_GPI_SHIFT, - FTR_UNSIGNED, ID_AA64ISAR1_GPI_IMP_DEF) + HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, GPI, IMP) }, {}, }; #endif static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_PMULL), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AES), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA1), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA2), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_SHA512), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_CRC32), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ATOMICS), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RDM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA3), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM3), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM4), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FLAGM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_FP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FPHP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_ASIMD), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_DIT_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DIT), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DCPOP), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_DCPODP), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_JSCVT), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FCMA), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_LRCPC), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FRINTTS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FRINT), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_SB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SB), - HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT), + HWCAP_CAP(ID_AA64ISAR0_EL1, AES, PMULL, CAP_HWCAP, KERNEL_HWCAP_PMULL), + HWCAP_CAP(ID_AA64ISAR0_EL1, AES, AES, CAP_HWCAP, KERNEL_HWCAP_AES), + HWCAP_CAP(ID_AA64ISAR0_EL1, SHA1, IMP, CAP_HWCAP, KERNEL_HWCAP_SHA1), + HWCAP_CAP(ID_AA64ISAR0_EL1, SHA2, SHA256, CAP_HWCAP, KERNEL_HWCAP_SHA2), + HWCAP_CAP(ID_AA64ISAR0_EL1, SHA2, SHA512, CAP_HWCAP, KERNEL_HWCAP_SHA512), + HWCAP_CAP(ID_AA64ISAR0_EL1, CRC32, IMP, CAP_HWCAP, KERNEL_HWCAP_CRC32), + HWCAP_CAP(ID_AA64ISAR0_EL1, ATOMIC, IMP, CAP_HWCAP, KERNEL_HWCAP_ATOMICS), + HWCAP_CAP(ID_AA64ISAR0_EL1, ATOMIC, FEAT_LSE128, CAP_HWCAP, KERNEL_HWCAP_LSE128), + HWCAP_CAP(ID_AA64ISAR0_EL1, RDM, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM), + HWCAP_CAP(ID_AA64ISAR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SHA3), + HWCAP_CAP(ID_AA64ISAR0_EL1, SM3, IMP, CAP_HWCAP, KERNEL_HWCAP_SM3), + HWCAP_CAP(ID_AA64ISAR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SM4), + HWCAP_CAP(ID_AA64ISAR0_EL1, DP, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP), + HWCAP_CAP(ID_AA64ISAR0_EL1, FHM, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM), + HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM, CAP_HWCAP, KERNEL_HWCAP_FLAGM), + HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), + HWCAP_CAP(ID_AA64ISAR0_EL1, RNDR, IMP, CAP_HWCAP, KERNEL_HWCAP_RNG), + HWCAP_CAP(ID_AA64PFR0_EL1, FP, IMP, CAP_HWCAP, KERNEL_HWCAP_FP), + HWCAP_CAP(ID_AA64PFR0_EL1, FP, FP16, CAP_HWCAP, KERNEL_HWCAP_FPHP), + HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMD), + HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, FP16, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP), + HWCAP_CAP(ID_AA64PFR0_EL1, DIT, IMP, CAP_HWCAP, KERNEL_HWCAP_DIT), + HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, IMP, CAP_HWCAP, KERNEL_HWCAP_DCPOP), + HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, DPB2, CAP_HWCAP, KERNEL_HWCAP_DCPODP), + HWCAP_CAP(ID_AA64ISAR1_EL1, JSCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_JSCVT), + HWCAP_CAP(ID_AA64ISAR1_EL1, FCMA, IMP, CAP_HWCAP, KERNEL_HWCAP_FCMA), + HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, IMP, CAP_HWCAP, KERNEL_HWCAP_LRCPC), + HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, LRCPC2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), + HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, LRCPC3, CAP_HWCAP, KERNEL_HWCAP_LRCPC3), + HWCAP_CAP(ID_AA64ISAR1_EL1, FRINTTS, IMP, CAP_HWCAP, KERNEL_HWCAP_FRINT), + HWCAP_CAP(ID_AA64ISAR1_EL1, SB, IMP, CAP_HWCAP, KERNEL_HWCAP_SB), + HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_BF16), + HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_EBF16), + HWCAP_CAP(ID_AA64ISAR1_EL1, DGH, IMP, CAP_HWCAP, KERNEL_HWCAP_DGH), + HWCAP_CAP(ID_AA64ISAR1_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_I8MM), + HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, ID_AA64PFR0_SVE, CAP_HWCAP, KERNEL_HWCAP_SVE), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SVEVER_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_SVEVER_SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_AES_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_AES, CAP_HWCAP, KERNEL_HWCAP_SVEAES), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_AES_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_AES_PMULL, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_BITPERM_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_BITPERM, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SHA3_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_SHA3, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), - HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SM4_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_SM4, CAP_HWCAP, KERNEL_HWCAP_SVESM4), + HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE), + HWCAP_CAP(ID_AA64ZFR0_EL1, SVEver, SVE2p1, CAP_HWCAP, KERNEL_HWCAP_SVE2P1), + HWCAP_CAP(ID_AA64ZFR0_EL1, SVEver, SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), + HWCAP_CAP(ID_AA64ZFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES), + HWCAP_CAP(ID_AA64ZFR0_EL1, AES, PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), + HWCAP_CAP(ID_AA64ZFR0_EL1, BitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), + HWCAP_CAP(ID_AA64ZFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_B16B16), + HWCAP_CAP(ID_AA64ZFR0_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16), + HWCAP_CAP(ID_AA64ZFR0_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16), + HWCAP_CAP(ID_AA64ZFR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), + HWCAP_CAP(ID_AA64ZFR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4), + HWCAP_CAP(ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), + HWCAP_CAP(ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), + HWCAP_CAP(ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), +#endif + HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS), +#ifdef CONFIG_ARM64_BTI + HWCAP_CAP(ID_AA64PFR1_EL1, BT, IMP, CAP_HWCAP, KERNEL_HWCAP_BTI), #endif - HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, KERNEL_HWCAP_SSBS), #ifdef CONFIG_ARM64_PTR_AUTH HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA), HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG), #endif +#ifdef CONFIG_ARM64_MTE + HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE2, CAP_HWCAP, KERNEL_HWCAP_MTE), + HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE3, CAP_HWCAP, KERNEL_HWCAP_MTE3), +#endif /* CONFIG_ARM64_MTE */ + HWCAP_CAP(ID_AA64MMFR0_EL1, ECV, IMP, CAP_HWCAP, KERNEL_HWCAP_ECV), + HWCAP_CAP(ID_AA64MMFR1_EL1, AFP, IMP, CAP_HWCAP, KERNEL_HWCAP_AFP), + HWCAP_CAP(ID_AA64ISAR2_EL1, CSSC, IMP, CAP_HWCAP, KERNEL_HWCAP_CSSC), + HWCAP_CAP(ID_AA64ISAR2_EL1, RPRFM, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRFM), + HWCAP_CAP(ID_AA64ISAR2_EL1, RPRES, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRES), + HWCAP_CAP(ID_AA64ISAR2_EL1, WFxT, IMP, CAP_HWCAP, KERNEL_HWCAP_WFXT), + HWCAP_CAP(ID_AA64ISAR2_EL1, MOPS, IMP, CAP_HWCAP, KERNEL_HWCAP_MOPS), + HWCAP_CAP(ID_AA64ISAR2_EL1, BC, IMP, CAP_HWCAP, KERNEL_HWCAP_HBC), +#ifdef CONFIG_ARM64_SME + HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME), + HWCAP_CAP(ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), + HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1), + HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2), + HWCAP_CAP(ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), + HWCAP_CAP(ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64), + HWCAP_CAP(ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32), + HWCAP_CAP(ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16), + HWCAP_CAP(ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16), + HWCAP_CAP(ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), + HWCAP_CAP(ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), + HWCAP_CAP(ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), + HWCAP_CAP(ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32), + HWCAP_CAP(ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), +#endif /* CONFIG_ARM64_SME */ {}, }; +#ifdef CONFIG_COMPAT +static bool compat_has_neon(const struct arm64_cpu_capabilities *cap, int scope) +{ + /* + * Check that all of MVFR1_EL1.{SIMDSP, SIMDInt, SIMDLS} are available, + * in line with that of arm32 as in vfp_init(). We make sure that the + * check is future proof, by making sure value is non-zero. + */ + u32 mvfr1; + + WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible()); + if (scope == SCOPE_SYSTEM) + mvfr1 = read_sanitised_ftr_reg(SYS_MVFR1_EL1); + else + mvfr1 = read_sysreg_s(SYS_MVFR1_EL1); + + return cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_EL1_SIMDSP_SHIFT) && + cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_EL1_SIMDInt_SHIFT) && + cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_EL1_SIMDLS_SHIFT); +} +#endif + static const struct arm64_cpu_capabilities compat_elf_hwcaps[] = { #ifdef CONFIG_COMPAT - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA1), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA2), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_CRC32), + HWCAP_CAP_MATCH(compat_has_neon, CAP_COMPAT_HWCAP, COMPAT_HWCAP_NEON), + HWCAP_CAP(MVFR1_EL1, SIMDFMAC, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv4), + /* Arm v8 mandates MVFR0.FPDP == {0, 2}. So, piggy back on this for the presence of VFP support */ + HWCAP_CAP(MVFR0_EL1, FPDP, VFPv3, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFP), + HWCAP_CAP(MVFR0_EL1, FPDP, VFPv3, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv3), + HWCAP_CAP(MVFR1_EL1, FPHP, FP16, CAP_COMPAT_HWCAP, COMPAT_HWCAP_FPHP), + HWCAP_CAP(MVFR1_EL1, SIMDHP, SIMDHP_FLOAT, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDHP), + HWCAP_CAP(ID_ISAR5_EL1, AES, VMULL, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL), + HWCAP_CAP(ID_ISAR5_EL1, AES, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES), + HWCAP_CAP(ID_ISAR5_EL1, SHA1, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA1), + HWCAP_CAP(ID_ISAR5_EL1, SHA2, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA2), + HWCAP_CAP(ID_ISAR5_EL1, CRC32, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_CRC32), + HWCAP_CAP(ID_ISAR6_EL1, DP, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDDP), + HWCAP_CAP(ID_ISAR6_EL1, FHM, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDFHM), + HWCAP_CAP(ID_ISAR6_EL1, SB, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SB), + HWCAP_CAP(ID_ISAR6_EL1, BF16, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDBF16), + HWCAP_CAP(ID_ISAR6_EL1, I8MM, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_I8MM), + HWCAP_CAP(ID_PFR2_EL1, SSBS, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SSBS), #endif {}, }; -static void __init cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap) +static void cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap) { switch (cap->hwcap_type) { case CAP_HWCAP: @@ -1725,7 +2982,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap) return rc; } -static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps) +static void setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps) { /* We support emulation of accesses to CPU ID feature registers */ cpu_set_named_feature(CPUID); @@ -1741,18 +2998,19 @@ static void update_cpu_capabilities(u16 scope_mask) scope_mask &= ARM64_CPUCAP_SCOPE_MASK; for (i = 0; i < ARM64_NCAPS; i++) { - caps = cpu_hwcaps_ptrs[i]; + caps = cpucap_ptrs[i]; if (!caps || !(caps->type & scope_mask) || cpus_have_cap(caps->capability) || !caps->matches(caps, cpucap_default_scope(caps))) continue; - if (caps->desc) + if (caps->desc && !caps->cpus) pr_info("detected: %s\n", caps->desc); - cpus_set_cap(caps->capability); + + __set_bit(caps->capability, system_cpucaps); if ((scope_mask & SCOPE_BOOT_CPU) && (caps->type & SCOPE_BOOT_CPU)) - set_bit(caps->capability, boot_capabilities); + set_bit(caps->capability, boot_cpucaps); } } @@ -1766,7 +3024,7 @@ static int cpu_enable_non_boot_scope_capabilities(void *__unused) u16 non_boot_scope = SCOPE_ALL & ~SCOPE_BOOT_CPU; for_each_available_cap(i) { - const struct arm64_cpu_capabilities *cap = cpu_hwcaps_ptrs[i]; + const struct arm64_cpu_capabilities *cap = cpucap_ptrs[i]; if (WARN_ON(!cap)) continue; @@ -1796,16 +3054,13 @@ static void __init enable_cpu_capabilities(u16 scope_mask) for (i = 0; i < ARM64_NCAPS; i++) { unsigned int num; - caps = cpu_hwcaps_ptrs[i]; + caps = cpucap_ptrs[i]; if (!caps || !(caps->type & scope_mask)) continue; num = caps->capability; if (!cpus_have_cap(num)) continue; - /* Ensure cpus_have_const_cap(num) works */ - static_branch_enable(&cpu_hwcap_keys[num]); - if (boot_scope && caps->cpu_enable) /* * Capabilities with SCOPE_BOOT_CPU scope are finalised @@ -1834,10 +3089,8 @@ static void __init enable_cpu_capabilities(u16 scope_mask) * Run through the list of capabilities to check for conflicts. * If the system has already detected a capability, take necessary * action on this CPU. - * - * Returns "false" on conflicts. */ -static bool verify_local_cpu_caps(u16 scope_mask) +static void verify_local_cpu_caps(u16 scope_mask) { int i; bool cpu_has_cap, system_has_cap; @@ -1846,7 +3099,7 @@ static bool verify_local_cpu_caps(u16 scope_mask) scope_mask &= ARM64_CPUCAP_SCOPE_MASK; for (i = 0; i < ARM64_NCAPS; i++) { - caps = cpu_hwcaps_ptrs[i]; + caps = cpucap_ptrs[i]; if (!caps || !(caps->type & scope_mask)) continue; @@ -1882,10 +3135,12 @@ static bool verify_local_cpu_caps(u16 scope_mask) pr_crit("CPU%d: Detected conflict for capability %d (%s), System: %d, CPU: %d\n", smp_processor_id(), caps->capability, caps->desc, system_has_cap, cpu_has_cap); - return false; - } - return true; + if (cpucap_panic_on_conflict(caps)) + cpu_panic_kernel(); + else + cpu_die_early(); + } } /* @@ -1895,16 +3150,12 @@ static bool verify_local_cpu_caps(u16 scope_mask) static void check_early_cpu_features(void) { verify_cpu_asid_bits(); - /* - * Early features are used by the kernel already. If there - * is a conflict, we cannot proceed further. - */ - if (!verify_local_cpu_caps(SCOPE_BOOT_CPU)) - cpu_panic_kernel(); + + verify_local_cpu_caps(SCOPE_BOOT_CPU); } static void -verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) +__verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) { for (; caps->matches; caps++) @@ -1915,23 +3166,70 @@ verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) } } -static void verify_sve_features(void) +static void verify_local_elf_hwcaps(void) { - u64 safe_zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1); - u64 zcr = read_zcr_features(); + __verify_local_elf_hwcaps(arm64_elf_hwcaps); - unsigned int safe_len = safe_zcr & ZCR_ELx_LEN_MASK; - unsigned int len = zcr & ZCR_ELx_LEN_MASK; + if (id_aa64pfr0_32bit_el0(read_cpuid(ID_AA64PFR0_EL1))) + __verify_local_elf_hwcaps(compat_elf_hwcaps); +} - if (len < safe_len || sve_verify_vq_map()) { +static void verify_sve_features(void) +{ + unsigned long cpacr = cpacr_save_enable_kernel_sve(); + + if (vec_verify_vq_map(ARM64_VEC_SVE)) { pr_crit("CPU%d: SVE: vector length support mismatch\n", smp_processor_id()); cpu_die_early(); } - /* Add checks on other ZCR bits here if necessary */ + cpacr_restore(cpacr); +} + +static void verify_sme_features(void) +{ + unsigned long cpacr = cpacr_save_enable_kernel_sme(); + + if (vec_verify_vq_map(ARM64_VEC_SME)) { + pr_crit("CPU%d: SME: vector length support mismatch\n", + smp_processor_id()); + cpu_die_early(); + } + + cpacr_restore(cpacr); } +static void verify_hyp_capabilities(void) +{ + u64 safe_mmfr1, mmfr0, mmfr1; + int parange, ipa_max; + unsigned int safe_vmid_bits, vmid_bits; + + if (!IS_ENABLED(CONFIG_KVM)) + return; + + safe_mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); + mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); + + /* Verify VMID bits */ + safe_vmid_bits = get_vmid_bits(safe_mmfr1); + vmid_bits = get_vmid_bits(mmfr1); + if (vmid_bits < safe_vmid_bits) { + pr_crit("CPU%d: VMID width mismatch\n", smp_processor_id()); + cpu_die_early(); + } + + /* Verify IPA range */ + parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_PARANGE_SHIFT); + ipa_max = id_aa64mmfr0_parange_to_phys_shift(parange); + if (ipa_max < get_kvm_ipa_limit()) { + pr_crit("CPU%d: IPA range mismatch\n", smp_processor_id()); + cpu_die_early(); + } +} /* * Run through the enabled system capabilities and enable() it on this CPU. @@ -1948,16 +3246,17 @@ static void verify_local_cpu_capabilities(void) * check_early_cpu_features(), as they need to be verified * on all secondary CPUs. */ - if (!verify_local_cpu_caps(SCOPE_ALL & ~SCOPE_BOOT_CPU)) - cpu_die_early(); - - verify_local_elf_hwcaps(arm64_elf_hwcaps); - - if (system_supports_32bit_el0()) - verify_local_elf_hwcaps(compat_elf_hwcaps); + verify_local_cpu_caps(SCOPE_ALL & ~SCOPE_BOOT_CPU); + verify_local_elf_hwcaps(); if (system_supports_sve()) verify_sve_features(); + + if (system_supports_sme()) + verify_sme_features(); + + if (is_hyp_mode_available()) + verify_hyp_capabilities(); } void check_local_cpu_capabilities(void) @@ -1974,51 +3273,49 @@ void check_local_cpu_capabilities(void) * Otherwise, this CPU should verify that it has all the system * advertised capabilities. */ - if (!sys_caps_initialised) + if (!system_capabilities_finalized()) update_cpu_capabilities(SCOPE_LOCAL_CPU); else verify_local_cpu_capabilities(); } -static void __init setup_boot_cpu_capabilities(void) +bool this_cpu_has_cap(unsigned int n) { - /* Detect capabilities with either SCOPE_BOOT_CPU or SCOPE_LOCAL_CPU */ - update_cpu_capabilities(SCOPE_BOOT_CPU | SCOPE_LOCAL_CPU); - /* Enable the SCOPE_BOOT_CPU capabilities alone right away */ - enable_cpu_capabilities(SCOPE_BOOT_CPU); -} + if (!WARN_ON(preemptible()) && n < ARM64_NCAPS) { + const struct arm64_cpu_capabilities *cap = cpucap_ptrs[n]; -DEFINE_STATIC_KEY_FALSE(arm64_const_caps_ready); -EXPORT_SYMBOL(arm64_const_caps_ready); + if (cap) + return cap->matches(cap, SCOPE_LOCAL_CPU); + } -static void __init mark_const_caps_ready(void) -{ - static_branch_enable(&arm64_const_caps_ready); + return false; } +EXPORT_SYMBOL_GPL(this_cpu_has_cap); -bool this_cpu_has_cap(unsigned int n) +/* + * This helper function is used in a narrow window when, + * - The system wide safe registers are set with all the SMP CPUs and, + * - The SYSTEM_FEATURE system_cpucaps may not have been set. + */ +static bool __maybe_unused __system_matches_cap(unsigned int n) { - if (!WARN_ON(preemptible()) && n < ARM64_NCAPS) { - const struct arm64_cpu_capabilities *cap = cpu_hwcaps_ptrs[n]; + if (n < ARM64_NCAPS) { + const struct arm64_cpu_capabilities *cap = cpucap_ptrs[n]; if (cap) - return cap->matches(cap, SCOPE_LOCAL_CPU); + return cap->matches(cap, SCOPE_SYSTEM); } - return false; } void cpu_set_feature(unsigned int num) { - WARN_ON(num >= MAX_CPU_FEATURES); - elf_hwcap |= BIT(num); + set_bit(num, elf_hwcap); } -EXPORT_SYMBOL_GPL(cpu_set_feature); bool cpu_have_feature(unsigned int num) { - WARN_ON(num >= MAX_CPU_FEATURES); - return elf_hwcap & BIT(num); + return test_bit(num, elf_hwcap); } EXPORT_SYMBOL_GPL(cpu_have_feature); @@ -2029,69 +3326,166 @@ unsigned long cpu_get_elf_hwcap(void) * note that for userspace compatibility we guarantee that bits 62 * and 63 will always be returned as 0. */ - return lower_32_bits(elf_hwcap); + return elf_hwcap[0]; } unsigned long cpu_get_elf_hwcap2(void) { - return upper_32_bits(elf_hwcap); + return elf_hwcap[1]; } -static void __init setup_system_capabilities(void) +static void __init setup_boot_cpu_capabilities(void) { /* - * We have finalised the system-wide safe feature - * registers, finalise the capabilities that depend - * on it. Also enable all the available capabilities, - * that are not enabled already. + * The boot CPU's feature register values have been recorded. Detect + * boot cpucaps and local cpucaps for the boot CPU, then enable and + * patch alternatives for the available boot cpucaps. */ - update_cpu_capabilities(SCOPE_SYSTEM); - enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU); + update_cpu_capabilities(SCOPE_BOOT_CPU | SCOPE_LOCAL_CPU); + enable_cpu_capabilities(SCOPE_BOOT_CPU); + apply_boot_alternatives(); } -void __init setup_cpu_features(void) +void __init setup_boot_cpu_features(void) { - u32 cwg; + /* + * Initialize the indirect array of CPU capabilities pointers before we + * handle the boot CPU. + */ + init_cpucap_indirect_list(); - setup_system_capabilities(); - mark_const_caps_ready(); - setup_elf_hwcaps(arm64_elf_hwcaps); + /* + * Detect broken pseudo-NMI. Must be called _before_ the call to + * setup_boot_cpu_capabilities() since it interacts with + * can_use_gic_priorities(). + */ + detect_system_supports_pseudo_nmi(); - if (system_supports_32bit_el0()) - setup_elf_hwcaps(compat_elf_hwcaps); + setup_boot_cpu_capabilities(); +} +static void __init setup_system_capabilities(void) +{ + /* + * The system-wide safe feature register values have been finalized. + * Detect, enable, and patch alternatives for the available system + * cpucaps. + */ + update_cpu_capabilities(SCOPE_SYSTEM); + enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU); + apply_alternatives_all(); + + /* + * Log any cpucaps with a cpumask as these aren't logged by + * update_cpu_capabilities(). + */ + for (int i = 0; i < ARM64_NCAPS; i++) { + const struct arm64_cpu_capabilities *caps = cpucap_ptrs[i]; + + if (caps && caps->cpus && caps->desc && + cpumask_any(caps->cpus) < nr_cpu_ids) + pr_info("detected: %s on CPU%*pbl\n", + caps->desc, cpumask_pr_args(caps->cpus)); + } + + /* + * TTBR0 PAN doesn't have its own cpucap, so log it manually. + */ if (system_uses_ttbr0_pan()) pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); +} - sve_setup(); - minsigstksz_setup(); +void __init setup_system_features(void) +{ + setup_system_capabilities(); - /* Advertise that we have computed the system capabilities */ - set_sys_caps_initialised(); + kpti_install_ng_mappings(); + + sve_setup(); + sme_setup(); /* * Check for sane CTR_EL0.CWG value. */ - cwg = cache_type_cwg(); - if (!cwg) + if (!cache_type_cwg()) pr_warn("No Cache Writeback Granule information, assuming %d\n", ARCH_DMA_MINALIGN); } -static bool __maybe_unused -cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused) +void __init setup_user_features(void) { - return (cpus_have_const_cap(ARM64_HAS_PAN) && !cpus_have_const_cap(ARM64_HAS_UAO)); + user_feature_fixup(); + + setup_elf_hwcaps(arm64_elf_hwcaps); + + if (system_supports_32bit_el0()) { + setup_elf_hwcaps(compat_elf_hwcaps); + elf_hwcap_fixup(); + } + + minsigstksz_setup(); } +static int enable_mismatched_32bit_el0(unsigned int cpu) +{ + /* + * The first 32-bit-capable CPU we detected and so can no longer + * be offlined by userspace. -1 indicates we haven't yet onlined + * a 32-bit-capable CPU. + */ + static int lucky_winner = -1; + + struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu); + bool cpu_32bit = id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0); + + if (cpu_32bit) { + cpumask_set_cpu(cpu, cpu_32bit_el0_mask); + static_branch_enable_cpuslocked(&arm64_mismatched_32bit_el0); + } + + if (cpumask_test_cpu(0, cpu_32bit_el0_mask) == cpu_32bit) + return 0; + + if (lucky_winner >= 0) + return 0; + + /* + * We've detected a mismatch. We need to keep one of our CPUs with + * 32-bit EL0 online so that is_cpu_allowed() doesn't end up rejecting + * every CPU in the system for a 32-bit task. + */ + lucky_winner = cpu_32bit ? cpu : cpumask_any_and(cpu_32bit_el0_mask, + cpu_active_mask); + get_cpu_device(lucky_winner)->offline_disabled = true; + setup_elf_hwcaps(compat_elf_hwcaps); + elf_hwcap_fixup(); + pr_info("Asymmetric 32-bit EL0 support detected on CPU %u; CPU hot-unplug disabled on CPU %u\n", + cpu, lucky_winner); + return 0; +} + +static int __init init_32bit_el0_mask(void) +{ + if (!allow_mismatched_32bit_el0) + return 0; + + if (!zalloc_cpumask_var(&cpu_32bit_el0_mask, GFP_KERNEL)) + return -ENOMEM; + + return cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "arm64/mismatched_32bit_el0:online", + enable_mismatched_32bit_el0, NULL); +} +subsys_initcall_sync(init_32bit_el0_mask); + static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap) { - cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); + cpu_enable_swapper_cnp(); } /* * We emulate only the following system register space. - * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 4 - 7] + * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 2 - 7] * See Table C5-6 System instruction encodings for System register accesses, * ARMv8 ARM(ARM DDI 0487A.f) for more details. */ @@ -2101,7 +3495,7 @@ static inline bool __attribute_const__ is_emulated(u32 id) sys_reg_CRn(id) == 0x0 && sys_reg_Op1(id) == 0x0 && (sys_reg_CRm(id) == 0 || - ((sys_reg_CRm(id) >= 4) && (sys_reg_CRm(id) <= 7)))); + ((sys_reg_CRm(id) >= 2) && (sys_reg_CRm(id) <= 7)))); } /* @@ -2138,7 +3532,7 @@ static int emulate_sys_reg(u32 id, u64 *valp) if (sys_reg_CRm(id) == 0) return emulate_id_reg(id, valp); - regp = get_arm64_ftr_reg(id); + regp = get_arm64_ftr_reg_nowarn(id); if (regp) *valp = arm64_ftr_reg_user_value(regp); else @@ -2163,43 +3557,44 @@ int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt) return rc; } -static int emulate_mrs(struct pt_regs *regs, u32 insn) +bool try_emulate_mrs(struct pt_regs *regs, u32 insn) { u32 sys_reg, rt; + if (compat_user_mode(regs) || !aarch64_insn_is_mrs(insn)) + return false; + /* * sys_reg values are defined as used in mrs/msr instruction. * shift the imm value to get the encoding. */ sys_reg = (u32)aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn) << 5; rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); - return do_emulate_mrs(regs, sys_reg, rt); + return do_emulate_mrs(regs, sys_reg, rt) == 0; } -static struct undef_hook mrs_hook = { - .instr_mask = 0xfff00000, - .instr_val = 0xd5300000, - .pstate_mask = PSR_AA32_MODE_MASK, - .pstate_val = PSR_MODE_EL0t, - .fn = emulate_mrs, -}; - -static int __init enable_mrs_emulation(void) +enum mitigation_state arm64_get_meltdown_state(void) { - register_undef_hook(&mrs_hook); - return 0; -} + if (__meltdown_safe) + return SPECTRE_UNAFFECTED; + + if (arm64_kernel_unmapped_at_el0()) + return SPECTRE_MITIGATED; -core_initcall(enable_mrs_emulation); + return SPECTRE_VULNERABLE; +} ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) { - if (__meltdown_safe) + switch (arm64_get_meltdown_state()) { + case SPECTRE_UNAFFECTED: return sprintf(buf, "Not affected\n"); - if (arm64_kernel_unmapped_at_el0()) + case SPECTRE_MITIGATED: return sprintf(buf, "Mitigation: PTI\n"); - return sprintf(buf, "Vulnerable\n"); + default: + return sprintf(buf, "Vulnerable\n"); + } } diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c index e4d6af2fdec7..f372295207fb 100644 --- a/arch/arm64/kernel/cpuidle.c +++ b/arch/arm64/kernel/cpuidle.c @@ -9,39 +9,9 @@ #include <linux/acpi.h> #include <linux/cpuidle.h> #include <linux/cpu_pm.h> -#include <linux/of.h> -#include <linux/of_device.h> #include <linux/psci.h> -#include <asm/cpuidle.h> -#include <asm/cpu_ops.h> - -int arm_cpuidle_init(unsigned int cpu) -{ - int ret = -EOPNOTSUPP; - - if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_suspend && - cpu_ops[cpu]->cpu_init_idle) - ret = cpu_ops[cpu]->cpu_init_idle(cpu); - - return ret; -} - -/** - * arm_cpuidle_suspend() - function to enter a low-power idle state - * @arg: argument to pass to CPU suspend operations - * - * Return: 0 on success, -EOPNOTSUPP if CPU suspend hook not initialized, CPU - * operations back-end error code otherwise. - */ -int arm_cpuidle_suspend(int index) -{ - int cpu = smp_processor_id(); - - return cpu_ops[cpu]->cpu_suspend(index); -} - -#ifdef CONFIG_ACPI +#ifdef CONFIG_ACPI_PROCESSOR_IDLE #include <acpi/processor.h> @@ -53,6 +23,9 @@ static int psci_acpi_cpu_init_idle(unsigned int cpu) struct acpi_lpi_state *lpi; struct acpi_processor *pr = per_cpu(processors, cpu); + if (unlikely(!pr || !pr->flags.has_lpi)) + return -EINVAL; + /* * If the PSCI cpu_suspend function hook has not been initialized * idle states must not be enabled, so bail out @@ -60,9 +33,6 @@ static int psci_acpi_cpu_init_idle(unsigned int cpu) if (!psci_ops.cpu_suspend) return -EOPNOTSUPP; - if (unlikely(!pr || !pr->flags.has_lpi)) - return -EINVAL; - count = pr->power.count - 1; if (count <= 0) return -ENODEV; @@ -90,15 +60,15 @@ int acpi_processor_ffh_lpi_probe(unsigned int cpu) return psci_acpi_cpu_init_idle(cpu); } -int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi) +__cpuidle int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi) { u32 state = lpi->address; if (ARM64_LPI_IS_RETENTION_STATE(lpi->arch_flags)) - return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(psci_cpu_suspend_enter, + return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM_RCU(psci_cpu_suspend_enter, lpi->index, state); else - return CPU_PM_CPU_IDLE_ENTER_PARAM(psci_cpu_suspend_enter, + return CPU_PM_CPU_IDLE_ENTER_PARAM_RCU(psci_cpu_suspend_enter, lpi->index, state); } #endif diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 56bba746da1c..47043c0d95ec 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -33,94 +33,145 @@ DEFINE_PER_CPU(struct cpuinfo_arm64, cpu_data); static struct cpuinfo_arm64 boot_cpu_data; -static const char *icache_policy_str[] = { - [0 ... ICACHE_POLICY_PIPT] = "RESERVED/UNKNOWN", - [ICACHE_POLICY_VIPT] = "VIPT", - [ICACHE_POLICY_PIPT] = "PIPT", - [ICACHE_POLICY_VPIPT] = "VPIPT", -}; +static inline const char *icache_policy_str(int l1ip) +{ + switch (l1ip) { + case CTR_EL0_L1Ip_VIPT: + return "VIPT"; + case CTR_EL0_L1Ip_PIPT: + return "PIPT"; + default: + return "RESERVED/UNKNOWN"; + } +} unsigned long __icache_flags; static const char *const hwcap_str[] = { - "fp", - "asimd", - "evtstrm", - "aes", - "pmull", - "sha1", - "sha2", - "crc32", - "atomics", - "fphp", - "asimdhp", - "cpuid", - "asimdrdm", - "jscvt", - "fcma", - "lrcpc", - "dcpop", - "sha3", - "sm3", - "sm4", - "asimddp", - "sha512", - "sve", - "asimdfhm", - "dit", - "uscat", - "ilrcpc", - "flagm", - "ssbs", - "sb", - "paca", - "pacg", - "dcpodp", - "sve2", - "sveaes", - "svepmull", - "svebitperm", - "svesha3", - "svesm4", - "flagm2", - "frint", - NULL + [KERNEL_HWCAP_FP] = "fp", + [KERNEL_HWCAP_ASIMD] = "asimd", + [KERNEL_HWCAP_EVTSTRM] = "evtstrm", + [KERNEL_HWCAP_AES] = "aes", + [KERNEL_HWCAP_PMULL] = "pmull", + [KERNEL_HWCAP_SHA1] = "sha1", + [KERNEL_HWCAP_SHA2] = "sha2", + [KERNEL_HWCAP_CRC32] = "crc32", + [KERNEL_HWCAP_ATOMICS] = "atomics", + [KERNEL_HWCAP_FPHP] = "fphp", + [KERNEL_HWCAP_ASIMDHP] = "asimdhp", + [KERNEL_HWCAP_CPUID] = "cpuid", + [KERNEL_HWCAP_ASIMDRDM] = "asimdrdm", + [KERNEL_HWCAP_JSCVT] = "jscvt", + [KERNEL_HWCAP_FCMA] = "fcma", + [KERNEL_HWCAP_LRCPC] = "lrcpc", + [KERNEL_HWCAP_DCPOP] = "dcpop", + [KERNEL_HWCAP_SHA3] = "sha3", + [KERNEL_HWCAP_SM3] = "sm3", + [KERNEL_HWCAP_SM4] = "sm4", + [KERNEL_HWCAP_ASIMDDP] = "asimddp", + [KERNEL_HWCAP_SHA512] = "sha512", + [KERNEL_HWCAP_SVE] = "sve", + [KERNEL_HWCAP_ASIMDFHM] = "asimdfhm", + [KERNEL_HWCAP_DIT] = "dit", + [KERNEL_HWCAP_USCAT] = "uscat", + [KERNEL_HWCAP_ILRCPC] = "ilrcpc", + [KERNEL_HWCAP_FLAGM] = "flagm", + [KERNEL_HWCAP_SSBS] = "ssbs", + [KERNEL_HWCAP_SB] = "sb", + [KERNEL_HWCAP_PACA] = "paca", + [KERNEL_HWCAP_PACG] = "pacg", + [KERNEL_HWCAP_DCPODP] = "dcpodp", + [KERNEL_HWCAP_SVE2] = "sve2", + [KERNEL_HWCAP_SVEAES] = "sveaes", + [KERNEL_HWCAP_SVEPMULL] = "svepmull", + [KERNEL_HWCAP_SVEBITPERM] = "svebitperm", + [KERNEL_HWCAP_SVESHA3] = "svesha3", + [KERNEL_HWCAP_SVESM4] = "svesm4", + [KERNEL_HWCAP_FLAGM2] = "flagm2", + [KERNEL_HWCAP_FRINT] = "frint", + [KERNEL_HWCAP_SVEI8MM] = "svei8mm", + [KERNEL_HWCAP_SVEF32MM] = "svef32mm", + [KERNEL_HWCAP_SVEF64MM] = "svef64mm", + [KERNEL_HWCAP_SVEBF16] = "svebf16", + [KERNEL_HWCAP_I8MM] = "i8mm", + [KERNEL_HWCAP_BF16] = "bf16", + [KERNEL_HWCAP_DGH] = "dgh", + [KERNEL_HWCAP_RNG] = "rng", + [KERNEL_HWCAP_BTI] = "bti", + [KERNEL_HWCAP_MTE] = "mte", + [KERNEL_HWCAP_ECV] = "ecv", + [KERNEL_HWCAP_AFP] = "afp", + [KERNEL_HWCAP_RPRES] = "rpres", + [KERNEL_HWCAP_MTE3] = "mte3", + [KERNEL_HWCAP_SME] = "sme", + [KERNEL_HWCAP_SME_I16I64] = "smei16i64", + [KERNEL_HWCAP_SME_F64F64] = "smef64f64", + [KERNEL_HWCAP_SME_I8I32] = "smei8i32", + [KERNEL_HWCAP_SME_F16F32] = "smef16f32", + [KERNEL_HWCAP_SME_B16F32] = "smeb16f32", + [KERNEL_HWCAP_SME_F32F32] = "smef32f32", + [KERNEL_HWCAP_SME_FA64] = "smefa64", + [KERNEL_HWCAP_WFXT] = "wfxt", + [KERNEL_HWCAP_EBF16] = "ebf16", + [KERNEL_HWCAP_SVE_EBF16] = "sveebf16", + [KERNEL_HWCAP_CSSC] = "cssc", + [KERNEL_HWCAP_RPRFM] = "rprfm", + [KERNEL_HWCAP_SVE2P1] = "sve2p1", + [KERNEL_HWCAP_SME2] = "sme2", + [KERNEL_HWCAP_SME2P1] = "sme2p1", + [KERNEL_HWCAP_SME_I16I32] = "smei16i32", + [KERNEL_HWCAP_SME_BI32I32] = "smebi32i32", + [KERNEL_HWCAP_SME_B16B16] = "smeb16b16", + [KERNEL_HWCAP_SME_F16F16] = "smef16f16", + [KERNEL_HWCAP_MOPS] = "mops", + [KERNEL_HWCAP_HBC] = "hbc", + [KERNEL_HWCAP_SVE_B16B16] = "sveb16b16", + [KERNEL_HWCAP_LRCPC3] = "lrcpc3", + [KERNEL_HWCAP_LSE128] = "lse128", }; #ifdef CONFIG_COMPAT +#define COMPAT_KERNEL_HWCAP(x) const_ilog2(COMPAT_HWCAP_ ## x) static const char *const compat_hwcap_str[] = { - "swp", - "half", - "thumb", - "26bit", - "fastmult", - "fpa", - "vfp", - "edsp", - "java", - "iwmmxt", - "crunch", - "thumbee", - "neon", - "vfpv3", - "vfpv3d16", - "tls", - "vfpv4", - "idiva", - "idivt", - "vfpd32", - "lpae", - "evtstrm", - NULL + [COMPAT_KERNEL_HWCAP(SWP)] = "swp", + [COMPAT_KERNEL_HWCAP(HALF)] = "half", + [COMPAT_KERNEL_HWCAP(THUMB)] = "thumb", + [COMPAT_KERNEL_HWCAP(26BIT)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(FAST_MULT)] = "fastmult", + [COMPAT_KERNEL_HWCAP(FPA)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(VFP)] = "vfp", + [COMPAT_KERNEL_HWCAP(EDSP)] = "edsp", + [COMPAT_KERNEL_HWCAP(JAVA)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(IWMMXT)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(CRUNCH)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(THUMBEE)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(NEON)] = "neon", + [COMPAT_KERNEL_HWCAP(VFPv3)] = "vfpv3", + [COMPAT_KERNEL_HWCAP(VFPV3D16)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(TLS)] = "tls", + [COMPAT_KERNEL_HWCAP(VFPv4)] = "vfpv4", + [COMPAT_KERNEL_HWCAP(IDIVA)] = "idiva", + [COMPAT_KERNEL_HWCAP(IDIVT)] = "idivt", + [COMPAT_KERNEL_HWCAP(VFPD32)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(LPAE)] = "lpae", + [COMPAT_KERNEL_HWCAP(EVTSTRM)] = "evtstrm", + [COMPAT_KERNEL_HWCAP(FPHP)] = "fphp", + [COMPAT_KERNEL_HWCAP(ASIMDHP)] = "asimdhp", + [COMPAT_KERNEL_HWCAP(ASIMDDP)] = "asimddp", + [COMPAT_KERNEL_HWCAP(ASIMDFHM)] = "asimdfhm", + [COMPAT_KERNEL_HWCAP(ASIMDBF16)] = "asimdbf16", + [COMPAT_KERNEL_HWCAP(I8MM)] = "i8mm", }; +#define COMPAT_KERNEL_HWCAP2(x) const_ilog2(COMPAT_HWCAP2_ ## x) static const char *const compat_hwcap2_str[] = { - "aes", - "pmull", - "sha1", - "sha2", - "crc32", - NULL + [COMPAT_KERNEL_HWCAP2(AES)] = "aes", + [COMPAT_KERNEL_HWCAP2(PMULL)] = "pmull", + [COMPAT_KERNEL_HWCAP2(SHA1)] = "sha1", + [COMPAT_KERNEL_HWCAP2(SHA2)] = "sha2", + [COMPAT_KERNEL_HWCAP2(CRC32)] = "crc32", + [COMPAT_KERNEL_HWCAP2(SB)] = "sb", + [COMPAT_KERNEL_HWCAP2(SSBS)] = "ssbs", }; #endif /* CONFIG_COMPAT */ @@ -156,16 +207,25 @@ static int c_show(struct seq_file *m, void *v) seq_puts(m, "Features\t:"); if (compat) { #ifdef CONFIG_COMPAT - for (j = 0; compat_hwcap_str[j]; j++) - if (compat_elf_hwcap & (1 << j)) + for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { + if (compat_elf_hwcap & (1 << j)) { + /* + * Warn once if any feature should not + * have been present on arm64 platform. + */ + if (WARN_ON_ONCE(!compat_hwcap_str[j])) + continue; + seq_printf(m, " %s", compat_hwcap_str[j]); + } + } - for (j = 0; compat_hwcap2_str[j]; j++) + for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) if (compat_elf_hwcap2 & (1 << j)) seq_printf(m, " %s", compat_hwcap2_str[j]); #endif /* CONFIG_COMPAT */ } else { - for (j = 0; hwcap_str[j]; j++) + for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) if (cpu_have_feature(j)) seq_printf(m, " %s", hwcap_str[j]); } @@ -228,7 +288,7 @@ static struct kobj_type cpuregs_kobj_type = { struct cpuinfo_arm64 *info = kobj_to_cpuinfo(kobj); \ \ if (info->reg_midr) \ - return sprintf(buf, "0x%016x\n", info->reg_##_field); \ + return sprintf(buf, "0x%016llx\n", info->reg_##_field); \ else \ return 0; \ } \ @@ -236,6 +296,7 @@ static struct kobj_type cpuregs_kobj_type = { CPUREGS_ATTR_RO(midr_el1, midr); CPUREGS_ATTR_RO(revidr_el1, revidr); +CPUREGS_ATTR_RO(smidr_el1, smidr); static struct attribute *cpuregs_id_attrs[] = { &cpuregs_attr_midr_el1.attr, @@ -248,6 +309,16 @@ static const struct attribute_group cpuregs_attr_group = { .name = "identification" }; +static struct attribute *sme_cpuregs_id_attrs[] = { + &cpuregs_attr_smidr_el1.attr, + NULL +}; + +static const struct attribute_group sme_cpuregs_attr_group = { + .attrs = sme_cpuregs_id_attrs, + .name = "identification" +}; + static int cpuid_cpu_online(unsigned int cpu) { int rc; @@ -265,6 +336,8 @@ static int cpuid_cpu_online(unsigned int cpu) rc = sysfs_create_group(&info->kobj, &cpuregs_attr_group); if (rc) kobject_del(&info->kobj); + if (system_supports_sme()) + rc = sysfs_merge_group(&info->kobj, &sme_cpuregs_attr_group); out: return rc; } @@ -303,25 +376,50 @@ static int __init cpuinfo_regs_init(void) } return 0; } +device_initcall(cpuinfo_regs_init); + static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info) { unsigned int cpu = smp_processor_id(); u32 l1ip = CTR_L1IP(info->reg_ctr); switch (l1ip) { - case ICACHE_POLICY_PIPT: - break; - case ICACHE_POLICY_VPIPT: - set_bit(ICACHEF_VPIPT, &__icache_flags); + case CTR_EL0_L1Ip_PIPT: break; + case CTR_EL0_L1Ip_VIPT: default: - /* Fallthrough */ - case ICACHE_POLICY_VIPT: /* Assume aliasing */ set_bit(ICACHEF_ALIASING, &__icache_flags); + break; } - pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str[l1ip], cpu); + pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str(l1ip), cpu); +} + +static void __cpuinfo_store_cpu_32bit(struct cpuinfo_32bit *info) +{ + info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1); + info->reg_id_dfr1 = read_cpuid(ID_DFR1_EL1); + info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1); + info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1); + info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1); + info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1); + info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1); + info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1); + info->reg_id_isar6 = read_cpuid(ID_ISAR6_EL1); + info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1); + info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1); + info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1); + info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1); + info->reg_id_mmfr4 = read_cpuid(ID_MMFR4_EL1); + info->reg_id_mmfr5 = read_cpuid(ID_MMFR5_EL1); + info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1); + info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1); + info->reg_id_pfr2 = read_cpuid(ID_PFR2_EL1); + + info->reg_mvfr0 = read_cpuid(MVFR0_EL1); + info->reg_mvfr1 = read_cpuid(MVFR1_EL1); + info->reg_mvfr2 = read_cpuid(MVFR2_EL1); } static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) @@ -333,7 +431,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) * with the CLIDR_EL1 fields to avoid triggering false warnings * when there is a mismatch across the CPUs. Keep track of the * effective value of the CTR_EL0 in our internal records for - * acurate sanity check and feature enablement. + * accurate sanity check and feature enablement. */ info->reg_ctr = read_cpuid_effective_cachetype(); info->reg_dczid = read_cpuid(DCZID_EL0); @@ -344,37 +442,21 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1); info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1); info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1); + info->reg_id_aa64isar2 = read_cpuid(ID_AA64ISAR2_EL1); info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1); + info->reg_id_aa64mmfr3 = read_cpuid(ID_AA64MMFR3_EL1); info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1); + info->reg_id_aa64smfr0 = read_cpuid(ID_AA64SMFR0_EL1); - /* Update the 32bit ID registers only if AArch32 is implemented */ - if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { - info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1); - info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1); - info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1); - info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1); - info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1); - info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1); - info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1); - info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1); - info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1); - info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1); - info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1); - info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1); - info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1); - - info->reg_mvfr0 = read_cpuid(MVFR0_EL1); - info->reg_mvfr1 = read_cpuid(MVFR1_EL1); - info->reg_mvfr2 = read_cpuid(MVFR2_EL1); - } + if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) + info->reg_gmid = read_cpuid(GMID_EL1); - if (IS_ENABLED(CONFIG_ARM64_SVE) && - id_aa64pfr0_sve(info->reg_id_aa64pfr0)) - info->reg_zcr = read_zcr_features(); + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) + __cpuinfo_store_cpu_32bit(&info->aarch32); cpuinfo_detect_icache_policy(info); } @@ -394,5 +476,3 @@ void __init cpuinfo_store_boot_cpu(void) boot_cpu_data = *info; init_cpu_features(&boot_cpu_data); } - -device_initcall(cpuinfo_regs_init); diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c index ca4c3e12d8c5..66cde752cd74 100644 --- a/arch/arm64/kernel/crash_core.c +++ b/arch/arm64/kernel/crash_core.c @@ -5,15 +5,36 @@ */ #include <linux/crash_core.h> +#include <asm/cpufeature.h> #include <asm/memory.h> +#include <asm/pgtable-hwdef.h> +#include <asm/pointer_auth.h> + +static inline u64 get_tcr_el1_t1sz(void); + +static inline u64 get_tcr_el1_t1sz(void) +{ + return (read_sysreg(tcr_el1) & TCR_T1SZ_MASK) >> TCR_T1SZ_OFFSET; +} void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_NUMBER(VA_BITS); /* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */ + vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR); + vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END); + vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START); + vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END); + vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START); + vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END); vmcoreinfo_append_str("NUMBER(kimage_voffset)=0x%llx\n", kimage_voffset); vmcoreinfo_append_str("NUMBER(PHYS_OFFSET)=0x%llx\n", PHYS_OFFSET); + vmcoreinfo_append_str("NUMBER(TCR_EL1_T1SZ)=0x%llx\n", + get_tcr_el1_t1sz()); vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + vmcoreinfo_append_str("NUMBER(KERNELPACMASK)=0x%llx\n", + system_supports_address_auth() ? + ptrauth_kernel_pac_mask() : 0); } diff --git a/arch/arm64/kernel/crash_dump.c b/arch/arm64/kernel/crash_dump.c index e6e284265f19..670e4ce81822 100644 --- a/arch/arm64/kernel/crash_dump.c +++ b/arch/arm64/kernel/crash_dump.c @@ -9,25 +9,11 @@ #include <linux/crash_dump.h> #include <linux/errno.h> #include <linux/io.h> -#include <linux/memblock.h> -#include <linux/uaccess.h> +#include <linux/uio.h> #include <asm/memory.h> -/** - * copy_oldmem_page() - copy one page from old kernel memory - * @pfn: page frame number to be copied - * @buf: buffer where the copied page is placed - * @csize: number of bytes to copy - * @offset: offset in bytes into the page - * @userbuf: if set, @buf is in a user address space - * - * This function copies one page from old kernel memory into buffer pointed by - * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes - * copied or negative error in case of failure. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, - int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; @@ -38,14 +24,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user((char __user *)buf, vaddr + offset, csize)) { - memunmap(vaddr); - return -EFAULT; - } - } else { - memcpy(buf, vaddr + offset, csize); - } + csize = copy_to_iter(vaddr + offset, csize, iter); memunmap(vaddr); @@ -64,5 +43,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { memcpy(buf, phys_to_virt((phys_addr_t)*ppos), count); + *ppos += count; + return count; } diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 48222a4760c2..64f2ecbdfe5c 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -28,7 +28,7 @@ u8 debug_monitors_arch(void) { return cpuid_feature_extract_unsigned_field(read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1), - ID_AA64DFR0_DEBUGVER_SHIFT); + ID_AA64DFR0_EL1_DebugVer_SHIFT); } /* @@ -130,7 +130,7 @@ static int clear_os_lock(unsigned int cpu) return 0; } -static int debug_monitors_init(void) +static int __init debug_monitors_init(void) { return cpuhp_setup_state(CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, "arm64/debug_monitors:starting", @@ -141,17 +141,20 @@ postcore_initcall(debug_monitors_init); /* * Single step API and exception handling. */ -static void set_regs_spsr_ss(struct pt_regs *regs) +static void set_user_regs_spsr_ss(struct user_pt_regs *regs) { regs->pstate |= DBG_SPSR_SS; } -NOKPROBE_SYMBOL(set_regs_spsr_ss); +NOKPROBE_SYMBOL(set_user_regs_spsr_ss); -static void clear_regs_spsr_ss(struct pt_regs *regs) +static void clear_user_regs_spsr_ss(struct user_pt_regs *regs) { regs->pstate &= ~DBG_SPSR_SS; } -NOKPROBE_SYMBOL(clear_regs_spsr_ss); +NOKPROBE_SYMBOL(clear_user_regs_spsr_ss); + +#define set_regs_spsr_ss(r) set_user_regs_spsr_ss(&(r)->user_regs) +#define clear_regs_spsr_ss(r) clear_user_regs_spsr_ss(&(r)->user_regs) static DEFINE_SPINLOCK(debug_hook_lock); static LIST_HEAD(user_step_hook); @@ -199,7 +202,7 @@ void unregister_kernel_step_hook(struct step_hook *hook) * So we call all the registered handlers, until the right handler is * found which returns zero. */ -static int call_step_hook(struct pt_regs *regs, unsigned int esr) +static int call_step_hook(struct pt_regs *regs, unsigned long esr) { struct step_hook *hook; struct list_head *list; @@ -231,12 +234,11 @@ static void send_user_sigtrap(int si_code) if (interrupts_enabled(regs)) local_irq_enable(); - arm64_force_sig_fault(SIGTRAP, si_code, - (void __user *)instruction_pointer(regs), - "User debug trap"); + arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs), + "User debug trap"); } -static int single_step_handler(unsigned long unused, unsigned int esr, +static int single_step_handler(unsigned long unused, unsigned long esr, struct pt_regs *regs) { bool handler_found = false; @@ -297,11 +299,11 @@ void unregister_kernel_break_hook(struct break_hook *hook) unregister_debug_hook(&hook->node); } -static int call_break_hook(struct pt_regs *regs, unsigned int esr) +static int call_break_hook(struct pt_regs *regs, unsigned long esr) { struct break_hook *hook; struct list_head *list; - int (*fn)(struct pt_regs *regs, unsigned int esr) = NULL; + int (*fn)(struct pt_regs *regs, unsigned long esr) = NULL; list = user_mode(regs) ? &user_break_hook : &kernel_break_hook; @@ -310,7 +312,7 @@ static int call_break_hook(struct pt_regs *regs, unsigned int esr) * entirely not preemptible, and we can use rcu list safely here. */ list_for_each_entry_rcu(hook, list, node) { - unsigned int comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; + unsigned long comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; if ((comment & ~hook->mask) == hook->imm) fn = hook->fn; @@ -320,7 +322,7 @@ static int call_break_hook(struct pt_regs *regs, unsigned int esr) } NOKPROBE_SYMBOL(call_break_hook); -static int brk_handler(unsigned long unused, unsigned int esr, +static int brk_handler(unsigned long unused, unsigned long esr, struct pt_regs *regs) { if (call_break_hook(regs, esr) == DBG_HOOK_HANDLED) @@ -376,15 +378,13 @@ int aarch32_break_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(aarch32_break_handler); -static int __init debug_traps_init(void) +void __init debug_traps_init(void) { hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP, TRAP_TRACE, "single-step handler"); hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP, - TRAP_BRKPT, "ptrace BRK handler"); - return 0; + TRAP_BRKPT, "BRK handler"); } -arch_initcall(debug_traps_init); /* Re-enable single step for syscall restarting. */ void user_rewind_single_step(struct task_struct *task) @@ -393,17 +393,26 @@ void user_rewind_single_step(struct task_struct *task) * If single step is active for this thread, then set SPSR.SS * to 1 to avoid returning to the active-pending state. */ - if (test_ti_thread_flag(task_thread_info(task), TIF_SINGLESTEP)) + if (test_tsk_thread_flag(task, TIF_SINGLESTEP)) set_regs_spsr_ss(task_pt_regs(task)); } NOKPROBE_SYMBOL(user_rewind_single_step); void user_fastforward_single_step(struct task_struct *task) { - if (test_ti_thread_flag(task_thread_info(task), TIF_SINGLESTEP)) + if (test_tsk_thread_flag(task, TIF_SINGLESTEP)) clear_regs_spsr_ss(task_pt_regs(task)); } +void user_regs_reset_single_step(struct user_pt_regs *regs, + struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_SINGLESTEP)) + set_user_regs_spsr_ss(regs); + else + clear_user_regs_spsr_ss(regs); +} + /* Kernel API */ void kernel_enable_single_step(struct pt_regs *regs) { @@ -429,6 +438,11 @@ int kernel_active_single_step(void) } NOKPROBE_SYMBOL(kernel_active_single_step); +void kernel_rewind_single_step(struct pt_regs *regs) +{ + set_regs_spsr_ss(regs); +} + /* ptrace API */ void user_enable_single_step(struct task_struct *task) { diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S deleted file mode 100644 index 304d5b02ca67..000000000000 --- a/arch/arm64/kernel/efi-entry.S +++ /dev/null @@ -1,120 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * EFI entry point. - * - * Copyright (C) 2013, 2014 Red Hat, Inc. - * Author: Mark Salter <msalter@redhat.com> - */ -#include <linux/linkage.h> -#include <linux/init.h> - -#include <asm/assembler.h> - -#define EFI_LOAD_ERROR 0x8000000000000001 - - __INIT - - /* - * We arrive here from the EFI boot manager with: - * - * * CPU in little-endian mode - * * MMU on with identity-mapped RAM - * * Icache and Dcache on - * - * We will most likely be running from some place other than where - * we want to be. The kernel image wants to be placed at TEXT_OFFSET - * from start of RAM. - */ -ENTRY(entry) - /* - * Create a stack frame to save FP/LR with extra space - * for image_addr variable passed to efi_entry(). - */ - stp x29, x30, [sp, #-32]! - mov x29, sp - - /* - * Call efi_entry to do the real work. - * x0 and x1 are already set up by firmware. Current runtime - * address of image is calculated and passed via *image_addr. - * - * unsigned long efi_entry(void *handle, - * efi_system_table_t *sys_table, - * unsigned long *image_addr) ; - */ - adr_l x8, _text - add x2, sp, 16 - str x8, [x2] - bl efi_entry - cmn x0, #1 - b.eq efi_load_fail - - /* - * efi_entry() will have copied the kernel image if necessary and we - * return here with device tree address in x0 and the kernel entry - * point stored at *image_addr. Save those values in registers which - * are callee preserved. - */ - mov x20, x0 // DTB address - ldr x0, [sp, #16] // relocated _text address - ldr w21, =stext_offset - add x21, x0, x21 - - /* - * Calculate size of the kernel Image (same for original and copy). - */ - adr_l x1, _text - adr_l x2, _edata - sub x1, x2, x1 - - /* - * Flush the copied Image to the PoC, and ensure it is not shadowed by - * stale icache entries from before relocation. - */ - bl __flush_dcache_area - ic ialluis - - /* - * Ensure that the rest of this function (in the original Image) is - * visible when the caches are disabled. The I-cache can't have stale - * entries for the VA range of the current image, so no maintenance is - * necessary. - */ - adr x0, entry - adr x1, entry_end - sub x1, x1, x0 - bl __flush_dcache_area - - /* Turn off Dcache and MMU */ - mrs x0, CurrentEL - cmp x0, #CurrentEL_EL2 - b.ne 1f - mrs x0, sctlr_el2 - bic x0, x0, #1 << 0 // clear SCTLR.M - bic x0, x0, #1 << 2 // clear SCTLR.C - pre_disable_mmu_workaround - msr sctlr_el2, x0 - isb - b 2f -1: - mrs x0, sctlr_el1 - bic x0, x0, #1 << 0 // clear SCTLR.M - bic x0, x0, #1 << 2 // clear SCTLR.C - pre_disable_mmu_workaround - msr sctlr_el1, x0 - isb -2: - /* Jump to kernel entry point */ - mov x0, x20 - mov x1, xzr - mov x2, xzr - mov x3, xzr - br x21 - -efi_load_fail: - mov x0, #EFI_LOAD_ERROR - ldp x29, x30, [sp], #32 - ret - -entry_end: -ENDPROC(entry) diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S index a7cfacce3e15..11d7f7de202d 100644 --- a/arch/arm64/kernel/efi-header.S +++ b/arch/arm64/kernel/efi-header.S @@ -7,54 +7,72 @@ #include <linux/pe.h> #include <linux/sizes.h> + .macro efi_signature_nop +#ifdef CONFIG_EFI +.L_head: + /* + * This ccmp instruction has no meaningful effect except that + * its opcode forms the magic "MZ" signature required by UEFI. + */ + ccmp x18, #0, #0xd, pl +#else + /* + * Bootloaders may inspect the opcode at the start of the kernel + * image to decide if the kernel is capable of booting via UEFI. + * So put an ordinary NOP here, not the "MZ.." pseudo-nop above. + */ + nop +#endif + .endm + .macro __EFI_PE_HEADER +#ifdef CONFIG_EFI + .set .Lpe_header_offset, . - .L_head .long PE_MAGIC -coff_header: .short IMAGE_FILE_MACHINE_ARM64 // Machine - .short section_count // NumberOfSections + .short .Lsection_count // NumberOfSections .long 0 // TimeDateStamp .long 0 // PointerToSymbolTable .long 0 // NumberOfSymbols - .short section_table - optional_header // SizeOfOptionalHeader + .short .Lsection_table - .Loptional_header // SizeOfOptionalHeader .short IMAGE_FILE_DEBUG_STRIPPED | \ IMAGE_FILE_EXECUTABLE_IMAGE | \ IMAGE_FILE_LINE_NUMS_STRIPPED // Characteristics -optional_header: +.Loptional_header: .short PE_OPT_MAGIC_PE32PLUS // PE32+ format .byte 0x02 // MajorLinkerVersion .byte 0x14 // MinorLinkerVersion - .long __initdata_begin - efi_header_end // SizeOfCode + .long __initdata_begin - .Lefi_header_end // SizeOfCode .long __pecoff_data_size // SizeOfInitializedData .long 0 // SizeOfUninitializedData - .long __efistub_entry - _head // AddressOfEntryPoint - .long efi_header_end - _head // BaseOfCode + .long __efistub_efi_pe_entry - .L_head // AddressOfEntryPoint + .long .Lefi_header_end - .L_head // BaseOfCode -extra_header_fields: .quad 0 // ImageBase - .long SZ_4K // SectionAlignment + .long SEGMENT_ALIGN // SectionAlignment .long PECOFF_FILE_ALIGNMENT // FileAlignment .short 0 // MajorOperatingSystemVersion .short 0 // MinorOperatingSystemVersion - .short 0 // MajorImageVersion - .short 0 // MinorImageVersion + .short LINUX_EFISTUB_MAJOR_VERSION // MajorImageVersion + .short LINUX_EFISTUB_MINOR_VERSION // MinorImageVersion .short 0 // MajorSubsystemVersion .short 0 // MinorSubsystemVersion .long 0 // Win32VersionValue - .long _end - _head // SizeOfImage + .long _end - .L_head // SizeOfImage // Everything before the kernel image is considered part of the header - .long efi_header_end - _head // SizeOfHeaders + .long .Lefi_header_end - .L_head // SizeOfHeaders .long 0 // CheckSum .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem - .short 0 // DllCharacteristics + .short IMAGE_DLL_CHARACTERISTICS_NX_COMPAT // DllCharacteristics .quad 0 // SizeOfStackReserve .quad 0 // SizeOfStackCommit .quad 0 // SizeOfHeapReserve .quad 0 // SizeOfHeapCommit .long 0 // LoaderFlags - .long (section_table - .) / 8 // NumberOfRvaAndSizes + .long (.Lsection_table - .) / 8 // NumberOfRvaAndSizes .quad 0 // ExportTable .quad 0 // ImportTable @@ -63,18 +81,56 @@ extra_header_fields: .quad 0 // CertificationTable .quad 0 // BaseRelocationTable +#if defined(CONFIG_DEBUG_EFI) || defined(CONFIG_ARM64_BTI_KERNEL) + .long .Lefi_debug_table - .L_head // DebugTable + .long .Lefi_debug_table_size + + /* + * The debug table is referenced via its Relative Virtual Address (RVA), + * which is only defined for those parts of the image that are covered + * by a section declaration. Since this header is not covered by any + * section, the debug table must be emitted elsewhere. So stick it in + * the .init.rodata section instead. + * + * Note that the payloads themselves are permitted to have zero RVAs, + * which means we can simply put those right after the section headers. + */ + __INITRODATA + + .align 2 +.Lefi_debug_table: #ifdef CONFIG_DEBUG_EFI - .long efi_debug_table - _head // DebugTable - .long efi_debug_table_size + // EFI_IMAGE_DEBUG_DIRECTORY_ENTRY + .long 0 // Characteristics + .long 0 // TimeDateStamp + .short 0 // MajorVersion + .short 0 // MinorVersion + .long IMAGE_DEBUG_TYPE_CODEVIEW // Type + .long .Lefi_debug_entry_size // SizeOfData + .long 0 // RVA + .long .Lefi_debug_entry - .L_head // FileOffset +#endif +#ifdef CONFIG_ARM64_BTI_KERNEL + .long 0 // Characteristics + .long 0 // TimeDateStamp + .short 0 // MajorVersion + .short 0 // MinorVersion + .long IMAGE_DEBUG_TYPE_EX_DLLCHARACTERISTICS // Type + .long 4 // SizeOfData + .long 0 // RVA + .long .Lefi_dll_characteristics_ex - .L_head // FileOffset +#endif + .set .Lefi_debug_table_size, . - .Lefi_debug_table + .previous #endif // Section table -section_table: +.Lsection_table: .ascii ".text\0\0\0" - .long __initdata_begin - efi_header_end // VirtualSize - .long efi_header_end - _head // VirtualAddress - .long __initdata_begin - efi_header_end // SizeOfRawData - .long efi_header_end - _head // PointerToRawData + .long __initdata_begin - .Lefi_header_end // VirtualSize + .long .Lefi_header_end - .L_head // VirtualAddress + .long __initdata_begin - .Lefi_header_end // SizeOfRawData + .long .Lefi_header_end - .L_head // PointerToRawData .long 0 // PointerToRelocations .long 0 // PointerToLineNumbers @@ -86,9 +142,9 @@ section_table: .ascii ".data\0\0\0" .long __pecoff_data_size // VirtualSize - .long __initdata_begin - _head // VirtualAddress + .long __initdata_begin - .L_head // VirtualAddress .long __pecoff_data_rawsize // SizeOfRawData - .long __initdata_begin - _head // PointerToRawData + .long __initdata_begin - .L_head // PointerToRawData .long 0 // PointerToRelocations .long 0 // PointerToLineNumbers @@ -98,37 +154,10 @@ section_table: IMAGE_SCN_MEM_READ | \ IMAGE_SCN_MEM_WRITE // Characteristics - .set section_count, (. - section_table) / 40 + .set .Lsection_count, (. - .Lsection_table) / 40 #ifdef CONFIG_DEBUG_EFI - /* - * The debug table is referenced via its Relative Virtual Address (RVA), - * which is only defined for those parts of the image that are covered - * by a section declaration. Since this header is not covered by any - * section, the debug table must be emitted elsewhere. So stick it in - * the .init.rodata section instead. - * - * Note that the EFI debug entry itself may legally have a zero RVA, - * which means we can simply put it right after the section headers. - */ - __INITRODATA - - .align 2 -efi_debug_table: - // EFI_IMAGE_DEBUG_DIRECTORY_ENTRY - .long 0 // Characteristics - .long 0 // TimeDateStamp - .short 0 // MajorVersion - .short 0 // MinorVersion - .long IMAGE_DEBUG_TYPE_CODEVIEW // Type - .long efi_debug_entry_size // SizeOfData - .long 0 // RVA - .long efi_debug_entry - _head // FileOffset - - .set efi_debug_table_size, . - efi_debug_table - .previous - -efi_debug_entry: +.Lefi_debug_entry: // EFI_IMAGE_DEBUG_CODEVIEW_NB10_ENTRY .ascii "NB10" // Signature .long 0 // Unknown @@ -137,16 +166,16 @@ efi_debug_entry: .asciz VMLINUX_PATH - .set efi_debug_entry_size, . - efi_debug_entry + .set .Lefi_debug_entry_size, . - .Lefi_debug_entry +#endif +#ifdef CONFIG_ARM64_BTI_KERNEL +.Lefi_dll_characteristics_ex: + .long IMAGE_DLLCHARACTERISTICS_EX_FORWARD_CFI_COMPAT #endif - /* - * EFI will load .text onwards at the 4k section alignment - * described in the PE/COFF header. To ensure that instruction - * sequences using an adrp and a :lo12: immediate will function - * correctly at this alignment, we must ensure that .text is - * placed at a 4k boundary in the Image to begin with. - */ - .align 12 -efi_header_end: + .balign SEGMENT_ALIGN +.Lefi_header_end: +#else + .set .Lpe_header_offset, 0x0 +#endif .endm diff --git a/arch/arm64/kernel/efi-rt-wrapper.S b/arch/arm64/kernel/efi-rt-wrapper.S index 3fc71106cb2b..e8ae803662cf 100644 --- a/arch/arm64/kernel/efi-rt-wrapper.S +++ b/arch/arm64/kernel/efi-rt-wrapper.S @@ -4,9 +4,10 @@ */ #include <linux/linkage.h> +#include <asm/assembler.h> -ENTRY(__efi_rt_asm_wrapper) - stp x29, x30, [sp, #-32]! +SYM_FUNC_START(__efi_rt_asm_wrapper) + stp x29, x30, [sp, #-112]! mov x29, sp /* @@ -17,6 +18,22 @@ ENTRY(__efi_rt_asm_wrapper) stp x1, x18, [sp, #16] /* + * Preserve all callee saved registers and preserve the stack pointer + * value at the base of the EFI runtime stack so we can recover from + * synchronous exceptions occurring while executing the firmware + * routines. + */ + stp x19, x20, [sp, #32] + stp x21, x22, [sp, #48] + stp x23, x24, [sp, #64] + stp x25, x26, [sp, #80] + stp x27, x28, [sp, #96] + + ldr_l x16, efi_rt_stack_top + mov sp, x16 + stp x18, x29, [sp, #-16]! + + /* * We are lucky enough that no EFI runtime services take more than * 5 arguments, so all are passed in registers rather than via the * stack. @@ -29,10 +46,42 @@ ENTRY(__efi_rt_asm_wrapper) mov x4, x6 blr x8 + mov x16, sp + mov sp, x29 + str xzr, [x16, #8] // clear recorded task SP value + ldp x1, x2, [sp, #16] cmp x2, x18 - ldp x29, x30, [sp], #32 + ldp x29, x30, [sp], #112 b.ne 0f ret -0: b efi_handle_corrupted_x18 // tail call -ENDPROC(__efi_rt_asm_wrapper) +0: + /* + * With CONFIG_SHADOW_CALL_STACK, the kernel uses x18 to store a + * shadow stack pointer, which we need to restore before returning to + * potentially instrumented code. This is safe because the wrapper is + * called with preemption disabled and a separate shadow stack is used + * for interrupts. + */ +#ifdef CONFIG_SHADOW_CALL_STACK + ldr_l x18, efi_rt_stack_top + ldr x18, [x18, #-16] +#endif + + b efi_handle_corrupted_x18 // tail call +SYM_FUNC_END(__efi_rt_asm_wrapper) + +SYM_CODE_START(__efi_rt_asm_recover) + mov sp, x30 + + ldr_l x16, efi_rt_stack_top // clear recorded task SP value + str xzr, [x16, #-8] + + ldp x19, x20, [sp, #32] + ldp x21, x22, [sp, #48] + ldp x23, x24, [sp, #64] + ldp x25, x26, [sp, #80] + ldp x27, x28, [sp, #96] + ldp x29, x30, [sp], #112 + ret +SYM_CODE_END(__efi_rt_asm_recover) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index d0cf596db82c..0228001347be 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -9,8 +9,18 @@ #include <linux/efi.h> #include <linux/init.h> +#include <linux/screen_info.h> #include <asm/efi.h> +#include <asm/stacktrace.h> + +static bool region_is_misaligned(const efi_memory_desc_t *md) +{ + if (PAGE_SIZE == EFI_PAGE_SIZE) + return false; + return !PAGE_ALIGNED(md->phys_addr) || + !PAGE_ALIGNED(md->num_pages << EFI_PAGE_SHIFT); +} /* * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be @@ -25,14 +35,22 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) if (type == EFI_MEMORY_MAPPED_IO) return PROT_DEVICE_nGnRE; - if (WARN_ONCE(!PAGE_ALIGNED(md->phys_addr), - "UEFI Runtime regions are not aligned to 64 KB -- buggy firmware?")) + if (region_is_misaligned(md)) { + static bool __initdata code_is_misaligned; + /* - * If the region is not aligned to the page size of the OS, we - * can not use strict permissions, since that would also affect - * the mapping attributes of the adjacent regions. + * Regions that are not aligned to the OS page size cannot be + * mapped with strict permissions, as those might interfere + * with the permissions that are needed by the adjacent + * region's mapping. However, if we haven't encountered any + * misaligned runtime code regions so far, we can safely use + * non-executable permissions for non-code regions. */ - return pgprot_val(PAGE_KERNEL_EXEC); + code_is_misaligned |= (type == EFI_RUNTIME_SERVICES_CODE); + + return code_is_misaligned ? pgprot_val(PAGE_KERNEL_EXEC) + : pgprot_val(PAGE_KERNEL); + } /* R-- */ if ((attr & (EFI_MEMORY_XP | EFI_MEMORY_RO)) == @@ -53,28 +71,22 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) return pgprot_val(PAGE_KERNEL_EXEC); } -/* we will fill this structure from the stub, so don't put it in .bss */ -struct screen_info screen_info __section(.data); - int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { pteval_t prot_val = create_mapping_protection(md); bool page_mappings_only = (md->type == EFI_RUNTIME_SERVICES_CODE || md->type == EFI_RUNTIME_SERVICES_DATA); - if (!PAGE_ALIGNED(md->phys_addr) || - !PAGE_ALIGNED(md->num_pages << EFI_PAGE_SHIFT)) { - /* - * If the end address of this region is not aligned to page - * size, the mapping is rounded up, and may end up sharing a - * page frame with the next UEFI memory region. If we create - * a block entry now, we may need to split it again when mapping - * the next region, and support for that is going to be removed - * from the MMU routines. So avoid block mappings altogether in - * that case. - */ + /* + * If this region is not aligned to the page size used by the OS, the + * mapping will be rounded outwards, and may end up sharing a page + * frame with an adjacent runtime memory region. Given that the page + * table descriptor covering the shared page will be rewritten when the + * adjacent region gets mapped, we must avoid block mappings here so we + * don't have to worry about splitting them when that happens. + */ + if (region_is_misaligned(md)) page_mappings_only = true; - } create_pgd_mapping(mm, md->phys_addr, md->virt_addr, md->num_pages << EFI_PAGE_SHIFT, @@ -82,25 +94,39 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) return 0; } +struct set_perm_data { + const efi_memory_desc_t *md; + bool has_bti; +}; + static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { - efi_memory_desc_t *md = data; + struct set_perm_data *spd = data; + const efi_memory_desc_t *md = spd->md; pte_t pte = READ_ONCE(*ptep); if (md->attribute & EFI_MEMORY_RO) pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); if (md->attribute & EFI_MEMORY_XP) pte = set_pte_bit(pte, __pgprot(PTE_PXN)); + else if (system_supports_bti_kernel() && spd->has_bti) + pte = set_pte_bit(pte, __pgprot(PTE_GP)); set_pte(ptep, pte); return 0; } int __init efi_set_mapping_permissions(struct mm_struct *mm, - efi_memory_desc_t *md) + efi_memory_desc_t *md, + bool has_bti) { + struct set_perm_data data = { md, has_bti }; + BUG_ON(md->type != EFI_RUNTIME_SERVICES_CODE && md->type != EFI_RUNTIME_SERVICES_DATA); + if (region_is_misaligned(md)) + return 0; + /* * Calling apply_to_page_range() is only safe on regions that are * guaranteed to be mapped down to pages. Since we are only called @@ -110,7 +136,7 @@ int __init efi_set_mapping_permissions(struct mm_struct *mm, */ return apply_to_page_range(mm, md->virt_addr, md->num_pages << EFI_PAGE_SHIFT, - set_permissions, md); + set_permissions, &data); } /* @@ -127,3 +153,66 @@ asmlinkage efi_status_t efi_handle_corrupted_x18(efi_status_t s, const char *f) pr_err_ratelimited(FW_BUG "register x18 corrupted by EFI %s\n", f); return s; } + +static DEFINE_RAW_SPINLOCK(efi_rt_lock); + +void arch_efi_call_virt_setup(void) +{ + efi_virtmap_load(); + __efi_fpsimd_begin(); + raw_spin_lock(&efi_rt_lock); +} + +void arch_efi_call_virt_teardown(void) +{ + raw_spin_unlock(&efi_rt_lock); + __efi_fpsimd_end(); + efi_virtmap_unload(); +} + +asmlinkage u64 *efi_rt_stack_top __ro_after_init; + +asmlinkage efi_status_t __efi_rt_asm_recover(void); + +bool efi_runtime_fixup_exception(struct pt_regs *regs, const char *msg) +{ + /* Check whether the exception occurred while running the firmware */ + if (!current_in_efi() || regs->pc >= TASK_SIZE_64) + return false; + + pr_err(FW_BUG "Unable to handle %s in EFI runtime service\n", msg); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + + regs->regs[0] = EFI_ABORTED; + regs->regs[30] = efi_rt_stack_top[-1]; + regs->pc = (u64)__efi_rt_asm_recover; + + if (IS_ENABLED(CONFIG_SHADOW_CALL_STACK)) + regs->regs[18] = efi_rt_stack_top[-2]; + + return true; +} + +/* EFI requires 8 KiB of stack space for runtime services */ +static_assert(THREAD_SIZE >= SZ_8K); + +static int __init arm64_efi_rt_init(void) +{ + void *p; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return 0; + + p = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, GFP_KERNEL, + NUMA_NO_NODE, &&l); +l: if (!p) { + pr_warn("Failed to allocate EFI runtime stack\n"); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + return -ENOMEM; + } + + efi_rt_stack_top = p + THREAD_SIZE; + return 0; +} +core_initcall(arm64_efi_rt_init); diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c new file mode 100644 index 000000000000..2e94d20c4ac7 --- /dev/null +++ b/arch/arm64/kernel/elfcore.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/coredump.h> +#include <linux/elfcore.h> +#include <linux/kernel.h> +#include <linux/mm.h> + +#include <asm/cpufeature.h> +#include <asm/mte.h> + +#define for_each_mte_vma(cprm, i, m) \ + if (system_supports_mte()) \ + for (i = 0, m = cprm->vma_meta; \ + i < cprm->vma_count; \ + i++, m = cprm->vma_meta + i) \ + if (m->flags & VM_MTE) + +static unsigned long mte_vma_tag_dump_size(struct core_vma_metadata *m) +{ + return (m->dump_size >> PAGE_SHIFT) * MTE_PAGE_TAG_STORAGE; +} + +/* Derived from dump_user_range(); start/end must be page-aligned */ +static int mte_dump_tag_range(struct coredump_params *cprm, + unsigned long start, unsigned long len) +{ + int ret = 1; + unsigned long addr; + void *tags = NULL; + + for (addr = start; addr < start + len; addr += PAGE_SIZE) { + struct page *page = get_dump_page(addr); + + /* + * get_dump_page() returns NULL when encountering an empty + * page table entry that would otherwise have been filled with + * the zero page. Skip the equivalent tag dump which would + * have been all zeros. + */ + if (!page) { + dump_skip(cprm, MTE_PAGE_TAG_STORAGE); + continue; + } + + /* + * Pages mapped in user space as !pte_access_permitted() (e.g. + * PROT_EXEC only) may not have the PG_mte_tagged flag set. + */ + if (!page_mte_tagged(page)) { + put_page(page); + dump_skip(cprm, MTE_PAGE_TAG_STORAGE); + continue; + } + + if (!tags) { + tags = mte_allocate_tag_storage(); + if (!tags) { + put_page(page); + ret = 0; + break; + } + } + + mte_save_page_tags(page_address(page), tags); + put_page(page); + if (!dump_emit(cprm, tags, MTE_PAGE_TAG_STORAGE)) { + ret = 0; + break; + } + } + + if (tags) + mte_free_tag_storage(tags); + + return ret; +} + +Elf_Half elf_core_extra_phdrs(struct coredump_params *cprm) +{ + int i; + struct core_vma_metadata *m; + int vma_count = 0; + + for_each_mte_vma(cprm, i, m) + vma_count++; + + return vma_count; +} + +int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) +{ + int i; + struct core_vma_metadata *m; + + for_each_mte_vma(cprm, i, m) { + struct elf_phdr phdr; + + phdr.p_type = PT_AARCH64_MEMTAG_MTE; + phdr.p_offset = offset; + phdr.p_vaddr = m->start; + phdr.p_paddr = 0; + phdr.p_filesz = mte_vma_tag_dump_size(m); + phdr.p_memsz = m->end - m->start; + offset += phdr.p_filesz; + phdr.p_flags = 0; + phdr.p_align = 0; + + if (!dump_emit(cprm, &phdr, sizeof(phdr))) + return 0; + } + + return 1; +} + +size_t elf_core_extra_data_size(struct coredump_params *cprm) +{ + int i; + struct core_vma_metadata *m; + size_t data_size = 0; + + for_each_mte_vma(cprm, i, m) + data_size += mte_vma_tag_dump_size(m); + + return data_size; +} + +int elf_core_write_extra_data(struct coredump_params *cprm) +{ + int i; + struct core_vma_metadata *m; + + for_each_mte_vma(cprm, i, m) { + if (!mte_dump_tag_range(cprm, m->start, m->dump_size)) + return 0; + } + + return 1; +} diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 5dce5e56995a..0fc94207e69a 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -6,67 +6,449 @@ */ #include <linux/context_tracking.h> +#include <linux/kasan.h> +#include <linux/linkage.h> +#include <linux/lockdep.h> #include <linux/ptrace.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> #include <linux/thread_info.h> #include <asm/cpufeature.h> #include <asm/daifflags.h> #include <asm/esr.h> #include <asm/exception.h> +#include <asm/irq_regs.h> #include <asm/kprobes.h> #include <asm/mmu.h> +#include <asm/processor.h> +#include <asm/sdei.h> +#include <asm/stacktrace.h> #include <asm/sysreg.h> +#include <asm/system_misc.h> -static void notrace el1_abort(struct pt_regs *regs, unsigned long esr) +/* + * Handle IRQ/context state management when entering from kernel mode. + * Before this function is called it is not safe to call regular kernel code, + * instrumentable code, or any code which may trigger an exception. + * + * This is intended to match the logic in irqentry_enter(), handling the kernel + * mode transitions only. + */ +static __always_inline void __enter_from_kernel_mode(struct pt_regs *regs) +{ + regs->exit_rcu = false; + + if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { + lockdep_hardirqs_off(CALLER_ADDR0); + ct_irq_enter(); + trace_hardirqs_off_finish(); + + regs->exit_rcu = true; + return; + } + + lockdep_hardirqs_off(CALLER_ADDR0); + rcu_irq_enter_check_tick(); + trace_hardirqs_off_finish(); +} + +static void noinstr enter_from_kernel_mode(struct pt_regs *regs) +{ + __enter_from_kernel_mode(regs); + mte_check_tfsr_entry(); + mte_disable_tco_entry(current); +} + +/* + * Handle IRQ/context state management when exiting to kernel mode. + * After this function returns it is not safe to call regular kernel code, + * instrumentable code, or any code which may trigger an exception. + * + * This is intended to match the logic in irqentry_exit(), handling the kernel + * mode transitions only, and with preemption handled elsewhere. + */ +static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) +{ + lockdep_assert_irqs_disabled(); + + if (interrupts_enabled(regs)) { + if (regs->exit_rcu) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + ct_irq_exit(); + lockdep_hardirqs_on(CALLER_ADDR0); + return; + } + + trace_hardirqs_on(); + } else { + if (regs->exit_rcu) + ct_irq_exit(); + } +} + +static void noinstr exit_to_kernel_mode(struct pt_regs *regs) +{ + mte_check_tfsr_exit(); + __exit_to_kernel_mode(regs); +} + +/* + * Handle IRQ/context state management when entering from user mode. + * Before this function is called it is not safe to call regular kernel code, + * instrumentable code, or any code which may trigger an exception. + */ +static __always_inline void __enter_from_user_mode(void) +{ + lockdep_hardirqs_off(CALLER_ADDR0); + CT_WARN_ON(ct_state() != CONTEXT_USER); + user_exit_irqoff(); + trace_hardirqs_off_finish(); + mte_disable_tco_entry(current); +} + +static __always_inline void enter_from_user_mode(struct pt_regs *regs) +{ + __enter_from_user_mode(); +} + +/* + * Handle IRQ/context state management when exiting to user mode. + * After this function returns it is not safe to call regular kernel code, + * instrumentable code, or any code which may trigger an exception. + */ +static __always_inline void __exit_to_user_mode(void) +{ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + user_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); +} + +static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) +{ + unsigned long flags; + + local_daif_mask(); + + flags = read_thread_flags(); + if (unlikely(flags & _TIF_WORK_MASK)) + do_notify_resume(regs, flags); + + lockdep_sys_exit(); +} + +static __always_inline void exit_to_user_mode(struct pt_regs *regs) +{ + exit_to_user_mode_prepare(regs); + mte_check_tfsr_exit(); + __exit_to_user_mode(); +} + +asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) +{ + exit_to_user_mode(regs); +} + +/* + * Handle IRQ/context state management when entering an NMI from user/kernel + * mode. Before this function is called it is not safe to call regular kernel + * code, instrumentable code, or any code which may trigger an exception. + */ +static void noinstr arm64_enter_nmi(struct pt_regs *regs) +{ + regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); + + __nmi_enter(); + lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirq_enter(); + ct_nmi_enter(); + + trace_hardirqs_off_finish(); + ftrace_nmi_enter(); +} + +/* + * Handle IRQ/context state management when exiting an NMI from user/kernel + * mode. After this function returns it is not safe to call regular kernel + * code, instrumentable code, or any code which may trigger an exception. + */ +static void noinstr arm64_exit_nmi(struct pt_regs *regs) +{ + bool restore = regs->lockdep_hardirqs; + + ftrace_nmi_exit(); + if (restore) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + } + + ct_nmi_exit(); + lockdep_hardirq_exit(); + if (restore) + lockdep_hardirqs_on(CALLER_ADDR0); + __nmi_exit(); +} + +/* + * Handle IRQ/context state management when entering a debug exception from + * kernel mode. Before this function is called it is not safe to call regular + * kernel code, instrumentable code, or any code which may trigger an exception. + */ +static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) +{ + regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); + + lockdep_hardirqs_off(CALLER_ADDR0); + ct_nmi_enter(); + + trace_hardirqs_off_finish(); +} + +/* + * Handle IRQ/context state management when exiting a debug exception from + * kernel mode. After this function returns it is not safe to call regular + * kernel code, instrumentable code, or any code which may trigger an exception. + */ +static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs) +{ + bool restore = regs->lockdep_hardirqs; + + if (restore) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + } + + ct_nmi_exit(); + if (restore) + lockdep_hardirqs_on(CALLER_ADDR0); +} + +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +#define need_irq_preemption() \ + (static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) +#else +#define need_irq_preemption() (IS_ENABLED(CONFIG_PREEMPTION)) +#endif + +static void __sched arm64_preempt_schedule_irq(void) +{ + if (!need_irq_preemption()) + return; + + /* + * Note: thread_info::preempt_count includes both thread_info::count + * and thread_info::need_resched, and is not equivalent to + * preempt_count(). + */ + if (READ_ONCE(current_thread_info()->preempt_count) != 0) + return; + + /* + * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC + * priority masking is used the GIC irqchip driver will clear DAIF.IF + * using gic_arch_enable_irqs() for normal IRQs. If anything is set in + * DAIF we must have handled an NMI, so skip preemption. + */ + if (system_uses_irq_prio_masking() && read_sysreg(daif)) + return; + + /* + * Preempting a task from an IRQ means we leave copies of PSTATE + * on the stack. cpufeature's enable calls may modify PSTATE, but + * resuming one of these preempted tasks would undo those changes. + * + * Only allow a task to be preempted once cpufeatures have been + * enabled. + */ + if (system_capabilities_finalized()) + preempt_schedule_irq(); +} + +static void do_interrupt_handler(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + if (on_thread_stack()) + call_on_irq_stack(regs, handler); + else + handler(regs); + + set_irq_regs(old_regs); +} + +extern void (*handle_arch_irq)(struct pt_regs *); +extern void (*handle_arch_fiq)(struct pt_regs *); + +static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector, + unsigned long esr) +{ + arm64_enter_nmi(regs); + + console_verbose(); + + pr_crit("Unhandled %s exception on CPU%d, ESR 0x%016lx -- %s\n", + vector, smp_processor_id(), esr, + esr_get_class_string(esr)); + + __show_regs(regs); + panic("Unhandled exception"); +} + +#define UNHANDLED(el, regsize, vector) \ +asmlinkage void noinstr el##_##regsize##_##vector##_handler(struct pt_regs *regs) \ +{ \ + const char *desc = #regsize "-bit " #el " " #vector; \ + __panic_unhandled(regs, desc, read_sysreg(esr_el1)); \ +} + +#ifdef CONFIG_ARM64_ERRATUM_1463225 +static DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); + +static void cortex_a76_erratum_1463225_svc_handler(void) +{ + u32 reg, val; + + if (!unlikely(test_thread_flag(TIF_SINGLESTEP))) + return; + + if (!unlikely(this_cpu_has_cap(ARM64_WORKAROUND_1463225))) + return; + + __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 1); + reg = read_sysreg(mdscr_el1); + val = reg | DBG_MDSCR_SS | DBG_MDSCR_KDE; + write_sysreg(val, mdscr_el1); + asm volatile("msr daifclr, #8"); + isb(); + + /* We will have taken a single-step exception by this point */ + + write_sysreg(reg, mdscr_el1); + __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 0); +} + +static __always_inline bool +cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) +{ + if (!__this_cpu_read(__in_cortex_a76_erratum_1463225_wa)) + return false; + + /* + * We've taken a dummy step exception from the kernel to ensure + * that interrupts are re-enabled on the syscall path. Return back + * to cortex_a76_erratum_1463225_svc_handler() with debug exceptions + * masked so that we can safely restore the mdscr and get on with + * handling the syscall. + */ + regs->pstate |= PSR_D_BIT; + return true; +} +#else /* CONFIG_ARM64_ERRATUM_1463225 */ +static void cortex_a76_erratum_1463225_svc_handler(void) { } +static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) +{ + return false; +} +#endif /* CONFIG_ARM64_ERRATUM_1463225 */ + +/* + * As per the ABI exit SME streaming mode and clear the SVE state not + * shared with FPSIMD on syscall entry. + */ +static inline void fp_user_discard(void) +{ + /* + * If SME is active then exit streaming mode. If ZA is active + * then flush the SVE registers but leave userspace access to + * both SVE and SME enabled, otherwise disable SME for the + * task and fall through to disabling SVE too. This means + * that after a syscall we never have any streaming mode + * register state to track, if this changes the KVM code will + * need updating. + */ + if (system_supports_sme()) + sme_smstop_sm(); + + if (!system_supports_sve()) + return; + + if (test_thread_flag(TIF_SVE)) { + unsigned int sve_vq_minus_one; + + sve_vq_minus_one = sve_vq_from_vl(task_get_sve_vl(current)) - 1; + sve_flush_live(true, sve_vq_minus_one); + } +} + +UNHANDLED(el1t, 64, sync) +UNHANDLED(el1t, 64, irq) +UNHANDLED(el1t, 64, fiq) +UNHANDLED(el1t, 64, error) + +static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + enter_from_kernel_mode(regs); local_daif_inherit(regs); - far = untagged_addr(far); do_mem_abort(far, esr, regs); + local_daif_mask(); + exit_to_kernel_mode(regs); } -NOKPROBE_SYMBOL(el1_abort); -static void notrace el1_pc(struct pt_regs *regs, unsigned long esr) +static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + enter_from_kernel_mode(regs); local_daif_inherit(regs); do_sp_pc_abort(far, esr, regs); + local_daif_mask(); + exit_to_kernel_mode(regs); } -NOKPROBE_SYMBOL(el1_pc); -static void el1_undef(struct pt_regs *regs) +static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) { + enter_from_kernel_mode(regs); local_daif_inherit(regs); - do_undefinstr(regs); + do_el1_undef(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); } -NOKPROBE_SYMBOL(el1_undef); -static void el1_inv(struct pt_regs *regs, unsigned long esr) +static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) { + enter_from_kernel_mode(regs); local_daif_inherit(regs); - bad_mode(regs, 0, esr); + do_el1_bti(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); } -NOKPROBE_SYMBOL(el1_inv); -static void notrace el1_dbg(struct pt_regs *regs, unsigned long esr) +static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); - /* - * The CPU masked interrupts, and we are leaving them masked during - * do_debug_exception(). Update PMR as if we had called - * local_mask_daif(). - */ - if (system_uses_irq_prio_masking()) - gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); + arm64_enter_el1_dbg(regs); + if (!cortex_a76_erratum_1463225_debug_handler(regs)) + do_debug_exception(far, esr, regs); + arm64_exit_el1_dbg(regs); +} - do_debug_exception(far, esr, regs); +static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) +{ + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_fpac(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); } -NOKPROBE_SYMBOL(el1_dbg); -asmlinkage void notrace el1_sync_handler(struct pt_regs *regs) +asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); @@ -84,7 +466,10 @@ asmlinkage void notrace el1_sync_handler(struct pt_regs *regs) break; case ESR_ELx_EC_SYS64: case ESR_ELx_EC_UNKNOWN: - el1_undef(regs); + el1_undef(regs, esr); + break; + case ESR_ELx_EC_BTI: + el1_bti(regs, esr); break; case ESR_ELx_EC_BREAKPT_CUR: case ESR_ELx_EC_SOFTSTP_CUR: @@ -92,24 +477,77 @@ asmlinkage void notrace el1_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_BRK64: el1_dbg(regs, esr); break; + case ESR_ELx_EC_FPAC: + el1_fpac(regs, esr); + break; default: - el1_inv(regs, esr); - }; + __panic_unhandled(regs, "64-bit el1h sync", esr); + } +} + +static __always_inline void __el1_pnmi(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + arm64_enter_nmi(regs); + do_interrupt_handler(regs, handler); + arm64_exit_nmi(regs); +} + +static __always_inline void __el1_irq(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + enter_from_kernel_mode(regs); + + irq_enter_rcu(); + do_interrupt_handler(regs, handler); + irq_exit_rcu(); + + arm64_preempt_schedule_irq(); + + exit_to_kernel_mode(regs); +} +static void noinstr el1_interrupt(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + write_sysreg(DAIF_PROCCTX_NOIRQ, daif); + + if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) + __el1_pnmi(regs, handler); + else + __el1_irq(regs, handler); +} + +asmlinkage void noinstr el1h_64_irq_handler(struct pt_regs *regs) +{ + el1_interrupt(regs, handle_arch_irq); +} + +asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs) +{ + el1_interrupt(regs, handle_arch_fiq); +} + +asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + + local_daif_restore(DAIF_ERRCTX); + arm64_enter_nmi(regs); + do_serror(regs, esr); + arm64_exit_nmi(regs); } -NOKPROBE_SYMBOL(el1_sync_handler); -static void notrace el0_da(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); - user_exit_irqoff(); + enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); - far = untagged_addr(far); do_mem_abort(far, esr, regs); + exit_to_user_mode(regs); } -NOKPROBE_SYMBOL(el0_da); -static void notrace el0_ia(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -121,105 +559,135 @@ static void notrace el0_ia(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(far)) arm64_apply_bp_hardening(); - user_exit_irqoff(); + enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); + exit_to_user_mode(regs); } -NOKPROBE_SYMBOL(el0_ia); -static void notrace el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); + enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_fpsimd_acc(esr, regs); + exit_to_user_mode(regs); } -NOKPROBE_SYMBOL(el0_fpsimd_acc); -static void notrace el0_sve_acc(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_sve_acc(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); + enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sve_acc(esr, regs); + exit_to_user_mode(regs); } -NOKPROBE_SYMBOL(el0_sve_acc); -static void notrace el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_sme_acc(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_sme_acc(esr, regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_fpsimd_exc(esr, regs); + exit_to_user_mode(regs); } -NOKPROBE_SYMBOL(el0_fpsimd_exc); -static void notrace el0_sys(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_sys(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); + enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); - do_sysinstr(esr, regs); + do_el0_sys(esr, regs); + exit_to_user_mode(regs); } -NOKPROBE_SYMBOL(el0_sys); -static void notrace el0_pc(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); if (!is_ttbr0_addr(instruction_pointer(regs))) arm64_apply_bp_hardening(); - user_exit_irqoff(); + enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(far, esr, regs); + exit_to_user_mode(regs); } -NOKPROBE_SYMBOL(el0_pc); -static void notrace el0_sp(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_sp(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); - local_daif_restore(DAIF_PROCCTX_NOIRQ); + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(regs->sp, esr, regs); + exit_to_user_mode(regs); } -NOKPROBE_SYMBOL(el0_sp); -static void notrace el0_undef(struct pt_regs *regs) +static void noinstr el0_undef(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); + enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); - do_undefinstr(regs); + do_el0_undef(regs, esr); + exit_to_user_mode(regs); } -NOKPROBE_SYMBOL(el0_undef); -static void notrace el0_inv(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_bti(struct pt_regs *regs) { - user_exit_irqoff(); + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_bti(regs); + exit_to_user_mode(regs); +} + +static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_mops(regs, esr); + exit_to_user_mode(regs); +} + +static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); bad_el0_sync(regs, 0, esr); + exit_to_user_mode(regs); } -NOKPROBE_SYMBOL(el0_inv); -static void notrace el0_dbg(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr) { /* Only watchpoints write FAR_EL1, otherwise its UNKNOWN */ unsigned long far = read_sysreg(far_el1); - if (system_uses_irq_prio_masking()) - gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); - - user_exit_irqoff(); + enter_from_user_mode(regs); do_debug_exception(far, esr, regs); - local_daif_restore(DAIF_PROCCTX_NOIRQ); + local_daif_restore(DAIF_PROCCTX); + exit_to_user_mode(regs); } -NOKPROBE_SYMBOL(el0_dbg); -static void notrace el0_svc(struct pt_regs *regs) +static void noinstr el0_svc(struct pt_regs *regs) { - if (system_uses_irq_prio_masking()) - gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); + enter_from_user_mode(regs); + cortex_a76_erratum_1463225_svc_handler(); + fp_user_discard(); + local_daif_restore(DAIF_PROCCTX); + do_el0_svc(regs); + exit_to_user_mode(regs); +} - el0_svc_handler(regs); +static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_fpac(regs, esr); + exit_to_user_mode(regs); } -NOKPROBE_SYMBOL(el0_svc); -asmlinkage void notrace el0_sync_handler(struct pt_regs *regs) +asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); @@ -239,6 +707,9 @@ asmlinkage void notrace el0_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_SVE: el0_sve_acc(regs, esr); break; + case ESR_ELx_EC_SME: + el0_sme_acc(regs, esr); + break; case ESR_ELx_EC_FP_EXC64: el0_fpsimd_exc(regs, esr); break; @@ -253,7 +724,13 @@ asmlinkage void notrace el0_sync_handler(struct pt_regs *regs) el0_pc(regs, esr); break; case ESR_ELx_EC_UNKNOWN: - el0_undef(regs); + el0_undef(regs, esr); + break; + case ESR_ELx_EC_BTI: + el0_bti(regs); + break; + case ESR_ELx_EC_MOPS: + el0_mops(regs, esr); break; case ESR_ELx_EC_BREAKPT_LOW: case ESR_ELx_EC_SOFTSTP_LOW: @@ -261,31 +738,88 @@ asmlinkage void notrace el0_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_BRK64: el0_dbg(regs, esr); break; + case ESR_ELx_EC_FPAC: + el0_fpac(regs, esr); + break; default: el0_inv(regs, esr); } } -NOKPROBE_SYMBOL(el0_sync_handler); -#ifdef CONFIG_COMPAT -static void notrace el0_cp15(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_interrupt(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) { - user_exit_irqoff(); + enter_from_user_mode(regs); + + write_sysreg(DAIF_PROCCTX_NOIRQ, daif); + + if (regs->pc & BIT(55)) + arm64_apply_bp_hardening(); + + irq_enter_rcu(); + do_interrupt_handler(regs, handler); + irq_exit_rcu(); + + exit_to_user_mode(regs); +} + +static void noinstr __el0_irq_handler_common(struct pt_regs *regs) +{ + el0_interrupt(regs, handle_arch_irq); +} + +asmlinkage void noinstr el0t_64_irq_handler(struct pt_regs *regs) +{ + __el0_irq_handler_common(regs); +} + +static void noinstr __el0_fiq_handler_common(struct pt_regs *regs) +{ + el0_interrupt(regs, handle_arch_fiq); +} + +asmlinkage void noinstr el0t_64_fiq_handler(struct pt_regs *regs) +{ + __el0_fiq_handler_common(regs); +} + +static void noinstr __el0_error_handler_common(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + + enter_from_user_mode(regs); + local_daif_restore(DAIF_ERRCTX); + arm64_enter_nmi(regs); + do_serror(regs, esr); + arm64_exit_nmi(regs); local_daif_restore(DAIF_PROCCTX); - do_cp15instr(esr, regs); + exit_to_user_mode(regs); +} + +asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs) +{ + __el0_error_handler_common(regs); } -NOKPROBE_SYMBOL(el0_cp15); -static void notrace el0_svc_compat(struct pt_regs *regs) +#ifdef CONFIG_COMPAT +static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) { - if (system_uses_irq_prio_masking()) - gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_cp15(esr, regs); + exit_to_user_mode(regs); +} - el0_svc_compat_handler(regs); +static void noinstr el0_svc_compat(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + cortex_a76_erratum_1463225_svc_handler(); + local_daif_restore(DAIF_PROCCTX); + do_el0_svc_compat(regs); + exit_to_user_mode(regs); } -NOKPROBE_SYMBOL(el0_svc_compat); -asmlinkage void notrace el0_sync_compat_handler(struct pt_regs *regs) +asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); @@ -312,7 +846,7 @@ asmlinkage void notrace el0_sync_compat_handler(struct pt_regs *regs) case ESR_ELx_EC_CP14_MR: case ESR_ELx_EC_CP14_LS: case ESR_ELx_EC_CP14_64: - el0_undef(regs); + el0_undef(regs, esr); break; case ESR_ELx_EC_CP15_32: case ESR_ELx_EC_CP15_64: @@ -328,5 +862,71 @@ asmlinkage void notrace el0_sync_compat_handler(struct pt_regs *regs) el0_inv(regs, esr); } } -NOKPROBE_SYMBOL(el0_sync_compat_handler); + +asmlinkage void noinstr el0t_32_irq_handler(struct pt_regs *regs) +{ + __el0_irq_handler_common(regs); +} + +asmlinkage void noinstr el0t_32_fiq_handler(struct pt_regs *regs) +{ + __el0_fiq_handler_common(regs); +} + +asmlinkage void noinstr el0t_32_error_handler(struct pt_regs *regs) +{ + __el0_error_handler_common(regs); +} +#else /* CONFIG_COMPAT */ +UNHANDLED(el0t, 32, sync) +UNHANDLED(el0t, 32, irq) +UNHANDLED(el0t, 32, fiq) +UNHANDLED(el0t, 32, error) #endif /* CONFIG_COMPAT */ + +#ifdef CONFIG_VMAP_STACK +asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + unsigned long far = read_sysreg(far_el1); + + arm64_enter_nmi(regs); + panic_bad_stack(regs, esr, far); +} +#endif /* CONFIG_VMAP_STACK */ + +#ifdef CONFIG_ARM_SDE_INTERFACE +asmlinkage noinstr unsigned long +__sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) +{ + unsigned long ret; + + /* + * We didn't take an exception to get here, so the HW hasn't + * set/cleared bits in PSTATE that we may rely on. + * + * The original SDEI spec (ARM DEN 0054A) can be read ambiguously as to + * whether PSTATE bits are inherited unchanged or generated from + * scratch, and the TF-A implementation always clears PAN and always + * clears UAO. There are no other known implementations. + * + * Subsequent revisions (ARM DEN 0054B) follow the usual rules for how + * PSTATE is modified upon architectural exceptions, and so PAN is + * either inherited or set per SCTLR_ELx.SPAN, and UAO is always + * cleared. + * + * We must explicitly reset PAN to the expected state, including + * clearing it when the host isn't using it, in case a VM had it set. + */ + if (system_uses_hw_pan()) + set_pstate_pan(1); + else if (cpu_has_pan()) + set_pstate_pan(0); + + arm64_enter_nmi(regs); + ret = do_sdei_event(regs, arg); + arm64_exit_nmi(regs); + + return ret; +} +#endif /* CONFIG_ARM_SDE_INTERFACE */ diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 0f24eae8f3cc..6325db1a2179 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -16,34 +16,119 @@ * * x0 - pointer to struct fpsimd_state */ -ENTRY(fpsimd_save_state) +SYM_FUNC_START(fpsimd_save_state) fpsimd_save x0, 8 ret -ENDPROC(fpsimd_save_state) +SYM_FUNC_END(fpsimd_save_state) /* * Load the FP registers. * * x0 - pointer to struct fpsimd_state */ -ENTRY(fpsimd_load_state) +SYM_FUNC_START(fpsimd_load_state) fpsimd_restore x0, 8 ret -ENDPROC(fpsimd_load_state) +SYM_FUNC_END(fpsimd_load_state) #ifdef CONFIG_ARM64_SVE -ENTRY(sve_save_state) - sve_save 0, x1, 2 + +/* + * Save the SVE state + * + * x0 - pointer to buffer for state + * x1 - pointer to storage for FPSR + * x2 - Save FFR if non-zero + */ +SYM_FUNC_START(sve_save_state) + sve_save 0, x1, x2, 3 ret -ENDPROC(sve_save_state) +SYM_FUNC_END(sve_save_state) -ENTRY(sve_load_state) - sve_load 0, x1, x2, 3, x4 +/* + * Load the SVE state + * + * x0 - pointer to buffer for state + * x1 - pointer to storage for FPSR + * x2 - Restore FFR if non-zero + */ +SYM_FUNC_START(sve_load_state) + sve_load 0, x1, x2, 4 ret -ENDPROC(sve_load_state) +SYM_FUNC_END(sve_load_state) -ENTRY(sve_get_vl) +SYM_FUNC_START(sve_get_vl) _sve_rdvl 0, 1 ret -ENDPROC(sve_get_vl) +SYM_FUNC_END(sve_get_vl) + +SYM_FUNC_START(sve_set_vq) + sve_load_vq x0, x1, x2 + ret +SYM_FUNC_END(sve_set_vq) + +/* + * Zero all SVE registers but the first 128-bits of each vector + * + * VQ must already be configured by caller, any further updates of VQ + * will need to ensure that the register state remains valid. + * + * x0 = include FFR? + * x1 = VQ - 1 + */ +SYM_FUNC_START(sve_flush_live) + cbz x1, 1f // A VQ-1 of 0 is 128 bits so no extra Z state + sve_flush_z +1: sve_flush_p + tbz x0, #0, 2f + sve_flush_ffr +2: ret +SYM_FUNC_END(sve_flush_live) + #endif /* CONFIG_ARM64_SVE */ + +#ifdef CONFIG_ARM64_SME + +SYM_FUNC_START(sme_get_vl) + _sme_rdsvl 0, 1 + ret +SYM_FUNC_END(sme_get_vl) + +SYM_FUNC_START(sme_set_vq) + sme_load_vq x0, x1, x2 + ret +SYM_FUNC_END(sme_set_vq) + +/* + * Save the ZA and ZT state + * + * x0 - pointer to buffer for state + * x1 - number of ZT registers to save + */ +SYM_FUNC_START(sme_save_state) + _sme_rdsvl 2, 1 // x2 = VL/8 + sme_save_za 0, x2, 12 // Leaves x0 pointing to the end of ZA + + cbz x1, 1f + _str_zt 0 +1: + ret +SYM_FUNC_END(sme_save_state) + +/* + * Load the ZA and ZT state + * + * x0 - pointer to buffer for state + * x1 - number of ZT registers to save + */ +SYM_FUNC_START(sme_load_state) + _sme_rdsvl 2, 1 // x2 = VL/8 + sme_load_za 0, x2, 12 // Leaves x0 pointing to the end of ZA + + cbz x1, 1f + _ldr_zt 0 +1: + ret +SYM_FUNC_END(sme_load_state) + +#endif /* CONFIG_ARM64_SME */ diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index 7d02f9966d34..f0c16640ef21 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -7,97 +7,108 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/asm-offsets.h> #include <asm/assembler.h> #include <asm/ftrace.h> #include <asm/insn.h> -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS /* * Due to -fpatchable-function-entry=2, the compiler has placed two NOPs before * the regular function prologue. For an enabled callsite, ftrace_init_nop() and * ftrace_make_call() have patched those NOPs to: * * MOV X9, LR - * BL <entry> + * BL ftrace_caller * - * ... where <entry> is either ftrace_caller or ftrace_regs_caller. + * Each instrumented function follows the AAPCS, so here x0-x8 and x18-x30 are + * live (x18 holds the Shadow Call Stack pointer), and x9-x17 are safe to + * clobber. * - * Each instrumented function follows the AAPCS, so here x0-x8 and x19-x30 are - * live, and x9-x18 are safe to clobber. - * - * We save the callsite's context into a pt_regs before invoking any ftrace - * callbacks. So that we can get a sensible backtrace, we create a stack record - * for the callsite and the ftrace entry assembly. This is not sufficient for - * reliable stacktrace: until we create the callsite stack record, its caller - * is missing from the LR and existing chain of frame records. + * We save the callsite's context into a struct ftrace_regs before invoking any + * ftrace callbacks. So that we can get a sensible backtrace, we create frame + * records for the callsite and the ftrace entry assembly. This is not + * sufficient for reliable stacktrace: until we create the callsite stack + * record, its caller is missing from the LR and existing chain of frame + * records. */ - .macro ftrace_regs_entry, allregs=0 - /* Make room for pt_regs, plus a callee frame */ - sub sp, sp, #(S_FRAME_SIZE + 16) - - /* Save function arguments (and x9 for simplicity) */ - stp x0, x1, [sp, #S_X0] - stp x2, x3, [sp, #S_X2] - stp x4, x5, [sp, #S_X4] - stp x6, x7, [sp, #S_X6] - stp x8, x9, [sp, #S_X8] - - /* Optionally save the callee-saved registers, always save the FP */ - .if \allregs == 1 - stp x10, x11, [sp, #S_X10] - stp x12, x13, [sp, #S_X12] - stp x14, x15, [sp, #S_X14] - stp x16, x17, [sp, #S_X16] - stp x18, x19, [sp, #S_X18] - stp x20, x21, [sp, #S_X20] - stp x22, x23, [sp, #S_X22] - stp x24, x25, [sp, #S_X24] - stp x26, x27, [sp, #S_X26] - stp x28, x29, [sp, #S_X28] - .else - str x29, [sp, #S_FP] - .endif - - /* Save the callsite's SP and LR */ - add x10, sp, #(S_FRAME_SIZE + 16) - stp x9, x10, [sp, #S_LR] +SYM_CODE_START(ftrace_caller) + bti c + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + /* + * The literal pointer to the ops is at an 8-byte aligned boundary + * which is either 12 or 16 bytes before the BL instruction in the call + * site. See ftrace_call_adjust() for details. + * + * Therefore here the LR points at `literal + 16` or `literal + 20`, + * and we can find the address of the literal in either case by + * aligning to an 8-byte boundary and subtracting 16. We do the + * alignment first as this allows us to fold the subtraction into the + * LDR. + */ + bic x11, x30, 0x7 + ldr x11, [x11, #-(4 * AARCH64_INSN_SIZE)] // op + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* + * If the op has a direct call, handle it immediately without + * saving/restoring registers. + */ + ldr x17, [x11, #FTRACE_OPS_DIRECT_CALL] // op->direct_call + cbnz x17, ftrace_caller_direct +#endif +#endif - /* Save the PC after the ftrace callsite */ - str x30, [sp, #S_PC] + /* Save original SP */ + mov x10, sp - /* Create a frame record for the callsite above pt_regs */ - stp x29, x9, [sp, #S_FRAME_SIZE] - add x29, sp, #S_FRAME_SIZE + /* Make room for ftrace regs, plus two frame records */ + sub sp, sp, #(FREGS_SIZE + 32) - /* Create our frame record within pt_regs. */ - stp x29, x30, [sp, #S_STACKFRAME] - add x29, sp, #S_STACKFRAME - .endm + /* Save function arguments */ + stp x0, x1, [sp, #FREGS_X0] + stp x2, x3, [sp, #FREGS_X2] + stp x4, x5, [sp, #FREGS_X4] + stp x6, x7, [sp, #FREGS_X6] + str x8, [sp, #FREGS_X8] -ENTRY(ftrace_regs_caller) - ftrace_regs_entry 1 - b ftrace_common -ENDPROC(ftrace_regs_caller) +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + str xzr, [sp, #FREGS_DIRECT_TRAMP] +#endif -ENTRY(ftrace_caller) - ftrace_regs_entry 0 - b ftrace_common -ENDPROC(ftrace_caller) + /* Save the callsite's FP, LR, SP */ + str x29, [sp, #FREGS_FP] + str x9, [sp, #FREGS_LR] + str x10, [sp, #FREGS_SP] -ENTRY(ftrace_common) - sub x0, x30, #AARCH64_INSN_SIZE // ip (callsite's BL insn) - mov x1, x9 // parent_ip (callsite's LR) - ldr_l x2, function_trace_op // op - mov x3, sp // regs + /* Save the PC after the ftrace callsite */ + str x30, [sp, #FREGS_PC] -GLOBAL(ftrace_call) - bl ftrace_stub + /* Create a frame record for the callsite above the ftrace regs */ + stp x29, x9, [sp, #FREGS_SIZE + 16] + add x29, sp, #FREGS_SIZE + 16 -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -GLOBAL(ftrace_graph_call) // ftrace_graph_caller(); - nop // If enabled, this will be replaced - // "b ftrace_graph_caller" + /* Create our frame record above the ftrace regs */ + stp x29, x30, [sp, #FREGS_SIZE] + add x29, sp, #FREGS_SIZE + + /* Prepare arguments for the the tracer func */ + sub x0, x30, #AARCH64_INSN_SIZE // ip (callsite's BL insn) + mov x1, x9 // parent_ip (callsite's LR) + mov x3, sp // regs + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + mov x2, x11 // op + ldr x4, [x2, #FTRACE_OPS_FUNC] // op->func + blr x4 // op->func(ip, parent_ip, op, regs) + +#else + ldr_l x2, function_trace_op // op + +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) + bl ftrace_stub // func(ip, parent_ip, op, regs) #endif /* @@ -105,37 +116,69 @@ GLOBAL(ftrace_graph_call) // ftrace_graph_caller(); * x19-x29 per the AAPCS, and we created frame records upon entry, so we need * to restore x0-x8, x29, and x30. */ -ftrace_common_return: /* Restore function arguments */ - ldp x0, x1, [sp] - ldp x2, x3, [sp, #S_X2] - ldp x4, x5, [sp, #S_X4] - ldp x6, x7, [sp, #S_X6] - ldr x8, [sp, #S_X8] + ldp x0, x1, [sp, #FREGS_X0] + ldp x2, x3, [sp, #FREGS_X2] + ldp x4, x5, [sp, #FREGS_X4] + ldp x6, x7, [sp, #FREGS_X6] + ldr x8, [sp, #FREGS_X8] + + /* Restore the callsite's FP */ + ldr x29, [sp, #FREGS_FP] + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + ldr x17, [sp, #FREGS_DIRECT_TRAMP] + cbnz x17, ftrace_caller_direct_late +#endif - /* Restore the callsite's FP, LR, PC */ - ldr x29, [sp, #S_FP] - ldr x30, [sp, #S_LR] - ldr x9, [sp, #S_PC] + /* Restore the callsite's LR and PC */ + ldr x30, [sp, #FREGS_LR] + ldr x9, [sp, #FREGS_PC] /* Restore the callsite's SP */ - add sp, sp, #S_FRAME_SIZE + 16 + add sp, sp, #FREGS_SIZE + 32 ret x9 -ENDPROC(ftrace_common) -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) - ldr x0, [sp, #S_PC] - sub x0, x0, #AARCH64_INSN_SIZE // ip (callsite's BL insn) - add x1, sp, #S_LR // parent_ip (callsite's LR) - ldr x2, [sp, #S_FRAME_SIZE] // parent fp (callsite's FP) - bl prepare_ftrace_return - b ftrace_common_return -ENDPROC(ftrace_graph_caller) -#endif +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +SYM_INNER_LABEL(ftrace_caller_direct_late, SYM_L_LOCAL) + /* + * Head to a direct trampoline in x17 after having run other tracers. + * The ftrace_regs are live, and x0-x8 and FP have been restored. The + * LR, PC, and SP have not been restored. + */ + + /* + * Restore the callsite's LR and PC matching the trampoline calling + * convention. + */ + ldr x9, [sp, #FREGS_LR] + ldr x30, [sp, #FREGS_PC] -#else /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ + /* Restore the callsite's SP */ + add sp, sp, #FREGS_SIZE + 32 + +SYM_INNER_LABEL(ftrace_caller_direct, SYM_L_LOCAL) + /* + * Head to a direct trampoline in x17. + * + * We use `BR X17` as this can safely land on a `BTI C` or `PACIASP` in + * the trampoline, and will not unbalance any return stack. + */ + br x17 +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ +SYM_CODE_END(ftrace_caller) + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +SYM_CODE_START(ftrace_stub_direct_tramp) + bti c + mov x10, x30 + mov x30, x9 + ret x10 +SYM_CODE_END(ftrace_stub_direct_tramp) +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +#else /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ /* * Gcc with -pg will put the following code in the beginning of each function: @@ -209,53 +252,15 @@ ENDPROC(ftrace_graph_caller) add \reg, \reg, #8 .endm -#ifndef CONFIG_DYNAMIC_FTRACE -/* - * void _mcount(unsigned long return_address) - * @return_address: return address to instrumented function - * - * This function makes calls, if enabled, to: - * - tracer function to probe instrumented function's entry, - * - ftrace_graph_caller to set up an exit hook - */ -ENTRY(_mcount) - mcount_enter - - ldr_l x2, ftrace_trace_function - adr x0, ftrace_stub - cmp x0, x2 // if (ftrace_trace_function - b.eq skip_ftrace_call // != ftrace_stub) { - - mcount_get_pc x0 // function's pc - mcount_get_lr x1 // function's lr (= parent's pc) - blr x2 // (*ftrace_trace_function)(pc, lr); - -skip_ftrace_call: // } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - ldr_l x2, ftrace_graph_return - cmp x0, x2 // if ((ftrace_graph_return - b.ne ftrace_graph_caller // != ftrace_stub) - - ldr_l x2, ftrace_graph_entry // || (ftrace_graph_entry - adr_l x0, ftrace_graph_entry_stub // != ftrace_graph_entry_stub)) - cmp x0, x2 - b.ne ftrace_graph_caller // ftrace_graph_caller(); -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - mcount_exit -ENDPROC(_mcount) -EXPORT_SYMBOL(_mcount) -NOKPROBE(_mcount) - -#else /* CONFIG_DYNAMIC_FTRACE */ /* * _mcount() is used to build the kernel with -pg option, but all the branch * instructions to _mcount() are replaced to NOP initially at kernel start up, * and later on, NOP to branch to ftrace_caller() when enabled or branch to * NOP when disabled per-function base. */ -ENTRY(_mcount) +SYM_FUNC_START(_mcount) ret -ENDPROC(_mcount) +SYM_FUNC_END(_mcount) EXPORT_SYMBOL(_mcount) NOKPROBE(_mcount) @@ -268,25 +273,24 @@ NOKPROBE(_mcount) * - tracer function to probe instrumented function's entry, * - ftrace_graph_caller to set up an exit hook */ -ENTRY(ftrace_caller) +SYM_FUNC_START(ftrace_caller) mcount_enter mcount_get_pc0 x0 // function's pc mcount_get_lr x1 // function's lr -GLOBAL(ftrace_call) // tracer(pc, lr); +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) // tracer(pc, lr); nop // This will be replaced with "bl xxx" // where xxx can be any kind of tracer. #ifdef CONFIG_FUNCTION_GRAPH_TRACER -GLOBAL(ftrace_graph_call) // ftrace_graph_caller(); +SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) // ftrace_graph_caller(); nop // If enabled, this will be replaced // "b ftrace_graph_caller" #endif mcount_exit -ENDPROC(ftrace_caller) -#endif /* CONFIG_DYNAMIC_FTRACE */ +SYM_FUNC_END(ftrace_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* @@ -298,47 +302,52 @@ ENDPROC(ftrace_caller) * the call stack in order to intercept instrumented function's return path * and run return_to_handler() later on its exit. */ -ENTRY(ftrace_graph_caller) +SYM_FUNC_START(ftrace_graph_caller) mcount_get_pc x0 // function's pc mcount_get_lr_addr x1 // pointer to function's saved lr mcount_get_parent_fp x2 // parent's fp bl prepare_ftrace_return // prepare_ftrace_return(pc, &lr, fp) mcount_exit -ENDPROC(ftrace_graph_caller) +SYM_FUNC_END(ftrace_graph_caller) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ -ENTRY(ftrace_stub) +SYM_TYPED_FUNC_START(ftrace_stub) ret -ENDPROC(ftrace_stub) +SYM_FUNC_END(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER +SYM_TYPED_FUNC_START(ftrace_stub_graph) + ret +SYM_FUNC_END(ftrace_stub_graph) + /* * void return_to_handler(void) * * Run ftrace_return_to_handler() before going back to parent. * @fp is checked against the value passed by ftrace_graph_caller(). */ -ENTRY(return_to_handler) +SYM_CODE_START(return_to_handler) /* save return value regs */ - sub sp, sp, #64 - stp x0, x1, [sp] - stp x2, x3, [sp, #16] - stp x4, x5, [sp, #32] - stp x6, x7, [sp, #48] + sub sp, sp, #FGRET_REGS_SIZE + stp x0, x1, [sp, #FGRET_REGS_X0] + stp x2, x3, [sp, #FGRET_REGS_X2] + stp x4, x5, [sp, #FGRET_REGS_X4] + stp x6, x7, [sp, #FGRET_REGS_X6] + str x29, [sp, #FGRET_REGS_FP] // parent's fp - mov x0, x29 // parent's fp - bl ftrace_return_to_handler// addr = ftrace_return_to_hander(fp); - mov x30, x0 // restore the original return address + mov x0, sp + bl ftrace_return_to_handler // addr = ftrace_return_to_hander(regs); + mov x30, x0 // restore the original return address /* restore return value regs */ - ldp x0, x1, [sp] - ldp x2, x3, [sp, #16] - ldp x4, x5, [sp, #32] - ldp x6, x7, [sp, #48] - add sp, sp, #64 + ldp x0, x1, [sp, #FGRET_REGS_X0] + ldp x2, x3, [sp, #FGRET_REGS_X2] + ldp x4, x5, [sp, #FGRET_REGS_X4] + ldp x6, x7, [sp, #FGRET_REGS_X6] + add sp, sp, #FGRET_REGS_SIZE ret -END(return_to_handler) +SYM_CODE_END(return_to_handler) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 7c6a0a41676f..7ef0e127b149 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -14,6 +14,8 @@ #include <asm/alternative.h> #include <asm/assembler.h> #include <asm/asm-offsets.h> +#include <asm/asm_pointer_auth.h> +#include <asm/bug.h> #include <asm/cpufeature.h> #include <asm/errno.h> #include <asm/esr.h> @@ -22,57 +24,36 @@ #include <asm/mmu.h> #include <asm/processor.h> #include <asm/ptrace.h> +#include <asm/scs.h> #include <asm/thread_info.h> #include <asm/asm-uaccess.h> #include <asm/unistd.h> -/* - * Context tracking subsystem. Used to instrument transitions - * between user and kernel mode. - */ - .macro ct_user_exit_irqoff -#ifdef CONFIG_CONTEXT_TRACKING - bl enter_from_user_mode -#endif - .endm - - .macro ct_user_enter -#ifdef CONFIG_CONTEXT_TRACKING - bl context_tracking_user_enter -#endif - .endm - .macro clear_gp_regs .irp n,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29 mov x\n, xzr .endr .endm -/* - * Bad Abort numbers - *----------------- - */ -#define BAD_SYNC 0 -#define BAD_IRQ 1 -#define BAD_FIQ 2 -#define BAD_ERROR 3 - - .macro kernel_ventry, el, label, regsize = 64 + .macro kernel_ventry, el:req, ht:req, regsize:req, label:req .align 7 -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -alternative_if ARM64_UNMAP_KERNEL_AT_EL0 +.Lventry_start\@: .if \el == 0 + /* + * This must be the first instruction of the EL0 vector entries. It is + * skipped by the trampoline vectors, to trigger the cleanup. + */ + b .Lskip_tramp_vectors_cleanup\@ .if \regsize == 64 mrs x30, tpidrro_el0 msr tpidrro_el0, xzr .else mov x30, xzr .endif +.Lskip_tramp_vectors_cleanup\@: .endif -alternative_else_nop_endif -#endif - sub sp, sp, #S_FRAME_SIZE + sub sp, sp, #PT_REGS_SIZE #ifdef CONFIG_VMAP_STACK /* * Test whether the SP has overflowed, without corrupting a GPR. @@ -84,7 +65,7 @@ alternative_else_nop_endif tbnz x0, #THREAD_SHIFT, 0f sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp - b el\()\el\()_\label + b el\el\ht\()_\regsize\()_\label 0: /* @@ -93,7 +74,7 @@ alternative_else_nop_endif * userspace, and can clobber EL0 registers to free up GPRs. */ - /* Stash the original SP (minus S_FRAME_SIZE) in tpidr_el0. */ + /* Stash the original SP (minus PT_REGS_SIZE) in tpidr_el0. */ msr tpidr_el0, x0 /* Recover the original x0 value and stash it in tpidrro_el0 */ @@ -116,20 +97,24 @@ alternative_else_nop_endif sub sp, sp, x0 mrs x0, tpidrro_el0 #endif - b el\()\el\()_\label + b el\el\ht\()_\regsize\()_\label +.org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm - .macro tramp_alias, dst, sym - mov_q \dst, TRAMP_VALIAS - add \dst, \dst, #(\sym - .entry.tramp.text) + .macro tramp_alias, dst, sym + .set .Lalias\@, TRAMP_VALIAS + \sym - .entry.tramp.text + movz \dst, :abs_g2_s:.Lalias\@ + movk \dst, :abs_g1_nc:.Lalias\@ + movk \dst, :abs_g0_nc:.Lalias\@ .endm - // This macro corrupts x0-x3. It is the caller's duty - // to save/restore them if required. + /* + * This macro corrupts x0-x3. It is the caller's duty to save/restore + * them if required. + */ .macro apply_ssbd, state, tmp1, tmp2 -#ifdef CONFIG_ARM64_SSBD -alternative_cb arm64_enable_wa2_handling - b .L__asm_ssbd_skip\@ +alternative_cb ARM64_ALWAYS_SYSTEM, spectre_v4_patch_fw_mitigation_enable + b .L__asm_ssbd_skip\@ // Patched to NOP alternative_cb_end ldr_this_cpu \tmp2, arm64_ssbd_callback_required, \tmp1 cbz \tmp2, .L__asm_ssbd_skip\@ @@ -137,14 +122,83 @@ alternative_cb_end tbnz \tmp2, #TIF_SSBD, .L__asm_ssbd_skip\@ mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2 mov w1, #\state -alternative_cb arm64_update_smccc_conduit +alternative_cb ARM64_ALWAYS_SYSTEM, smccc_patch_fw_mitigation_conduit nop // Patched to SMC/HVC #0 alternative_cb_end .L__asm_ssbd_skip\@: + .endm + + /* Check for MTE asynchronous tag check faults */ + .macro check_mte_async_tcf, tmp, ti_flags, thread_sctlr +#ifdef CONFIG_ARM64_MTE + .arch_extension lse +alternative_if_not ARM64_MTE + b 1f +alternative_else_nop_endif + /* + * Asynchronous tag check faults are only possible in ASYNC (2) or + * ASYM (3) modes. In each of these modes bit 1 of SCTLR_EL1.TCF0 is + * set, so skip the check if it is unset. + */ + tbz \thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f + mrs_s \tmp, SYS_TFSRE0_EL1 + tbz \tmp, #SYS_TFSR_EL1_TF0_SHIFT, 1f + /* Asynchronous TCF occurred for TTBR0 access, set the TI flag */ + mov \tmp, #_TIF_MTE_ASYNC_FAULT + add \ti_flags, tsk, #TSK_TI_FLAGS + stset \tmp, [\ti_flags] +1: +#endif + .endm + + /* Clear the MTE asynchronous tag check faults */ + .macro clear_mte_async_tcf thread_sctlr +#ifdef CONFIG_ARM64_MTE +alternative_if ARM64_MTE + /* See comment in check_mte_async_tcf above. */ + tbz \thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f + dsb ish + msr_s SYS_TFSRE0_EL1, xzr +1: +alternative_else_nop_endif +#endif + .endm + + .macro mte_set_gcr, mte_ctrl, tmp +#ifdef CONFIG_ARM64_MTE + ubfx \tmp, \mte_ctrl, #MTE_CTRL_GCR_USER_EXCL_SHIFT, #16 + orr \tmp, \tmp, #SYS_GCR_EL1_RRND + msr_s SYS_GCR_EL1, \tmp +#endif + .endm + + .macro mte_set_kernel_gcr, tmp, tmp2 +#ifdef CONFIG_KASAN_HW_TAGS +alternative_cb ARM64_ALWAYS_SYSTEM, kasan_hw_tags_enable + b 1f +alternative_cb_end + mov \tmp, KERNEL_GCR_EL1 + msr_s SYS_GCR_EL1, \tmp +1: +#endif + .endm + + .macro mte_set_user_gcr, tsk, tmp, tmp2 +#ifdef CONFIG_KASAN_HW_TAGS +alternative_cb ARM64_ALWAYS_SYSTEM, kasan_hw_tags_enable + b 1f +alternative_cb_end + ldr \tmp, [\tsk, #THREAD_MTE_CTRL] + + mte_set_gcr \tmp, \tmp2 +1: #endif .endm .macro kernel_entry, el, regsize = 64 + .if \el == 0 + alternative_insn nop, SET_PSTATE_DIT(1), ARM64_HAS_DIT + .endif .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 .endif @@ -167,30 +221,72 @@ alternative_cb_end .if \el == 0 clear_gp_regs mrs x21, sp_el0 - ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear, - ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug - disable_step_tsk x19, x20 // exceptions when scheduling. + ldr_this_cpu tsk, __entry_task, x20 + msr sp_el0, tsk + + /* + * Ensure MDSCR_EL1.SS is clear, since we can unmask debug exceptions + * when scheduling. + */ + ldr x19, [tsk, #TSK_TI_FLAGS] + disable_step_tsk x19, x20 + + /* Check for asynchronous tag check faults in user space */ + ldr x0, [tsk, THREAD_SCTLR_USER] + check_mte_async_tcf x22, x23, x0 + +#ifdef CONFIG_ARM64_PTR_AUTH +alternative_if ARM64_HAS_ADDRESS_AUTH + /* + * Enable IA for in-kernel PAC if the task had it disabled. Although + * this could be implemented with an unconditional MRS which would avoid + * a load, this was measured to be slower on Cortex-A75 and Cortex-A76. + * + * Install the kernel IA key only if IA was enabled in the task. If IA + * was disabled on kernel exit then we would have left the kernel IA + * installed so there is no need to install it again. + */ + tbz x0, SCTLR_ELx_ENIA_SHIFT, 1f + __ptrauth_keys_install_kernel_nosync tsk, x20, x22, x23 + b 2f +1: + mrs x0, sctlr_el1 + orr x0, x0, SCTLR_ELx_ENIA + msr sctlr_el1, x0 +2: +alternative_else_nop_endif +#endif apply_ssbd 1, x22, x23 + mte_set_kernel_gcr x22, x23 + + /* + * Any non-self-synchronizing system register updates required for + * kernel entry should be placed before this point. + */ +alternative_if ARM64_MTE + isb + b 1f +alternative_else_nop_endif +alternative_if ARM64_HAS_ADDRESS_AUTH + isb +alternative_else_nop_endif +1: + + scs_load_current .else - add x21, sp, #S_FRAME_SIZE + add x21, sp, #PT_REGS_SIZE get_current_task tsk - /* Save the task's original addr_limit and set USER_DS */ - ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] - str x20, [sp, #S_ORIG_ADDR_LIMIT] - mov x20, #USER_DS - str x20, [tsk, #TSK_TI_ADDR_LIMIT] - /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */ .endif /* \el == 0 */ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] /* - * In order to be able to dump the contents of struct pt_regs at the - * time the exception was taken (in case we attempt to walk the call - * stack later), chain it together with the stack frames. + * For exceptions from EL0, create a final frame record. + * For exceptions from EL1, create a synthetic frame record so the + * interrupted code shows up in the backtrace. */ .if \el == 0 stp xzr, xzr, [sp, #S_STACKFRAME] @@ -200,28 +296,9 @@ alternative_cb_end add x29, sp, #S_STACKFRAME #ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Set the TTBR0 PAN bit in SPSR. When the exception is taken from - * EL0, there is no need to check the state of TTBR0_EL1 since - * accesses are always enabled. - * Note that the meaning of this bit differs from the ARMv8.1 PAN - * feature as all TTBR0_EL1 accesses are disabled, not just those to - * user mappings. - */ -alternative_if ARM64_HAS_PAN - b 1f // skip TTBR0 PAN +alternative_if_not ARM64_HAS_PAN + bl __swpan_entry_el\el alternative_else_nop_endif - - .if \el != 0 - mrs x21, ttbr0_el1 - tst x21, #TTBR_ASID_MASK // Check for the reserved ASID - orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR - b.eq 1f // TTBR0 access already disabled - and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR - .endif - - __uaccess_ttbr0_disable x21 -1: #endif stp x22, x23, [sp, #S_PC] @@ -232,18 +309,18 @@ alternative_else_nop_endif str w21, [sp, #S_SYSCALLNO] .endif - /* - * Set sp_el0 to current thread_info. - */ - .if \el == 0 - msr sp_el0, tsk - .endif +#ifdef CONFIG_ARM64_PSEUDO_NMI +alternative_if_not ARM64_HAS_GIC_PRIO_MASKING + b .Lskip_pmr_save\@ +alternative_else_nop_endif - /* Save pmr */ -alternative_if ARM64_HAS_IRQ_PRIO_MASKING mrs_s x20, SYS_ICC_PMR_EL1 str x20, [sp, #S_PMR_SAVE] -alternative_else_nop_endif + mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET + msr_s SYS_ICC_PMR_EL1, x20 + +.Lskip_pmr_save\@: +#endif /* * Registers that may be useful after this macro is invoked: @@ -258,58 +335,30 @@ alternative_else_nop_endif .macro kernel_exit, el .if \el != 0 disable_daif - - /* Restore the task's original addr_limit. */ - ldr x20, [sp, #S_ORIG_ADDR_LIMIT] - str x20, [tsk, #TSK_TI_ADDR_LIMIT] - - /* No need to restore UAO, it will be restored from SPSR_EL1 */ .endif - /* Restore pmr */ -alternative_if ARM64_HAS_IRQ_PRIO_MASKING +#ifdef CONFIG_ARM64_PSEUDO_NMI +alternative_if_not ARM64_HAS_GIC_PRIO_MASKING + b .Lskip_pmr_restore\@ +alternative_else_nop_endif + ldr x20, [sp, #S_PMR_SAVE] msr_s SYS_ICC_PMR_EL1, x20 - mrs_s x21, SYS_ICC_CTLR_EL1 - tbz x21, #6, .L__skip_pmr_sync\@ // Check for ICC_CTLR_EL1.PMHE - dsb sy // Ensure priority change is seen by redistributor -.L__skip_pmr_sync\@: + + /* Ensure priority change is seen by redistributor */ +alternative_if_not ARM64_HAS_GIC_PRIO_RELAXED_SYNC + dsb sy alternative_else_nop_endif +.Lskip_pmr_restore\@: +#endif + ldp x21, x22, [sp, #S_PC] // load ELR, SPSR - .if \el == 0 - ct_user_enter - .endif #ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR - * PAN bit checking. - */ -alternative_if ARM64_HAS_PAN - b 2f // skip TTBR0 PAN +alternative_if_not ARM64_HAS_PAN + bl __swpan_exit_el\el alternative_else_nop_endif - - .if \el != 0 - tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set - .endif - - __uaccess_ttbr0_enable x0, x1 - - .if \el == 0 - /* - * Enable errata workarounds only if returning to user. The only - * workaround currently required for TTBR0_EL1 changes are for the - * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache - * corruption). - */ - bl post_ttbr_update_workaround - .endif -1: - .if \el != 0 - and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit - .endif -2: #endif .if \el == 0 @@ -329,21 +378,34 @@ alternative_if ARM64_WORKAROUND_845719 alternative_else_nop_endif #endif 3: -#ifdef CONFIG_ARM64_ERRATUM_1418040 -alternative_if_not ARM64_WORKAROUND_1418040 - b 4f -alternative_else_nop_endif + scs_save tsk + + /* Ignore asynchronous tag check faults in the uaccess routines */ + ldr x0, [tsk, THREAD_SCTLR_USER] + clear_mte_async_tcf x0 + +#ifdef CONFIG_ARM64_PTR_AUTH +alternative_if ARM64_HAS_ADDRESS_AUTH /* - * if (x22.mode32 == cntkctl_el1.el0vcten) - * cntkctl_el1.el0vcten = ~cntkctl_el1.el0vcten + * IA was enabled for in-kernel PAC. Disable it now if needed, or + * alternatively install the user's IA. All other per-task keys and + * SCTLR bits were updated on task switch. + * + * No kernel C function calls after this. */ - mrs x1, cntkctl_el1 - eon x0, x1, x22, lsr #3 - tbz x0, #1, 4f - eor x1, x1, #2 // ARCH_TIMER_USR_VCT_ACCESS_EN - msr cntkctl_el1, x1 -4: + tbz x0, SCTLR_ELx_ENIA_SHIFT, 1f + __ptrauth_keys_install_user tsk, x0, x1, x2 + b 2f +1: + mrs x0, sctlr_el1 + bic x0, x0, SCTLR_ELx_ENIA + msr sctlr_el1, x0 +2: +alternative_else_nop_endif #endif + + mte_set_user_gcr tsk, x0, x1 + apply_ssbd 0, x0, x1 .endif @@ -364,101 +426,87 @@ alternative_else_nop_endif ldp x24, x25, [sp, #16 * 12] ldp x26, x27, [sp, #16 * 13] ldp x28, x29, [sp, #16 * 14] - ldr lr, [sp, #S_LR] - add sp, sp, #S_FRAME_SIZE // restore sp .if \el == 0 -alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - bne 5f - msr far_el1, x30 - tramp_alias x30, tramp_exit_native - br x30 -5: - tramp_alias x30, tramp_exit_compat - br x30 + alternative_insn "b .L_skip_tramp_exit_\@", nop, ARM64_UNMAP_KERNEL_AT_EL0 + + msr far_el1, x29 + + ldr_this_cpu x30, this_cpu_vector, x29 + tramp_alias x29, tramp_exit + msr vbar_el1, x30 // install vector table + ldr lr, [sp, #S_LR] // restore x30 + add sp, sp, #PT_REGS_SIZE // restore sp + br x29 + +.L_skip_tramp_exit_\@: #endif + .endif + + ldr lr, [sp, #S_LR] + add sp, sp, #PT_REGS_SIZE // restore sp + + .if \el == 0 + /* This must be after the last explicit memory access */ +alternative_if ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD + tlbi vale1, xzr + dsb nsh +alternative_else_nop_endif .else - eret + /* Ensure any device/NC reads complete */ + alternative_insn nop, "dmb sy", ARM64_WORKAROUND_1508412 .endif + + eret sb .endm - .macro irq_stack_entry - mov x19, sp // preserve the original sp - +#ifdef CONFIG_ARM64_SW_TTBR0_PAN /* - * Compare sp with the base of the task stack. - * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack, - * and should switch to the irq stack. + * Set the TTBR0 PAN bit in SPSR. When the exception is taken from + * EL0, there is no need to check the state of TTBR0_EL1 since + * accesses are always enabled. + * Note that the meaning of this bit differs from the ARMv8.1 PAN + * feature as all TTBR0_EL1 accesses are disabled, not just those to + * user mappings. */ - ldr x25, [tsk, TSK_STACK] - eor x25, x25, x19 - and x25, x25, #~(THREAD_SIZE - 1) - cbnz x25, 9998f - - ldr_this_cpu x25, irq_stack_ptr, x26 - mov x26, #IRQ_STACK_SIZE - add x26, x25, x26 - - /* switch to the irq stack */ - mov sp, x26 -9998: - .endm +SYM_CODE_START_LOCAL(__swpan_entry_el1) + mrs x21, ttbr0_el1 + tst x21, #TTBR_ASID_MASK // Check for the reserved ASID + orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR + b.eq 1f // TTBR0 access already disabled + and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR +SYM_INNER_LABEL(__swpan_entry_el0, SYM_L_LOCAL) + __uaccess_ttbr0_disable x21 +1: ret +SYM_CODE_END(__swpan_entry_el1) /* - * x19 should be preserved between irq_stack_entry and - * irq_stack_exit. + * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR + * PAN bit checking. */ - .macro irq_stack_exit - mov sp, x19 - .endm - -/* GPRs used by entry code */ -tsk .req x28 // current thread_info - -/* - * Interrupt handling. - */ - .macro irq_handler - ldr_l x1, handle_arch_irq - mov x0, sp - irq_stack_entry - blr x1 - irq_stack_exit - .endm +SYM_CODE_START_LOCAL(__swpan_exit_el1) + tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set + __uaccess_ttbr0_enable x0, x1 +1: and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit + ret +SYM_CODE_END(__swpan_exit_el1) -#ifdef CONFIG_ARM64_PSEUDO_NMI +SYM_CODE_START_LOCAL(__swpan_exit_el0) + __uaccess_ttbr0_enable x0, x1 /* - * Set res to 0 if irqs were unmasked in interrupted context. - * Otherwise set res to non-0 value. + * Enable errata workarounds only if returning to user. The only + * workaround currently required for TTBR0_EL1 changes are for the + * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache + * corruption). */ - .macro test_irqs_unmasked res:req, pmr:req -alternative_if ARM64_HAS_IRQ_PRIO_MASKING - sub \res, \pmr, #GIC_PRIO_IRQON -alternative_else - mov \res, xzr -alternative_endif - .endm -#endif - - .macro gic_prio_kentry_setup, tmp:req -#ifdef CONFIG_ARM64_PSEUDO_NMI - alternative_if ARM64_HAS_IRQ_PRIO_MASKING - mov \tmp, #(GIC_PRIO_PSR_I_SET | GIC_PRIO_IRQON) - msr_s SYS_ICC_PMR_EL1, \tmp - alternative_else_nop_endif + b post_ttbr_update_workaround +SYM_CODE_END(__swpan_exit_el0) #endif - .endm - .macro gic_prio_irq_setup, pmr:req, tmp:req -#ifdef CONFIG_ARM64_PSEUDO_NMI - alternative_if ARM64_HAS_IRQ_PRIO_MASKING - orr \tmp, \pmr, #GIC_PRIO_PSR_I_SET - msr_s SYS_ICC_PMR_EL1, \tmp - alternative_else_nop_endif -#endif - .endm +/* GPRs used by entry code */ +tsk .req x28 // current thread_info .text @@ -468,53 +516,47 @@ alternative_endif .pushsection ".entry.text", "ax" .align 11 -ENTRY(vectors) - kernel_ventry 1, sync_invalid // Synchronous EL1t - kernel_ventry 1, irq_invalid // IRQ EL1t - kernel_ventry 1, fiq_invalid // FIQ EL1t - kernel_ventry 1, error_invalid // Error EL1t - - kernel_ventry 1, sync // Synchronous EL1h - kernel_ventry 1, irq // IRQ EL1h - kernel_ventry 1, fiq_invalid // FIQ EL1h - kernel_ventry 1, error // Error EL1h - - kernel_ventry 0, sync // Synchronous 64-bit EL0 - kernel_ventry 0, irq // IRQ 64-bit EL0 - kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0 - kernel_ventry 0, error // Error 64-bit EL0 - -#ifdef CONFIG_COMPAT - kernel_ventry 0, sync_compat, 32 // Synchronous 32-bit EL0 - kernel_ventry 0, irq_compat, 32 // IRQ 32-bit EL0 - kernel_ventry 0, fiq_invalid_compat, 32 // FIQ 32-bit EL0 - kernel_ventry 0, error_compat, 32 // Error 32-bit EL0 -#else - kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0 - kernel_ventry 0, irq_invalid, 32 // IRQ 32-bit EL0 - kernel_ventry 0, fiq_invalid, 32 // FIQ 32-bit EL0 - kernel_ventry 0, error_invalid, 32 // Error 32-bit EL0 -#endif -END(vectors) +SYM_CODE_START(vectors) + kernel_ventry 1, t, 64, sync // Synchronous EL1t + kernel_ventry 1, t, 64, irq // IRQ EL1t + kernel_ventry 1, t, 64, fiq // FIQ EL1t + kernel_ventry 1, t, 64, error // Error EL1t + + kernel_ventry 1, h, 64, sync // Synchronous EL1h + kernel_ventry 1, h, 64, irq // IRQ EL1h + kernel_ventry 1, h, 64, fiq // FIQ EL1h + kernel_ventry 1, h, 64, error // Error EL1h + + kernel_ventry 0, t, 64, sync // Synchronous 64-bit EL0 + kernel_ventry 0, t, 64, irq // IRQ 64-bit EL0 + kernel_ventry 0, t, 64, fiq // FIQ 64-bit EL0 + kernel_ventry 0, t, 64, error // Error 64-bit EL0 + + kernel_ventry 0, t, 32, sync // Synchronous 32-bit EL0 + kernel_ventry 0, t, 32, irq // IRQ 32-bit EL0 + kernel_ventry 0, t, 32, fiq // FIQ 32-bit EL0 + kernel_ventry 0, t, 32, error // Error 32-bit EL0 +SYM_CODE_END(vectors) #ifdef CONFIG_VMAP_STACK +SYM_CODE_START_LOCAL(__bad_stack) /* * We detected an overflow in kernel_ventry, which switched to the * overflow stack. Stash the exception regs, and head to our overflow * handler. */ -__bad_stack: + /* Restore the original x0 value */ mrs x0, tpidrro_el0 /* * Store the original GPRs to the new stack. The orginal SP (minus - * S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry. + * PT_REGS_SIZE) was stashed in tpidr_el0 by kernel_ventry. */ - sub sp, sp, #S_FRAME_SIZE + sub sp, sp, #PT_REGS_SIZE kernel_entry 1 mrs x0, tpidr_el0 - add x0, x0, #S_FRAME_SIZE + add x0, x0, #PT_REGS_SIZE str x0, [sp, #S_SP] /* Stash the regs for handle_bad_stack */ @@ -523,244 +565,65 @@ __bad_stack: /* Time to die */ bl handle_bad_stack ASM_BUG() +SYM_CODE_END(__bad_stack) #endif /* CONFIG_VMAP_STACK */ -/* - * Invalid mode handlers - */ - .macro inv_entry, el, reason, regsize = 64 + + .macro entry_handler el:req, ht:req, regsize:req, label:req +SYM_CODE_START_LOCAL(el\el\ht\()_\regsize\()_\label) kernel_entry \el, \regsize mov x0, sp - mov x1, #\reason - mrs x2, esr_el1 - bl bad_mode - ASM_BUG() + bl el\el\ht\()_\regsize\()_\label\()_handler + .if \el == 0 + b ret_to_user + .else + b ret_to_kernel + .endif +SYM_CODE_END(el\el\ht\()_\regsize\()_\label) .endm -el0_sync_invalid: - inv_entry 0, BAD_SYNC -ENDPROC(el0_sync_invalid) - -el0_irq_invalid: - inv_entry 0, BAD_IRQ -ENDPROC(el0_irq_invalid) - -el0_fiq_invalid: - inv_entry 0, BAD_FIQ -ENDPROC(el0_fiq_invalid) - -el0_error_invalid: - inv_entry 0, BAD_ERROR -ENDPROC(el0_error_invalid) - -#ifdef CONFIG_COMPAT -el0_fiq_invalid_compat: - inv_entry 0, BAD_FIQ, 32 -ENDPROC(el0_fiq_invalid_compat) -#endif - -el1_sync_invalid: - inv_entry 1, BAD_SYNC -ENDPROC(el1_sync_invalid) - -el1_irq_invalid: - inv_entry 1, BAD_IRQ -ENDPROC(el1_irq_invalid) - -el1_fiq_invalid: - inv_entry 1, BAD_FIQ -ENDPROC(el1_fiq_invalid) - -el1_error_invalid: - inv_entry 1, BAD_ERROR -ENDPROC(el1_error_invalid) - /* - * EL1 mode handlers. + * Early exception handlers */ - .align 6 -el1_sync: - kernel_entry 1 - mov x0, sp - bl el1_sync_handler + entry_handler 1, t, 64, sync + entry_handler 1, t, 64, irq + entry_handler 1, t, 64, fiq + entry_handler 1, t, 64, error + + entry_handler 1, h, 64, sync + entry_handler 1, h, 64, irq + entry_handler 1, h, 64, fiq + entry_handler 1, h, 64, error + + entry_handler 0, t, 64, sync + entry_handler 0, t, 64, irq + entry_handler 0, t, 64, fiq + entry_handler 0, t, 64, error + + entry_handler 0, t, 32, sync + entry_handler 0, t, 32, irq + entry_handler 0, t, 32, fiq + entry_handler 0, t, 32, error + +SYM_CODE_START_LOCAL(ret_to_kernel) kernel_exit 1 -ENDPROC(el1_sync) - - .align 6 -el1_irq: - kernel_entry 1 - gic_prio_irq_setup pmr=x20, tmp=x1 - enable_da_f - -#ifdef CONFIG_ARM64_PSEUDO_NMI - test_irqs_unmasked res=x0, pmr=x20 - cbz x0, 1f - bl asm_nmi_enter -1: -#endif - -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_off -#endif +SYM_CODE_END(ret_to_kernel) - irq_handler - -#ifdef CONFIG_PREEMPT - ldr x24, [tsk, #TSK_TI_PREEMPT] // get preempt count -alternative_if ARM64_HAS_IRQ_PRIO_MASKING - /* - * DA_F were cleared at start of handling. If anything is set in DAIF, - * we come back from an NMI, so skip preemption - */ - mrs x0, daif - orr x24, x24, x0 -alternative_else_nop_endif - cbnz x24, 1f // preempt count != 0 || NMI return path - bl arm64_preempt_schedule_irq // irq en/disable is done inside -1: -#endif - -#ifdef CONFIG_ARM64_PSEUDO_NMI - /* - * When using IRQ priority masking, we can get spurious interrupts while - * PMR is set to GIC_PRIO_IRQOFF. An NMI might also have occurred in a - * section with interrupts disabled. Skip tracing in those cases. - */ - test_irqs_unmasked res=x0, pmr=x20 - cbz x0, 1f - bl asm_nmi_exit -1: -#endif - -#ifdef CONFIG_TRACE_IRQFLAGS -#ifdef CONFIG_ARM64_PSEUDO_NMI - test_irqs_unmasked res=x0, pmr=x20 - cbnz x0, 1f -#endif - bl trace_hardirqs_on -1: -#endif - - kernel_exit 1 -ENDPROC(el1_irq) - -/* - * EL0 mode handlers. - */ - .align 6 -el0_sync: - kernel_entry 0 - mov x0, sp - bl el0_sync_handler - b ret_to_user - -#ifdef CONFIG_COMPAT - .align 6 -el0_sync_compat: - kernel_entry 0, 32 - mov x0, sp - bl el0_sync_compat_handler - b ret_to_user -ENDPROC(el0_sync) - - .align 6 -el0_irq_compat: - kernel_entry 0, 32 - b el0_irq_naked - -el0_error_compat: - kernel_entry 0, 32 - b el0_error_naked -#endif - - .align 6 -el0_irq: - kernel_entry 0 -el0_irq_naked: - gic_prio_irq_setup pmr=x20, tmp=x0 - ct_user_exit_irqoff - enable_da_f - -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_off -#endif - -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR - tbz x22, #55, 1f - bl do_el0_irq_bp_hardening -1: -#endif - irq_handler - -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on -#endif - b ret_to_user -ENDPROC(el0_irq) - -el1_error: - kernel_entry 1 - mrs x1, esr_el1 - gic_prio_kentry_setup tmp=x2 - enable_dbg - mov x0, sp - bl do_serror - kernel_exit 1 -ENDPROC(el1_error) - -el0_error: - kernel_entry 0 -el0_error_naked: - mrs x25, esr_el1 - gic_prio_kentry_setup tmp=x2 - ct_user_exit_irqoff - enable_dbg - mov x0, sp - mov x1, x25 - bl do_serror - enable_da_f - b ret_to_user -ENDPROC(el0_error) - -/* - * Ok, we need to do extra processing, enter the slow path. - */ -work_pending: - mov x0, sp // 'regs' - bl do_notify_resume -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on // enabled while in userspace -#endif - ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step - b finish_ret_to_user -/* - * "slow" syscall return path. - */ -ret_to_user: - disable_daif - gic_prio_kentry_setup tmp=x3 - ldr x1, [tsk, #TSK_TI_FLAGS] - and x2, x1, #_TIF_WORK_MASK - cbnz x2, work_pending -finish_ret_to_user: - enable_step_tsk x1, x2 +SYM_CODE_START_LOCAL(ret_to_user) + ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step + enable_step_tsk x19, x2 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK - bl stackleak_erase + bl stackleak_erase_on_task_stack #endif kernel_exit 0 -ENDPROC(ret_to_user) +SYM_CODE_END(ret_to_user) .popsection // .entry.text -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -/* - * Exception vectors trampoline. - */ - .pushsection ".entry.tramp.text", "ax" - + // Move from tramp_pg_dir to swapper_pg_dir .macro tramp_map_kernel, tmp mrs \tmp, ttbr1_el1 - add \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE) + add \tmp, \tmp, #TRAMP_SWAPPER_OFFSET bic \tmp, \tmp, #USER_ASID_FLAG msr ttbr1_el1, \tmp #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 @@ -777,9 +640,10 @@ alternative_else_nop_endif #endif /* CONFIG_QCOM_FALKOR_ERRATUM_1003 */ .endm + // Move from swapper_pg_dir to tramp_pg_dir .macro tramp_unmap_kernel, tmp mrs \tmp, ttbr1_el1 - sub \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE) + sub \tmp, \tmp, #TRAMP_SWAPPER_OFFSET orr \tmp, \tmp, #USER_ASID_FLAG msr ttbr1_el1, \tmp /* @@ -789,12 +653,57 @@ alternative_else_nop_endif */ .endm - .macro tramp_ventry, regsize = 64 + .macro tramp_data_read_var dst, var +#ifdef CONFIG_RELOCATABLE + ldr \dst, .L__tramp_data_\var + .ifndef .L__tramp_data_\var + .pushsection ".entry.tramp.rodata", "a", %progbits + .align 3 +.L__tramp_data_\var: + .quad \var + .popsection + .endif +#else + /* + * As !RELOCATABLE implies !RANDOMIZE_BASE the address is always a + * compile time constant (and hence not secret and not worth hiding). + * + * As statically allocated kernel code and data always live in the top + * 47 bits of the address space we can sign-extend bit 47 and avoid an + * instruction to load the upper 16 bits (which must be 0xFFFF). + */ + movz \dst, :abs_g2_s:\var + movk \dst, :abs_g1_nc:\var + movk \dst, :abs_g0_nc:\var +#endif + .endm + +#define BHB_MITIGATION_NONE 0 +#define BHB_MITIGATION_LOOP 1 +#define BHB_MITIGATION_FW 2 +#define BHB_MITIGATION_INSN 3 + + .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 1: .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif + + .if \bhb == BHB_MITIGATION_LOOP + /* + * This sequence must appear before the first indirect branch. i.e. the + * ret out of tramp_ventry. It appears here because x30 is free. + */ + __mitigate_spectre_bhb_loop x30 + .endif // \bhb == BHB_MITIGATION_LOOP + + .if \bhb == BHB_MITIGATION_INSN + clearbhb + isb + .endif // \bhb == BHB_MITIGATION_INSN + + .if \kpti == 1 /* * Defend against branch aliasing attacks by pushing a dummy * entry onto the return stack and using a RET instruction to @@ -804,67 +713,106 @@ alternative_else_nop_endif b . 2: tramp_map_kernel x30 -#ifdef CONFIG_RANDOMIZE_BASE - adr x30, tramp_vectors + PAGE_SIZE alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 - ldr x30, [x30] -#else - ldr x30, =vectors -#endif + tramp_data_read_var x30, vectors alternative_if_not ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM - prfm plil1strm, [x30, #(1b - tramp_vectors)] + prfm plil1strm, [x30, #(1b - \vector_start)] alternative_else_nop_endif + msr vbar_el1, x30 - add x30, x30, #(1b - tramp_vectors) isb - ret - .endm + .else + adr_l x30, vectors + .endif // \kpti == 1 - .macro tramp_exit, regsize = 64 - adr x30, tramp_vectors - msr vbar_el1, x30 - tramp_unmap_kernel x30 - .if \regsize == 64 - mrs x30, far_el1 - .endif - eret - sb + .if \bhb == BHB_MITIGATION_FW + /* + * The firmware sequence must appear before the first indirect branch. + * i.e. the ret out of tramp_ventry. But it also needs the stack to be + * mapped to save/restore the registers the SMC clobbers. + */ + __mitigate_spectre_bhb_fw + .endif // \bhb == BHB_MITIGATION_FW + + add x30, x30, #(1b - \vector_start + 4) + ret +.org 1b + 128 // Did we overflow the ventry slot? .endm - .align 11 -ENTRY(tramp_vectors) + .macro generate_tramp_vector, kpti, bhb +.Lvector_start\@: .space 0x400 - tramp_ventry - tramp_ventry - tramp_ventry - tramp_ventry + .rept 4 + tramp_ventry .Lvector_start\@, 64, \kpti, \bhb + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32, \kpti, \bhb + .endr + .endm - tramp_ventry 32 - tramp_ventry 32 - tramp_ventry 32 - tramp_ventry 32 -END(tramp_vectors) +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * Exception vectors trampoline. + * The order must match __bp_harden_el1_vectors and the + * arm64_bp_harden_el1_vectors enum. + */ + .pushsection ".entry.tramp.text", "ax" + .align 11 +SYM_CODE_START_LOCAL_NOALIGN(tramp_vectors) +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_LOOP + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_FW + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_INSN +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_NONE +SYM_CODE_END(tramp_vectors) + +SYM_CODE_START_LOCAL(tramp_exit) + tramp_unmap_kernel x29 + mrs x29, far_el1 // restore x29 + eret + sb +SYM_CODE_END(tramp_exit) + .popsection // .entry.tramp.text +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ -ENTRY(tramp_exit_native) - tramp_exit -END(tramp_exit_native) +/* + * Exception vectors for spectre mitigations on entry from EL1 when + * kpti is not in use. + */ + .macro generate_el1_vector, bhb +.Lvector_start\@: + kernel_ventry 1, t, 64, sync // Synchronous EL1t + kernel_ventry 1, t, 64, irq // IRQ EL1t + kernel_ventry 1, t, 64, fiq // FIQ EL1h + kernel_ventry 1, t, 64, error // Error EL1t + + kernel_ventry 1, h, 64, sync // Synchronous EL1h + kernel_ventry 1, h, 64, irq // IRQ EL1h + kernel_ventry 1, h, 64, fiq // FIQ EL1h + kernel_ventry 1, h, 64, error // Error EL1h + + .rept 4 + tramp_ventry .Lvector_start\@, 64, 0, \bhb + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32, 0, \bhb + .endr + .endm -ENTRY(tramp_exit_compat) - tramp_exit 32 -END(tramp_exit_compat) +/* The order must match tramp_vecs and the arm64_bp_harden_el1_vectors enum. */ + .pushsection ".entry.text", "ax" + .align 11 +SYM_CODE_START(__bp_harden_el1_vectors) +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_el1_vector bhb=BHB_MITIGATION_LOOP + generate_el1_vector bhb=BHB_MITIGATION_FW + generate_el1_vector bhb=BHB_MITIGATION_INSN +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ +SYM_CODE_END(__bp_harden_el1_vectors) + .popsection - .ltorg - .popsection // .entry.tramp.text -#ifdef CONFIG_RANDOMIZE_BASE - .pushsection ".rodata", "a" - .align PAGE_SHIFT - .globl __entry_tramp_data_start -__entry_tramp_data_start: - .quad vectors - .popsection // .rodata -#endif /* CONFIG_RANDOMIZE_BASE */ -#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ /* * Register switch for AArch64. The callee-saved registers need to be saved @@ -874,7 +822,7 @@ __entry_tramp_data_start: * Previous and next are guaranteed not to be the same. * */ -ENTRY(cpu_switch_to) +SYM_FUNC_START(cpu_switch_to) mov x10, #THREAD_CPU_CONTEXT add x8, x0, x10 mov x9, sp @@ -895,23 +843,62 @@ ENTRY(cpu_switch_to) ldr lr, [x8] mov sp, x9 msr sp_el0, x1 + ptrauth_keys_install_kernel x1, x8, x9, x10 + scs_save x0 + scs_load_current ret -ENDPROC(cpu_switch_to) +SYM_FUNC_END(cpu_switch_to) NOKPROBE(cpu_switch_to) /* * This is how we return from a fork. */ -ENTRY(ret_from_fork) +SYM_CODE_START(ret_from_fork) bl schedule_tail cbz x19, 1f // not a kernel thread mov x0, x20 blr x19 1: get_current_task tsk + mov x0, sp + bl asm_exit_to_user_mode b ret_to_user -ENDPROC(ret_from_fork) +SYM_CODE_END(ret_from_fork) NOKPROBE(ret_from_fork) +/* + * void call_on_irq_stack(struct pt_regs *regs, + * void (*func)(struct pt_regs *)); + * + * Calls func(regs) using this CPU's irq stack and shadow irq stack. + */ +SYM_FUNC_START(call_on_irq_stack) +#ifdef CONFIG_SHADOW_CALL_STACK + get_current_task x16 + scs_save x16 + ldr_this_cpu scs_sp, irq_shadow_call_stack_ptr, x17 +#endif + + /* Create a frame record to save our LR and SP (implicit in FP) */ + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr_this_cpu x16, irq_stack_ptr, x17 + + /* Move to the new stack and call the function there */ + add sp, x16, #IRQ_STACK_SIZE + blr x1 + + /* + * Restore the SP from the FP, and restore the FP and LR from the frame + * record. + */ + mov sp, x29 + ldp x29, x30, [sp], #16 + scs_load_current + ret +SYM_FUNC_END(call_on_irq_stack) +NOKPROBE(call_on_irq_stack) + #ifdef CONFIG_ARM_SDE_INTERFACE #include <asm/sdei.h> @@ -936,9 +923,8 @@ NOKPROBE(ret_from_fork) * This clobbers x4, __sdei_handler() will restore this from firmware's * copy. */ -.ltorg .pushsection ".entry.tramp.text", "ax" -ENTRY(__sdei_asm_entry_trampoline) +SYM_CODE_START(__sdei_asm_entry_trampoline) mrs x4, ttbr1_el1 tbz x4, #USER_ASID_BIT, 1f @@ -947,20 +933,12 @@ ENTRY(__sdei_asm_entry_trampoline) mov x4, xzr /* - * Use reg->interrupted_regs.addr_limit to remember whether to unmap - * the kernel on exit. + * Remember whether to unmap the kernel on exit. */ -1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] - -#ifdef CONFIG_RANDOMIZE_BASE - adr x4, tramp_vectors + PAGE_SIZE - add x4, x4, #:lo12:__sdei_asm_trampoline_next_handler - ldr x4, [x4] -#else - ldr x4, =__sdei_asm_handler -#endif +1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] + tramp_data_read_var x4, __sdei_asm_handler br x4 -ENDPROC(__sdei_asm_entry_trampoline) +SYM_CODE_END(__sdei_asm_entry_trampoline) NOKPROBE(__sdei_asm_entry_trampoline) /* @@ -970,23 +948,16 @@ NOKPROBE(__sdei_asm_entry_trampoline) * x2: exit_mode * x4: struct sdei_registered_event argument from registration time. */ -ENTRY(__sdei_asm_exit_trampoline) - ldr x4, [x4, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] +SYM_CODE_START(__sdei_asm_exit_trampoline) + ldr x4, [x4, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] cbnz x4, 1f tramp_unmap_kernel tmp=x4 1: sdei_handler_exit exit_mode=x2 -ENDPROC(__sdei_asm_exit_trampoline) +SYM_CODE_END(__sdei_asm_exit_trampoline) NOKPROBE(__sdei_asm_exit_trampoline) - .ltorg .popsection // .entry.tramp.text -#ifdef CONFIG_RANDOMIZE_BASE -.pushsection ".rodata", "a" -__sdei_asm_trampoline_next_handler: - .quad __sdei_asm_handler -.popsection // .rodata -#endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ /* @@ -1002,7 +973,7 @@ __sdei_asm_trampoline_next_handler: * follow SMC-CC. We save (or retrieve) all the registers as the handler may * want them. */ -ENTRY(__sdei_asm_handler) +SYM_CODE_START(__sdei_asm_handler) stp x2, x3, [x1, #SDEI_EVENT_INTREGS + S_PC] stp x4, x5, [x1, #SDEI_EVENT_INTREGS + 16 * 2] stp x6, x7, [x1, #SDEI_EVENT_INTREGS + 16 * 3] @@ -1022,13 +993,20 @@ ENTRY(__sdei_asm_handler) mov x19, x1 + /* Store the registered-event for crash_smp_send_stop() */ + ldrb w4, [x19, #SDEI_EVENT_PRIORITY] + cbnz w4, 1f + adr_this_cpu dst=x5, sym=sdei_active_normal_event, tmp=x6 + b 2f +1: adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6 +2: str x19, [x5] + #ifdef CONFIG_VMAP_STACK /* * entry.S may have been using sp as a scratch register, find whether * this is a normal or critical event and switch to the appropriate * stack for this CPU. */ - ldrb w4, [x19, #SDEI_EVENT_PRIORITY] cbnz w4, 1f ldr_this_cpu dst=x5, sym=sdei_stack_normal_ptr, tmp=x6 b 2f @@ -1038,6 +1016,15 @@ ENTRY(__sdei_asm_handler) mov sp, x5 #endif +#ifdef CONFIG_SHADOW_CALL_STACK + /* Use a separate shadow call stack for normal and critical events */ + cbnz w4, 3f + ldr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_normal_ptr, tmp=x6 + b 4f +3: ldr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_critical_ptr, tmp=x6 +4: +#endif + /* * We may have interrupted userspace, or a guest, or exit-from or * return-to either of these. We can't trust sp_el0, restore it. @@ -1069,14 +1056,24 @@ ENTRY(__sdei_asm_handler) mov sp, x1 mov x1, x0 // address to complete_and_resume - /* x0 = (x0 <= 1) ? EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME */ - cmp x0, #1 + /* x0 = (x0 <= SDEI_EV_FAILED) ? + * EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME + */ + cmp x0, #SDEI_EV_FAILED mov_q x2, SDEI_1_0_FN_SDEI_EVENT_COMPLETE mov_q x3, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME csel x0, x2, x3, ls ldr_l x2, sdei_exit_mode + /* Clear the registered-event seen by crash_smp_send_stop() */ + ldrb w3, [x4, #SDEI_EVENT_PRIORITY] + cbnz w3, 1f + adr_this_cpu dst=x5, sym=sdei_active_normal_event, tmp=x6 + b 2f +1: adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6 +2: str xzr, [x5] + alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 sdei_handler_exit exit_mode=x2 alternative_else_nop_endif @@ -1085,6 +1082,17 @@ alternative_else_nop_endif tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline br x5 #endif -ENDPROC(__sdei_asm_handler) +SYM_CODE_END(__sdei_asm_handler) NOKPROBE(__sdei_asm_handler) + +SYM_CODE_START(__sdei_handler_abort) + mov_q x0, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME + adr x1, 1f + ldr_l x2, sdei_exit_mode + sdei_handler_exit exit_mode=x2 + // exit the handler and jump to the next instruction. + // Exit will stomp x0-x17, PSTATE, ELR_ELx, and SPSR_ELx. +1: ret +SYM_CODE_END(__sdei_handler_abort) +NOKPROBE(__sdei_handler_abort) #endif /* CONFIG_ARM_SDE_INTERFACE */ diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 3eb338f14386..a5dc6f764195 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -12,8 +12,10 @@ #include <linux/bug.h> #include <linux/cache.h> #include <linux/compat.h> +#include <linux/compiler.h> #include <linux/cpu.h> #include <linux/cpu_pm.h> +#include <linux/ctype.h> #include <linux/kernel.h> #include <linux/linkage.h> #include <linux/irqflags.h> @@ -31,9 +33,11 @@ #include <linux/swab.h> #include <asm/esr.h> +#include <asm/exception.h> #include <asm/fpsimd.h> #include <asm/cpufeature.h> #include <asm/cputype.h> +#include <asm/neon.h> #include <asm/processor.h> #include <asm/simd.h> #include <asm/sigcontext.h> @@ -75,15 +79,19 @@ * indicate whether or not the userland FPSIMD state of the current task is * present in the registers. The flag is set unless the FPSIMD registers of this * CPU currently contain the most recent userland FPSIMD state of the current - * task. + * task. If the task is behaving as a VMM, then this is will be managed by + * KVM which will clear it to indicate that the vcpu FPSIMD state is currently + * loaded on the CPU, allowing the state to be saved if a FPSIMD-aware + * softirq kicks in. Upon vcpu_put(), KVM will save the vcpu FP state and + * flag the register state as invalid. * - * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may - * save the task's FPSIMD context back to task_struct from softirq context. - * To prevent this from racing with the manipulation of the task's FPSIMD state - * from task context and thereby corrupting the state, it is necessary to - * protect any manipulation of a task's fpsimd_state or TIF_FOREIGN_FPSTATE - * flag with {, __}get_cpu_fpsimd_context(). This will still allow softirqs to - * run but prevent them to use FPSIMD. + * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may be + * called from softirq context, which will save the task's FPSIMD context back + * to task_struct. To prevent this from racing with the manipulation of the + * task's FPSIMD state from task context and thereby corrupting the state, it + * is necessary to protect any manipulation of a task's fpsimd_state or + * TIF_FOREIGN_FPSTATE flag with get_cpu_fpsimd_context(), which will suspend + * softirq servicing entirely until put_cpu_fpsimd_context() is called. * * For a certain task, the sequence may look something like this: * - the task gets scheduled in; if both the task's fpsimd_cpu field @@ -110,72 +118,117 @@ * returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so * whatever is in the FPSIMD registers is not saved to memory, but discarded. */ -struct fpsimd_last_state_struct { - struct user_fpsimd_state *st; - void *sve_state; - unsigned int sve_vl; + +static DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); + +__ro_after_init struct vl_info vl_info[ARM64_VEC_MAX] = { +#ifdef CONFIG_ARM64_SVE + [ARM64_VEC_SVE] = { + .type = ARM64_VEC_SVE, + .name = "SVE", + .min_vl = SVE_VL_MIN, + .max_vl = SVE_VL_MIN, + .max_virtualisable_vl = SVE_VL_MIN, + }, +#endif +#ifdef CONFIG_ARM64_SME + [ARM64_VEC_SME] = { + .type = ARM64_VEC_SME, + .name = "SME", + }, +#endif +}; + +static unsigned int vec_vl_inherit_flag(enum vec_type type) +{ + switch (type) { + case ARM64_VEC_SVE: + return TIF_SVE_VL_INHERIT; + case ARM64_VEC_SME: + return TIF_SME_VL_INHERIT; + default: + WARN_ON_ONCE(1); + return 0; + } +} + +struct vl_config { + int __default_vl; /* Default VL for tasks */ }; -static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state); +static struct vl_config vl_config[ARM64_VEC_MAX]; -/* Default VL for tasks that don't set it explicitly: */ -static int sve_default_vl = -1; +static inline int get_default_vl(enum vec_type type) +{ + return READ_ONCE(vl_config[type].__default_vl); +} #ifdef CONFIG_ARM64_SVE -/* Maximum supported vector length across all CPUs (initially poisoned) */ -int __ro_after_init sve_max_vl = SVE_VL_MIN; -int __ro_after_init sve_max_virtualisable_vl = SVE_VL_MIN; +static inline int get_sve_default_vl(void) +{ + return get_default_vl(ARM64_VEC_SVE); +} -/* - * Set of available vector lengths, - * where length vq encoded as bit __vq_to_bit(vq): - */ -__ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); -/* Set of vector lengths present on at least one cpu: */ -static __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX); +static inline void set_default_vl(enum vec_type type, int val) +{ + WRITE_ONCE(vl_config[type].__default_vl, val); +} + +static inline void set_sve_default_vl(int val) +{ + set_default_vl(ARM64_VEC_SVE, val); +} static void __percpu *efi_sve_state; #else /* ! CONFIG_ARM64_SVE */ /* Dummy declaration for code that will be optimised out: */ -extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); -extern __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX); extern void __percpu *efi_sve_state; #endif /* ! CONFIG_ARM64_SVE */ -DEFINE_PER_CPU(bool, fpsimd_context_busy); -EXPORT_PER_CPU_SYMBOL(fpsimd_context_busy); +#ifdef CONFIG_ARM64_SME -static void __get_cpu_fpsimd_context(void) +static int get_sme_default_vl(void) { - bool busy = __this_cpu_xchg(fpsimd_context_busy, true); + return get_default_vl(ARM64_VEC_SME); +} - WARN_ON(busy); +static void set_sme_default_vl(int val) +{ + set_default_vl(ARM64_VEC_SME, val); } +static void sme_free(struct task_struct *); + +#else + +static inline void sme_free(struct task_struct *t) { } + +#endif + +static void fpsimd_bind_task_to_cpu(void); + /* * Claim ownership of the CPU FPSIMD context for use by the calling context. * * The caller may freely manipulate the FPSIMD context metadata until * put_cpu_fpsimd_context() is called. * - * The double-underscore version must only be called if you know the task - * can't be preempted. + * On RT kernels local_bh_disable() is not sufficient because it only + * serializes soft interrupt related sections via a local lock, but stays + * preemptible. Disabling preemption is the right choice here as bottom + * half processing is always in thread context on RT kernels so it + * implicitly prevents bottom half processing as well. */ static void get_cpu_fpsimd_context(void) { - preempt_disable(); - __get_cpu_fpsimd_context(); -} - -static void __put_cpu_fpsimd_context(void) -{ - bool busy = __this_cpu_xchg(fpsimd_context_busy, false); - - WARN_ON(!busy); /* No matching get_cpu_fpsimd_context()? */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); } /* @@ -187,55 +240,61 @@ static void __put_cpu_fpsimd_context(void) */ static void put_cpu_fpsimd_context(void) { - __put_cpu_fpsimd_context(); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); } -static bool have_cpu_fpsimd_context(void) +unsigned int task_get_vl(const struct task_struct *task, enum vec_type type) { - return !preemptible() && __this_cpu_read(fpsimd_context_busy); + return task->thread.vl[type]; } -/* - * Call __sve_free() directly only if you know task can't be scheduled - * or preempted. - */ -static void __sve_free(struct task_struct *task) +void task_set_vl(struct task_struct *task, enum vec_type type, + unsigned long vl) { - kfree(task->thread.sve_state); - task->thread.sve_state = NULL; + task->thread.vl[type] = vl; } -static void sve_free(struct task_struct *task) +unsigned int task_get_vl_onexec(const struct task_struct *task, + enum vec_type type) { - WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); + return task->thread.vl_onexec[type]; +} - __sve_free(task); +void task_set_vl_onexec(struct task_struct *task, enum vec_type type, + unsigned long vl) +{ + task->thread.vl_onexec[type] = vl; } /* + * TIF_SME controls whether a task can use SME without trapping while + * in userspace, when TIF_SME is set then we must have storage + * allocated in sve_state and sme_state to store the contents of both ZA + * and the SVE registers for both streaming and non-streaming modes. + * + * If both SVCR.ZA and SVCR.SM are disabled then at any point we + * may disable TIF_SME and reenable traps. + */ + + +/* * TIF_SVE controls whether a task can use SVE without trapping while - * in userspace, and also the way a task's FPSIMD/SVE state is stored - * in thread_struct. + * in userspace, and also (together with TIF_SME) the way a task's + * FPSIMD/SVE state is stored in thread_struct. * * The kernel uses this flag to track whether a user task is actively * using SVE, and therefore whether full SVE register state needs to * be tracked. If not, the cheaper FPSIMD context handling code can * be used instead of the more costly SVE equivalents. * - * * TIF_SVE set: + * * TIF_SVE or SVCR.SM set: * * The task can execute SVE instructions while in userspace without * trapping to the kernel. * - * When stored, Z0-Z31 (incorporating Vn in bits[127:0] or the - * corresponding Zn), P0-P15 and FFR are encoded in in - * task->thread.sve_state, formatted appropriately for vector - * length task->thread.sve_vl. - * - * task->thread.sve_state must point to a valid buffer at least - * sve_state_size(task) bytes in size. - * * During any syscall, the kernel may optionally clear TIF_SVE and * discard the vector state except for the FPSIMD subset. * @@ -245,7 +304,15 @@ static void sve_free(struct task_struct *task) * do_sve_acc() to be called, which does some preparation and then * sets TIF_SVE. * - * When stored, FPSIMD registers V0-V31 are encoded in + * During any syscall, the kernel may optionally clear TIF_SVE and + * discard the vector state except for the FPSIMD subset. + * + * The data will be stored in one of two formats: + * + * * FPSIMD only - FP_STATE_FPSIMD: + * + * When the FPSIMD only state stored task->thread.fp_type is set to + * FP_STATE_FPSIMD, the FPSIMD registers V0-V31 are encoded in * task->thread.uw.fpsimd_state; bits [max : 128] for each of Z0-Z31 are * logically zero but not stored anywhere; P0-P15 and FFR are not * stored and have unspecified values from userspace's point of @@ -253,7 +320,23 @@ static void sve_free(struct task_struct *task) * but userspace is discouraged from relying on this. * * task->thread.sve_state does not need to be non-NULL, valid or any - * particular size: it must not be dereferenced. + * particular size: it must not be dereferenced and any data stored + * there should be considered stale and not referenced. + * + * * SVE state - FP_STATE_SVE: + * + * When the full SVE state is stored task->thread.fp_type is set to + * FP_STATE_SVE and Z0-Z31 (incorporating Vn in bits[127:0] or the + * corresponding Zn), P0-P15 and FFR are encoded in in + * task->thread.sve_state, formatted appropriately for vector + * length task->thread.sve_vl or, if SVCR.SM is set, + * task->thread.sme_vl. The storage for the vector registers in + * task->thread.uw.fpsimd_state should be ignored. + * + * task->thread.sve_state must point to a valid buffer at least + * sve_state_size(task) bytes in size. The data stored in + * task->thread.uw.fpsimd_state.vregs should be considered stale + * and not referenced. * * * FPSR and FPCR are always stored in task->thread.uw.fpsimd_state * irrespective of whether TIF_SVE is clear or set, since these are @@ -269,45 +352,149 @@ static void sve_free(struct task_struct *task) */ static void task_fpsimd_load(void) { - WARN_ON(!have_cpu_fpsimd_context()); + bool restore_sve_regs = false; + bool restore_ffr; + + WARN_ON(!system_supports_fpsimd()); + WARN_ON(preemptible()); + WARN_ON(test_thread_flag(TIF_KERNEL_FPSTATE)); + + if (system_supports_sve() || system_supports_sme()) { + switch (current->thread.fp_type) { + case FP_STATE_FPSIMD: + /* Stop tracking SVE for this task until next use. */ + if (test_and_clear_thread_flag(TIF_SVE)) + sve_user_disable(); + break; + case FP_STATE_SVE: + if (!thread_sm_enabled(¤t->thread) && + !WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE))) + sve_user_enable(); + + if (test_thread_flag(TIF_SVE)) + sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1); + + restore_sve_regs = true; + restore_ffr = true; + break; + default: + /* + * This indicates either a bug in + * fpsimd_save_user_state() or memory corruption, we + * should always record an explicit format + * when we save. We always at least have the + * memory allocated for FPSMID registers so + * try that and hope for the best. + */ + WARN_ON_ONCE(1); + clear_thread_flag(TIF_SVE); + break; + } + } - if (system_supports_sve() && test_thread_flag(TIF_SVE)) + /* Restore SME, override SVE register configuration if needed */ + if (system_supports_sme()) { + unsigned long sme_vl = task_get_sme_vl(current); + + /* Ensure VL is set up for restoring data */ + if (test_thread_flag(TIF_SME)) + sme_set_vq(sve_vq_from_vl(sme_vl) - 1); + + write_sysreg_s(current->thread.svcr, SYS_SVCR); + + if (thread_za_enabled(¤t->thread)) + sme_load_state(current->thread.sme_state, + system_supports_sme2()); + + if (thread_sm_enabled(¤t->thread)) + restore_ffr = system_supports_fa64(); + } + + if (restore_sve_regs) { + WARN_ON_ONCE(current->thread.fp_type != FP_STATE_SVE); sve_load_state(sve_pffr(¤t->thread), ¤t->thread.uw.fpsimd_state.fpsr, - sve_vq_from_vl(current->thread.sve_vl) - 1); - else + restore_ffr); + } else { + WARN_ON_ONCE(current->thread.fp_type != FP_STATE_FPSIMD); fpsimd_load_state(¤t->thread.uw.fpsimd_state); + } } /* * Ensure FPSIMD/SVE storage in memory for the loaded context is up to - * date with respect to the CPU registers. + * date with respect to the CPU registers. Note carefully that the + * current context is the context last bound to the CPU stored in + * last, if KVM is involved this may be the guest VM context rather + * than the host thread for the VM pointed to by current. This means + * that we must always reference the state storage via last rather + * than via current, if we are saving KVM state then it will have + * ensured that the type of registers to save is set in last->to_save. */ -static void fpsimd_save(void) +static void fpsimd_save_user_state(void) { - struct fpsimd_last_state_struct const *last = + struct cpu_fp_state const *last = this_cpu_ptr(&fpsimd_last_state); /* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */ + bool save_sve_regs = false; + bool save_ffr; + unsigned int vl; - WARN_ON(!have_cpu_fpsimd_context()); + WARN_ON(!system_supports_fpsimd()); + WARN_ON(preemptible()); - if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { - if (system_supports_sve() && test_thread_flag(TIF_SVE)) { - if (WARN_ON(sve_get_vl() != last->sve_vl)) { - /* - * Can't save the user regs, so current would - * re-enter user with corrupt state. - * There's no way to recover, so kill it: - */ - force_signal_inject(SIGKILL, SI_KERNEL, 0); - return; - } + if (test_thread_flag(TIF_FOREIGN_FPSTATE)) + return; + + /* + * If a task is in a syscall the ABI allows us to only + * preserve the state shared with FPSIMD so don't bother + * saving the full SVE state in that case. + */ + if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE) && + !in_syscall(current_pt_regs())) || + last->to_save == FP_STATE_SVE) { + save_sve_regs = true; + save_ffr = true; + vl = last->sve_vl; + } - sve_save_state((char *)last->sve_state + - sve_ffr_offset(last->sve_vl), - &last->st->fpsr); - } else - fpsimd_save_state(last->st); + if (system_supports_sme()) { + u64 *svcr = last->svcr; + + *svcr = read_sysreg_s(SYS_SVCR); + + if (*svcr & SVCR_ZA_MASK) + sme_save_state(last->sme_state, + system_supports_sme2()); + + /* If we are in streaming mode override regular SVE. */ + if (*svcr & SVCR_SM_MASK) { + save_sve_regs = true; + save_ffr = system_supports_fa64(); + vl = last->sme_vl; + } + } + + if (IS_ENABLED(CONFIG_ARM64_SVE) && save_sve_regs) { + /* Get the configured VL from RDVL, will account for SM */ + if (WARN_ON(sve_get_vl() != vl)) { + /* + * Can't save the user regs, so current would + * re-enter user with corrupt state. + * There's no way to recover, so kill it: + */ + force_signal_inject(SIGKILL, SI_KERNEL, 0, 0); + return; + } + + sve_save_state((char *)last->sve_state + + sve_ffr_offset(vl), + &last->st->fpsr, save_ffr); + *last->fp_type = FP_STATE_SVE; + } else { + fpsimd_save_state(last->st); + *last->fp_type = FP_STATE_FPSIMD; } } @@ -317,33 +504,38 @@ static void fpsimd_save(void) * If things go wrong there's a bug somewhere, but try to fall back to a * safe choice. */ -static unsigned int find_supported_vector_length(unsigned int vl) +static unsigned int find_supported_vector_length(enum vec_type type, + unsigned int vl) { + struct vl_info *info = &vl_info[type]; int bit; - int max_vl = sve_max_vl; + int max_vl = info->max_vl; if (WARN_ON(!sve_vl_valid(vl))) - vl = SVE_VL_MIN; + vl = info->min_vl; if (WARN_ON(!sve_vl_valid(max_vl))) - max_vl = SVE_VL_MIN; + max_vl = info->min_vl; if (vl > max_vl) vl = max_vl; + if (vl < info->min_vl) + vl = info->min_vl; - bit = find_next_bit(sve_vq_map, SVE_VQ_MAX, + bit = find_next_bit(info->vq_map, SVE_VQ_MAX, __vq_to_bit(sve_vq_from_vl(vl))); return sve_vl_from_vq(__bit_to_vq(bit)); } -#ifdef CONFIG_SYSCTL +#if defined(CONFIG_ARM64_SVE) && defined(CONFIG_SYSCTL) -static int sve_proc_do_default_vl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int vec_proc_do_default_vl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { + struct vl_info *info = table->extra1; + enum vec_type type = info->type; int ret; - int vl = sve_default_vl; + int vl = get_default_vl(type); struct ctl_table tmp_table = { .data = &vl, .maxlen = sizeof(vl), @@ -355,12 +547,12 @@ static int sve_proc_do_default_vl(struct ctl_table *table, int write, /* Writing -1 has the special meaning "set to max": */ if (vl == -1) - vl = sve_max_vl; + vl = info->max_vl; if (!sve_vl_valid(vl)) return -EINVAL; - sve_default_vl = find_supported_vector_length(vl); + set_default_vl(type, find_supported_vector_length(type, vl)); return 0; } @@ -368,9 +560,9 @@ static struct ctl_table sve_default_vl_table[] = { { .procname = "sve_default_vector_length", .mode = 0644, - .proc_handler = sve_proc_do_default_vl, + .proc_handler = vec_proc_do_default_vl, + .extra1 = &vl_info[ARM64_VEC_SVE], }, - { } }; static int __init sve_sysctl_init(void) @@ -382,9 +574,32 @@ static int __init sve_sysctl_init(void) return 0; } -#else /* ! CONFIG_SYSCTL */ +#else /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */ static int __init sve_sysctl_init(void) { return 0; } -#endif /* ! CONFIG_SYSCTL */ +#endif /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */ + +#if defined(CONFIG_ARM64_SME) && defined(CONFIG_SYSCTL) +static struct ctl_table sme_default_vl_table[] = { + { + .procname = "sme_default_vector_length", + .mode = 0644, + .proc_handler = vec_proc_do_default_vl, + .extra1 = &vl_info[ARM64_VEC_SME], + }, +}; + +static int __init sme_sysctl_init(void) +{ + if (system_supports_sme()) + if (!register_sysctl("abi", sme_default_vl_table)) + return -EINVAL; + + return 0; +} + +#else /* ! (CONFIG_ARM64_SME && CONFIG_SYSCTL) */ +static int __init sme_sysctl_init(void) { return 0; } +#endif /* ! (CONFIG_ARM64_SME && CONFIG_SYSCTL) */ #define ZREG(sve_state, vq, n) ((char *)(sve_state) + \ (SVE_SIG_ZREG_OFFSET(vq, n) - SVE_SIG_REGS_OFFSET)) @@ -436,10 +651,10 @@ static void fpsimd_to_sve(struct task_struct *task) void *sst = task->thread.sve_state; struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state; - if (!system_supports_sve()) + if (!system_supports_sve() && !system_supports_sme()) return; - vq = sve_vq_from_vl(task->thread.sve_vl); + vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); __fpsimd_to_sve(sst, fst, vq); } @@ -456,16 +671,17 @@ static void fpsimd_to_sve(struct task_struct *task) */ static void sve_to_fpsimd(struct task_struct *task) { - unsigned int vq; + unsigned int vq, vl; void const *sst = task->thread.sve_state; struct user_fpsimd_state *fst = &task->thread.uw.fpsimd_state; unsigned int i; __uint128_t const *p; - if (!system_supports_sve()) + if (!system_supports_sve() && !system_supports_sme()) return; - vq = sve_vq_from_vl(task->thread.sve_vl); + vl = thread_get_cur_vl(&task->thread); + vq = sve_vq_from_vl(vl); for (i = 0; i < SVE_NUM_ZREGS; ++i) { p = (__uint128_t const *)ZREG(sst, vq, i); fst->vregs[i] = arm64_le128_to_cpu(*p); @@ -473,6 +689,22 @@ static void sve_to_fpsimd(struct task_struct *task) } #ifdef CONFIG_ARM64_SVE +/* + * Call __sve_free() directly only if you know task can't be scheduled + * or preempted. + */ +static void __sve_free(struct task_struct *task) +{ + kfree(task->thread.sve_state); + task->thread.sve_state = NULL; +} + +static void sve_free(struct task_struct *task) +{ + WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); + + __sve_free(task); +} /* * Return how many bytes of memory are required to store the full SVE @@ -480,7 +712,14 @@ static void sve_to_fpsimd(struct task_struct *task) */ size_t sve_state_size(struct task_struct const *task) { - return SVE_SIG_REGS_SIZE(sve_vq_from_vl(task->thread.sve_vl)); + unsigned int vl = 0; + + if (system_supports_sve()) + vl = task_get_sve_vl(task); + if (system_supports_sme()) + vl = max(vl, task_get_sme_vl(task)); + + return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)); } /* @@ -493,26 +732,35 @@ size_t sve_state_size(struct task_struct const *task) * do_sve_acc() case, there is no ABI requirement to hide stale data * written previously be task. */ -void sve_alloc(struct task_struct *task) +void sve_alloc(struct task_struct *task, bool flush) { if (task->thread.sve_state) { - memset(task->thread.sve_state, 0, sve_state_size(current)); + if (flush) + memset(task->thread.sve_state, 0, + sve_state_size(task)); return; } /* This is a small allocation (maximum ~8KB) and Should Not Fail. */ task->thread.sve_state = kzalloc(sve_state_size(task), GFP_KERNEL); - - /* - * If future SVE revisions can have larger vectors though, - * this may cease to be true: - */ - BUG_ON(!task->thread.sve_state); } /* + * Force the FPSIMD state shared with SVE to be updated in the SVE state + * even if the SVE state is the current active state. + * + * This should only be called by ptrace. task must be non-runnable. + * task->thread.sve_state must point to at least sve_state_size(task) + * bytes of allocated kernel memory. + */ +void fpsimd_force_sync_to_sve(struct task_struct *task) +{ + fpsimd_to_sve(task); +} + +/* * Ensure that task->thread.sve_state is up to date with respect to * the user task, irrespective of when SVE is in use or not. * @@ -522,7 +770,8 @@ void sve_alloc(struct task_struct *task) */ void fpsimd_sync_to_sve(struct task_struct *task) { - if (!test_tsk_thread_flag(task, TIF_SVE)) + if (!test_tsk_thread_flag(task, TIF_SVE) && + !thread_sm_enabled(&task->thread)) fpsimd_to_sve(task); } @@ -536,7 +785,7 @@ void fpsimd_sync_to_sve(struct task_struct *task) */ void sve_sync_to_fpsimd(struct task_struct *task) { - if (test_tsk_thread_flag(task, TIF_SVE)) + if (task->thread.fp_type == FP_STATE_SVE) sve_to_fpsimd(task); } @@ -558,18 +807,21 @@ void sve_sync_from_fpsimd_zeropad(struct task_struct *task) void *sst = task->thread.sve_state; struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state; - if (!test_tsk_thread_flag(task, TIF_SVE)) + if (!test_tsk_thread_flag(task, TIF_SVE) && + !thread_sm_enabled(&task->thread)) return; - vq = sve_vq_from_vl(task->thread.sve_vl); + vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); memset(sst, 0, SVE_SIG_REGS_SIZE(vq)); __fpsimd_to_sve(sst, fst, vq); } -int sve_set_vector_length(struct task_struct *task, +int vec_set_vector_length(struct task_struct *task, enum vec_type type, unsigned long vl, unsigned long flags) { + bool free_sme = false; + if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT | PR_SVE_SET_VL_ONEXEC)) return -EINVAL; @@ -578,57 +830,82 @@ int sve_set_vector_length(struct task_struct *task, return -EINVAL; /* - * Clamp to the maximum vector length that VL-agnostic SVE code can - * work with. A flag may be assigned in the future to allow setting - * of larger vector lengths without confusing older software. + * Clamp to the maximum vector length that VL-agnostic code + * can work with. A flag may be assigned in the future to + * allow setting of larger vector lengths without confusing + * older software. */ - if (vl > SVE_VL_ARCH_MAX) - vl = SVE_VL_ARCH_MAX; + if (vl > VL_ARCH_MAX) + vl = VL_ARCH_MAX; - vl = find_supported_vector_length(vl); + vl = find_supported_vector_length(type, vl); if (flags & (PR_SVE_VL_INHERIT | PR_SVE_SET_VL_ONEXEC)) - task->thread.sve_vl_onexec = vl; + task_set_vl_onexec(task, type, vl); else /* Reset VL to system default on next exec: */ - task->thread.sve_vl_onexec = 0; + task_set_vl_onexec(task, type, 0); /* Only actually set the VL if not deferred: */ if (flags & PR_SVE_SET_VL_ONEXEC) goto out; - if (vl == task->thread.sve_vl) + if (vl == task_get_vl(task, type)) goto out; /* * To ensure the FPSIMD bits of the SVE vector registers are preserved, * write any live register state back to task_struct, and convert to a - * non-SVE thread. + * regular FPSIMD thread. */ if (task == current) { get_cpu_fpsimd_context(); - fpsimd_save(); + fpsimd_save_user_state(); } fpsimd_flush_task_state(task); - if (test_and_clear_tsk_thread_flag(task, TIF_SVE)) + if (test_and_clear_tsk_thread_flag(task, TIF_SVE) || + thread_sm_enabled(&task->thread)) { sve_to_fpsimd(task); + task->thread.fp_type = FP_STATE_FPSIMD; + } + + if (system_supports_sme()) { + if (type == ARM64_VEC_SME || + !(task->thread.svcr & (SVCR_SM_MASK | SVCR_ZA_MASK))) { + /* + * We are changing the SME VL or weren't using + * SME anyway, discard the state and force a + * reallocation. + */ + task->thread.svcr &= ~(SVCR_SM_MASK | + SVCR_ZA_MASK); + clear_tsk_thread_flag(task, TIF_SME); + free_sme = true; + } + } if (task == current) put_cpu_fpsimd_context(); + task_set_vl(task, type, vl); + /* - * Force reallocation of task SVE state to the correct size - * on next use: + * Free the changed states if they are not in use, SME will be + * reallocated to the correct size on next use and we just + * allocate SVE now in case it is needed for use in streaming + * mode. */ sve_free(task); + sve_alloc(task, true); - task->thread.sve_vl = vl; + if (free_sme) + sme_free(task); out: - update_tsk_thread_flag(task, TIF_SVE_VL_INHERIT, + update_tsk_thread_flag(task, vec_vl_inherit_flag(type), flags & PR_SVE_VL_INHERIT); return 0; @@ -636,20 +913,21 @@ out: /* * Encode the current vector length and flags for return. - * This is only required for prctl(): ptrace has separate fields + * This is only required for prctl(): ptrace has separate fields. + * SVE and SME use the same bits for _ONEXEC and _INHERIT. * - * flags are as for sve_set_vector_length(). + * flags are as for vec_set_vector_length(). */ -static int sve_prctl_status(unsigned long flags) +static int vec_prctl_status(enum vec_type type, unsigned long flags) { int ret; if (flags & PR_SVE_SET_VL_ONEXEC) - ret = current->thread.sve_vl_onexec; + ret = task_get_vl_onexec(current, type); else - ret = current->thread.sve_vl; + ret = task_get_vl(current, type); - if (test_thread_flag(TIF_SVE_VL_INHERIT)) + if (test_thread_flag(vec_vl_inherit_flag(type))) ret |= PR_SVE_VL_INHERIT; return ret; @@ -664,38 +942,81 @@ int sve_set_current_vl(unsigned long arg) vl = arg & PR_SVE_VL_LEN_MASK; flags = arg & ~vl; - if (!system_supports_sve()) + if (!system_supports_sve() || is_compat_task()) return -EINVAL; - ret = sve_set_vector_length(current, vl, flags); + ret = vec_set_vector_length(current, ARM64_VEC_SVE, vl, flags); if (ret) return ret; - return sve_prctl_status(flags); + return vec_prctl_status(ARM64_VEC_SVE, flags); } /* PR_SVE_GET_VL */ int sve_get_current_vl(void) { - if (!system_supports_sve()) + if (!system_supports_sve() || is_compat_task()) + return -EINVAL; + + return vec_prctl_status(ARM64_VEC_SVE, 0); +} + +#ifdef CONFIG_ARM64_SME +/* PR_SME_SET_VL */ +int sme_set_current_vl(unsigned long arg) +{ + unsigned long vl, flags; + int ret; + + vl = arg & PR_SME_VL_LEN_MASK; + flags = arg & ~vl; + + if (!system_supports_sme() || is_compat_task()) + return -EINVAL; + + ret = vec_set_vector_length(current, ARM64_VEC_SME, vl, flags); + if (ret) + return ret; + + return vec_prctl_status(ARM64_VEC_SME, flags); +} + +/* PR_SME_GET_VL */ +int sme_get_current_vl(void) +{ + if (!system_supports_sme() || is_compat_task()) return -EINVAL; - return sve_prctl_status(0); + return vec_prctl_status(ARM64_VEC_SME, 0); } +#endif /* CONFIG_ARM64_SME */ -static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX)) +static void vec_probe_vqs(struct vl_info *info, + DECLARE_BITMAP(map, SVE_VQ_MAX)) { unsigned int vq, vl; - unsigned long zcr; bitmap_zero(map, SVE_VQ_MAX); - zcr = ZCR_ELx_LEN_MASK; - zcr = read_sysreg_s(SYS_ZCR_EL1) & ~zcr; - for (vq = SVE_VQ_MAX; vq >= SVE_VQ_MIN; --vq) { - write_sysreg_s(zcr | (vq - 1), SYS_ZCR_EL1); /* self-syncing */ - vl = sve_get_vl(); + write_vl(info->type, vq - 1); /* self-syncing */ + + switch (info->type) { + case ARM64_VEC_SVE: + vl = sve_get_vl(); + break; + case ARM64_VEC_SME: + vl = sme_get_vl(); + break; + default: + vl = 0; + break; + } + + /* Minimum VL identified? */ + if (sve_vq_from_vl(vl) > vq) + break; + vq = sve_vq_from_vl(vl); /* skip intervening lengths */ set_bit(__vq_to_bit(vq), map); } @@ -705,10 +1026,11 @@ static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX)) * Initialise the set of known supported VQs for the boot CPU. * This is called during kernel boot, before secondary CPUs are brought up. */ -void __init sve_init_vq_map(void) +void __init vec_init_vq_map(enum vec_type type) { - sve_probe_vqs(sve_vq_map); - bitmap_copy(sve_vq_partial_map, sve_vq_map, SVE_VQ_MAX); + struct vl_info *info = &vl_info[type]; + vec_probe_vqs(info, info->vq_map); + bitmap_copy(info->vq_partial_map, info->vq_map, SVE_VQ_MAX); } /* @@ -716,30 +1038,33 @@ void __init sve_init_vq_map(void) * those not supported by the current CPU. * This function is called during the bring-up of early secondary CPUs only. */ -void sve_update_vq_map(void) +void vec_update_vq_map(enum vec_type type) { + struct vl_info *info = &vl_info[type]; DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); - sve_probe_vqs(tmp_map); - bitmap_and(sve_vq_map, sve_vq_map, tmp_map, SVE_VQ_MAX); - bitmap_or(sve_vq_partial_map, sve_vq_partial_map, tmp_map, SVE_VQ_MAX); + vec_probe_vqs(info, tmp_map); + bitmap_and(info->vq_map, info->vq_map, tmp_map, SVE_VQ_MAX); + bitmap_or(info->vq_partial_map, info->vq_partial_map, tmp_map, + SVE_VQ_MAX); } /* * Check whether the current CPU supports all VQs in the committed set. * This function is called during the bring-up of late secondary CPUs only. */ -int sve_verify_vq_map(void) +int vec_verify_vq_map(enum vec_type type) { + struct vl_info *info = &vl_info[type]; DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); unsigned long b; - sve_probe_vqs(tmp_map); + vec_probe_vqs(info, tmp_map); bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX); - if (bitmap_intersects(tmp_map, sve_vq_map, SVE_VQ_MAX)) { - pr_warn("SVE: cpu%d: Required vector length(s) missing\n", - smp_processor_id()); + if (bitmap_intersects(tmp_map, info->vq_map, SVE_VQ_MAX)) { + pr_warn("%s: cpu%d: Required vector length(s) missing\n", + info->name, smp_processor_id()); return -EINVAL; } @@ -755,7 +1080,7 @@ int sve_verify_vq_map(void) /* Recover the set of supported VQs: */ bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX); /* Find VQs supported that are not globally supported: */ - bitmap_andnot(tmp_map, tmp_map, sve_vq_map, SVE_VQ_MAX); + bitmap_andnot(tmp_map, tmp_map, info->vq_map, SVE_VQ_MAX); /* Find the lowest such VQ, if any: */ b = find_last_bit(tmp_map, SVE_VQ_MAX); @@ -766,9 +1091,9 @@ int sve_verify_vq_map(void) * Mismatches above sve_max_virtualisable_vl are fine, since * no guest is allowed to configure ZCR_EL2.LEN to exceed this: */ - if (sve_vl_from_vq(__bit_to_vq(b)) <= sve_max_virtualisable_vl) { - pr_warn("SVE: cpu%d: Unsupported vector length(s) present\n", - smp_processor_id()); + if (sve_vl_from_vq(__bit_to_vq(b)) <= info->max_virtualisable_vl) { + pr_warn("%s: cpu%d: Unsupported vector length(s) present\n", + info->name, smp_processor_id()); return -EINVAL; } @@ -777,19 +1102,25 @@ int sve_verify_vq_map(void) static void __init sve_efi_setup(void) { + int max_vl = 0; + int i; + if (!IS_ENABLED(CONFIG_EFI)) return; + for (i = 0; i < ARRAY_SIZE(vl_info); i++) + max_vl = max(vl_info[i].max_vl, max_vl); + /* * alloc_percpu() warns and prints a backtrace if this goes wrong. * This is evidence of a crippled system and we are returning void, * so no attempt is made to handle this situation here. */ - if (!sve_vl_valid(sve_max_vl)) + if (!sve_vl_valid(max_vl)) goto fail; efi_sve_state = __alloc_percpu( - SVE_SIG_REGS_SIZE(sve_vq_from_vl(sve_max_vl)), SVE_VQ_BYTES); + SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), SVE_VQ_BYTES); if (!efi_sve_state) goto fail; @@ -799,48 +1130,18 @@ fail: panic("Cannot allocate percpu memory for EFI SVE save/restore"); } -/* - * Enable SVE for EL1. - * Intended for use by the cpufeatures code during CPU boot. - */ -void sve_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +void cpu_enable_sve(const struct arm64_cpu_capabilities *__always_unused p) { write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_ZEN_EL1EN, CPACR_EL1); isb(); } -/* - * Read the pseudo-ZCR used by cpufeatures to identify the supported SVE - * vector length. - * - * Use only if SVE is present. - * This function clobbers the SVE vector length. - */ -u64 read_zcr_features(void) -{ - u64 zcr; - unsigned int vq_max; - - /* - * Set the maximum possible VL, and write zeroes to all other - * bits to see if they stick. - */ - sve_kernel_enable(NULL); - write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL1); - - zcr = read_sysreg_s(SYS_ZCR_EL1); - zcr &= ~(u64)ZCR_ELx_LEN_MASK; /* find sticky 1s outside LEN field */ - vq_max = sve_vq_from_vl(sve_get_vl()); - zcr |= vq_max - 1; /* set LEN field to maximum effective value */ - - return zcr; -} - void __init sve_setup(void) { - u64 zcr; + struct vl_info *info = &vl_info[ARM64_VEC_SVE]; DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); unsigned long b; + int max_bit; if (!system_supports_sve()) return; @@ -850,49 +1151,43 @@ void __init sve_setup(void) * so sve_vq_map must have at least SVE_VQ_MIN set. * If something went wrong, at least try to patch it up: */ - if (WARN_ON(!test_bit(__vq_to_bit(SVE_VQ_MIN), sve_vq_map))) - set_bit(__vq_to_bit(SVE_VQ_MIN), sve_vq_map); - - zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1); - sve_max_vl = sve_vl_from_vq((zcr & ZCR_ELx_LEN_MASK) + 1); + if (WARN_ON(!test_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map))) + set_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map); - /* - * Sanity-check that the max VL we determined through CPU features - * corresponds properly to sve_vq_map. If not, do our best: - */ - if (WARN_ON(sve_max_vl != find_supported_vector_length(sve_max_vl))) - sve_max_vl = find_supported_vector_length(sve_max_vl); + max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX); + info->max_vl = sve_vl_from_vq(__bit_to_vq(max_bit)); /* * For the default VL, pick the maximum supported value <= 64. * VL == 64 is guaranteed not to grow the signal frame. */ - sve_default_vl = find_supported_vector_length(64); + set_sve_default_vl(find_supported_vector_length(ARM64_VEC_SVE, 64)); - bitmap_andnot(tmp_map, sve_vq_partial_map, sve_vq_map, + bitmap_andnot(tmp_map, info->vq_partial_map, info->vq_map, SVE_VQ_MAX); b = find_last_bit(tmp_map, SVE_VQ_MAX); if (b >= SVE_VQ_MAX) /* No non-virtualisable VLs found */ - sve_max_virtualisable_vl = SVE_VQ_MAX; + info->max_virtualisable_vl = SVE_VQ_MAX; else if (WARN_ON(b == SVE_VQ_MAX - 1)) /* No virtualisable VLs? This is architecturally forbidden. */ - sve_max_virtualisable_vl = SVE_VQ_MIN; + info->max_virtualisable_vl = SVE_VQ_MIN; else /* b + 1 < SVE_VQ_MAX */ - sve_max_virtualisable_vl = sve_vl_from_vq(__bit_to_vq(b + 1)); + info->max_virtualisable_vl = sve_vl_from_vq(__bit_to_vq(b + 1)); - if (sve_max_virtualisable_vl > sve_max_vl) - sve_max_virtualisable_vl = sve_max_vl; + if (info->max_virtualisable_vl > info->max_vl) + info->max_virtualisable_vl = info->max_vl; - pr_info("SVE: maximum available vector length %u bytes per vector\n", - sve_max_vl); - pr_info("SVE: default vector length %u bytes per vector\n", - sve_default_vl); + pr_info("%s: maximum available vector length %u bytes per vector\n", + info->name, info->max_vl); + pr_info("%s: default vector length %u bytes per vector\n", + info->name, get_sve_default_vl()); /* KVM decides whether to support mismatched systems. Just warn here: */ - if (sve_max_virtualisable_vl < sve_max_vl) - pr_warn("SVE: unvirtualisable vector lengths present\n"); + if (sve_max_virtualisable_vl() < sve_max_vl()) + pr_warn("%s: unvirtualisable vector lengths present\n", + info->name); sve_efi_setup(); } @@ -904,59 +1199,260 @@ void __init sve_setup(void) void fpsimd_release_task(struct task_struct *dead_task) { __sve_free(dead_task); + sme_free(dead_task); } #endif /* CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +/* + * Ensure that task->thread.sme_state is allocated and sufficiently large. + * + * This function should be used only in preparation for replacing + * task->thread.sme_state with new data. The memory is always zeroed + * here to prevent stale data from showing through: this is done in + * the interest of testability and predictability, the architecture + * guarantees that when ZA is enabled it will be zeroed. + */ +void sme_alloc(struct task_struct *task, bool flush) +{ + if (task->thread.sme_state) { + if (flush) + memset(task->thread.sme_state, 0, + sme_state_size(task)); + return; + } + + /* This could potentially be up to 64K. */ + task->thread.sme_state = + kzalloc(sme_state_size(task), GFP_KERNEL); +} + +static void sme_free(struct task_struct *task) +{ + kfree(task->thread.sme_state); + task->thread.sme_state = NULL; +} + +void cpu_enable_sme(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* Set priority for all PEs to architecturally defined minimum */ + write_sysreg_s(read_sysreg_s(SYS_SMPRI_EL1) & ~SMPRI_EL1_PRIORITY_MASK, + SYS_SMPRI_EL1); + + /* Allow SME in kernel */ + write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_SMEN_EL1EN, CPACR_EL1); + isb(); + + /* Allow EL0 to access TPIDR2 */ + write_sysreg(read_sysreg(SCTLR_EL1) | SCTLR_ELx_ENTP2, SCTLR_EL1); + isb(); +} + +void cpu_enable_sme2(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* This must be enabled after SME */ + BUILD_BUG_ON(ARM64_SME2 <= ARM64_SME); + + /* Allow use of ZT0 */ + write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_EZT0_MASK, + SYS_SMCR_EL1); +} + +void cpu_enable_fa64(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* This must be enabled after SME */ + BUILD_BUG_ON(ARM64_SME_FA64 <= ARM64_SME); + + /* Allow use of FA64 */ + write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK, + SYS_SMCR_EL1); +} + +void __init sme_setup(void) +{ + struct vl_info *info = &vl_info[ARM64_VEC_SME]; + int min_bit, max_bit; + + if (!system_supports_sme()) + return; + + /* + * SME doesn't require any particular vector length be + * supported but it does require at least one. We should have + * disabled the feature entirely while bringing up CPUs but + * let's double check here. The bitmap is SVE_VQ_MAP sized for + * sharing with SVE. + */ + WARN_ON(bitmap_empty(info->vq_map, SVE_VQ_MAX)); + + min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); + info->min_vl = sve_vl_from_vq(__bit_to_vq(min_bit)); + + max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX); + info->max_vl = sve_vl_from_vq(__bit_to_vq(max_bit)); + + WARN_ON(info->min_vl > info->max_vl); + + /* + * For the default VL, pick the maximum supported value <= 32 + * (256 bits) if there is one since this is guaranteed not to + * grow the signal frame when in streaming mode, otherwise the + * minimum available VL will be used. + */ + set_sme_default_vl(find_supported_vector_length(ARM64_VEC_SME, 32)); + + pr_info("SME: minimum available vector length %u bytes per vector\n", + info->min_vl); + pr_info("SME: maximum available vector length %u bytes per vector\n", + info->max_vl); + pr_info("SME: default vector length %u bytes per vector\n", + get_sme_default_vl()); +} + +#endif /* CONFIG_ARM64_SME */ + +static void sve_init_regs(void) +{ + /* + * Convert the FPSIMD state to SVE, zeroing all the state that + * is not shared with FPSIMD. If (as is likely) the current + * state is live in the registers then do this there and + * update our metadata for the current task including + * disabling the trap, otherwise update our in-memory copy. + * We are guaranteed to not be in streaming mode, we can only + * take a SVE trap when not in streaming mode and we can't be + * in streaming mode when taking a SME trap. + */ + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { + unsigned long vq_minus_one = + sve_vq_from_vl(task_get_sve_vl(current)) - 1; + sve_set_vq(vq_minus_one); + sve_flush_live(true, vq_minus_one); + fpsimd_bind_task_to_cpu(); + } else { + fpsimd_to_sve(current); + current->thread.fp_type = FP_STATE_SVE; + } +} + /* * Trapped SVE access * * Storage is allocated for the full SVE state, the current FPSIMD - * register contents are migrated across, and TIF_SVE is set so that - * the SVE access trap will be disabled the next time this task - * reaches ret_to_user. + * register contents are migrated across, and the access trap is + * disabled. * - * TIF_SVE should be clear on entry: otherwise, task_fpsimd_load() + * TIF_SVE should be clear on entry: otherwise, fpsimd_restore_current_state() * would have disabled the SVE access trap for userspace during * ret_to_user, making an SVE access trap impossible in that case. */ -void do_sve_acc(unsigned int esr, struct pt_regs *regs) +void do_sve_acc(unsigned long esr, struct pt_regs *regs) { /* Even if we chose not to use SVE, the hardware could still trap: */ if (unlikely(!system_supports_sve()) || WARN_ON(is_compat_task())) { - force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } - sve_alloc(current); + sve_alloc(current, true); + if (!current->thread.sve_state) { + force_sig(SIGKILL); + return; + } get_cpu_fpsimd_context(); - fpsimd_save(); - - /* Force ret_to_user to reload the registers: */ - fpsimd_flush_task_state(current); - - fpsimd_to_sve(current); if (test_and_set_thread_flag(TIF_SVE)) WARN_ON(1); /* SVE access shouldn't have trapped */ + /* + * Even if the task can have used streaming mode we can only + * generate SVE access traps in normal SVE mode and + * transitioning out of streaming mode may discard any + * streaming mode state. Always clear the high bits to avoid + * any potential errors tracking what is properly initialised. + */ + sve_init_regs(); + + put_cpu_fpsimd_context(); +} + +/* + * Trapped SME access + * + * Storage is allocated for the full SVE and SME state, the current + * FPSIMD register contents are migrated to SVE if SVE is not already + * active, and the access trap is disabled. + * + * TIF_SME should be clear on entry: otherwise, fpsimd_restore_current_state() + * would have disabled the SME access trap for userspace during + * ret_to_user, making an SME access trap impossible in that case. + */ +void do_sme_acc(unsigned long esr, struct pt_regs *regs) +{ + /* Even if we chose not to use SME, the hardware could still trap: */ + if (unlikely(!system_supports_sme()) || WARN_ON(is_compat_task())) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + /* + * If this not a trap due to SME being disabled then something + * is being used in the wrong mode, report as SIGILL. + */ + if (ESR_ELx_ISS(esr) != ESR_ELx_SME_ISS_SME_DISABLED) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + sve_alloc(current, false); + sme_alloc(current, true); + if (!current->thread.sve_state || !current->thread.sme_state) { + force_sig(SIGKILL); + return; + } + + get_cpu_fpsimd_context(); + + /* With TIF_SME userspace shouldn't generate any traps */ + if (test_and_set_thread_flag(TIF_SME)) + WARN_ON(1); + + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { + unsigned long vq_minus_one = + sve_vq_from_vl(task_get_sme_vl(current)) - 1; + sme_set_vq(vq_minus_one); + + fpsimd_bind_task_to_cpu(); + } + put_cpu_fpsimd_context(); } /* * Trapped FP/ASIMD access. */ -void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) +void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs) { - /* TODO: implement lazy context saving/restoring */ - WARN_ON(1); + /* Even if we chose not to use FPSIMD, the hardware could still trap: */ + if (!system_supports_fpsimd()) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + /* + * When FPSIMD is enabled, we should never take a trap unless something + * has gone very wrong. + */ + BUG(); } /* * Raise a SIGFPE for the current process. */ -void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) +void do_fpsimd_exc(unsigned long esr, struct pt_regs *regs) { unsigned int si_code = FPE_FLTUNK; @@ -978,6 +1474,34 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) current); } +static void fpsimd_load_kernel_state(struct task_struct *task) +{ + struct cpu_fp_state *last = this_cpu_ptr(&fpsimd_last_state); + + /* + * Elide the load if this CPU holds the most recent kernel mode + * FPSIMD context of the current task. + */ + if (last->st == &task->thread.kernel_fpsimd_state && + task->thread.kernel_fpsimd_cpu == smp_processor_id()) + return; + + fpsimd_load_state(&task->thread.kernel_fpsimd_state); +} + +static void fpsimd_save_kernel_state(struct task_struct *task) +{ + struct cpu_fp_state cpu_fp_state = { + .st = &task->thread.kernel_fpsimd_state, + .to_save = FP_STATE_FPSIMD, + }; + + fpsimd_save_state(&task->thread.kernel_fpsimd_state); + fpsimd_bind_state_to_cpu(&cpu_fp_state); + + task->thread.kernel_fpsimd_cpu = smp_processor_id(); +} + void fpsimd_thread_switch(struct task_struct *next) { bool wrong_task, wrong_cpu; @@ -985,29 +1509,72 @@ void fpsimd_thread_switch(struct task_struct *next) if (!system_supports_fpsimd()) return; - __get_cpu_fpsimd_context(); + WARN_ON_ONCE(!irqs_disabled()); /* Save unsaved fpsimd state, if any: */ - fpsimd_save(); + if (test_thread_flag(TIF_KERNEL_FPSTATE)) + fpsimd_save_kernel_state(current); + else + fpsimd_save_user_state(); + + if (test_tsk_thread_flag(next, TIF_KERNEL_FPSTATE)) { + fpsimd_load_kernel_state(next); + set_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE); + } else { + /* + * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's + * state. For kernel threads, FPSIMD registers are never + * loaded with user mode FPSIMD state and so wrong_task and + * wrong_cpu will always be true. + */ + wrong_task = __this_cpu_read(fpsimd_last_state.st) != + &next->thread.uw.fpsimd_state; + wrong_cpu = next->thread.fpsimd_cpu != smp_processor_id(); + + update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, + wrong_task || wrong_cpu); + } +} + +static void fpsimd_flush_thread_vl(enum vec_type type) +{ + int vl, supported_vl; /* - * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's - * state. For kernel threads, FPSIMD registers are never loaded - * and wrong_task and wrong_cpu will always be true. + * Reset the task vector length as required. This is where we + * ensure that all user tasks have a valid vector length + * configured: no kernel task can become a user task without + * an exec and hence a call to this function. By the time the + * first call to this function is made, all early hardware + * probing is complete, so __sve_default_vl should be valid. + * If a bug causes this to go wrong, we make some noise and + * try to fudge thread.sve_vl to a safe value here. */ - wrong_task = __this_cpu_read(fpsimd_last_state.st) != - &next->thread.uw.fpsimd_state; - wrong_cpu = next->thread.fpsimd_cpu != smp_processor_id(); + vl = task_get_vl_onexec(current, type); + if (!vl) + vl = get_default_vl(type); + + if (WARN_ON(!sve_vl_valid(vl))) + vl = vl_info[type].min_vl; + + supported_vl = find_supported_vector_length(type, vl); + if (WARN_ON(supported_vl != vl)) + vl = supported_vl; - update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, - wrong_task || wrong_cpu); + task_set_vl(current, type, vl); - __put_cpu_fpsimd_context(); + /* + * If the task is not set to inherit, ensure that the vector + * length will be reset by a subsequent exec: + */ + if (!test_thread_flag(vec_vl_inherit_flag(type))) + task_set_vl_onexec(current, type, 0); } void fpsimd_flush_thread(void) { - int vl, supported_vl; + void *sve_state = NULL; + void *sme_state = NULL; if (!system_supports_fpsimd()) return; @@ -1020,40 +1587,30 @@ void fpsimd_flush_thread(void) if (system_supports_sve()) { clear_thread_flag(TIF_SVE); - sve_free(current); - /* - * Reset the task vector length as required. - * This is where we ensure that all user tasks have a valid - * vector length configured: no kernel task can become a user - * task without an exec and hence a call to this function. - * By the time the first call to this function is made, all - * early hardware probing is complete, so sve_default_vl - * should be valid. - * If a bug causes this to go wrong, we make some noise and - * try to fudge thread.sve_vl to a safe value here. - */ - vl = current->thread.sve_vl_onexec ? - current->thread.sve_vl_onexec : sve_default_vl; + /* Defer kfree() while in atomic context */ + sve_state = current->thread.sve_state; + current->thread.sve_state = NULL; - if (WARN_ON(!sve_vl_valid(vl))) - vl = SVE_VL_MIN; + fpsimd_flush_thread_vl(ARM64_VEC_SVE); + } - supported_vl = find_supported_vector_length(vl); - if (WARN_ON(supported_vl != vl)) - vl = supported_vl; + if (system_supports_sme()) { + clear_thread_flag(TIF_SME); - current->thread.sve_vl = vl; + /* Defer kfree() while in atomic context */ + sme_state = current->thread.sme_state; + current->thread.sme_state = NULL; - /* - * If the task is not set to inherit, ensure that the vector - * length will be reset by a subsequent exec: - */ - if (!test_thread_flag(TIF_SVE_VL_INHERIT)) - current->thread.sve_vl_onexec = 0; + fpsimd_flush_thread_vl(ARM64_VEC_SME); + current->thread.svcr = 0; } + current->thread.fp_type = FP_STATE_FPSIMD; + put_cpu_fpsimd_context(); + kfree(sve_state); + kfree(sme_state); } /* @@ -1066,7 +1623,7 @@ void fpsimd_preserve_current_state(void) return; get_cpu_fpsimd_context(); - fpsimd_save(); + fpsimd_save_user_state(); put_cpu_fpsimd_context(); } @@ -1078,58 +1635,115 @@ void fpsimd_preserve_current_state(void) void fpsimd_signal_preserve_current_state(void) { fpsimd_preserve_current_state(); - if (system_supports_sve() && test_thread_flag(TIF_SVE)) + if (test_thread_flag(TIF_SVE)) sve_to_fpsimd(current); } /* + * Called by KVM when entering the guest. + */ +void fpsimd_kvm_prepare(void) +{ + if (!system_supports_sve()) + return; + + /* + * KVM does not save host SVE state since we can only enter + * the guest from a syscall so the ABI means that only the + * non-saved SVE state needs to be saved. If we have left + * SVE enabled for performance reasons then update the task + * state to be FPSIMD only. + */ + get_cpu_fpsimd_context(); + + if (test_and_clear_thread_flag(TIF_SVE)) { + sve_to_fpsimd(current); + current->thread.fp_type = FP_STATE_FPSIMD; + } + + put_cpu_fpsimd_context(); +} + +/* * Associate current's FPSIMD context with this cpu * The caller must have ownership of the cpu FPSIMD context before calling * this function. */ -void fpsimd_bind_task_to_cpu(void) +static void fpsimd_bind_task_to_cpu(void) { - struct fpsimd_last_state_struct *last = - this_cpu_ptr(&fpsimd_last_state); + struct cpu_fp_state *last = this_cpu_ptr(&fpsimd_last_state); + WARN_ON(!system_supports_fpsimd()); last->st = ¤t->thread.uw.fpsimd_state; last->sve_state = current->thread.sve_state; - last->sve_vl = current->thread.sve_vl; + last->sme_state = current->thread.sme_state; + last->sve_vl = task_get_sve_vl(current); + last->sme_vl = task_get_sme_vl(current); + last->svcr = ¤t->thread.svcr; + last->fp_type = ¤t->thread.fp_type; + last->to_save = FP_STATE_CURRENT; current->thread.fpsimd_cpu = smp_processor_id(); + /* + * Toggle SVE and SME trapping for userspace if needed, these + * are serialsied by ret_to_user(). + */ + if (system_supports_sme()) { + if (test_thread_flag(TIF_SME)) + sme_user_enable(); + else + sme_user_disable(); + } + if (system_supports_sve()) { - /* Toggle SVE trapping for userspace if needed */ if (test_thread_flag(TIF_SVE)) sve_user_enable(); else sve_user_disable(); - - /* Serialised by exception return to user */ } } -void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state, - unsigned int sve_vl) +void fpsimd_bind_state_to_cpu(struct cpu_fp_state *state) { - struct fpsimd_last_state_struct *last = - this_cpu_ptr(&fpsimd_last_state); + struct cpu_fp_state *last = this_cpu_ptr(&fpsimd_last_state); + WARN_ON(!system_supports_fpsimd()); WARN_ON(!in_softirq() && !irqs_disabled()); - last->st = st; - last->sve_state = sve_state; - last->sve_vl = sve_vl; + *last = *state; } /* * Load the userland FPSIMD state of 'current' from memory, but only if the * FPSIMD state already held in the registers is /not/ the most recent FPSIMD - * state of 'current' + * state of 'current'. This is called when we are preparing to return to + * userspace to ensure that userspace sees a good register state. */ void fpsimd_restore_current_state(void) { - if (!system_supports_fpsimd()) + /* + * TIF_FOREIGN_FPSTATE is set on the init task and copied by + * arch_dup_task_struct() regardless of whether FP/SIMD is detected. + * Thus user threads can have this set even when FP/SIMD hasn't been + * detected. + * + * When FP/SIMD is detected, begin_new_exec() will set + * TIF_FOREIGN_FPSTATE via flush_thread() -> fpsimd_flush_thread(), + * and fpsimd_thread_switch() will set TIF_FOREIGN_FPSTATE when + * switching tasks. We detect FP/SIMD before we exec the first user + * process, ensuring this has TIF_FOREIGN_FPSTATE set and + * do_notify_resume() will call fpsimd_restore_current_state() to + * install the user FP/SIMD context. + * + * When FP/SIMD is not detected, nothing else will clear or set + * TIF_FOREIGN_FPSTATE prior to the first return to userspace, and + * we must clear TIF_FOREIGN_FPSTATE to avoid do_notify_resume() + * looping forever calling fpsimd_restore_current_state(). + */ + if (!system_supports_fpsimd()) { + clear_thread_flag(TIF_FOREIGN_FPSTATE); return; + } get_cpu_fpsimd_context(); @@ -1144,17 +1758,19 @@ void fpsimd_restore_current_state(void) /* * Load an updated userland FPSIMD state for 'current' from memory and set the * flag that indicates that the FPSIMD register contents are the most recent - * FPSIMD state of 'current' + * FPSIMD state of 'current'. This is used by the signal code to restore the + * register state when returning from a signal handler in FPSIMD only cases, + * any SVE context will be discarded. */ void fpsimd_update_current_state(struct user_fpsimd_state const *state) { - if (!system_supports_fpsimd()) + if (WARN_ON(!system_supports_fpsimd())) return; get_cpu_fpsimd_context(); current->thread.uw.fpsimd_state = *state; - if (system_supports_sve() && test_thread_flag(TIF_SVE)) + if (test_thread_flag(TIF_SVE)) fpsimd_to_sve(current); task_fpsimd_load(); @@ -1179,7 +1795,13 @@ void fpsimd_update_current_state(struct user_fpsimd_state const *state) void fpsimd_flush_task_state(struct task_struct *t) { t->thread.fpsimd_cpu = NR_CPUS; - + /* + * If we don't support fpsimd, bail out after we have + * reset the fpsimd_cpu for this task and clear the + * FPSTATE. + */ + if (!system_supports_fpsimd()) + return; barrier(); set_tsk_thread_flag(t, TIF_FOREIGN_FPSTATE); @@ -1193,7 +1815,17 @@ void fpsimd_flush_task_state(struct task_struct *t) */ static void fpsimd_flush_cpu_state(void) { + WARN_ON(!system_supports_fpsimd()); __this_cpu_write(fpsimd_last_state.st, NULL); + + /* + * Leaving streaming mode enabled will cause issues for any kernel + * NEON and leaving streaming mode or ZA enabled may increase power + * consumption. + */ + if (system_supports_sme()) + sme_smstop(); + set_thread_flag(TIF_FOREIGN_FPSTATE); } @@ -1203,11 +1835,15 @@ static void fpsimd_flush_cpu_state(void) */ void fpsimd_save_and_flush_cpu_state(void) { + unsigned long flags; + + if (!system_supports_fpsimd()) + return; WARN_ON(preemptible()); - __get_cpu_fpsimd_context(); - fpsimd_save(); + local_irq_save(flags); + fpsimd_save_user_state(); fpsimd_flush_cpu_state(); - __put_cpu_fpsimd_context(); + local_irq_restore(flags); } #ifdef CONFIG_KERNEL_MODE_NEON @@ -1239,12 +1875,39 @@ void kernel_neon_begin(void) get_cpu_fpsimd_context(); /* Save unsaved fpsimd state, if any: */ - fpsimd_save(); + if (test_thread_flag(TIF_KERNEL_FPSTATE)) { + BUG_ON(IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq()); + fpsimd_save_kernel_state(current); + } else { + fpsimd_save_user_state(); + + /* + * Set the thread flag so that the kernel mode FPSIMD state + * will be context switched along with the rest of the task + * state. + * + * On non-PREEMPT_RT, softirqs may interrupt task level kernel + * mode FPSIMD, but the task will not be preemptible so setting + * TIF_KERNEL_FPSTATE for those would be both wrong (as it + * would mark the task context FPSIMD state as requiring a + * context switch) and unnecessary. + * + * On PREEMPT_RT, softirqs are serviced from a separate thread, + * which is scheduled as usual, and this guarantees that these + * softirqs are not interrupting use of the FPSIMD in kernel + * mode in task context. So in this case, setting the flag here + * is always appropriate. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq()) + set_thread_flag(TIF_KERNEL_FPSTATE); + } /* Invalidate any task state remaining in the fpsimd regs: */ fpsimd_flush_cpu_state(); + + put_cpu_fpsimd_context(); } -EXPORT_SYMBOL(kernel_neon_begin); +EXPORT_SYMBOL_GPL(kernel_neon_begin); /* * kernel_neon_end(): give the CPU FPSIMD registers back to the current task @@ -1260,15 +1923,25 @@ void kernel_neon_end(void) if (!system_supports_fpsimd()) return; - put_cpu_fpsimd_context(); + /* + * If we are returning from a nested use of kernel mode FPSIMD, restore + * the task context kernel mode FPSIMD state. This can only happen when + * running in softirq context on non-PREEMPT_RT. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && in_serving_softirq() && + test_thread_flag(TIF_KERNEL_FPSTATE)) + fpsimd_load_kernel_state(current); + else + clear_thread_flag(TIF_KERNEL_FPSTATE); } -EXPORT_SYMBOL(kernel_neon_end); +EXPORT_SYMBOL_GPL(kernel_neon_end); #ifdef CONFIG_EFI static DEFINE_PER_CPU(struct user_fpsimd_state, efi_fpsimd_state); static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); static DEFINE_PER_CPU(bool, efi_sve_state_used); +static DEFINE_PER_CPU(bool, efi_sm_state); /* * EFI runtime services support functions @@ -1303,11 +1976,33 @@ void __efi_fpsimd_begin(void) */ if (system_supports_sve() && likely(efi_sve_state)) { char *sve_state = this_cpu_ptr(efi_sve_state); + bool ffr = true; + u64 svcr; __this_cpu_write(efi_sve_state_used, true); - sve_save_state(sve_state + sve_ffr_offset(sve_max_vl), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr); + if (system_supports_sme()) { + svcr = read_sysreg_s(SYS_SVCR); + + __this_cpu_write(efi_sm_state, + svcr & SVCR_SM_MASK); + + /* + * Unless we have FA64 FFR does not + * exist in streaming mode. + */ + if (!system_supports_fa64()) + ffr = !(svcr & SVCR_SM_MASK); + } + + sve_save_state(sve_state + sve_ffr_offset(sve_max_vl()), + &this_cpu_ptr(&efi_fpsimd_state)->fpsr, + ffr); + + if (system_supports_sme()) + sysreg_clear_set_s(SYS_SVCR, + SVCR_SM_MASK, 0); + } else { fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); } @@ -1330,10 +2025,31 @@ void __efi_fpsimd_end(void) if (system_supports_sve() && likely(__this_cpu_read(efi_sve_state_used))) { char const *sve_state = this_cpu_ptr(efi_sve_state); + bool ffr = true; + + /* + * Restore streaming mode; EFI calls are + * normal function calls so should not return in + * streaming mode. + */ + if (system_supports_sme()) { + if (__this_cpu_read(efi_sm_state)) { + sysreg_clear_set_s(SYS_SVCR, + 0, + SVCR_SM_MASK); + + /* + * Unless we have FA64 FFR does not + * exist in streaming mode. + */ + if (!system_supports_fa64()) + ffr = false; + } + } - sve_load_state(sve_state + sve_ffr_offset(sve_max_vl), + sve_load_state(sve_state + sve_ffr_offset(sve_max_vl()), &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - sve_vq_from_vl(sve_get_vl()) - 1); + ffr); __this_cpu_write(efi_sve_state_used, false); } else { @@ -1393,6 +2109,13 @@ static inline void fpsimd_hotplug_init(void) static inline void fpsimd_hotplug_init(void) { } #endif +void cpu_enable_fpsimd(const struct arm64_cpu_capabilities *__always_unused p) +{ + unsigned long enable = CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN; + write_sysreg(read_sysreg(CPACR_EL1) | enable, CPACR_EL1); + isb(); +} + /* * FP/SIMD support code initialisation. */ @@ -1408,6 +2131,10 @@ static int __init fpsimd_init(void) if (!cpu_have_named_feature(ASIMD)) pr_notice("Advanced SIMD is not implemented\n"); - return sve_sysctl_init(); + + sve_sysctl_init(); + sme_sysctl_init(); + + return 0; } core_initcall(fpsimd_init); diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 8618faa82e6d..a650f5e11fc5 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -15,8 +15,134 @@ #include <asm/debug-monitors.h> #include <asm/ftrace.h> #include <asm/insn.h> +#include <asm/patching.h> + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS +struct fregs_offset { + const char *name; + int offset; +}; + +#define FREGS_OFFSET(n, field) \ +{ \ + .name = n, \ + .offset = offsetof(struct ftrace_regs, field), \ +} + +static const struct fregs_offset fregs_offsets[] = { + FREGS_OFFSET("x0", regs[0]), + FREGS_OFFSET("x1", regs[1]), + FREGS_OFFSET("x2", regs[2]), + FREGS_OFFSET("x3", regs[3]), + FREGS_OFFSET("x4", regs[4]), + FREGS_OFFSET("x5", regs[5]), + FREGS_OFFSET("x6", regs[6]), + FREGS_OFFSET("x7", regs[7]), + FREGS_OFFSET("x8", regs[8]), + + FREGS_OFFSET("x29", fp), + FREGS_OFFSET("x30", lr), + FREGS_OFFSET("lr", lr), + + FREGS_OFFSET("sp", sp), + FREGS_OFFSET("pc", pc), +}; + +int ftrace_regs_query_register_offset(const char *name) +{ + for (int i = 0; i < ARRAY_SIZE(fregs_offsets); i++) { + const struct fregs_offset *roff = &fregs_offsets[i]; + if (!strcmp(roff->name, name)) + return roff->offset; + } + + return -EINVAL; +} +#endif + +unsigned long ftrace_call_adjust(unsigned long addr) +{ + /* + * When using mcount, addr is the address of the mcount call + * instruction, and no adjustment is necessary. + */ + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_ARGS)) + return addr; + + /* + * When using patchable-function-entry without pre-function NOPS, addr + * is the address of the first NOP after the function entry point. + * + * The compiler has either generated: + * + * addr+00: func: NOP // To be patched to MOV X9, LR + * addr+04: NOP // To be patched to BL <caller> + * + * Or: + * + * addr-04: BTI C + * addr+00: func: NOP // To be patched to MOV X9, LR + * addr+04: NOP // To be patched to BL <caller> + * + * We must adjust addr to the address of the NOP which will be patched + * to `BL <caller>`, which is at `addr + 4` bytes in either case. + * + */ + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return addr + AARCH64_INSN_SIZE; + + /* + * When using patchable-function-entry with pre-function NOPs, addr is + * the address of the first pre-function NOP. + * + * Starting from an 8-byte aligned base, the compiler has either + * generated: + * + * addr+00: NOP // Literal (first 32 bits) + * addr+04: NOP // Literal (last 32 bits) + * addr+08: func: NOP // To be patched to MOV X9, LR + * addr+12: NOP // To be patched to BL <caller> + * + * Or: + * + * addr+00: NOP // Literal (first 32 bits) + * addr+04: NOP // Literal (last 32 bits) + * addr+08: func: BTI C + * addr+12: NOP // To be patched to MOV X9, LR + * addr+16: NOP // To be patched to BL <caller> + * + * We must adjust addr to the address of the NOP which will be patched + * to `BL <caller>`, which is at either addr+12 or addr+16 depending on + * whether there is a BTI. + */ + + if (!IS_ALIGNED(addr, sizeof(unsigned long))) { + WARN_RATELIMIT(1, "Misaligned patch-site %pS\n", + (void *)(addr + 8)); + return 0; + } + + /* Skip the NOPs placed before the function entry point */ + addr += 2 * AARCH64_INSN_SIZE; + + /* Skip any BTI */ + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) { + u32 insn = le32_to_cpu(*(__le32 *)addr); + + if (aarch64_insn_is_bti(insn)) { + addr += AARCH64_INSN_SIZE; + } else if (insn != aarch64_insn_gen_nop()) { + WARN_RATELIMIT(1, "unexpected insn in patch-site %pS: 0x%08x\n", + (void *)addr, insn); + } + } + + /* Skip the first NOP after function entry */ + addr += AARCH64_INSN_SIZE; + + return addr; +} -#ifdef CONFIG_DYNAMIC_FTRACE /* * Replace a single instruction, which may be a branch or NOP. * If @validate == true, a replaced instruction is checked against 'old'. @@ -55,80 +181,181 @@ int ftrace_update_ftrace_func(ftrace_func_t func) unsigned long pc; u32 new; - pc = (unsigned long)&ftrace_call; + /* + * When using CALL_OPS, the function to call is associated with the + * call site, and we don't have a global function pointer to update. + */ + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return 0; + + pc = (unsigned long)ftrace_call; new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, AARCH64_INSN_BRANCH_LINK); return ftrace_modify_code(pc, 0, new, false); } -static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr) +static struct plt_entry *get_ftrace_plt(struct module *mod) { -#ifdef CONFIG_ARM64_MODULE_PLTS +#ifdef CONFIG_MODULES struct plt_entry *plt = mod->arch.ftrace_trampolines; - if (addr == FTRACE_ADDR) - return &plt[FTRACE_PLT_IDX]; - if (addr == FTRACE_REGS_ADDR && IS_ENABLED(CONFIG_FTRACE_WITH_REGS)) - return &plt[FTRACE_REGS_PLT_IDX]; -#endif + return &plt[FTRACE_PLT_IDX]; +#else return NULL; +#endif +} + +static bool reachable_by_bl(unsigned long addr, unsigned long pc) +{ + long offset = (long)addr - (long)pc; + + return offset >= -SZ_128M && offset < SZ_128M; } /* - * Turn on the call to ftrace_caller() in instrumented function + * Find the address the callsite must branch to in order to reach '*addr'. + * + * Due to the limited range of 'BL' instructions, modules may be placed too far + * away to branch directly and must use a PLT. + * + * Returns true when '*addr' contains a reachable target address, or has been + * modified to contain a PLT address. Returns false otherwise. */ -int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +static bool ftrace_find_callable_addr(struct dyn_ftrace *rec, + struct module *mod, + unsigned long *addr) { unsigned long pc = rec->ip; - u32 old, new; - long offset = (long)pc - (long)addr; + struct plt_entry *plt; - if (offset < -SZ_128M || offset >= SZ_128M) { - struct module *mod; - struct plt_entry *plt; + /* + * If a custom trampoline is unreachable, rely on the ftrace_caller + * trampoline which knows how to indirectly reach that trampoline + * through ops->direct_call. + */ + if (*addr != FTRACE_ADDR && !reachable_by_bl(*addr, pc)) + *addr = FTRACE_ADDR; - if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) - return -EINVAL; + /* + * When the target is within range of the 'BL' instruction, use 'addr' + * as-is and branch to that directly. + */ + if (reachable_by_bl(*addr, pc)) + return true; - /* - * On kernels that support module PLTs, the offset between the - * branch instruction and its target may legally exceed the - * range of an ordinary relative 'bl' opcode. In this case, we - * need to branch via a trampoline in the module. - * - * NOTE: __module_text_address() must be called with preemption - * disabled, but we can rely on ftrace_lock to ensure that 'mod' - * retains its validity throughout the remainder of this code. - */ + /* + * When the target is outside of the range of a 'BL' instruction, we + * must use a PLT to reach it. We can only place PLTs for modules, and + * only when module PLT support is built-in. + */ + if (!IS_ENABLED(CONFIG_MODULES)) + return false; + + /* + * 'mod' is only set at module load time, but if we end up + * dealing with an out-of-range condition, we can assume it + * is due to a module being loaded far away from the kernel. + * + * NOTE: __module_text_address() must be called with preemption + * disabled, but we can rely on ftrace_lock to ensure that 'mod' + * retains its validity throughout the remainder of this code. + */ + if (!mod) { preempt_disable(); mod = __module_text_address(pc); preempt_enable(); + } - if (WARN_ON(!mod)) - return -EINVAL; + if (WARN_ON(!mod)) + return false; - plt = get_ftrace_plt(mod, addr); - if (!plt) { - pr_err("ftrace: no module PLT for %ps\n", (void *)addr); - return -EINVAL; - } + plt = get_ftrace_plt(mod); + if (!plt) { + pr_err("ftrace: no module PLT for %ps\n", (void *)*addr); + return false; + } + + *addr = (unsigned long)plt; + return true; +} - addr = (unsigned long)plt; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS +static const struct ftrace_ops *arm64_rec_get_ops(struct dyn_ftrace *rec) +{ + const struct ftrace_ops *ops = NULL; + + if (rec->flags & FTRACE_FL_CALL_OPS_EN) { + ops = ftrace_find_unique_ops(rec); + WARN_ON_ONCE(!ops); } + if (!ops) + ops = &ftrace_list_ops; + + return ops; +} + +static int ftrace_rec_set_ops(const struct dyn_ftrace *rec, + const struct ftrace_ops *ops) +{ + unsigned long literal = ALIGN_DOWN(rec->ip - 12, 8); + return aarch64_insn_write_literal_u64((void *)literal, + (unsigned long)ops); +} + +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) +{ + return ftrace_rec_set_ops(rec, &ftrace_nop_ops); +} + +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) +{ + return ftrace_rec_set_ops(rec, arm64_rec_get_ops(rec)); +} +#else +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; } +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; } +#endif + +/* + * Turn on the call to ftrace_caller() in instrumented function + */ +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long pc = rec->ip; + u32 old, new; + int ret; + + ret = ftrace_rec_update_ops(rec); + if (ret) + return ret; + + if (!ftrace_find_callable_addr(rec, NULL, &addr)) + return -EINVAL; + old = aarch64_insn_gen_nop(); new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK); return ftrace_modify_code(pc, old, new, true); } -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, - unsigned long addr) + unsigned long addr) { unsigned long pc = rec->ip; u32 old, new; + int ret; + + ret = ftrace_rec_set_ops(rec, arm64_rec_get_ops(rec)); + if (ret) + return ret; + + if (!ftrace_find_callable_addr(rec, NULL, &old_addr)) + return -EINVAL; + if (!ftrace_find_callable_addr(rec, NULL, &addr)) + return -EINVAL; old = aarch64_insn_gen_branch_imm(pc, old_addr, AARCH64_INSN_BRANCH_LINK); @@ -136,7 +363,9 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, return ftrace_modify_code(pc, old, new, true); } +#endif +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS /* * The compiler has inserted two NOPs before the regular function prologue. * All instrumented functions follow the AAPCS, so x0-x8 and x19-x30 are live, @@ -152,7 +381,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, * | NOP | MOV X9, LR | MOV X9, LR | * | NOP | NOP | BL <entry> | * - * The LR value will be recovered by ftrace_regs_entry, and restored into LR + * The LR value will be recovered by ftrace_caller, and restored into LR * before returning to the regular function prologue. When a function is not * being traced, the MOV is not harmful given x9 is not live per the AAPCS. * @@ -163,6 +392,11 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) { unsigned long pc = rec->ip - AARCH64_INSN_SIZE; u32 old, new; + int ret; + + ret = ftrace_rec_set_nop_ops(rec); + if (ret) + return ret; old = aarch64_insn_gen_nop(); new = aarch64_insn_gen_move_reg(AARCH64_INSN_REG_9, @@ -179,54 +413,33 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { unsigned long pc = rec->ip; - bool validate = true; u32 old = 0, new; - long offset = (long)pc - (long)addr; - - if (offset < -SZ_128M || offset >= SZ_128M) { - u32 replaced; + int ret; - if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) - return -EINVAL; + new = aarch64_insn_gen_nop(); - /* - * 'mod' is only set at module load time, but if we end up - * dealing with an out-of-range condition, we can assume it - * is due to a module being loaded far away from the kernel. - */ - if (!mod) { - preempt_disable(); - mod = __module_text_address(pc); - preempt_enable(); - - if (WARN_ON(!mod)) - return -EINVAL; - } + ret = ftrace_rec_set_nop_ops(rec); + if (ret) + return ret; - /* - * The instruction we are about to patch may be a branch and - * link instruction that was redirected via a PLT entry. In - * this case, the normal validation will fail, but we can at - * least check that we are dealing with a branch and link - * instruction that points into the right module. - */ - if (aarch64_insn_read((void *)pc, &replaced)) - return -EFAULT; - - if (!aarch64_insn_is_bl(replaced) || - !within_module(pc + aarch64_get_branch_offset(replaced), - mod)) - return -EINVAL; + /* + * When using mcount, callsites in modules may have been initalized to + * call an arbitrary module PLT (which redirects to the _mcount stub) + * rather than the ftrace PLT we'll use at runtime (which redirects to + * the ftrace trampoline). We can ignore the old PLT when initializing + * the callsite. + * + * Note: 'mod' is only set at module load time. + */ + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) && mod) + return aarch64_insn_patch_text_nosync((void *)pc, new); - validate = false; - } else { - old = aarch64_insn_gen_branch_imm(pc, addr, - AARCH64_INSN_BRANCH_LINK); - } + if (!ftrace_find_callable_addr(rec, mod, &addr)) + return -EINVAL; - new = aarch64_insn_gen_nop(); + old = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK); - return ftrace_modify_code(pc, old, new, validate); + return ftrace_modify_code(pc, old, new, true); } void arch_ftrace_update_code(int command) @@ -235,20 +448,12 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} -#endif /* CONFIG_DYNAMIC_FTRACE */ - #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* * function_graph tracer expects ftrace_return_to_handler() to be called * on the way back to parent. For this purpose, this function is called * in _mcount() or ftrace_caller() to replace return address (*parent) on * the call stack to return_to_handler. - * - * Note that @frame_pointer is used only for sanity check later. */ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long frame_pointer) @@ -266,11 +471,19 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, */ old = *parent; - if (!function_graph_enter(old, self_addr, frame_pointer, NULL)) + if (!function_graph_enter(old, self_addr, frame_pointer, + (void *)frame_pointer)) { *parent = return_hooker; + } } -#ifdef CONFIG_DYNAMIC_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + prepare_ftrace_return(ip, &fregs->lr, fregs->fp); +} +#else /* * Turn on/off the call to ftrace_graph_caller() in ftrace_caller() * depending on @enable. @@ -300,5 +513,5 @@ int ftrace_disable_ftrace_graph_caller(void) { return ftrace_modify_graph_caller(false); } -#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 989b1944cb71..cab7f91949d8 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -11,22 +11,25 @@ #include <linux/linkage.h> #include <linux/init.h> -#include <linux/irqchip/arm-gic-v3.h> +#include <linux/pgtable.h> +#include <asm/asm_pointer_auth.h> #include <asm/assembler.h> #include <asm/boot.h> +#include <asm/bug.h> #include <asm/ptrace.h> #include <asm/asm-offsets.h> #include <asm/cache.h> #include <asm/cputype.h> +#include <asm/el2_setup.h> #include <asm/elf.h> #include <asm/image.h> #include <asm/kernel-pgtable.h> #include <asm/kvm_arm.h> #include <asm/memory.h> #include <asm/pgtable-hwdef.h> -#include <asm/pgtable.h> #include <asm/page.h> +#include <asm/scs.h> #include <asm/smp.h> #include <asm/sysreg.h> #include <asm/thread_info.h> @@ -34,14 +37,8 @@ #include "efi-header.S" -#define __PHYS_OFFSET (KERNEL_START - TEXT_OFFSET) - -#if (TEXT_OFFSET & 0xfff) != 0 -#error TEXT_OFFSET must be at least 4KB aligned -#elif (PAGE_OFFSET & 0x1fffff) != 0 +#if (PAGE_OFFSET & 0x1fffff) != 0 #error PAGE_OFFSET must be at least 2MB aligned -#elif TEXT_OFFSET > 0x1fffff -#error TEXT_OFFSET must be less than 2MB #endif /* @@ -52,115 +49,144 @@ * MMU = off, D-cache = off, I-cache = on or off, * x0 = physical address to the FDT blob. * - * This code is mostly position independent so you call this at - * __pa(PAGE_OFFSET + TEXT_OFFSET). - * * Note that the callee-saved registers are used for storing variables * that are useful before the MMU is enabled. The allocations are described * in the entry routines. */ __HEAD -_head: /* * DO NOT MODIFY. Image header expected by Linux boot-loaders. */ -#ifdef CONFIG_EFI - /* - * This add instruction has no meaningful effect except that - * its opcode forms the magic "MZ" signature required by UEFI. - */ - add x13, x18, #0x16 - b stext -#else - b stext // branch to kernel start, magic - .long 0 // reserved -#endif - le64sym _kernel_offset_le // Image load offset from start of RAM, little-endian + efi_signature_nop // special NOP to identity as PE/COFF executable + b primary_entry // branch to kernel start, magic + .quad 0 // Image load offset from start of RAM, little-endian le64sym _kernel_size_le // Effective size of kernel image, little-endian le64sym _kernel_flags_le // Informative flags, little-endian .quad 0 // reserved .quad 0 // reserved .quad 0 // reserved .ascii ARM64_IMAGE_MAGIC // Magic number -#ifdef CONFIG_EFI - .long pe_header - _head // Offset to the PE header. + .long .Lpe_header_offset // Offset to the PE header. -pe_header: __EFI_PE_HEADER -#else - .long 0 // reserved -#endif - __INIT + .section ".idmap.text","a" /* * The following callee saved general purpose registers are used on the * primary lowlevel boot path: * * Register Scope Purpose - * x21 stext() .. start_kernel() FDT pointer passed at boot in x0 - * x23 stext() .. start_kernel() physical misalignment/KASLR offset - * x28 __create_page_tables() callee preserved temp register - * x19/x20 __primary_switch() callee preserved temp registers - * x24 __primary_switch() .. relocate_kernel() - * current RELR displacement + * x19 primary_entry() .. start_kernel() whether we entered with the MMU on + * x20 primary_entry() .. __primary_switch() CPU boot mode + * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 + * x22 create_idmap() .. start_kernel() ID map VA of the DT blob + * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset + * x24 __primary_switch() linear map KASLR seed + * x25 primary_entry() .. start_kernel() supported VA size + * x28 create_idmap() callee preserved temp register */ -ENTRY(stext) +SYM_CODE_START(primary_entry) + bl record_mmu_state bl preserve_boot_args - bl el2_setup // Drop to EL1, w0=cpu_boot_mode - adrp x23, __PHYS_OFFSET - and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 - bl set_cpu_boot_mode_flag - bl __create_page_tables + bl create_idmap + + /* + * If we entered with the MMU and caches on, clean the ID mapped part + * of the primary boot code to the PoC so we can safely execute it with + * the MMU off. + */ + cbz x19, 0f + adrp x0, __idmap_text_start + adr_l x1, __idmap_text_end + adr_l x2, dcache_clean_poc + blr x2 +0: mov x0, x19 + bl init_kernel_el // w0=cpu_boot_mode + mov x20, x0 + /* * The following calls CPU setup code, see arch/arm64/mm/proc.S for * details. * On return, the CPU will be ready for the MMU to be turned on and * the TCR will have been set. */ +#if VA_BITS > 48 + mrs_s x0, SYS_ID_AA64MMFR2_EL1 + tst x0, ID_AA64MMFR2_EL1_VARange_MASK + mov x0, #VA_BITS + mov x25, #VA_BITS_MIN + csel x25, x25, x0, eq + mov x0, x25 +#endif bl __cpu_setup // initialise processor b __primary_switch -ENDPROC(stext) +SYM_CODE_END(primary_entry) + + __INIT +SYM_CODE_START_LOCAL(record_mmu_state) + mrs x19, CurrentEL + cmp x19, #CurrentEL_EL2 + mrs x19, sctlr_el1 + b.ne 0f + mrs x19, sctlr_el2 +0: +CPU_LE( tbnz x19, #SCTLR_ELx_EE_SHIFT, 1f ) +CPU_BE( tbz x19, #SCTLR_ELx_EE_SHIFT, 1f ) + tst x19, #SCTLR_ELx_C // Z := (C == 0) + and x19, x19, #SCTLR_ELx_M // isolate M bit + csel x19, xzr, x19, eq // clear x19 if Z + ret + + /* + * Set the correct endianness early so all memory accesses issued + * before init_kernel_el() occur in the correct byte order. Note that + * this means the MMU must be disabled, or the active ID map will end + * up getting interpreted with the wrong byte order. + */ +1: eor x19, x19, #SCTLR_ELx_EE + bic x19, x19, #SCTLR_ELx_M + b.ne 2f + pre_disable_mmu_workaround + msr sctlr_el2, x19 + b 3f +2: pre_disable_mmu_workaround + msr sctlr_el1, x19 +3: isb + mov x19, xzr + ret +SYM_CODE_END(record_mmu_state) /* * Preserve the arguments passed by the bootloader in x0 .. x3 */ -preserve_boot_args: +SYM_CODE_START_LOCAL(preserve_boot_args) mov x21, x0 // x21=FDT adr_l x0, boot_args // record the contents of stp x21, x1, [x0] // x0 .. x3 at kernel entry stp x2, x3, [x0, #16] + cbnz x19, 0f // skip cache invalidation if MMU is on dmb sy // needed before dc ivac with // MMU off - mov x1, #0x20 // 4 x 8 bytes - b __inval_dcache_area // tail call -ENDPROC(preserve_boot_args) + add x1, x0, #0x20 // 4 x 8 bytes + b dcache_inval_poc // tail call +0: str_l x19, mmu_enabled_at_boot, x0 + ret +SYM_CODE_END(preserve_boot_args) -/* - * Macro to create a table entry to the next page. - * - * tbl: page table address - * virt: virtual address - * shift: #imm page table shift - * ptrs: #imm pointers per table page - * - * Preserves: virt - * Corrupts: ptrs, tmp1, tmp2 - * Returns: tbl -> next level table page address - */ - .macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2 - add \tmp1, \tbl, #PAGE_SIZE - phys_to_pte \tmp2, \tmp1 - orr \tmp2, \tmp2, #PMD_TYPE_TABLE // address of next table and entry type - lsr \tmp1, \virt, #\shift - sub \ptrs, \ptrs, #1 - and \tmp1, \tmp1, \ptrs // table index - str \tmp2, [\tbl, \tmp1, lsl #3] - add \tbl, \tbl, #PAGE_SIZE // next level table page - .endm +SYM_FUNC_START_LOCAL(clear_page_tables) + /* + * Clear the init page tables. + */ + adrp x0, init_pg_dir + adrp x1, init_pg_end + sub x2, x1, x0 + mov x1, xzr + b __pi_memset // tail call +SYM_FUNC_END(clear_page_tables) /* * Macro to populate page table entries, these entries can be pointers to the next level @@ -194,33 +220,22 @@ ENDPROC(preserve_boot_args) * to be composed of multiple pages. (This effectively scales the end index). * * vstart: virtual address of start of range - * vend: virtual address of end of range + * vend: virtual address of end of range - we map [vstart, vend] * shift: shift used to transform virtual address into index - * ptrs: number of entries in page table + * order: #imm 2log(number of entries in page table) * istart: index in table corresponding to vstart * iend: index in table corresponding to vend * count: On entry: how many extra entries were required in previous level, scales * our end index. * On exit: returns how many extra entries required for next page table level * - * Preserves: vstart, vend, shift, ptrs + * Preserves: vstart, vend * Returns: istart, iend, count */ - .macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count - lsr \iend, \vend, \shift - mov \istart, \ptrs - sub \istart, \istart, #1 - and \iend, \iend, \istart // iend = (vend >> shift) & (ptrs - 1) - mov \istart, \ptrs - mul \istart, \istart, \count - add \iend, \iend, \istart // iend += (count - 1) * ptrs - // our entries span multiple tables - - lsr \istart, \vstart, \shift - mov \count, \ptrs - sub \count, \count, #1 - and \istart, \istart, \count - + .macro compute_indices, vstart, vend, shift, order, istart, iend, count + ubfx \istart, \vstart, \shift, \order + ubfx \iend, \vend, \shift, \order + add \iend, \iend, \count, lsl \order sub \count, \iend, \istart .endm @@ -231,123 +246,120 @@ ENDPROC(preserve_boot_args) * * tbl: location of page table * rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE) - * vstart: start address to map - * vend: end address to map - we map [vstart, vend] + * vstart: virtual address of start of range + * vend: virtual address of end of range - we map [vstart, vend - 1] * flags: flags to use to map last level entries * phys: physical address corresponding to vstart - physical memory is contiguous - * pgds: the number of pgd entries + * order: #imm 2log(number of entries in PGD table) + * + * If extra_shift is set, an extra level will be populated if the end address does + * not fit in 'extra_shift' bits. This assumes vend is in the TTBR0 range. * * Temporaries: istart, iend, tmp, count, sv - these need to be different registers - * Preserves: vstart, vend, flags - * Corrupts: tbl, rtbl, istart, iend, tmp, count, sv + * Preserves: vstart, flags + * Corrupts: tbl, rtbl, vend, istart, iend, tmp, count, sv */ - .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv + .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, order, istart, iend, tmp, count, sv, extra_shift + sub \vend, \vend, #1 add \rtbl, \tbl, #PAGE_SIZE - mov \sv, \rtbl mov \count, #0 - compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count + + .ifnb \extra_shift + tst \vend, #~((1 << (\extra_shift)) - 1) + b.eq .L_\@ + compute_indices \vstart, \vend, #\extra_shift, #(PAGE_SHIFT - 3), \istart, \iend, \count + mov \sv, \rtbl populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp mov \tbl, \sv + .endif +.L_\@: + compute_indices \vstart, \vend, #PGDIR_SHIFT, #\order, \istart, \iend, \count mov \sv, \rtbl + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + mov \tbl, \sv #if SWAPPER_PGTABLE_LEVELS > 3 - compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count + compute_indices \vstart, \vend, #PUD_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count + mov \sv, \rtbl populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp mov \tbl, \sv - mov \sv, \rtbl #endif #if SWAPPER_PGTABLE_LEVELS > 2 - compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count + compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count + mov \sv, \rtbl populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp mov \tbl, \sv #endif - compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count - bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1 - populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp + compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count + bic \rtbl, \phys, #SWAPPER_BLOCK_SIZE - 1 + populate_entries \tbl, \rtbl, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp .endm /* - * Setup the initial page tables. We only setup the barest amount which is - * required to get the kernel running. The following sections are required: - * - identity mapping to enable the MMU (low address, TTBR0) - * - first few MB of the kernel linear mapping to jump to once the MMU has - * been enabled + * Remap a subregion created with the map_memory macro with modified attributes + * or output address. The entire remapped region must have been covered in the + * invocation of map_memory. + * + * x0: last level table address (returned in first argument to map_memory) + * x1: start VA of the existing mapping + * x2: start VA of the region to update + * x3: end VA of the region to update (exclusive) + * x4: start PA associated with the region to update + * x5: attributes to set on the updated region + * x6: order of the last level mappings */ -__create_page_tables: - mov x28, lr +SYM_FUNC_START_LOCAL(remap_region) + sub x3, x3, #1 // make end inclusive - /* - * Invalidate the init page tables to avoid potential dirty cache lines - * being evicted. Other page tables are allocated in rodata as part of - * the kernel image, and thus are clean to the PoC per the boot - * protocol. - */ - adrp x0, init_pg_dir - adrp x1, init_pg_end - sub x1, x1, x0 - bl __inval_dcache_area + // Get the index offset for the start of the last level table + lsr x1, x1, x6 + bfi x1, xzr, #0, #PAGE_SHIFT - 3 - /* - * Clear the init page tables. - */ - adrp x0, init_pg_dir - adrp x1, init_pg_end - sub x1, x1, x0 -1: stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - subs x1, x1, #64 - b.ne 1b + // Derive the start and end indexes into the last level table + // associated with the provided region + lsr x2, x2, x6 + lsr x3, x3, x6 + sub x2, x2, x1 + sub x3, x3, x1 - mov x7, SWAPPER_MM_MMUFLAGS + mov x1, #1 + lsl x6, x1, x6 // block size at this level - /* - * Create the identity mapping. - */ - adrp x0, idmap_pg_dir - adrp x3, __idmap_text_start // __pa(__idmap_text_start) - -#ifdef CONFIG_ARM64_VA_BITS_52 - mrs_s x6, SYS_ID_AA64MMFR2_EL1 - and x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT) - mov x5, #52 - cbnz x6, 1f -#endif - mov x5, #VA_BITS_MIN -1: - adr_l x6, vabits_actual - str x5, [x6] - dmb sy - dc ivac, x6 // Invalidate potentially stale cache line + populate_entries x0, x4, x2, x3, x5, x6, x7 + ret +SYM_FUNC_END(remap_region) +SYM_FUNC_START_LOCAL(create_idmap) + mov x28, lr /* - * VA_BITS may be too small to allow for an ID mapping to be created - * that covers system RAM if that is located sufficiently high in the - * physical address space. So for the ID map, use an extended virtual - * range in that case, and configure an additional translation level - * if needed. + * The ID map carries a 1:1 mapping of the physical address range + * covered by the loaded image, which could be anywhere in DRAM. This + * means that the required size of the VA (== PA) space is decided at + * boot time, and could be more than the configured size of the VA + * space for ordinary kernel and user space mappings. + * + * There are three cases to consider here: + * - 39 <= VA_BITS < 48, and the ID map needs up to 48 VA bits to cover + * the placement of the image. In this case, we configure one extra + * level of translation on the fly for the ID map only. (This case + * also covers 42-bit VA/52-bit PA on 64k pages). + * + * - VA_BITS == 48, and the ID map needs more than 48 VA bits. This can + * only happen when using 64k pages, in which case we need to extend + * the root level table rather than add a level. Note that we can + * treat this case as 'always extended' as long as we take care not + * to program an unsupported T0SZ value into the TCR register. * - * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the - * entire ID map region can be mapped. As T0SZ == (64 - #bits used), - * this number conveniently equals the number of leading zeroes in - * the physical address of __idmap_text_end. + * - Combinations that would require two additional levels of + * translation are not supported, e.g., VA_BITS==36 on 16k pages, or + * VA_BITS==39/4k pages with 5-level paging, where the input address + * requires more than 47 or 48 bits, respectively. */ - adrp x5, __idmap_text_end - clz x5, x5 - cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough? - b.ge 1f // .. then skip VA range extension - - adr_l x6, idmap_t0sz - str x5, [x6] - dmb sy - dc ivac, x6 // Invalidate potentially stale cache line - #if (VA_BITS < 48) +#define IDMAP_PGD_ORDER (VA_BITS - PGDIR_SHIFT) #define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) -#define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT)) /* * If VA_BITS < 48, we have to configure an additional table level. @@ -359,77 +371,124 @@ __create_page_tables: #if VA_BITS != EXTRA_SHIFT #error "Mismatch between VA_BITS and page size/number of translation levels" #endif - - mov x4, EXTRA_PTRS - create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6 #else +#define IDMAP_PGD_ORDER (PHYS_MASK_SHIFT - PGDIR_SHIFT) +#define EXTRA_SHIFT /* * If VA_BITS == 48, we don't have to configure an additional * translation level, but the top-level table has more entries. */ - mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT) - str_l x4, idmap_ptrs_per_pgd, x5 #endif -1: - ldr_l x4, idmap_ptrs_per_pgd - mov x5, x3 // __pa(__idmap_text_start) - adr_l x6, __idmap_text_end // __pa(__idmap_text_end) - - map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14 + adrp x0, init_idmap_pg_dir + adrp x3, _text + adrp x6, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE + mov_q x7, SWAPPER_RX_MMUFLAGS + + map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14, EXTRA_SHIFT + + /* Remap the kernel page tables r/w in the ID map */ + adrp x1, _text + adrp x2, init_pg_dir + adrp x3, init_pg_end + bic x4, x2, #SWAPPER_BLOCK_SIZE - 1 + mov_q x5, SWAPPER_RW_MMUFLAGS + mov x6, #SWAPPER_BLOCK_SHIFT + bl remap_region + + /* Remap the FDT after the kernel image */ + adrp x1, _text + adrp x22, _end + SWAPPER_BLOCK_SIZE + bic x2, x22, #SWAPPER_BLOCK_SIZE - 1 + bfi x22, x21, #0, #SWAPPER_BLOCK_SHIFT // remapped FDT address + add x3, x2, #MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE + bic x4, x21, #SWAPPER_BLOCK_SIZE - 1 + mov_q x5, SWAPPER_RW_MMUFLAGS + mov x6, #SWAPPER_BLOCK_SHIFT + bl remap_region /* - * Map the kernel image (starting with PHYS_OFFSET). + * Since the page tables have been populated with non-cacheable + * accesses (MMU disabled), invalidate those tables again to + * remove any speculatively loaded cache lines. */ + cbnz x19, 0f // skip cache invalidation if MMU is on + dmb sy + + adrp x0, init_idmap_pg_dir + adrp x1, init_idmap_pg_end + bl dcache_inval_poc +0: ret x28 +SYM_FUNC_END(create_idmap) + +SYM_FUNC_START_LOCAL(create_kernel_mapping) adrp x0, init_pg_dir - mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text) + mov_q x5, KIMAGE_VADDR // compile time __va(_text) +#ifdef CONFIG_RELOCATABLE add x5, x5, x23 // add KASLR displacement - mov x4, PTRS_PER_PGD +#endif adrp x6, _end // runtime __pa(_end) adrp x3, _text // runtime __pa(_text) sub x6, x6, x3 // _end - _text add x6, x6, x5 // runtime __va(_end) + mov_q x7, SWAPPER_RW_MMUFLAGS + + map_memory x0, x1, x5, x6, x7, x3, (VA_BITS - PGDIR_SHIFT), x10, x11, x12, x13, x14 - map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14 + dsb ishst // sync with page table walker + ret +SYM_FUNC_END(create_kernel_mapping) /* - * Since the page tables have been populated with non-cacheable - * accesses (MMU disabled), invalidate the idmap and swapper page - * tables again to remove any speculatively loaded cache lines. + * Initialize CPU registers with task-specific and cpu-specific context. + * + * Create a final frame record at task_pt_regs(current)->stackframe, so + * that the unwinder can identify the final frame record of any task by + * its location in the task stack. We reserve the entire pt_regs space + * for consistency with user tasks and kthreads. */ - adrp x0, idmap_pg_dir - adrp x1, init_pg_end - sub x1, x1, x0 - dmb sy - bl __inval_dcache_area + .macro init_cpu_task tsk, tmp1, tmp2 + msr sp_el0, \tsk + + ldr \tmp1, [\tsk, #TSK_STACK] + add sp, \tmp1, #THREAD_SIZE + sub sp, sp, #PT_REGS_SIZE + + stp xzr, xzr, [sp, #S_STACKFRAME] + add x29, sp, #S_STACKFRAME + + scs_load_current - ret x28 -ENDPROC(__create_page_tables) - .ltorg + adr_l \tmp1, __per_cpu_offset + ldr w\tmp2, [\tsk, #TSK_TI_CPU] + ldr \tmp1, [\tmp1, \tmp2, lsl #3] + set_this_cpu_offset \tmp1 + .endm /* * The following fragment of code is executed with the MMU enabled. * - * x0 = __PHYS_OFFSET + * x0 = __pa(KERNEL_START) */ -__primary_switched: - adrp x4, init_thread_union - add sp, x4, #THREAD_SIZE - adr_l x5, init_task - msr sp_el0, x5 // Save thread_info +SYM_FUNC_START_LOCAL(__primary_switched) + adr_l x4, init_task + init_cpu_task x4, x5, x6 adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address isb - stp xzr, x30, [sp, #-16]! + stp x29, x30, [sp, #-16]! mov x29, sp str_l x21, __fdt_pointer, x5 // Save FDT pointer - ldr_l x4, kimage_vaddr // Save the offset between + adrp x4, _text // Save the offset between sub x4, x4, x0 // the kernel virtual and str_l x4, kimage_voffset, x5 // physical mappings + mov x0, x20 + bl set_cpu_boot_mode_flag + // Clear BSS adr_l x0, __bss_start mov x1, xzr @@ -438,309 +497,209 @@ __primary_switched: bl __pi_memset dsb ishst // Make zero page visible to PTW -#ifdef CONFIG_KASAN - bl kasan_early_init +#if VA_BITS > 48 + adr_l x8, vabits_actual // Set this early so KASAN early init + str x25, [x8] // ... observes the correct value + dc civac, x8 // Make visible to booting secondaries #endif + #ifdef CONFIG_RANDOMIZE_BASE - tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized? - b.ne 0f + adrp x5, memstart_offset_seed // Save KASLR linear map seed + strh w24, [x5, :lo12:memstart_offset_seed] +#endif +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + bl kasan_early_init +#endif mov x0, x21 // pass FDT address in x0 - bl kaslr_early_init // parse FDT for KASLR options - cbz x0, 0f // KASLR disabled? just proceed - orr x23, x23, x0 // record KASLR offset - ldp x29, x30, [sp], #16 // we must enable KASLR, return - ret // to __primary_switch() -0: + bl early_fdt_map // Try mapping the FDT early + mov x0, x20 // pass the full boot status + bl init_feature_override // Parse cpu feature overrides +#ifdef CONFIG_UNWIND_PATCH_PAC_INTO_SCS + bl scs_patch_vmlinux #endif - add sp, sp, #16 - mov x29, #0 - mov x30, #0 - b start_kernel -ENDPROC(__primary_switched) + mov x0, x20 + bl finalise_el2 // Prefer VHE if possible + ldp x29, x30, [sp], #16 + bl start_kernel + ASM_BUG() +SYM_FUNC_END(__primary_switched) /* * end early head section, begin head code that is also used for * hotplug and needs to have the same protections as the text region */ - .section ".idmap.text","awx" - -ENTRY(kimage_vaddr) - .quad _text - TEXT_OFFSET -EXPORT_SYMBOL(kimage_vaddr) + .section ".idmap.text","a" /* - * If we're fortunate enough to boot at EL2, ensure that the world is - * sane before dropping to EL1. + * Starting from EL2 or EL1, configure the CPU to execute at the highest + * reachable EL supported by the kernel in a chosen default state. If dropping + * from EL2 to EL1, configure EL2 before configuring EL1. * - * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if - * booted in EL1 or EL2 respectively. + * Since we cannot always rely on ERET synchronizing writes to sysregs (e.g. if + * SCTLR_ELx.EOS is clear), we place an ISB prior to ERET. + * + * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in x0 if + * booted in EL1 or EL2 respectively, with the top 32 bits containing + * potential context flags. These flags are *not* stored in __boot_cpu_mode. + * + * x0: whether we are being called from the primary boot path with the MMU on */ -ENTRY(el2_setup) - msr SPsel, #1 // We want to use SP_EL{1,2} - mrs x0, CurrentEL - cmp x0, #CurrentEL_EL2 - b.eq 1f - mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1) +SYM_FUNC_START(init_kernel_el) + mrs x1, CurrentEL + cmp x1, #CurrentEL_EL2 + b.eq init_el2 + +SYM_INNER_LABEL(init_el1, SYM_L_LOCAL) + mov_q x0, INIT_SCTLR_EL1_MMU_OFF + pre_disable_mmu_workaround msr sctlr_el1, x0 - mov w0, #BOOT_CPU_MODE_EL1 // This cpu booted in EL1 isb - ret - -1: mov_q x0, (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) - msr sctlr_el2, x0 + mov_q x0, INIT_PSTATE_EL1 + msr spsr_el1, x0 + msr elr_el1, lr + mov w0, #BOOT_CPU_MODE_EL1 + eret -#ifdef CONFIG_ARM64_VHE - /* - * Check for VHE being present. For the rest of the EL2 setup, - * x2 being non-zero indicates that we do have VHE, and that the - * kernel is intended to run at EL2. - */ - mrs x2, id_aa64mmfr1_el1 - ubfx x2, x2, #ID_AA64MMFR1_VHE_SHIFT, #4 -#else - mov x2, xzr -#endif +SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) + msr elr_el2, lr - /* Hyp configuration. */ + // clean all HYP code to the PoC if we booted at EL2 with the MMU on + cbz x0, 0f + adrp x0, __hyp_idmap_text_start + adr_l x1, __hyp_text_end + adr_l x2, dcache_clean_poc + blr x2 +0: mov_q x0, HCR_HOST_NVHE_FLAGS - cbz x2, set_hcr - mov_q x0, HCR_HOST_VHE_FLAGS -set_hcr: msr hcr_el2, x0 isb - /* - * Allow Non-secure EL1 and EL0 to access physical timer and counter. - * This is not necessary for VHE, since the host kernel runs in EL2, - * and EL0 accesses are configured in the later stage of boot process. - * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout - * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined - * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1 - * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in - * EL2. - */ - cbnz x2, 1f - mrs x0, cnthctl_el2 - orr x0, x0, #3 // Enable EL1 physical timers - msr cnthctl_el2, x0 -1: - msr cntvoff_el2, xzr // Clear virtual offset - -#ifdef CONFIG_ARM_GIC_V3 - /* GICv3 system register access */ - mrs x0, id_aa64pfr0_el1 - ubfx x0, x0, #ID_AA64PFR0_GIC_SHIFT, #4 - cbz x0, 3f - - mrs_s x0, SYS_ICC_SRE_EL2 - orr x0, x0, #ICC_SRE_EL2_SRE // Set ICC_SRE_EL2.SRE==1 - orr x0, x0, #ICC_SRE_EL2_ENABLE // Set ICC_SRE_EL2.Enable==1 - msr_s SYS_ICC_SRE_EL2, x0 - isb // Make sure SRE is now set - mrs_s x0, SYS_ICC_SRE_EL2 // Read SRE back, - tbz x0, #0, 3f // and check that it sticks - msr_s SYS_ICH_HCR_EL2, xzr // Reset ICC_HCR_EL2 to defaults - -3: -#endif + init_el2_state - /* Populate ID registers. */ - mrs x0, midr_el1 - mrs x1, mpidr_el1 - msr vpidr_el2, x0 - msr vmpidr_el2, x1 - -#ifdef CONFIG_COMPAT - msr hstr_el2, xzr // Disable CP15 traps to EL2 -#endif - - /* EL2 debug */ - mrs x1, id_aa64dfr0_el1 - sbfx x0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4 - cmp x0, #1 - b.lt 4f // Skip if no PMU present - mrs x0, pmcr_el0 // Disable debug access traps - ubfx x0, x0, #11, #5 // to EL2 and allow access to -4: - csel x3, xzr, x0, lt // all PMU counters from EL1 - - /* Statistical profiling */ - ubfx x0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4 - cbz x0, 7f // Skip if SPE not present - cbnz x2, 6f // VHE? - mrs_s x4, SYS_PMBIDR_EL1 // If SPE available at EL2, - and x4, x4, #(1 << SYS_PMBIDR_EL1_P_SHIFT) - cbnz x4, 5f // then permit sampling of physical - mov x4, #(1 << SYS_PMSCR_EL2_PCT_SHIFT | \ - 1 << SYS_PMSCR_EL2_PA_SHIFT) - msr_s SYS_PMSCR_EL2, x4 // addresses and physical counter -5: - mov x1, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) - orr x3, x3, x1 // If we don't have VHE, then - b 7f // use EL1&0 translation. -6: // For VHE, use EL2 translation - orr x3, x3, #MDCR_EL2_TPMS // and disable access from EL1 -7: - msr mdcr_el2, x3 // Configure debug traps - - /* LORegions */ - mrs x1, id_aa64mmfr1_el1 - ubfx x0, x1, #ID_AA64MMFR1_LOR_SHIFT, 4 - cbz x0, 1f - msr_s SYS_LORC_EL1, xzr -1: - - /* Stage-2 translation */ - msr vttbr_el2, xzr - - cbz x2, install_el2_stub - - mov w0, #BOOT_CPU_MODE_EL2 // This CPU booted in EL2 + /* Hypervisor stub */ + adr_l x0, __hyp_stub_vectors + msr vbar_el2, x0 isb - ret -install_el2_stub: + mov_q x1, INIT_SCTLR_EL1_MMU_OFF + /* - * When VHE is not in use, early init of EL2 and EL1 needs to be - * done here. - * When VHE _is_ in use, EL1 will not be used in the host and - * requires no configuration, and all non-hyp-specific EL2 setup - * will be done via the _EL1 system register aliases in __cpu_setup. + * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, + * making it impossible to start in nVHE mode. Is that + * compliant with the architecture? Absolutely not! */ - mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1) - msr sctlr_el1, x0 - - /* Coprocessor traps. */ - mov x0, #0x33ff - msr cptr_el2, x0 // Disable copro. traps to EL2 - - /* SVE register access */ - mrs x1, id_aa64pfr0_el1 - ubfx x1, x1, #ID_AA64PFR0_SVE_SHIFT, #4 - cbz x1, 7f + mrs x0, hcr_el2 + and x0, x0, #HCR_E2H + cbz x0, 1f - bic x0, x0, #CPTR_EL2_TZ // Also disable SVE traps - msr cptr_el2, x0 // Disable copro. traps to EL2 - isb - mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector - msr_s SYS_ZCR_EL2, x1 // length for EL1. + /* Set a sane SCTLR_EL1, the VHE way */ + pre_disable_mmu_workaround + msr_s SYS_SCTLR_EL12, x1 + mov x2, #BOOT_CPU_FLAG_E2H + b 2f - /* Hypervisor stub */ -7: adr_l x0, __hyp_stub_vectors - msr vbar_el2, x0 +1: + pre_disable_mmu_workaround + msr sctlr_el1, x1 + mov x2, xzr +2: + __init_el2_nvhe_prepare_eret - /* spsr */ - mov x0, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ - PSR_MODE_EL1h) - msr spsr_el2, x0 - msr elr_el2, lr - mov w0, #BOOT_CPU_MODE_EL2 // This CPU booted in EL2 + mov w0, #BOOT_CPU_MODE_EL2 + orr x0, x0, x2 eret -ENDPROC(el2_setup) - -/* - * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed - * in w0. See arch/arm64/include/asm/virt.h for more info. - */ -set_cpu_boot_mode_flag: - adr_l x1, __boot_cpu_mode - cmp w0, #BOOT_CPU_MODE_EL2 - b.ne 1f - add x1, x1, #4 -1: str w0, [x1] // This CPU has booted in EL1 - dmb sy - dc ivac, x1 // Invalidate potentially stale cache line - ret -ENDPROC(set_cpu_boot_mode_flag) - -/* - * These values are written with the MMU off, but read with the MMU on. - * Writers will invalidate the corresponding address, discarding up to a - * 'Cache Writeback Granule' (CWG) worth of data. The linker script ensures - * sufficient alignment that the CWG doesn't overlap another section. - */ - .pushsection ".mmuoff.data.write", "aw" -/* - * We need to find out the CPU boot mode long after boot, so we need to - * store it in a writable variable. - * - * This is not in .bss, because we set it sufficiently early that the boot-time - * zeroing of .bss would clobber it. - */ -ENTRY(__boot_cpu_mode) - .long BOOT_CPU_MODE_EL2 - .long BOOT_CPU_MODE_EL1 -/* - * The booting CPU updates the failed status @__early_cpu_boot_status, - * with MMU turned off. - */ -ENTRY(__early_cpu_boot_status) - .quad 0 - - .popsection +SYM_FUNC_END(init_kernel_el) /* * This provides a "holding pen" for platforms to hold all secondary * cores are held until we're ready for them to initialise. */ -ENTRY(secondary_holding_pen) - bl el2_setup // Drop to EL1, w0=cpu_boot_mode - bl set_cpu_boot_mode_flag - mrs x0, mpidr_el1 +SYM_FUNC_START(secondary_holding_pen) + mov x0, xzr + bl init_kernel_el // w0=cpu_boot_mode + mrs x2, mpidr_el1 mov_q x1, MPIDR_HWID_BITMASK - and x0, x0, x1 + and x2, x2, x1 adr_l x3, secondary_holding_pen_release pen: ldr x4, [x3] - cmp x4, x0 + cmp x4, x2 b.eq secondary_startup wfe b pen -ENDPROC(secondary_holding_pen) +SYM_FUNC_END(secondary_holding_pen) /* * Secondary entry point that jumps straight into the kernel. Only to * be used where CPUs are brought online dynamically by the kernel. */ -ENTRY(secondary_entry) - bl el2_setup // Drop to EL1 - bl set_cpu_boot_mode_flag +SYM_FUNC_START(secondary_entry) + mov x0, xzr + bl init_kernel_el // w0=cpu_boot_mode b secondary_startup -ENDPROC(secondary_entry) +SYM_FUNC_END(secondary_entry) -secondary_startup: +SYM_FUNC_START_LOCAL(secondary_startup) /* * Common entry point for secondary CPUs. */ + mov x20, x0 // preserve boot mode bl __cpu_secondary_check52bitva +#if VA_BITS > 48 + ldr_l x0, vabits_actual +#endif bl __cpu_setup // initialise processor adrp x1, swapper_pg_dir + adrp x2, idmap_pg_dir bl __enable_mmu ldr x8, =__secondary_switched br x8 -ENDPROC(secondary_startup) +SYM_FUNC_END(secondary_startup) + + .text +SYM_FUNC_START_LOCAL(__secondary_switched) + mov x0, x20 + bl set_cpu_boot_mode_flag + + mov x0, x20 + bl finalise_el2 -__secondary_switched: + str_l xzr, __early_cpu_boot_status, x3 adr_l x5, vectors msr vbar_el1, x5 isb adr_l x0, secondary_data - ldr x1, [x0, #CPU_BOOT_STACK] // get secondary_data.stack - cbz x1, __secondary_too_slow - mov sp, x1 ldr x2, [x0, #CPU_BOOT_TASK] cbz x2, __secondary_too_slow - msr sp_el0, x2 - mov x29, #0 - mov x30, #0 - b secondary_start_kernel -ENDPROC(__secondary_switched) -__secondary_too_slow: + init_cpu_task x2, x1, x3 + +#ifdef CONFIG_ARM64_PTR_AUTH + ptrauth_keys_init_cpu x2, x3, x4, x5 +#endif + + bl secondary_start_kernel + ASM_BUG() +SYM_FUNC_END(__secondary_switched) + +SYM_FUNC_START_LOCAL(__secondary_too_slow) wfe wfi b __secondary_too_slow -ENDPROC(__secondary_too_slow) +SYM_FUNC_END(__secondary_too_slow) + +/* + * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed + * in w0. See arch/arm64/include/asm/virt.h for more info. + */ +SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag) + adr_l x1, __boot_cpu_mode + cmp w0, #BOOT_CPU_MODE_EL2 + b.ne 1f + add x1, x1, #4 +1: str w0, [x1] // Save CPU boot mode + ret +SYM_FUNC_END(set_cpu_boot_mode_flag) /* * The booting CPU updates the failed status @__early_cpu_boot_status, @@ -765,6 +724,7 @@ ENDPROC(__secondary_too_slow) * * x0 = SCTLR_EL1 value for turning on the MMU. * x1 = TTBR1_EL1 value + * x2 = ID map root table address * * Returns to the caller via x30/lr. This requires the caller to be covered * by the .idmap.text section. @@ -772,40 +732,31 @@ ENDPROC(__secondary_too_slow) * Checks if the selected granule size is supported by the CPU. * If it isn't, park the CPU */ -ENTRY(__enable_mmu) - mrs x2, ID_AA64MMFR0_EL1 - ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4 - cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED - b.ne __no_granule_support - update_early_cpu_boot_status 0, x2, x3 - adrp x2, idmap_pg_dir - phys_to_ttbr x1, x1 + .section ".idmap.text","a" +SYM_FUNC_START(__enable_mmu) + mrs x3, ID_AA64MMFR0_EL1 + ubfx x3, x3, #ID_AA64MMFR0_EL1_TGRAN_SHIFT, 4 + cmp x3, #ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN + b.lt __no_granule_support + cmp x3, #ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX + b.gt __no_granule_support phys_to_ttbr x2, x2 msr ttbr0_el1, x2 // load TTBR0 - offset_ttbr1 x1, x3 - msr ttbr1_el1, x1 // load TTBR1 - isb - msr sctlr_el1, x0 - isb - /* - * Invalidate the local I-cache so that any instructions fetched - * speculatively from the PoC are discarded, since they may have - * been dynamically patched at the PoU. - */ - ic iallu - dsb nsh - isb + load_ttbr1 x1, x1, x3 + + set_sctlr_el1 x0 + ret -ENDPROC(__enable_mmu) +SYM_FUNC_END(__enable_mmu) -ENTRY(__cpu_secondary_check52bitva) -#ifdef CONFIG_ARM64_VA_BITS_52 +SYM_FUNC_START(__cpu_secondary_check52bitva) +#if VA_BITS > 48 ldr_l x0, vabits_actual cmp x0, #52 b.ne 2f mrs_s x0, SYS_ID_AA64MMFR2_EL1 - and x0, x0, #(0xf << ID_AA64MMFR2_LVA_SHIFT) + and x0, x0, ID_AA64MMFR2_EL1_VARange_MASK cbnz x0, 2f update_early_cpu_boot_status \ @@ -816,9 +767,9 @@ ENTRY(__cpu_secondary_check52bitva) #endif 2: ret -ENDPROC(__cpu_secondary_check52bitva) +SYM_FUNC_END(__cpu_secondary_check52bitva) -__no_granule_support: +SYM_FUNC_START_LOCAL(__no_granule_support) /* Indicate that this CPU can't boot and is stuck in the kernel */ update_early_cpu_boot_status \ CPU_STUCK_IN_KERNEL | CPU_STUCK_REASON_NO_GRAN, x1, x2 @@ -826,21 +777,18 @@ __no_granule_support: wfe wfi b 1b -ENDPROC(__no_granule_support) +SYM_FUNC_END(__no_granule_support) #ifdef CONFIG_RELOCATABLE -__relocate_kernel: +SYM_FUNC_START_LOCAL(__relocate_kernel) /* * Iterate over each entry in the relocation table, and apply the * relocations in place. */ - ldr w9, =__rela_offset // offset to reloc table - ldr w10, =__rela_size // size of reloc table - + adr_l x9, __rela_start + adr_l x10, __rela_end mov_q x11, KIMAGE_VADDR // default virtual offset add x11, x11, x23 // actual virtual offset - add x9, x9, x11 // __va(.rela) - add x10, x9, x10 // __va(.rela) + sizeof(.rela) 0: cmp x9, x10 b.hs 1f @@ -883,21 +831,9 @@ __relocate_kernel: * entry in x9, the address being relocated by the current address or * bitmap entry in x13 and the address being relocated by the current * bit in x14. - * - * Because addends are stored in place in the binary, RELR relocations - * cannot be applied idempotently. We use x24 to keep track of the - * currently applied displacement so that we can correctly relocate if - * __relocate_kernel is called twice with non-zero displacements (i.e. - * if there is both a physical misalignment and a KASLR displacement). */ - ldr w9, =__relr_offset // offset to reloc table - ldr w10, =__relr_size // size of reloc table - add x9, x9, x11 // __va(.relr) - add x10, x9, x10 // __va(.relr) + sizeof(.relr) - - sub x15, x23, x24 // delta from previous offset - cbz x15, 7f // nothing to do if unchanged - mov x24, x23 // save new offset + adr_l x9, __relr_start + adr_l x10, __relr_end 2: cmp x9, x10 b.hs 7f @@ -905,7 +841,7 @@ __relocate_kernel: tbnz x11, #0, 3f // branch to handle bitmaps add x13, x11, x23 ldr x12, [x13] // relocate address entry - add x12, x12, x15 + add x12, x12, x23 str x12, [x13], #8 // adjust to start of bitmap b 2b @@ -914,7 +850,7 @@ __relocate_kernel: cbz x11, 6f tbz x11, #0, 5f // skip bit if not set ldr x12, [x14] // relocate bit - add x12, x12, x15 + add x12, x12, x23 str x12, [x14] 5: add x14, x14, #8 // move to next bit's address @@ -931,50 +867,36 @@ __relocate_kernel: #endif ret -ENDPROC(__relocate_kernel) +SYM_FUNC_END(__relocate_kernel) #endif -__primary_switch: +SYM_FUNC_START_LOCAL(__primary_switch) + adrp x1, reserved_pg_dir + adrp x2, init_idmap_pg_dir + bl __enable_mmu +#ifdef CONFIG_RELOCATABLE + adrp x23, KERNEL_START + and x23, x23, MIN_KIMG_ALIGN - 1 #ifdef CONFIG_RANDOMIZE_BASE - mov x19, x0 // preserve new SCTLR_EL1 value - mrs x20, sctlr_el1 // preserve old SCTLR_EL1 value + mov x0, x22 + adrp x1, init_pg_end + mov sp, x1 + mov x29, xzr + bl __pi_kaslr_early_init + and x24, x0, #SZ_2M - 1 // capture memstart offset seed + bic x0, x0, #SZ_2M - 1 + orr x23, x23, x0 // record kernel offset +#endif #endif + bl clear_page_tables + bl create_kernel_mapping adrp x1, init_pg_dir - bl __enable_mmu + load_ttbr1 x1, x1, x2 #ifdef CONFIG_RELOCATABLE -#ifdef CONFIG_RELR - mov x24, #0 // no RELR displacement yet -#endif bl __relocate_kernel -#ifdef CONFIG_RANDOMIZE_BASE - ldr x8, =__primary_switched - adrp x0, __PHYS_OFFSET - blr x8 - - /* - * If we return here, we have a KASLR displacement in x23 which we need - * to take into account by discarding the current kernel mapping and - * creating a new one. - */ - pre_disable_mmu_workaround - msr sctlr_el1, x20 // disable the MMU - isb - bl __create_page_tables // recreate kernel mapping - - tlbi vmalle1 // Remove any stale TLB entries - dsb nsh - - msr sctlr_el1, x19 // re-enable the MMU - isb - ic iallu // flush instructions fetched - dsb nsh // via old mapping - isb - - bl __relocate_kernel -#endif #endif ldr x8, =__primary_switched - adrp x0, __PHYS_OFFSET + adrp x0, KERNEL_START // __pa(KERNEL_START) br x8 -ENDPROC(__primary_switch) +SYM_FUNC_END(__primary_switch) diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S index 38bcd4d4e43b..0e1d9c3c6a93 100644 --- a/arch/arm64/kernel/hibernate-asm.S +++ b/arch/arm64/kernel/hibernate-asm.S @@ -16,26 +16,6 @@ #include <asm/virt.h> /* - * To prevent the possibility of old and new partial table walks being visible - * in the tlb, switch the ttbr to a zero page when we invalidate the old - * records. D4.7.1 'General TLB maintenance requirements' in ARM DDI 0487A.i - * Even switching to our copied tables will cause a changed output address at - * each stage of the walk. - */ -.macro break_before_make_ttbr_switch zero_page, page_table, tmp, tmp2 - phys_to_ttbr \tmp, \zero_page - msr ttbr1_el1, \tmp - isb - tlbi vmalle1 - dsb nsh - phys_to_ttbr \tmp, \page_table - offset_ttbr1 \tmp, \tmp2 - msr ttbr1_el1, \tmp - isb -.endm - - -/* * Resume from hibernate * * Loads temporary page tables then restores the memory image. @@ -45,7 +25,7 @@ * Because this code has to be copied to a 'safe' page, it can't call out to * other functions by PC-relative address. Also remember that it may be * mid-way through over-writing other functions. For this reason it contains - * code from flush_icache_range() and uses the copy_page() macro. + * code from caches_clean_inval_pou() and uses the copy_page() macro. * * This 'safe' page is mapped via ttbr0, and executed from there. This function * switches to a copy of the linear map in ttbr1, performs the restore, then @@ -65,7 +45,7 @@ * x5: physical address of a zero page that remains zero after resume */ .pushsection ".hibernate_exit.text", "ax" -ENTRY(swsusp_arch_suspend_exit) +SYM_CODE_START(swsusp_arch_suspend_exit) /* * We execute from ttbr0, change ttbr1 to our copied linear map tables * with a break-before-make via the zero page @@ -87,11 +67,12 @@ ENTRY(swsusp_arch_suspend_exit) copy_page x0, x1, x2, x3, x4, x5, x6, x7, x8, x9 add x1, x10, #PAGE_SIZE - /* Clean the copied page to PoU - based on flush_icache_range() */ + /* Clean the copied page to PoU - based on caches_clean_inval_pou() */ raw_dcache_line_size x2, x3 sub x3, x2, #1 bic x4, x10, x3 -2: dc cvau, x4 /* clean D line / unified line */ +2: /* clean D line / unified line */ +alternative_insn "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE add x4, x4, x2 cmp x4, x1 b.lo 2b @@ -110,59 +91,5 @@ ENTRY(swsusp_arch_suspend_exit) cbz x24, 3f /* Do we need to re-initialise EL2? */ hvc #0 3: ret - - .ltorg -ENDPROC(swsusp_arch_suspend_exit) - -/* - * Restore the hyp stub. - * This must be done before the hibernate page is unmapped by _cpu_resume(), - * but happens before any of the hyp-stub's code is cleaned to PoC. - * - * x24: The physical address of __hyp_stub_vectors - */ -el1_sync: - msr vbar_el2, x24 - eret -ENDPROC(el1_sync) - -.macro invalid_vector label -\label: - b \label -ENDPROC(\label) -.endm - - invalid_vector el2_sync_invalid - invalid_vector el2_irq_invalid - invalid_vector el2_fiq_invalid - invalid_vector el2_error_invalid - invalid_vector el1_sync_invalid - invalid_vector el1_irq_invalid - invalid_vector el1_fiq_invalid - invalid_vector el1_error_invalid - -/* el2 vectors - switch el2 here while we restore the memory image. */ - .align 11 -ENTRY(hibernate_el2_vectors) - ventry el2_sync_invalid // Synchronous EL2t - ventry el2_irq_invalid // IRQ EL2t - ventry el2_fiq_invalid // FIQ EL2t - ventry el2_error_invalid // Error EL2t - - ventry el2_sync_invalid // Synchronous EL2h - ventry el2_irq_invalid // IRQ EL2h - ventry el2_fiq_invalid // FIQ EL2h - ventry el2_error_invalid // Error EL2h - - ventry el1_sync // Synchronous 64-bit EL1 - ventry el1_irq_invalid // IRQ 64-bit EL1 - ventry el1_fiq_invalid // FIQ 64-bit EL1 - ventry el1_error_invalid // Error 64-bit EL1 - - ventry el1_sync_invalid // Synchronous 32-bit EL1 - ventry el1_irq_invalid // IRQ 32-bit EL1 - ventry el1_fiq_invalid // FIQ 32-bit EL1 - ventry el1_error_invalid // Error 32-bit EL1 -END(hibernate_el2_vectors) - +SYM_CODE_END(swsusp_arch_suspend_exit) .popsection diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index a96b2921d22c..02870beb271e 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -7,21 +7,15 @@ * Ubuntu project, hibernation support for mach-dove * Copyright (C) 2010 Nokia Corporation (Hiroshi Doyu) * Copyright (C) 2010 Texas Instruments, Inc. (Teerth Reddy et al.) - * https://lkml.org/lkml/2010/6/18/4 - * https://lists.linux-foundation.org/pipermail/linux-pm/2010-June/027422.html - * https://patchwork.kernel.org/patch/96442/ - * * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> */ #define pr_fmt(x) "hibernate: " x #include <linux/cpu.h> #include <linux/kvm_host.h> -#include <linux/mm.h> #include <linux/pm.h> #include <linux/sched.h> #include <linux/suspend.h> #include <linux/utsname.h> -#include <linux/version.h> #include <asm/barrier.h> #include <asm/cacheflush.h> @@ -31,14 +25,13 @@ #include <asm/kexec.h> #include <asm/memory.h> #include <asm/mmu_context.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> -#include <asm/pgtable-hwdef.h> +#include <asm/mte.h> #include <asm/sections.h> #include <asm/smp.h> #include <asm/smp_plat.h> #include <asm/suspend.h> #include <asm/sysreg.h> +#include <asm/trans_pgd.h> #include <asm/virt.h> /* @@ -52,10 +45,7 @@ extern int in_suspend; /* Do we need to reset el2? */ -#define el2_reset_needed() (is_hyp_mode_available() && !is_kernel_in_hyp_mode()) - -/* temporary el2 vectors in the __hibernate_exit_text section. */ -extern char hibernate_el2_vectors[]; +#define el2_reset_needed() (is_hyp_nvhe()) /* hyp-stub vectors, used to restore el2 during resume from hibernate. */ extern char __hyp_stub_vectors[]; @@ -109,7 +99,6 @@ int pfn_is_nosave(unsigned long pfn) void notrace save_processor_state(void) { - WARN_ON(num_online_cpus() != 1); } void notrace restore_processor_state(void) @@ -166,14 +155,11 @@ int arch_hibernation_header_restore(void *addr) sleep_cpu = -EINVAL; return -EINVAL; } - if (!cpu_online(sleep_cpu)) { - pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n"); - ret = cpu_up(sleep_cpu); - if (ret) { - pr_err("Failed to bring hibernate-CPU up!\n"); - sleep_cpu = -EINVAL; - return ret; - } + + ret = bringup_hibernate_cpu(sleep_cpu); + if (ret) { + sleep_cpu = -EINVAL; + return ret; } resume_hdr = *hdr; @@ -182,9 +168,14 @@ int arch_hibernation_header_restore(void *addr) } EXPORT_SYMBOL(arch_hibernation_header_restore); +static void *hibernate_page_alloc(void *arg) +{ + return (void *)get_safe_page((__force gfp_t)(unsigned long)arg); +} + /* * Copies length bytes, starting at src_start into an new page, - * perform cache maintentance, then maps it at the specified address low + * perform cache maintenance, then maps it at the specified address low * address as executable. * * This is used by hibernate to copy the code it needs to execute when @@ -195,90 +186,143 @@ EXPORT_SYMBOL(arch_hibernation_header_restore); * page system. */ static int create_safe_exec_page(void *src_start, size_t length, - unsigned long dst_addr, - phys_addr_t *phys_dst_addr, - void *(*allocator)(gfp_t mask), - gfp_t mask) + phys_addr_t *phys_dst_addr) { - int rc = 0; - pgd_t *trans_pgd; - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - unsigned long dst = (unsigned long)allocator(mask); - - if (!dst) { - rc = -ENOMEM; - goto out; - } + struct trans_pgd_info trans_info = { + .trans_alloc_page = hibernate_page_alloc, + .trans_alloc_arg = (__force void *)GFP_ATOMIC, + }; + + void *page = (void *)get_safe_page(GFP_ATOMIC); + phys_addr_t trans_ttbr0; + unsigned long t0sz; + int rc; + + if (!page) + return -ENOMEM; + + memcpy(page, src_start, length); + caches_clean_inval_pou((unsigned long)page, (unsigned long)page + length); + rc = trans_pgd_idmap_page(&trans_info, &trans_ttbr0, &t0sz, page); + if (rc) + return rc; - memcpy((void *)dst, src_start, length); - __flush_icache_range(dst, dst + length); + cpu_install_ttbr0(trans_ttbr0, t0sz); + *phys_dst_addr = virt_to_phys(page); - trans_pgd = allocator(mask); - if (!trans_pgd) { - rc = -ENOMEM; - goto out; + return 0; +} + +#ifdef CONFIG_ARM64_MTE + +static DEFINE_XARRAY(mte_pages); + +static int save_tags(struct page *page, unsigned long pfn) +{ + void *tag_storage, *ret; + + tag_storage = mte_allocate_tag_storage(); + if (!tag_storage) + return -ENOMEM; + + mte_save_page_tags(page_address(page), tag_storage); + + ret = xa_store(&mte_pages, pfn, tag_storage, GFP_KERNEL); + if (WARN(xa_is_err(ret), "Failed to store MTE tags")) { + mte_free_tag_storage(tag_storage); + return xa_err(ret); + } else if (WARN(ret, "swsusp: %s: Duplicate entry", __func__)) { + mte_free_tag_storage(ret); } - pgdp = pgd_offset_raw(trans_pgd, dst_addr); - if (pgd_none(READ_ONCE(*pgdp))) { - pudp = allocator(mask); - if (!pudp) { - rc = -ENOMEM; - goto out; - } - pgd_populate(&init_mm, pgdp, pudp); + return 0; +} + +static void swsusp_mte_free_storage(void) +{ + XA_STATE(xa_state, &mte_pages, 0); + void *tags; + + xa_lock(&mte_pages); + xas_for_each(&xa_state, tags, ULONG_MAX) { + mte_free_tag_storage(tags); } + xa_unlock(&mte_pages); + + xa_destroy(&mte_pages); +} + +static int swsusp_mte_save_tags(void) +{ + struct zone *zone; + unsigned long pfn, max_zone_pfn; + int ret = 0; + int n = 0; + + if (!system_supports_mte()) + return 0; + + for_each_populated_zone(zone) { + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { + struct page *page = pfn_to_online_page(pfn); - pudp = pud_offset(pgdp, dst_addr); - if (pud_none(READ_ONCE(*pudp))) { - pmdp = allocator(mask); - if (!pmdp) { - rc = -ENOMEM; - goto out; + if (!page) + continue; + + if (!page_mte_tagged(page)) + continue; + + ret = save_tags(page, pfn); + if (ret) { + swsusp_mte_free_storage(); + goto out; + } + + n++; } - pud_populate(&init_mm, pudp, pmdp); } + pr_info("Saved %d MTE pages\n", n); - pmdp = pmd_offset(pudp, dst_addr); - if (pmd_none(READ_ONCE(*pmdp))) { - ptep = allocator(mask); - if (!ptep) { - rc = -ENOMEM; - goto out; - } - pmd_populate_kernel(&init_mm, pmdp, ptep); +out: + return ret; +} + +static void swsusp_mte_restore_tags(void) +{ + XA_STATE(xa_state, &mte_pages, 0); + int n = 0; + void *tags; + + xa_lock(&mte_pages); + xas_for_each(&xa_state, tags, ULONG_MAX) { + unsigned long pfn = xa_state.xa_index; + struct page *page = pfn_to_online_page(pfn); + + mte_restore_page_tags(page_address(page), tags); + + mte_free_tag_storage(tags); + n++; } + xa_unlock(&mte_pages); - ptep = pte_offset_kernel(pmdp, dst_addr); - set_pte(ptep, pfn_pte(virt_to_pfn(dst), PAGE_KERNEL_EXEC)); + pr_info("Restored %d MTE pages\n", n); - /* - * Load our new page tables. A strict BBM approach requires that we - * ensure that TLBs are free of any entries that may overlap with the - * global mappings we are about to install. - * - * For a real hibernate/resume cycle TTBR0 currently points to a zero - * page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI - * runtime services), while for a userspace-driven test_resume cycle it - * points to userspace page tables (and we must point it at a zero page - * ourselves). Elsewhere we only (un)install the idmap with preemption - * disabled, so T0SZ should be as required regardless. - */ - cpu_set_reserved_ttbr0(); - local_flush_tlb_all(); - write_sysreg(phys_to_ttbr(virt_to_phys(pgdp)), ttbr0_el1); - isb(); + xa_destroy(&mte_pages); +} - *phys_dst_addr = virt_to_phys((void *)dst); +#else /* CONFIG_ARM64_MTE */ -out: - return rc; +static int swsusp_mte_save_tags(void) +{ + return 0; +} + +static void swsusp_mte_restore_tags(void) +{ } -#define dcache_clean_range(start, end) __flush_dcache_area(start, (end - start)) +#endif /* CONFIG_ARM64_MTE */ int swsusp_arch_suspend(void) { @@ -297,19 +341,30 @@ int swsusp_arch_suspend(void) /* make the crash dump kernel image visible/saveable */ crash_prepare_suspend(); + ret = swsusp_mte_save_tags(); + if (ret) + return ret; + sleep_cpu = smp_processor_id(); ret = swsusp_save(); } else { /* Clean kernel core startup/idle code to PoC*/ - dcache_clean_range(__mmuoff_data_start, __mmuoff_data_end); - dcache_clean_range(__idmap_text_start, __idmap_text_end); + dcache_clean_inval_poc((unsigned long)__mmuoff_data_start, + (unsigned long)__mmuoff_data_end); + dcache_clean_inval_poc((unsigned long)__idmap_text_start, + (unsigned long)__idmap_text_end); /* Clean kvm setup code to PoC? */ if (el2_reset_needed()) { - dcache_clean_range(__hyp_idmap_text_start, __hyp_idmap_text_end); - dcache_clean_range(__hyp_text_start, __hyp_text_end); + dcache_clean_inval_poc( + (unsigned long)__hyp_idmap_text_start, + (unsigned long)__hyp_idmap_text_end); + dcache_clean_inval_poc((unsigned long)__hyp_text_start, + (unsigned long)__hyp_text_end); } + swsusp_mte_restore_tags(); + /* make the crash dump kernel image protected again */ crash_post_resume(); @@ -327,11 +382,7 @@ int swsusp_arch_suspend(void) * mitigation off behind our back, let's set the state * to what we expect it to be. */ - switch (arm64_get_ssbd_state()) { - case ARM64_SSBD_FORCE_ENABLE: - case ARM64_SSBD_KERNEL: - arm64_set_ssbd_mitigation(true); - } + spectre_v4_enable_mitigation(NULL); } local_daif_restore(flags); @@ -339,143 +390,6 @@ int swsusp_arch_suspend(void) return ret; } -static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) -{ - pte_t pte = READ_ONCE(*src_ptep); - - if (pte_valid(pte)) { - /* - * Resume will overwrite areas that may be marked - * read only (code, rodata). Clear the RDONLY bit from - * the temporary mappings we use during restore. - */ - set_pte(dst_ptep, pte_mkwrite(pte)); - } else if (debug_pagealloc_enabled() && !pte_none(pte)) { - /* - * debug_pagealloc will removed the PTE_VALID bit if - * the page isn't in use by the resume kernel. It may have - * been in use by the original kernel, in which case we need - * to put it back in our copy to do the restore. - * - * Before marking this entry valid, check the pfn should - * be mapped. - */ - BUG_ON(!pfn_valid(pte_pfn(pte))); - - set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte))); - } -} - -static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, - unsigned long end) -{ - pte_t *src_ptep; - pte_t *dst_ptep; - unsigned long addr = start; - - dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC); - if (!dst_ptep) - return -ENOMEM; - pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep); - dst_ptep = pte_offset_kernel(dst_pmdp, start); - - src_ptep = pte_offset_kernel(src_pmdp, start); - do { - _copy_pte(dst_ptep, src_ptep, addr); - } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end); - - return 0; -} - -static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, - unsigned long end) -{ - pmd_t *src_pmdp; - pmd_t *dst_pmdp; - unsigned long next; - unsigned long addr = start; - - if (pud_none(READ_ONCE(*dst_pudp))) { - dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pmdp) - return -ENOMEM; - pud_populate(&init_mm, dst_pudp, dst_pmdp); - } - dst_pmdp = pmd_offset(dst_pudp, start); - - src_pmdp = pmd_offset(src_pudp, start); - do { - pmd_t pmd = READ_ONCE(*src_pmdp); - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - continue; - if (pmd_table(pmd)) { - if (copy_pte(dst_pmdp, src_pmdp, addr, next)) - return -ENOMEM; - } else { - set_pmd(dst_pmdp, - __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY)); - } - } while (dst_pmdp++, src_pmdp++, addr = next, addr != end); - - return 0; -} - -static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, - unsigned long end) -{ - pud_t *dst_pudp; - pud_t *src_pudp; - unsigned long next; - unsigned long addr = start; - - if (pgd_none(READ_ONCE(*dst_pgdp))) { - dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pudp) - return -ENOMEM; - pgd_populate(&init_mm, dst_pgdp, dst_pudp); - } - dst_pudp = pud_offset(dst_pgdp, start); - - src_pudp = pud_offset(src_pgdp, start); - do { - pud_t pud = READ_ONCE(*src_pudp); - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - continue; - if (pud_table(pud)) { - if (copy_pmd(dst_pudp, src_pudp, addr, next)) - return -ENOMEM; - } else { - set_pud(dst_pudp, - __pud(pud_val(pud) & ~PMD_SECT_RDONLY)); - } - } while (dst_pudp++, src_pudp++, addr = next, addr != end); - - return 0; -} - -static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, - unsigned long end) -{ - unsigned long next; - unsigned long addr = start; - pgd_t *src_pgdp = pgd_offset_k(start); - - dst_pgdp = pgd_offset_raw(dst_pgdp, start); - do { - next = pgd_addr_end(addr, end); - if (pgd_none(READ_ONCE(*src_pgdp))) - continue; - if (copy_pud(dst_pgdp, src_pgdp, addr, next)) - return -ENOMEM; - } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); - - return 0; -} - /* * Setup then Resume from the hibernate image using swsusp_arch_suspend_exit(). * @@ -484,85 +398,72 @@ static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, */ int swsusp_arch_resume(void) { - int rc = 0; + int rc; void *zero_page; size_t exit_size; pgd_t *tmp_pg_dir; - phys_addr_t phys_hibernate_exit; + phys_addr_t el2_vectors; void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *, void *, phys_addr_t, phys_addr_t); + struct trans_pgd_info trans_info = { + .trans_alloc_page = hibernate_page_alloc, + .trans_alloc_arg = (void *)GFP_ATOMIC, + }; /* * Restoring the memory image will overwrite the ttbr1 page tables. * Create a second copy of just the linear map, and use this when * restoring. */ - tmp_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!tmp_pg_dir) { - pr_err("Failed to allocate memory for temporary page tables.\n"); - rc = -ENOMEM; - goto out; - } - rc = copy_page_tables(tmp_pg_dir, PAGE_OFFSET, PAGE_END); + rc = trans_pgd_create_copy(&trans_info, &tmp_pg_dir, PAGE_OFFSET, + PAGE_END); if (rc) - goto out; + return rc; /* - * We need a zero page that is zero before & after resume in order to + * We need a zero page that is zero before & after resume in order * to break before make on the ttbr1 page tables. */ zero_page = (void *)get_safe_page(GFP_ATOMIC); if (!zero_page) { pr_err("Failed to allocate zero page.\n"); - rc = -ENOMEM; - goto out; + return -ENOMEM; + } + + if (el2_reset_needed()) { + rc = trans_pgd_copy_el2_vectors(&trans_info, &el2_vectors); + if (rc) { + pr_err("Failed to setup el2 vectors\n"); + return rc; + } } - /* - * Locate the exit code in the bottom-but-one page, so that *NULL - * still has disastrous affects. - */ - hibernate_exit = (void *)PAGE_SIZE; exit_size = __hibernate_exit_text_end - __hibernate_exit_text_start; /* * Copy swsusp_arch_suspend_exit() to a safe page. This will generate * a new set of ttbr0 page tables and load them. */ rc = create_safe_exec_page(__hibernate_exit_text_start, exit_size, - (unsigned long)hibernate_exit, - &phys_hibernate_exit, - (void *)get_safe_page, GFP_ATOMIC); + (phys_addr_t *)&hibernate_exit); if (rc) { pr_err("Failed to create safe executable page for hibernate_exit code.\n"); - goto out; + return rc; } /* - * The hibernate exit text contains a set of el2 vectors, that will - * be executed at el2 with the mmu off in order to reload hyp-stub. - */ - __flush_dcache_area(hibernate_exit, exit_size); - - /* * KASLR will cause the el2 vectors to be in a different location in * the resumed kernel. Load hibernate's temporary copy into el2. * * We can skip this step if we booted at EL1, or are running with VHE. */ - if (el2_reset_needed()) { - phys_addr_t el2_vectors = phys_hibernate_exit; /* base */ - el2_vectors += hibernate_el2_vectors - - __hibernate_exit_text_start; /* offset */ - + if (el2_reset_needed()) __hyp_set_vectors(el2_vectors); - } hibernate_exit(virt_to_phys(tmp_pg_dir), resume_hdr.ttbr1_el1, resume_hdr.reenter_kernel, restore_pblist, resume_hdr.__hyp_stub_vectors, virt_to_phys(zero_page)); -out: - return rc; + return 0; } int hibernate_resume_nonboot_cpu_disable(void) diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 0b727edf4104..35225632d70a 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -257,7 +257,7 @@ static int hw_breakpoint_control(struct perf_event *bp, * level. */ enable_debug_monitors(dbg_el); - /* Fall through */ + fallthrough; case HW_BREAKPOINT_RESTORE: /* Setup the address register. */ write_wb_reg(val_reg, i, info->address); @@ -541,13 +541,13 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, if (hw->ctrl.len == ARM_BREAKPOINT_LEN_2) break; - /* Fallthrough */ + fallthrough; case 3: /* Allow single byte watchpoint. */ if (hw->ctrl.len == ARM_BREAKPOINT_LEN_1) break; - /* Fallthrough */ + fallthrough; default: return -EINVAL; } @@ -617,7 +617,7 @@ NOKPROBE_SYMBOL(toggle_bp_registers); /* * Debug exception handlers. */ -static int breakpoint_handler(unsigned long unused, unsigned int esr, +static int breakpoint_handler(unsigned long unused, unsigned long esr, struct pt_regs *regs) { int i, step = 0, *kernel_step; @@ -654,7 +654,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr, perf_bp_event(bp, regs); /* Do we need to handle the stepping? */ - if (is_default_overflow_handler(bp)) + if (uses_default_overflow_handler(bp)) step = 1; unlock: rcu_read_unlock(); @@ -701,7 +701,7 @@ NOKPROBE_SYMBOL(breakpoint_handler); * addresses. There is no straight-forward way, short of disassembling the * offending instruction, to map that address back to the watchpoint. This * function computes the distance of the memory access from the watchpoint as a - * heuristic for the likelyhood that a given access triggered the watchpoint. + * heuristic for the likelihood that a given access triggered the watchpoint. * * See Section D2.10.5 "Determining the memory location that caused a Watchpoint * exception" of ARMv8 Architecture Reference Manual for details. @@ -730,7 +730,28 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val, return 0; } -static int watchpoint_handler(unsigned long addr, unsigned int esr, +static int watchpoint_report(struct perf_event *wp, unsigned long addr, + struct pt_regs *regs) +{ + int step = uses_default_overflow_handler(wp); + struct arch_hw_breakpoint *info = counter_arch_bp(wp); + + info->trigger = addr; + + /* + * If we triggered a user watchpoint from a uaccess routine, then + * handle the stepping ourselves since userspace really can't help + * us with this. + */ + if (!user_mode(regs) && info->ctrl.privilege == AARCH64_BREAKPOINT_EL0) + step = 1; + else + perf_bp_event(wp, regs); + + return step; +} + +static int watchpoint_handler(unsigned long addr, unsigned long esr, struct pt_regs *regs) { int i, step = 0, *kernel_step, access, closest_match = 0; @@ -739,7 +760,6 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, u64 val; struct perf_event *wp, **slots; struct debug_info *debug_info; - struct arch_hw_breakpoint *info; struct arch_hw_breakpoint_ctrl ctrl; slots = this_cpu_ptr(wp_on_reg); @@ -777,25 +797,13 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, if (dist != 0) continue; - info = counter_arch_bp(wp); - info->trigger = addr; - perf_bp_event(wp, regs); - - /* Do we need to handle the stepping? */ - if (is_default_overflow_handler(wp)) - step = 1; + step = watchpoint_report(wp, addr, regs); } - if (min_dist > 0 && min_dist != -1) { - /* No exact match found. */ - wp = slots[closest_match]; - info = counter_arch_bp(wp); - info->trigger = addr; - perf_bp_event(wp, regs); - /* Do we need to handle the stepping? */ - if (is_default_overflow_handler(wp)) - step = 1; - } + /* No exact match found? */ + if (min_dist > 0 && min_dist != -1) + step = watchpoint_report(slots[closest_match], addr, regs); + rcu_read_unlock(); if (!step) @@ -965,14 +973,6 @@ static int hw_breakpoint_reset(unsigned int cpu) return 0; } -#ifdef CONFIG_CPU_PM -extern void cpu_suspend_set_dbg_restorer(int (*hw_bp_restore)(unsigned int)); -#else -static inline void cpu_suspend_set_dbg_restorer(int (*hw_bp_restore)(unsigned int)) -{ -} -#endif - /* * One-time initialisation. */ diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 73d46070b315..65f76064c86b 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -8,9 +8,9 @@ #include <linux/init.h> #include <linux/linkage.h> -#include <linux/irqchip/arm-gic-v3.h> #include <asm/assembler.h> +#include <asm/el2_setup.h> #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> #include <asm/ptrace.h> @@ -21,18 +21,18 @@ .align 11 -ENTRY(__hyp_stub_vectors) +SYM_CODE_START(__hyp_stub_vectors) ventry el2_sync_invalid // Synchronous EL2t ventry el2_irq_invalid // IRQ EL2t ventry el2_fiq_invalid // FIQ EL2t ventry el2_error_invalid // Error EL2t - ventry el2_sync_invalid // Synchronous EL2h + ventry elx_sync // Synchronous EL2h ventry el2_irq_invalid // IRQ EL2h ventry el2_fiq_invalid // FIQ EL2h ventry el2_error_invalid // Error EL2h - ventry el1_sync // Synchronous 64-bit EL1 + ventry elx_sync // Synchronous 64-bit EL1 ventry el1_irq_invalid // IRQ 64-bit EL1 ventry el1_fiq_invalid // FIQ 64-bit EL1 ventry el1_error_invalid // Error 64-bit EL1 @@ -41,16 +41,19 @@ ENTRY(__hyp_stub_vectors) ventry el1_irq_invalid // IRQ 32-bit EL1 ventry el1_fiq_invalid // FIQ 32-bit EL1 ventry el1_error_invalid // Error 32-bit EL1 -ENDPROC(__hyp_stub_vectors) +SYM_CODE_END(__hyp_stub_vectors) .align 11 -el1_sync: +SYM_CODE_START_LOCAL(elx_sync) cmp x0, #HVC_SET_VECTORS - b.ne 2f + b.ne 1f msr vbar_el2, x1 b 9f +1: cmp x0, #HVC_FINALISE_EL2 + b.eq __finalise_el2 + 2: cmp x0, #HVC_SOFT_RESTART b.ne 3f mov x0, x2 @@ -63,17 +66,129 @@ el1_sync: beq 9f // Nothing to reset! /* Someone called kvm_call_hyp() against the hyp-stub... */ - ldr x0, =HVC_STUB_ERR + mov_q x0, HVC_STUB_ERR eret 9: mov x0, xzr eret -ENDPROC(el1_sync) +SYM_CODE_END(elx_sync) + +SYM_CODE_START_LOCAL(__finalise_el2) + finalise_el2_state + + // nVHE? No way! Give me the real thing! + // Sanity check: MMU *must* be off + mrs x1, sctlr_el2 + tbnz x1, #0, 1f + + // Needs to be VHE capable, obviously + check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 0f 1f x1 x2 + +0: // Check whether we only want the hypervisor to run VHE, not the kernel + adr_l x1, arm64_sw_feature_override + ldr x2, [x1, FTR_OVR_VAL_OFFSET] + ldr x1, [x1, FTR_OVR_MASK_OFFSET] + and x2, x2, x1 + ubfx x2, x2, #ARM64_SW_FEATURE_OVERRIDE_HVHE, #4 + cbz x2, 2f + +1: mov_q x0, HVC_STUB_ERR + eret +2: + // Engage the VHE magic! + mov_q x0, HCR_HOST_VHE_FLAGS + msr hcr_el2, x0 + isb + + // Use the EL1 allocated stack, per-cpu offset + mrs x0, sp_el1 + mov sp, x0 + mrs x0, tpidr_el1 + msr tpidr_el2, x0 + + // FP configuration, vectors + mrs_s x0, SYS_CPACR_EL12 + msr cpacr_el1, x0 + mrs_s x0, SYS_VBAR_EL12 + msr vbar_el1, x0 + + // Use EL2 translations for SPE & TRBE and disable access from EL1 + mrs x0, mdcr_el2 + bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) + bic x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) + msr mdcr_el2, x0 + + // Transfer the MM state from EL1 to EL2 + mrs_s x0, SYS_TCR_EL12 + msr tcr_el1, x0 + mrs_s x0, SYS_TTBR0_EL12 + msr ttbr0_el1, x0 + mrs_s x0, SYS_TTBR1_EL12 + msr ttbr1_el1, x0 + mrs_s x0, SYS_MAIR_EL12 + msr mair_el1, x0 + mrs x1, REG_ID_AA64MMFR3_EL1 + ubfx x1, x1, #ID_AA64MMFR3_EL1_TCRX_SHIFT, #4 + cbz x1, .Lskip_tcr2 + mrs x0, REG_TCR2_EL12 + msr REG_TCR2_EL1, x0 + + // Transfer permission indirection state + mrs x1, REG_ID_AA64MMFR3_EL1 + ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 + cbz x1, .Lskip_indirection + mrs x0, REG_PIRE0_EL12 + msr REG_PIRE0_EL1, x0 + mrs x0, REG_PIR_EL12 + msr REG_PIR_EL1, x0 + +.Lskip_indirection: +.Lskip_tcr2: + + isb + + // Hack the exception return to stay at EL2 + mrs x0, spsr_el1 + and x0, x0, #~PSR_MODE_MASK + mov x1, #PSR_MODE_EL2h + orr x0, x0, x1 + msr spsr_el1, x0 + + b enter_vhe +SYM_CODE_END(__finalise_el2) + + // At the point where we reach enter_vhe(), we run with + // the MMU off (which is enforced by __finalise_el2()). + // We thus need to be in the idmap, or everything will + // explode when enabling the MMU. + + .pushsection .idmap.text, "ax" + +SYM_CODE_START_LOCAL(enter_vhe) + // Invalidate TLBs before enabling the MMU + tlbi vmalle1 + dsb nsh + isb + + // Enable the EL2 S1 MMU, as set up from EL1 + mrs_s x0, SYS_SCTLR_EL12 + set_sctlr_el1 x0 + + // Disable the EL1 S1 MMU for a good measure + mov_q x0, INIT_SCTLR_EL1_MMU_OFF + msr_s SYS_SCTLR_EL12, x0 + + mov x0, xzr + + eret +SYM_CODE_END(enter_vhe) + + .popsection .macro invalid_vector label -\label: +SYM_CODE_START_LOCAL(\label) b \label -ENDPROC(\label) +SYM_CODE_END(\label) .endm invalid_vector el2_sync_invalid @@ -85,6 +200,8 @@ ENDPROC(\label) invalid_vector el1_fiq_invalid invalid_vector el1_error_invalid + .popsection + /* * __hyp_set_vectors: Call this after boot to set the initial hypervisor * vectors as part of hypervisor installation. On an SMP system, this should @@ -106,15 +223,36 @@ ENDPROC(\label) * initialisation entry point. */ -ENTRY(__hyp_set_vectors) +SYM_FUNC_START(__hyp_set_vectors) mov x1, x0 mov x0, #HVC_SET_VECTORS hvc #0 ret -ENDPROC(__hyp_set_vectors) +SYM_FUNC_END(__hyp_set_vectors) -ENTRY(__hyp_reset_vectors) +SYM_FUNC_START(__hyp_reset_vectors) mov x0, #HVC_RESET_VECTORS hvc #0 ret -ENDPROC(__hyp_reset_vectors) +SYM_FUNC_END(__hyp_reset_vectors) + +/* + * Entry point to finalise EL2 and switch to VHE if deemed capable + * + * w0: boot mode, as returned by init_kernel_el() + */ +SYM_FUNC_START(finalise_el2) + // Need to have booted at EL2 + cmp w0, #BOOT_CPU_MODE_EL2 + b.ne 1f + + // and still be at EL1 + mrs x0, CurrentEL + cmp x0, #CurrentEL_EL1 + b.ne 1f + + mov x0, #HVC_FINALISE_EL2 + hvc #0 +1: + ret +SYM_FUNC_END(finalise_el2) diff --git a/arch/arm64/kernel/idle.c b/arch/arm64/kernel/idle.c new file mode 100644 index 000000000000..05cfb347ec26 --- /dev/null +++ b/arch/arm64/kernel/idle.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Low-level idle sequences + */ + +#include <linux/cpu.h> +#include <linux/irqflags.h> + +#include <asm/barrier.h> +#include <asm/cpuidle.h> +#include <asm/cpufeature.h> +#include <asm/sysreg.h> + +/* + * cpu_do_idle() + * + * Idle the processor (wait for interrupt). + * + * If the CPU supports priority masking we must do additional work to + * ensure that interrupts are not masked at the PMR (because the core will + * not wake up if we block the wake up signal in the interrupt controller). + */ +void __cpuidle cpu_do_idle(void) +{ + struct arm_cpuidle_irq_context context; + + arm_cpuidle_save_irq_context(&context); + + dsb(sy); + wfi(); + + arm_cpuidle_restore_irq_context(&context); +} + +/* + * This is our default idle handler. + */ +void __cpuidle arch_cpu_idle(void) +{ + /* + * This should do all the clock switching and wait for interrupt + * tricks + */ + cpu_do_idle(); +} diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c new file mode 100644 index 000000000000..e30fd9e32ef3 --- /dev/null +++ b/arch/arm64/kernel/idreg-override.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Early cpufeature override framework + * + * Copyright (C) 2020 Google LLC + * Author: Marc Zyngier <maz@kernel.org> + */ + +#include <linux/ctype.h> +#include <linux/kernel.h> +#include <linux/libfdt.h> + +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/setup.h> + +#define FTR_DESC_NAME_LEN 20 +#define FTR_DESC_FIELD_LEN 10 +#define FTR_ALIAS_NAME_LEN 30 +#define FTR_ALIAS_OPTION_LEN 116 + +static u64 __boot_status __initdata; + +// temporary __prel64 related definitions +// to be removed when this code is moved under pi/ + +#define __prel64_initconst __initconst + +#define PREL64(type, name) union { type *name; } + +#define prel64_pointer(__d) (__d) + +typedef bool filter_t(u64 val); + +struct ftr_set_desc { + char name[FTR_DESC_NAME_LEN]; + PREL64(struct arm64_ftr_override, override); + struct { + char name[FTR_DESC_FIELD_LEN]; + u8 shift; + u8 width; + PREL64(filter_t, filter); + } fields[]; +}; + +#define FIELD(n, s, f) { .name = n, .shift = s, .width = 4, .filter = f } + +static bool __init mmfr1_vh_filter(u64 val) +{ + /* + * If we ever reach this point while running VHE, we're + * guaranteed to be on one of these funky, VHE-stuck CPUs. If + * the user was trying to force nVHE on us, proceed with + * attitude adjustment. + */ + return !(__boot_status == (BOOT_CPU_FLAG_E2H | BOOT_CPU_MODE_EL2) && + val == 0); +} + +static const struct ftr_set_desc mmfr1 __prel64_initconst = { + .name = "id_aa64mmfr1", + .override = &id_aa64mmfr1_override, + .fields = { + FIELD("vh", ID_AA64MMFR1_EL1_VH_SHIFT, mmfr1_vh_filter), + {} + }, +}; + +static bool __init pfr0_sve_filter(u64 val) +{ + /* + * Disabling SVE also means disabling all the features that + * are associated with it. The easiest way to do it is just to + * override id_aa64zfr0_el1 to be 0. + */ + if (!val) { + id_aa64zfr0_override.val = 0; + id_aa64zfr0_override.mask = GENMASK(63, 0); + } + + return true; +} + +static const struct ftr_set_desc pfr0 __prel64_initconst = { + .name = "id_aa64pfr0", + .override = &id_aa64pfr0_override, + .fields = { + FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter), + {} + }, +}; + +static bool __init pfr1_sme_filter(u64 val) +{ + /* + * Similarly to SVE, disabling SME also means disabling all + * the features that are associated with it. Just set + * id_aa64smfr0_el1 to 0 and don't look back. + */ + if (!val) { + id_aa64smfr0_override.val = 0; + id_aa64smfr0_override.mask = GENMASK(63, 0); + } + + return true; +} + +static const struct ftr_set_desc pfr1 __prel64_initconst = { + .name = "id_aa64pfr1", + .override = &id_aa64pfr1_override, + .fields = { + FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ), + FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), + FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), + {} + }, +}; + +static const struct ftr_set_desc isar1 __prel64_initconst = { + .name = "id_aa64isar1", + .override = &id_aa64isar1_override, + .fields = { + FIELD("gpi", ID_AA64ISAR1_EL1_GPI_SHIFT, NULL), + FIELD("gpa", ID_AA64ISAR1_EL1_GPA_SHIFT, NULL), + FIELD("api", ID_AA64ISAR1_EL1_API_SHIFT, NULL), + FIELD("apa", ID_AA64ISAR1_EL1_APA_SHIFT, NULL), + {} + }, +}; + +static const struct ftr_set_desc isar2 __prel64_initconst = { + .name = "id_aa64isar2", + .override = &id_aa64isar2_override, + .fields = { + FIELD("gpa3", ID_AA64ISAR2_EL1_GPA3_SHIFT, NULL), + FIELD("apa3", ID_AA64ISAR2_EL1_APA3_SHIFT, NULL), + FIELD("mops", ID_AA64ISAR2_EL1_MOPS_SHIFT, NULL), + {} + }, +}; + +static const struct ftr_set_desc smfr0 __prel64_initconst = { + .name = "id_aa64smfr0", + .override = &id_aa64smfr0_override, + .fields = { + FIELD("smever", ID_AA64SMFR0_EL1_SMEver_SHIFT, NULL), + /* FA64 is a one bit field... :-/ */ + { "fa64", ID_AA64SMFR0_EL1_FA64_SHIFT, 1, }, + {} + }, +}; + +static bool __init hvhe_filter(u64 val) +{ + u64 mmfr1 = read_sysreg(id_aa64mmfr1_el1); + + return (val == 1 && + lower_32_bits(__boot_status) == BOOT_CPU_MODE_EL2 && + cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_EL1_VH_SHIFT)); +} + +static const struct ftr_set_desc sw_features __prel64_initconst = { + .name = "arm64_sw", + .override = &arm64_sw_feature_override, + .fields = { + FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL), + FIELD("hvhe", ARM64_SW_FEATURE_OVERRIDE_HVHE, hvhe_filter), + {} + }, +}; + +static const +PREL64(const struct ftr_set_desc, reg) regs[] __prel64_initconst = { + { &mmfr1 }, + { &pfr0 }, + { &pfr1 }, + { &isar1 }, + { &isar2 }, + { &smfr0 }, + { &sw_features }, +}; + +static const struct { + char alias[FTR_ALIAS_NAME_LEN]; + char feature[FTR_ALIAS_OPTION_LEN]; +} aliases[] __initconst = { + { "kvm_arm.mode=nvhe", "id_aa64mmfr1.vh=0" }, + { "kvm_arm.mode=protected", "id_aa64mmfr1.vh=0" }, + { "arm64.nosve", "id_aa64pfr0.sve=0" }, + { "arm64.nosme", "id_aa64pfr1.sme=0" }, + { "arm64.nobti", "id_aa64pfr1.bt=0" }, + { "arm64.nopauth", + "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 " + "id_aa64isar1.api=0 id_aa64isar1.apa=0 " + "id_aa64isar2.gpa3=0 id_aa64isar2.apa3=0" }, + { "arm64.nomops", "id_aa64isar2.mops=0" }, + { "arm64.nomte", "id_aa64pfr1.mte=0" }, + { "nokaslr", "arm64_sw.nokaslr=1" }, +}; + +static int __init parse_hexdigit(const char *p, u64 *v) +{ + // skip "0x" if it comes next + if (p[0] == '0' && tolower(p[1]) == 'x') + p += 2; + + // check whether the RHS is a single hex digit + if (!isxdigit(p[0]) || (p[1] && !isspace(p[1]))) + return -EINVAL; + + *v = tolower(*p) - (isdigit(*p) ? '0' : 'a' - 10); + return 0; +} + +static int __init find_field(const char *cmdline, char *opt, int len, + const struct ftr_set_desc *reg, int f, u64 *v) +{ + int flen = strlen(reg->fields[f].name); + + // append '<fieldname>=' to obtain '<name>.<fieldname>=' + memcpy(opt + len, reg->fields[f].name, flen); + len += flen; + opt[len++] = '='; + + if (memcmp(cmdline, opt, len)) + return -1; + + return parse_hexdigit(cmdline + len, v); +} + +static void __init match_options(const char *cmdline) +{ + char opt[FTR_DESC_NAME_LEN + FTR_DESC_FIELD_LEN + 2]; + int i; + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + const struct ftr_set_desc *reg = prel64_pointer(regs[i].reg); + struct arm64_ftr_override *override; + int len = strlen(reg->name); + int f; + + override = prel64_pointer(reg->override); + + // set opt[] to '<name>.' + memcpy(opt, reg->name, len); + opt[len++] = '.'; + + for (f = 0; reg->fields[f].name[0] != '\0'; f++) { + u64 shift = reg->fields[f].shift; + u64 width = reg->fields[f].width ?: 4; + u64 mask = GENMASK_ULL(shift + width - 1, shift); + bool (*filter)(u64 val); + u64 v; + + if (find_field(cmdline, opt, len, reg, f, &v)) + continue; + + /* + * If an override gets filtered out, advertise + * it by setting the value to the all-ones while + * clearing the mask... Yes, this is fragile. + */ + filter = prel64_pointer(reg->fields[f].filter); + if (filter && !filter(v)) { + override->val |= mask; + override->mask &= ~mask; + continue; + } + + override->val &= ~mask; + override->val |= (v << shift) & mask; + override->mask |= mask; + + return; + } + } +} + +static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) +{ + do { + char buf[256]; + size_t len; + int i; + + cmdline = skip_spaces(cmdline); + + /* terminate on "--" appearing on the command line by itself */ + if (cmdline[0] == '-' && cmdline[1] == '-' && isspace(cmdline[2])) + return; + + for (len = 0; cmdline[len] && !isspace(cmdline[len]); len++) { + if (len >= sizeof(buf) - 1) + break; + if (cmdline[len] == '-') + buf[len] = '_'; + else + buf[len] = cmdline[len]; + } + if (!len) + return; + + buf[len] = 0; + + cmdline += len; + + match_options(buf); + + for (i = 0; parse_aliases && i < ARRAY_SIZE(aliases); i++) + if (!memcmp(buf, aliases[i].alias, len + 1)) + __parse_cmdline(aliases[i].feature, false); + } while (1); +} + +static __init const u8 *get_bootargs_cmdline(void) +{ + const u8 *prop; + void *fdt; + int node; + + fdt = get_early_fdt_ptr(); + if (!fdt) + return NULL; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return NULL; + + prop = fdt_getprop(fdt, node, "bootargs", NULL); + if (!prop) + return NULL; + + return strlen(prop) ? prop : NULL; +} + +static __init void parse_cmdline(void) +{ + const u8 *prop = get_bootargs_cmdline(); + + if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop) + __parse_cmdline(CONFIG_CMDLINE, true); + + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop) + __parse_cmdline(prop, true); +} + +/* Keep checkers quiet */ +void init_feature_override(u64 boot_status); + +asmlinkage void __init init_feature_override(u64 boot_status) +{ + struct arm64_ftr_override *override; + const struct ftr_set_desc *reg; + int i; + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + reg = prel64_pointer(regs[i].reg); + override = prel64_pointer(reg->override); + + override->val = 0; + override->mask = 0; + } + + __boot_status = boot_status; + + parse_cmdline(); + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + reg = prel64_pointer(regs[i].reg); + override = prel64_pointer(reg->override); + dcache_clean_inval_poc((unsigned long)override, + (unsigned long)(override + 1)); + } +} diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 25a2a9b479c2..5e4dc72ab1bd 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -10,9 +10,7 @@ #error This file should only be included in vmlinux.lds.S #endif -#ifdef CONFIG_EFI - -__efistub_stext_offset = stext - _text; +PROVIDE(__efistub_primary_entry = primary_entry); /* * The EFI stub has its own symbol namespace prefixed by __efistub_, to @@ -23,29 +21,97 @@ __efistub_stext_offset = stext - _text; * linked at. The routines below are all implemented in assembler in a * position independent manner */ -__efistub_memcmp = __pi_memcmp; -__efistub_memchr = __pi_memchr; -__efistub_memcpy = __pi_memcpy; -__efistub_memmove = __pi_memmove; -__efistub_memset = __pi_memset; -__efistub_strlen = __pi_strlen; -__efistub_strnlen = __pi_strnlen; -__efistub_strcmp = __pi_strcmp; -__efistub_strncmp = __pi_strncmp; -__efistub_strrchr = __pi_strrchr; -__efistub___flush_dcache_area = __pi___flush_dcache_area; +PROVIDE(__efistub_caches_clean_inval_pou = __pi_caches_clean_inval_pou); + +PROVIDE(__efistub__text = _text); +PROVIDE(__efistub__end = _end); +PROVIDE(__efistub___inittext_end = __inittext_end); +PROVIDE(__efistub__edata = _edata); +#if defined(CONFIG_EFI_EARLYCON) || defined(CONFIG_SYSFB) +PROVIDE(__efistub_screen_info = screen_info); +#endif +PROVIDE(__efistub__ctype = _ctype); + +PROVIDE(__pi___memcpy = __pi_memcpy); +PROVIDE(__pi___memmove = __pi_memmove); +PROVIDE(__pi___memset = __pi_memset); + +#ifdef CONFIG_KVM + +/* + * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_, to + * separate it from the kernel proper. The following symbols are legally + * accessed by it, therefore provide aliases to make them linkable. + * Do not include symbols which may not be safely accessed under hypervisor + * memory mappings. + */ + +/* Alternative callbacks for init-time patching of nVHE hyp code. */ +KVM_NVHE_ALIAS(kvm_patch_vector_branch); +KVM_NVHE_ALIAS(kvm_update_va_mask); +KVM_NVHE_ALIAS(kvm_get_kimage_voffset); +KVM_NVHE_ALIAS(kvm_compute_final_ctr_el0); +KVM_NVHE_ALIAS(spectre_bhb_patch_loop_iter); +KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable); +KVM_NVHE_ALIAS(spectre_bhb_patch_wa3); +KVM_NVHE_ALIAS(spectre_bhb_patch_clearbhb); +KVM_NVHE_ALIAS(alt_cb_patch_nops); + +/* Global kernel state accessed by nVHE hyp code. */ +KVM_NVHE_ALIAS(kvm_vgic_global_state); + +/* Kernel symbols used to call panic() from nVHE hyp code (via ERET). */ +KVM_NVHE_ALIAS(nvhe_hyp_panic_handler); + +/* Vectors installed by hyp-init on reset HVC. */ +KVM_NVHE_ALIAS(__hyp_stub_vectors); + +/* Static keys which are set if a vGIC trap should be handled in hyp. */ +KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); +KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); + +#ifdef CONFIG_ARM64_PSEUDO_NMI +/* Static key checked in GIC_PRIO_IRQOFF. */ +KVM_NVHE_ALIAS(gic_nonsecure_priorities); +#endif + +/* EL2 exception handling */ +KVM_NVHE_ALIAS(__start___kvm_ex_table); +KVM_NVHE_ALIAS(__stop___kvm_ex_table); + +/* PMU available static key */ +#ifdef CONFIG_HW_PERF_EVENTS +KVM_NVHE_ALIAS(kvm_arm_pmu_available); +#endif + +/* Position-independent library routines */ +KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page); +KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page); +KVM_NVHE_ALIAS_HYP(memcpy, __pi_memcpy); +KVM_NVHE_ALIAS_HYP(memset, __pi_memset); #ifdef CONFIG_KASAN -__efistub___memcpy = __pi_memcpy; -__efistub___memmove = __pi_memmove; -__efistub___memset = __pi_memset; +KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy); +KVM_NVHE_ALIAS_HYP(__memset, __pi_memset); #endif -__efistub__text = _text; -__efistub__end = _end; -__efistub__edata = _edata; -__efistub_screen_info = screen_info; +/* Hyp memory sections */ +KVM_NVHE_ALIAS(__hyp_idmap_text_start); +KVM_NVHE_ALIAS(__hyp_idmap_text_end); +KVM_NVHE_ALIAS(__hyp_text_start); +KVM_NVHE_ALIAS(__hyp_text_end); +KVM_NVHE_ALIAS(__hyp_bss_start); +KVM_NVHE_ALIAS(__hyp_bss_end); +KVM_NVHE_ALIAS(__hyp_rodata_start); +KVM_NVHE_ALIAS(__hyp_rodata_end); + +/* pKVM static key */ +KVM_NVHE_ALIAS(kvm_protected_mode_initialized); + +#endif /* CONFIG_KVM */ +#ifdef CONFIG_EFI_ZBOOT +_kernel_codesize = ABSOLUTE(__inittext_end - _text); #endif #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index c7d38c660372..7bc3ba897901 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -62,7 +62,6 @@ */ #define HEAD_SYMBOLS \ DEFINE_IMAGE_LE64(_kernel_size_le, _end - _text); \ - DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET); \ DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS); #endif /* __ARM64_KERNEL_IMAGE_H */ diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c deleted file mode 100644 index 4a9e773a177f..000000000000 --- a/arch/arm64/kernel/insn.c +++ /dev/null @@ -1,1690 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2013 Huawei Ltd. - * Author: Jiang Liu <liuj97@gmail.com> - * - * Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com> - */ -#include <linux/bitops.h> -#include <linux/bug.h> -#include <linux/compiler.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/spinlock.h> -#include <linux/stop_machine.h> -#include <linux/types.h> -#include <linux/uaccess.h> - -#include <asm/cacheflush.h> -#include <asm/debug-monitors.h> -#include <asm/fixmap.h> -#include <asm/insn.h> -#include <asm/kprobes.h> -#include <asm/sections.h> - -#define AARCH64_INSN_SF_BIT BIT(31) -#define AARCH64_INSN_N_BIT BIT(22) -#define AARCH64_INSN_LSL_12 BIT(22) - -static const int aarch64_insn_encoding_class[] = { - AARCH64_INSN_CLS_UNKNOWN, - AARCH64_INSN_CLS_UNKNOWN, - AARCH64_INSN_CLS_UNKNOWN, - AARCH64_INSN_CLS_UNKNOWN, - AARCH64_INSN_CLS_LDST, - AARCH64_INSN_CLS_DP_REG, - AARCH64_INSN_CLS_LDST, - AARCH64_INSN_CLS_DP_FPSIMD, - AARCH64_INSN_CLS_DP_IMM, - AARCH64_INSN_CLS_DP_IMM, - AARCH64_INSN_CLS_BR_SYS, - AARCH64_INSN_CLS_BR_SYS, - AARCH64_INSN_CLS_LDST, - AARCH64_INSN_CLS_DP_REG, - AARCH64_INSN_CLS_LDST, - AARCH64_INSN_CLS_DP_FPSIMD, -}; - -enum aarch64_insn_encoding_class __kprobes aarch64_get_insn_class(u32 insn) -{ - return aarch64_insn_encoding_class[(insn >> 25) & 0xf]; -} - -/* NOP is an alias of HINT */ -bool __kprobes aarch64_insn_is_nop(u32 insn) -{ - if (!aarch64_insn_is_hint(insn)) - return false; - - switch (insn & 0xFE0) { - case AARCH64_INSN_HINT_YIELD: - case AARCH64_INSN_HINT_WFE: - case AARCH64_INSN_HINT_WFI: - case AARCH64_INSN_HINT_SEV: - case AARCH64_INSN_HINT_SEVL: - return false; - default: - return true; - } -} - -bool aarch64_insn_is_branch_imm(u32 insn) -{ - return (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn) || - aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn) || - aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || - aarch64_insn_is_bcond(insn)); -} - -static DEFINE_RAW_SPINLOCK(patch_lock); - -static bool is_exit_text(unsigned long addr) -{ - /* discarded with init text/data */ - return system_state < SYSTEM_RUNNING && - addr >= (unsigned long)__exittext_begin && - addr < (unsigned long)__exittext_end; -} - -static bool is_image_text(unsigned long addr) -{ - return core_kernel_text(addr) || is_exit_text(addr); -} - -static void __kprobes *patch_map(void *addr, int fixmap) -{ - unsigned long uintaddr = (uintptr_t) addr; - bool image = is_image_text(uintaddr); - struct page *page; - - if (image) - page = phys_to_page(__pa_symbol(addr)); - else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) - page = vmalloc_to_page(addr); - else - return addr; - - BUG_ON(!page); - return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + - (uintaddr & ~PAGE_MASK)); -} - -static void __kprobes patch_unmap(int fixmap) -{ - clear_fixmap(fixmap); -} -/* - * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always - * little-endian. - */ -int __kprobes aarch64_insn_read(void *addr, u32 *insnp) -{ - int ret; - __le32 val; - - ret = probe_kernel_read(&val, addr, AARCH64_INSN_SIZE); - if (!ret) - *insnp = le32_to_cpu(val); - - return ret; -} - -static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) -{ - void *waddr = addr; - unsigned long flags = 0; - int ret; - - raw_spin_lock_irqsave(&patch_lock, flags); - waddr = patch_map(addr, FIX_TEXT_POKE0); - - ret = probe_kernel_write(waddr, &insn, AARCH64_INSN_SIZE); - - patch_unmap(FIX_TEXT_POKE0); - raw_spin_unlock_irqrestore(&patch_lock, flags); - - return ret; -} - -int __kprobes aarch64_insn_write(void *addr, u32 insn) -{ - return __aarch64_insn_write(addr, cpu_to_le32(insn)); -} - -bool __kprobes aarch64_insn_uses_literal(u32 insn) -{ - /* ldr/ldrsw (literal), prfm */ - - return aarch64_insn_is_ldr_lit(insn) || - aarch64_insn_is_ldrsw_lit(insn) || - aarch64_insn_is_adr_adrp(insn) || - aarch64_insn_is_prfm_lit(insn); -} - -bool __kprobes aarch64_insn_is_branch(u32 insn) -{ - /* b, bl, cb*, tb*, b.cond, br, blr */ - - return aarch64_insn_is_b(insn) || - aarch64_insn_is_bl(insn) || - aarch64_insn_is_cbz(insn) || - aarch64_insn_is_cbnz(insn) || - aarch64_insn_is_tbz(insn) || - aarch64_insn_is_tbnz(insn) || - aarch64_insn_is_ret(insn) || - aarch64_insn_is_br(insn) || - aarch64_insn_is_blr(insn) || - aarch64_insn_is_bcond(insn); -} - -int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) -{ - u32 *tp = addr; - int ret; - - /* A64 instructions must be word aligned */ - if ((uintptr_t)tp & 0x3) - return -EINVAL; - - ret = aarch64_insn_write(tp, insn); - if (ret == 0) - __flush_icache_range((uintptr_t)tp, - (uintptr_t)tp + AARCH64_INSN_SIZE); - - return ret; -} - -struct aarch64_insn_patch { - void **text_addrs; - u32 *new_insns; - int insn_cnt; - atomic_t cpu_count; -}; - -static int __kprobes aarch64_insn_patch_text_cb(void *arg) -{ - int i, ret = 0; - struct aarch64_insn_patch *pp = arg; - - /* The first CPU becomes master */ - if (atomic_inc_return(&pp->cpu_count) == 1) { - for (i = 0; ret == 0 && i < pp->insn_cnt; i++) - ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i], - pp->new_insns[i]); - /* Notify other processors with an additional increment. */ - atomic_inc(&pp->cpu_count); - } else { - while (atomic_read(&pp->cpu_count) <= num_online_cpus()) - cpu_relax(); - isb(); - } - - return ret; -} - -int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt) -{ - struct aarch64_insn_patch patch = { - .text_addrs = addrs, - .new_insns = insns, - .insn_cnt = cnt, - .cpu_count = ATOMIC_INIT(0), - }; - - if (cnt <= 0) - return -EINVAL; - - return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch, - cpu_online_mask); -} - -static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type, - u32 *maskp, int *shiftp) -{ - u32 mask; - int shift; - - switch (type) { - case AARCH64_INSN_IMM_26: - mask = BIT(26) - 1; - shift = 0; - break; - case AARCH64_INSN_IMM_19: - mask = BIT(19) - 1; - shift = 5; - break; - case AARCH64_INSN_IMM_16: - mask = BIT(16) - 1; - shift = 5; - break; - case AARCH64_INSN_IMM_14: - mask = BIT(14) - 1; - shift = 5; - break; - case AARCH64_INSN_IMM_12: - mask = BIT(12) - 1; - shift = 10; - break; - case AARCH64_INSN_IMM_9: - mask = BIT(9) - 1; - shift = 12; - break; - case AARCH64_INSN_IMM_7: - mask = BIT(7) - 1; - shift = 15; - break; - case AARCH64_INSN_IMM_6: - case AARCH64_INSN_IMM_S: - mask = BIT(6) - 1; - shift = 10; - break; - case AARCH64_INSN_IMM_R: - mask = BIT(6) - 1; - shift = 16; - break; - case AARCH64_INSN_IMM_N: - mask = 1; - shift = 22; - break; - default: - return -EINVAL; - } - - *maskp = mask; - *shiftp = shift; - - return 0; -} - -#define ADR_IMM_HILOSPLIT 2 -#define ADR_IMM_SIZE SZ_2M -#define ADR_IMM_LOMASK ((1 << ADR_IMM_HILOSPLIT) - 1) -#define ADR_IMM_HIMASK ((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1) -#define ADR_IMM_LOSHIFT 29 -#define ADR_IMM_HISHIFT 5 - -u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn) -{ - u32 immlo, immhi, mask; - int shift; - - switch (type) { - case AARCH64_INSN_IMM_ADR: - shift = 0; - immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK; - immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK; - insn = (immhi << ADR_IMM_HILOSPLIT) | immlo; - mask = ADR_IMM_SIZE - 1; - break; - default: - if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) { - pr_err("aarch64_insn_decode_immediate: unknown immediate encoding %d\n", - type); - return 0; - } - } - - return (insn >> shift) & mask; -} - -u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type, - u32 insn, u64 imm) -{ - u32 immlo, immhi, mask; - int shift; - - if (insn == AARCH64_BREAK_FAULT) - return AARCH64_BREAK_FAULT; - - switch (type) { - case AARCH64_INSN_IMM_ADR: - shift = 0; - immlo = (imm & ADR_IMM_LOMASK) << ADR_IMM_LOSHIFT; - imm >>= ADR_IMM_HILOSPLIT; - immhi = (imm & ADR_IMM_HIMASK) << ADR_IMM_HISHIFT; - imm = immlo | immhi; - mask = ((ADR_IMM_LOMASK << ADR_IMM_LOSHIFT) | - (ADR_IMM_HIMASK << ADR_IMM_HISHIFT)); - break; - default: - if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) { - pr_err("aarch64_insn_encode_immediate: unknown immediate encoding %d\n", - type); - return AARCH64_BREAK_FAULT; - } - } - - /* Update the immediate field. */ - insn &= ~(mask << shift); - insn |= (imm & mask) << shift; - - return insn; -} - -u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type, - u32 insn) -{ - int shift; - - switch (type) { - case AARCH64_INSN_REGTYPE_RT: - case AARCH64_INSN_REGTYPE_RD: - shift = 0; - break; - case AARCH64_INSN_REGTYPE_RN: - shift = 5; - break; - case AARCH64_INSN_REGTYPE_RT2: - case AARCH64_INSN_REGTYPE_RA: - shift = 10; - break; - case AARCH64_INSN_REGTYPE_RM: - shift = 16; - break; - default: - pr_err("%s: unknown register type encoding %d\n", __func__, - type); - return 0; - } - - return (insn >> shift) & GENMASK(4, 0); -} - -static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type, - u32 insn, - enum aarch64_insn_register reg) -{ - int shift; - - if (insn == AARCH64_BREAK_FAULT) - return AARCH64_BREAK_FAULT; - - if (reg < AARCH64_INSN_REG_0 || reg > AARCH64_INSN_REG_SP) { - pr_err("%s: unknown register encoding %d\n", __func__, reg); - return AARCH64_BREAK_FAULT; - } - - switch (type) { - case AARCH64_INSN_REGTYPE_RT: - case AARCH64_INSN_REGTYPE_RD: - shift = 0; - break; - case AARCH64_INSN_REGTYPE_RN: - shift = 5; - break; - case AARCH64_INSN_REGTYPE_RT2: - case AARCH64_INSN_REGTYPE_RA: - shift = 10; - break; - case AARCH64_INSN_REGTYPE_RM: - case AARCH64_INSN_REGTYPE_RS: - shift = 16; - break; - default: - pr_err("%s: unknown register type encoding %d\n", __func__, - type); - return AARCH64_BREAK_FAULT; - } - - insn &= ~(GENMASK(4, 0) << shift); - insn |= reg << shift; - - return insn; -} - -static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type, - u32 insn) -{ - u32 size; - - switch (type) { - case AARCH64_INSN_SIZE_8: - size = 0; - break; - case AARCH64_INSN_SIZE_16: - size = 1; - break; - case AARCH64_INSN_SIZE_32: - size = 2; - break; - case AARCH64_INSN_SIZE_64: - size = 3; - break; - default: - pr_err("%s: unknown size encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - insn &= ~GENMASK(31, 30); - insn |= size << 30; - - return insn; -} - -static inline long branch_imm_common(unsigned long pc, unsigned long addr, - long range) -{ - long offset; - - if ((pc & 0x3) || (addr & 0x3)) { - pr_err("%s: A64 instructions must be word aligned\n", __func__); - return range; - } - - offset = ((long)addr - (long)pc); - - if (offset < -range || offset >= range) { - pr_err("%s: offset out of range\n", __func__); - return range; - } - - return offset; -} - -u32 __kprobes aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr, - enum aarch64_insn_branch_type type) -{ - u32 insn; - long offset; - - /* - * B/BL support [-128M, 128M) offset - * ARM64 virtual address arrangement guarantees all kernel and module - * texts are within +/-128M. - */ - offset = branch_imm_common(pc, addr, SZ_128M); - if (offset >= SZ_128M) - return AARCH64_BREAK_FAULT; - - switch (type) { - case AARCH64_INSN_BRANCH_LINK: - insn = aarch64_insn_get_bl_value(); - break; - case AARCH64_INSN_BRANCH_NOLINK: - insn = aarch64_insn_get_b_value(); - break; - default: - pr_err("%s: unknown branch encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn, - offset >> 2); -} - -u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr, - enum aarch64_insn_register reg, - enum aarch64_insn_variant variant, - enum aarch64_insn_branch_type type) -{ - u32 insn; - long offset; - - offset = branch_imm_common(pc, addr, SZ_1M); - if (offset >= SZ_1M) - return AARCH64_BREAK_FAULT; - - switch (type) { - case AARCH64_INSN_BRANCH_COMP_ZERO: - insn = aarch64_insn_get_cbz_value(); - break; - case AARCH64_INSN_BRANCH_COMP_NONZERO: - insn = aarch64_insn_get_cbnz_value(); - break; - default: - pr_err("%s: unknown branch encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, - offset >> 2); -} - -u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr, - enum aarch64_insn_condition cond) -{ - u32 insn; - long offset; - - offset = branch_imm_common(pc, addr, SZ_1M); - - insn = aarch64_insn_get_bcond_value(); - - if (cond < AARCH64_INSN_COND_EQ || cond > AARCH64_INSN_COND_AL) { - pr_err("%s: unknown condition encoding %d\n", __func__, cond); - return AARCH64_BREAK_FAULT; - } - insn |= cond; - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, - offset >> 2); -} - -u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_op op) -{ - return aarch64_insn_get_hint_value() | op; -} - -u32 __kprobes aarch64_insn_gen_nop(void) -{ - return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP); -} - -u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg, - enum aarch64_insn_branch_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_BRANCH_NOLINK: - insn = aarch64_insn_get_br_value(); - break; - case AARCH64_INSN_BRANCH_LINK: - insn = aarch64_insn_get_blr_value(); - break; - case AARCH64_INSN_BRANCH_RETURN: - insn = aarch64_insn_get_ret_value(); - break; - default: - pr_err("%s: unknown branch encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, reg); -} - -u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg, - enum aarch64_insn_register base, - enum aarch64_insn_register offset, - enum aarch64_insn_size_type size, - enum aarch64_insn_ldst_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_LDST_LOAD_REG_OFFSET: - insn = aarch64_insn_get_ldr_reg_value(); - break; - case AARCH64_INSN_LDST_STORE_REG_OFFSET: - insn = aarch64_insn_get_str_reg_value(); - break; - default: - pr_err("%s: unknown load/store encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_ldst_size(size, insn); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - base); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, - offset); -} - -u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, - enum aarch64_insn_register reg2, - enum aarch64_insn_register base, - int offset, - enum aarch64_insn_variant variant, - enum aarch64_insn_ldst_type type) -{ - u32 insn; - int shift; - - switch (type) { - case AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX: - insn = aarch64_insn_get_ldp_pre_value(); - break; - case AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX: - insn = aarch64_insn_get_stp_pre_value(); - break; - case AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX: - insn = aarch64_insn_get_ldp_post_value(); - break; - case AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX: - insn = aarch64_insn_get_stp_post_value(); - break; - default: - pr_err("%s: unknown load/store encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if ((offset & 0x3) || (offset < -256) || (offset > 252)) { - pr_err("%s: offset must be multiples of 4 in the range of [-256, 252] %d\n", - __func__, offset); - return AARCH64_BREAK_FAULT; - } - shift = 2; - break; - case AARCH64_INSN_VARIANT_64BIT: - if ((offset & 0x7) || (offset < -512) || (offset > 504)) { - pr_err("%s: offset must be multiples of 8 in the range of [-512, 504] %d\n", - __func__, offset); - return AARCH64_BREAK_FAULT; - } - shift = 3; - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, - reg1); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn, - reg2); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - base); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_7, insn, - offset >> shift); -} - -u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg, - enum aarch64_insn_register base, - enum aarch64_insn_register state, - enum aarch64_insn_size_type size, - enum aarch64_insn_ldst_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_LDST_LOAD_EX: - insn = aarch64_insn_get_load_ex_value(); - break; - case AARCH64_INSN_LDST_STORE_EX: - insn = aarch64_insn_get_store_ex_value(); - break; - default: - pr_err("%s: unknown load/store exclusive encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_ldst_size(size, insn); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, - reg); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - base); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn, - AARCH64_INSN_REG_ZR); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn, - state); -} - -u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result, - enum aarch64_insn_register address, - enum aarch64_insn_register value, - enum aarch64_insn_size_type size) -{ - u32 insn = aarch64_insn_get_ldadd_value(); - - switch (size) { - case AARCH64_INSN_SIZE_32: - case AARCH64_INSN_SIZE_64: - break; - default: - pr_err("%s: unimplemented size encoding %d\n", __func__, size); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_ldst_size(size, insn); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, - result); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - address); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn, - value); -} - -u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address, - enum aarch64_insn_register value, - enum aarch64_insn_size_type size) -{ - /* - * STADD is simply encoded as an alias for LDADD with XZR as - * the destination register. - */ - return aarch64_insn_gen_ldadd(AARCH64_INSN_REG_ZR, address, - value, size); -} - -static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type, - enum aarch64_insn_prfm_target target, - enum aarch64_insn_prfm_policy policy, - u32 insn) -{ - u32 imm_type = 0, imm_target = 0, imm_policy = 0; - - switch (type) { - case AARCH64_INSN_PRFM_TYPE_PLD: - break; - case AARCH64_INSN_PRFM_TYPE_PLI: - imm_type = BIT(0); - break; - case AARCH64_INSN_PRFM_TYPE_PST: - imm_type = BIT(1); - break; - default: - pr_err("%s: unknown prfm type encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (target) { - case AARCH64_INSN_PRFM_TARGET_L1: - break; - case AARCH64_INSN_PRFM_TARGET_L2: - imm_target = BIT(0); - break; - case AARCH64_INSN_PRFM_TARGET_L3: - imm_target = BIT(1); - break; - default: - pr_err("%s: unknown prfm target encoding %d\n", __func__, target); - return AARCH64_BREAK_FAULT; - } - - switch (policy) { - case AARCH64_INSN_PRFM_POLICY_KEEP: - break; - case AARCH64_INSN_PRFM_POLICY_STRM: - imm_policy = BIT(0); - break; - default: - pr_err("%s: unknown prfm policy encoding %d\n", __func__, policy); - return AARCH64_BREAK_FAULT; - } - - /* In this case, imm5 is encoded into Rt field. */ - insn &= ~GENMASK(4, 0); - insn |= imm_policy | (imm_target << 1) | (imm_type << 3); - - return insn; -} - -u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base, - enum aarch64_insn_prfm_type type, - enum aarch64_insn_prfm_target target, - enum aarch64_insn_prfm_policy policy) -{ - u32 insn = aarch64_insn_get_prfm_value(); - - insn = aarch64_insn_encode_ldst_size(AARCH64_INSN_SIZE_64, insn); - - insn = aarch64_insn_encode_prfm_imm(type, target, policy, insn); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - base); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, 0); -} - -u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - int imm, enum aarch64_insn_variant variant, - enum aarch64_insn_adsb_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_ADSB_ADD: - insn = aarch64_insn_get_add_imm_value(); - break; - case AARCH64_INSN_ADSB_SUB: - insn = aarch64_insn_get_sub_imm_value(); - break; - case AARCH64_INSN_ADSB_ADD_SETFLAGS: - insn = aarch64_insn_get_adds_imm_value(); - break; - case AARCH64_INSN_ADSB_SUB_SETFLAGS: - insn = aarch64_insn_get_subs_imm_value(); - break; - default: - pr_err("%s: unknown add/sub encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - /* We can't encode more than a 24bit value (12bit + 12bit shift) */ - if (imm & ~(BIT(24) - 1)) - goto out; - - /* If we have something in the top 12 bits... */ - if (imm & ~(SZ_4K - 1)) { - /* ... and in the low 12 bits -> error */ - if (imm & (SZ_4K - 1)) - goto out; - - imm >>= 12; - insn |= AARCH64_INSN_LSL_12; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm); - -out: - pr_err("%s: invalid immediate encoding %d\n", __func__, imm); - return AARCH64_BREAK_FAULT; -} - -u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - int immr, int imms, - enum aarch64_insn_variant variant, - enum aarch64_insn_bitfield_type type) -{ - u32 insn; - u32 mask; - - switch (type) { - case AARCH64_INSN_BITFIELD_MOVE: - insn = aarch64_insn_get_bfm_value(); - break; - case AARCH64_INSN_BITFIELD_MOVE_UNSIGNED: - insn = aarch64_insn_get_ubfm_value(); - break; - case AARCH64_INSN_BITFIELD_MOVE_SIGNED: - insn = aarch64_insn_get_sbfm_value(); - break; - default: - pr_err("%s: unknown bitfield encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - mask = GENMASK(4, 0); - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT | AARCH64_INSN_N_BIT; - mask = GENMASK(5, 0); - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - if (immr & ~mask) { - pr_err("%s: invalid immr encoding %d\n", __func__, immr); - return AARCH64_BREAK_FAULT; - } - if (imms & ~mask) { - pr_err("%s: invalid imms encoding %d\n", __func__, imms); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms); -} - -u32 aarch64_insn_gen_movewide(enum aarch64_insn_register dst, - int imm, int shift, - enum aarch64_insn_variant variant, - enum aarch64_insn_movewide_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_MOVEWIDE_ZERO: - insn = aarch64_insn_get_movz_value(); - break; - case AARCH64_INSN_MOVEWIDE_KEEP: - insn = aarch64_insn_get_movk_value(); - break; - case AARCH64_INSN_MOVEWIDE_INVERSE: - insn = aarch64_insn_get_movn_value(); - break; - default: - pr_err("%s: unknown movewide encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - if (imm & ~(SZ_64K - 1)) { - pr_err("%s: invalid immediate encoding %d\n", __func__, imm); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if (shift != 0 && shift != 16) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - if (shift != 0 && shift != 16 && shift != 32 && shift != 48) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn |= (shift >> 4) << 21; - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); -} - -u32 aarch64_insn_gen_add_sub_shifted_reg(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_register reg, - int shift, - enum aarch64_insn_variant variant, - enum aarch64_insn_adsb_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_ADSB_ADD: - insn = aarch64_insn_get_add_value(); - break; - case AARCH64_INSN_ADSB_SUB: - insn = aarch64_insn_get_sub_value(); - break; - case AARCH64_INSN_ADSB_ADD_SETFLAGS: - insn = aarch64_insn_get_adds_value(); - break; - case AARCH64_INSN_ADSB_SUB_SETFLAGS: - insn = aarch64_insn_get_subs_value(); - break; - default: - pr_err("%s: unknown add/sub encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if (shift & ~(SZ_32 - 1)) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - if (shift & ~(SZ_64 - 1)) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift); -} - -u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_variant variant, - enum aarch64_insn_data1_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_DATA1_REVERSE_16: - insn = aarch64_insn_get_rev16_value(); - break; - case AARCH64_INSN_DATA1_REVERSE_32: - insn = aarch64_insn_get_rev32_value(); - break; - case AARCH64_INSN_DATA1_REVERSE_64: - if (variant != AARCH64_INSN_VARIANT_64BIT) { - pr_err("%s: invalid variant for reverse64 %d\n", - __func__, variant); - return AARCH64_BREAK_FAULT; - } - insn = aarch64_insn_get_rev64_value(); - break; - default: - pr_err("%s: unknown data1 encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); -} - -u32 aarch64_insn_gen_data2(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_register reg, - enum aarch64_insn_variant variant, - enum aarch64_insn_data2_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_DATA2_UDIV: - insn = aarch64_insn_get_udiv_value(); - break; - case AARCH64_INSN_DATA2_SDIV: - insn = aarch64_insn_get_sdiv_value(); - break; - case AARCH64_INSN_DATA2_LSLV: - insn = aarch64_insn_get_lslv_value(); - break; - case AARCH64_INSN_DATA2_LSRV: - insn = aarch64_insn_get_lsrv_value(); - break; - case AARCH64_INSN_DATA2_ASRV: - insn = aarch64_insn_get_asrv_value(); - break; - case AARCH64_INSN_DATA2_RORV: - insn = aarch64_insn_get_rorv_value(); - break; - default: - pr_err("%s: unknown data2 encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); -} - -u32 aarch64_insn_gen_data3(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_register reg1, - enum aarch64_insn_register reg2, - enum aarch64_insn_variant variant, - enum aarch64_insn_data3_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_DATA3_MADD: - insn = aarch64_insn_get_madd_value(); - break; - case AARCH64_INSN_DATA3_MSUB: - insn = aarch64_insn_get_msub_value(); - break; - default: - pr_err("%s: unknown data3 encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RA, insn, src); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - reg1); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, - reg2); -} - -u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_register reg, - int shift, - enum aarch64_insn_variant variant, - enum aarch64_insn_logic_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_LOGIC_AND: - insn = aarch64_insn_get_and_value(); - break; - case AARCH64_INSN_LOGIC_BIC: - insn = aarch64_insn_get_bic_value(); - break; - case AARCH64_INSN_LOGIC_ORR: - insn = aarch64_insn_get_orr_value(); - break; - case AARCH64_INSN_LOGIC_ORN: - insn = aarch64_insn_get_orn_value(); - break; - case AARCH64_INSN_LOGIC_EOR: - insn = aarch64_insn_get_eor_value(); - break; - case AARCH64_INSN_LOGIC_EON: - insn = aarch64_insn_get_eon_value(); - break; - case AARCH64_INSN_LOGIC_AND_SETFLAGS: - insn = aarch64_insn_get_ands_value(); - break; - case AARCH64_INSN_LOGIC_BIC_SETFLAGS: - insn = aarch64_insn_get_bics_value(); - break; - default: - pr_err("%s: unknown logical encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if (shift & ~(SZ_32 - 1)) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - if (shift & ~(SZ_64 - 1)) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift); -} - -/* - * MOV (register) is architecturally an alias of ORR (shifted register) where - * MOV <*d>, <*m> is equivalent to ORR <*d>, <*ZR>, <*m> - */ -u32 aarch64_insn_gen_move_reg(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_variant variant) -{ - return aarch64_insn_gen_logical_shifted_reg(dst, AARCH64_INSN_REG_ZR, - src, 0, variant, - AARCH64_INSN_LOGIC_ORR); -} - -u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr, - enum aarch64_insn_register reg, - enum aarch64_insn_adr_type type) -{ - u32 insn; - s32 offset; - - switch (type) { - case AARCH64_INSN_ADR_TYPE_ADR: - insn = aarch64_insn_get_adr_value(); - offset = addr - pc; - break; - case AARCH64_INSN_ADR_TYPE_ADRP: - insn = aarch64_insn_get_adrp_value(); - offset = (addr - ALIGN_DOWN(pc, SZ_4K)) >> 12; - break; - default: - pr_err("%s: unknown adr encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - if (offset < -SZ_1M || offset >= SZ_1M) - return AARCH64_BREAK_FAULT; - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, reg); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, offset); -} - -/* - * Decode the imm field of a branch, and return the byte offset as a - * signed value (so it can be used when computing a new branch - * target). - */ -s32 aarch64_get_branch_offset(u32 insn) -{ - s32 imm; - - if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) { - imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn); - return (imm << 6) >> 4; - } - - if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || - aarch64_insn_is_bcond(insn)) { - imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn); - return (imm << 13) >> 11; - } - - if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) { - imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn); - return (imm << 18) >> 16; - } - - /* Unhandled instruction */ - BUG(); -} - -/* - * Encode the displacement of a branch in the imm field and return the - * updated instruction. - */ -u32 aarch64_set_branch_offset(u32 insn, s32 offset) -{ - if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn, - offset >> 2); - - if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || - aarch64_insn_is_bcond(insn)) - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, - offset >> 2); - - if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_14, insn, - offset >> 2); - - /* Unhandled instruction */ - BUG(); -} - -s32 aarch64_insn_adrp_get_offset(u32 insn) -{ - BUG_ON(!aarch64_insn_is_adrp(insn)); - return aarch64_insn_decode_immediate(AARCH64_INSN_IMM_ADR, insn) << 12; -} - -u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset) -{ - BUG_ON(!aarch64_insn_is_adrp(insn)); - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, - offset >> 12); -} - -/* - * Extract the Op/CR data from a msr/mrs instruction. - */ -u32 aarch64_insn_extract_system_reg(u32 insn) -{ - return (insn & 0x1FFFE0) >> 5; -} - -bool aarch32_insn_is_wide(u32 insn) -{ - return insn >= 0xe800; -} - -/* - * Macros/defines for extracting register numbers from instruction. - */ -u32 aarch32_insn_extract_reg_num(u32 insn, int offset) -{ - return (insn & (0xf << offset)) >> offset; -} - -#define OPC2_MASK 0x7 -#define OPC2_OFFSET 5 -u32 aarch32_insn_mcr_extract_opc2(u32 insn) -{ - return (insn & (OPC2_MASK << OPC2_OFFSET)) >> OPC2_OFFSET; -} - -#define CRM_MASK 0xf -u32 aarch32_insn_mcr_extract_crm(u32 insn) -{ - return insn & CRM_MASK; -} - -static bool __kprobes __check_eq(unsigned long pstate) -{ - return (pstate & PSR_Z_BIT) != 0; -} - -static bool __kprobes __check_ne(unsigned long pstate) -{ - return (pstate & PSR_Z_BIT) == 0; -} - -static bool __kprobes __check_cs(unsigned long pstate) -{ - return (pstate & PSR_C_BIT) != 0; -} - -static bool __kprobes __check_cc(unsigned long pstate) -{ - return (pstate & PSR_C_BIT) == 0; -} - -static bool __kprobes __check_mi(unsigned long pstate) -{ - return (pstate & PSR_N_BIT) != 0; -} - -static bool __kprobes __check_pl(unsigned long pstate) -{ - return (pstate & PSR_N_BIT) == 0; -} - -static bool __kprobes __check_vs(unsigned long pstate) -{ - return (pstate & PSR_V_BIT) != 0; -} - -static bool __kprobes __check_vc(unsigned long pstate) -{ - return (pstate & PSR_V_BIT) == 0; -} - -static bool __kprobes __check_hi(unsigned long pstate) -{ - pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ - return (pstate & PSR_C_BIT) != 0; -} - -static bool __kprobes __check_ls(unsigned long pstate) -{ - pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ - return (pstate & PSR_C_BIT) == 0; -} - -static bool __kprobes __check_ge(unsigned long pstate) -{ - pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ - return (pstate & PSR_N_BIT) == 0; -} - -static bool __kprobes __check_lt(unsigned long pstate) -{ - pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ - return (pstate & PSR_N_BIT) != 0; -} - -static bool __kprobes __check_gt(unsigned long pstate) -{ - /*PSR_N_BIT ^= PSR_V_BIT */ - unsigned long temp = pstate ^ (pstate << 3); - - temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ - return (temp & PSR_N_BIT) == 0; -} - -static bool __kprobes __check_le(unsigned long pstate) -{ - /*PSR_N_BIT ^= PSR_V_BIT */ - unsigned long temp = pstate ^ (pstate << 3); - - temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ - return (temp & PSR_N_BIT) != 0; -} - -static bool __kprobes __check_al(unsigned long pstate) -{ - return true; -} - -/* - * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that - * it behaves identically to 0b1110 ("al"). - */ -pstate_check_t * const aarch32_opcode_cond_checks[16] = { - __check_eq, __check_ne, __check_cs, __check_cc, - __check_mi, __check_pl, __check_vs, __check_vc, - __check_hi, __check_ls, __check_ge, __check_lt, - __check_gt, __check_le, __check_al, __check_al -}; - -static bool range_of_ones(u64 val) -{ - /* Doesn't handle full ones or full zeroes */ - u64 sval = val >> __ffs64(val); - - /* One of Sean Eron Anderson's bithack tricks */ - return ((sval + 1) & (sval)) == 0; -} - -static u32 aarch64_encode_immediate(u64 imm, - enum aarch64_insn_variant variant, - u32 insn) -{ - unsigned int immr, imms, n, ones, ror, esz, tmp; - u64 mask = ~0UL; - - /* Can't encode full zeroes or full ones */ - if (!imm || !~imm) - return AARCH64_BREAK_FAULT; - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if (upper_32_bits(imm)) - return AARCH64_BREAK_FAULT; - esz = 32; - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - esz = 64; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - /* - * Inverse of Replicate(). Try to spot a repeating pattern - * with a pow2 stride. - */ - for (tmp = esz / 2; tmp >= 2; tmp /= 2) { - u64 emask = BIT(tmp) - 1; - - if ((imm & emask) != ((imm >> tmp) & emask)) - break; - - esz = tmp; - mask = emask; - } - - /* N is only set if we're encoding a 64bit value */ - n = esz == 64; - - /* Trim imm to the element size */ - imm &= mask; - - /* That's how many ones we need to encode */ - ones = hweight64(imm); - - /* - * imms is set to (ones - 1), prefixed with a string of ones - * and a zero if they fit. Cap it to 6 bits. - */ - imms = ones - 1; - imms |= 0xf << ffs(esz); - imms &= BIT(6) - 1; - - /* Compute the rotation */ - if (range_of_ones(imm)) { - /* - * Pattern: 0..01..10..0 - * - * Compute how many rotate we need to align it right - */ - ror = __ffs64(imm); - } else { - /* - * Pattern: 0..01..10..01..1 - * - * Fill the unused top bits with ones, and check if - * the result is a valid immediate (all ones with a - * contiguous ranges of zeroes). - */ - imm |= ~mask; - if (!range_of_ones(~imm)) - return AARCH64_BREAK_FAULT; - - /* - * Compute the rotation to get a continuous set of - * ones, with the first bit set at position 0 - */ - ror = fls(~imm); - } - - /* - * immr is the number of bits we need to rotate back to the - * original set of ones. Note that this is relative to the - * element size... - */ - immr = (esz - ror) % esz; - - insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, n); - insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr); - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms); -} - -u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type, - enum aarch64_insn_variant variant, - enum aarch64_insn_register Rn, - enum aarch64_insn_register Rd, - u64 imm) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_LOGIC_AND: - insn = aarch64_insn_get_and_imm_value(); - break; - case AARCH64_INSN_LOGIC_ORR: - insn = aarch64_insn_get_orr_imm_value(); - break; - case AARCH64_INSN_LOGIC_EOR: - insn = aarch64_insn_get_eor_imm_value(); - break; - case AARCH64_INSN_LOGIC_AND_SETFLAGS: - insn = aarch64_insn_get_ands_imm_value(); - break; - default: - pr_err("%s: unknown logical encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd); - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn); - return aarch64_encode_immediate(imm, variant, insn); -} - -u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant, - enum aarch64_insn_register Rm, - enum aarch64_insn_register Rn, - enum aarch64_insn_register Rd, - u8 lsb) -{ - u32 insn; - - insn = aarch64_insn_get_extr_value(); - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if (lsb > 31) - return AARCH64_BREAK_FAULT; - break; - case AARCH64_INSN_VARIANT_64BIT: - if (lsb > 63) - return AARCH64_BREAK_FAULT; - insn |= AARCH64_INSN_SF_BIT; - insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, 1); - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, lsb); - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd); - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn); - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm); -} diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 04a327ccf84d..85087e2df564 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -10,40 +10,55 @@ * Copyright (C) 2012 ARM Ltd. */ -#include <linux/kernel_stat.h> -#include <linux/irq.h> -#include <linux/memory.h> -#include <linux/smp.h> +#include <linux/hardirq.h> #include <linux/init.h> +#include <linux/irq.h> #include <linux/irqchip.h> #include <linux/kprobes.h> +#include <linux/memory.h> +#include <linux/scs.h> #include <linux/seq_file.h> +#include <linux/smp.h> #include <linux/vmalloc.h> #include <asm/daifflags.h> +#include <asm/exception.h> +#include <asm/numa.h> +#include <asm/softirq_stack.h> +#include <asm/stacktrace.h> #include <asm/vmap_stack.h> -unsigned long irq_err_count; - /* Only access this in an NMI enter/exit */ DEFINE_PER_CPU(struct nmi_ctx, nmi_contexts); DEFINE_PER_CPU(unsigned long *, irq_stack_ptr); -int arch_show_interrupts(struct seq_file *p, int prec) + +DECLARE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr); + +#ifdef CONFIG_SHADOW_CALL_STACK +DEFINE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr); +#endif + +static void init_irq_scs(void) { - show_ipi_list(p, prec); - seq_printf(p, "%*s: %10lu\n", prec, "Err", irq_err_count); - return 0; + int cpu; + + if (!scs_is_enabled()) + return; + + for_each_possible_cpu(cpu) + per_cpu(irq_shadow_call_stack_ptr, cpu) = + scs_alloc(early_cpu_to_node(cpu)); } #ifdef CONFIG_VMAP_STACK -static void init_irq_stacks(void) +static void __init init_irq_stacks(void) { int cpu; unsigned long *p; for_each_possible_cpu(cpu) { - p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, cpu_to_node(cpu)); + p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, early_cpu_to_node(cpu)); per_cpu(irq_stack_ptr, cpu) = p; } } @@ -60,12 +75,56 @@ static void init_irq_stacks(void) } #endif +#ifndef CONFIG_PREEMPT_RT +static void ____do_softirq(struct pt_regs *regs) +{ + __do_softirq(); +} + +void do_softirq_own_stack(void) +{ + call_on_irq_stack(NULL, ____do_softirq); +} +#endif + +static void default_handle_irq(struct pt_regs *regs) +{ + panic("IRQ taken without a root IRQ handler\n"); +} + +static void default_handle_fiq(struct pt_regs *regs) +{ + panic("FIQ taken without a root FIQ handler\n"); +} + +void (*handle_arch_irq)(struct pt_regs *) __ro_after_init = default_handle_irq; +void (*handle_arch_fiq)(struct pt_regs *) __ro_after_init = default_handle_fiq; + +int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) +{ + if (handle_arch_irq != default_handle_irq) + return -EBUSY; + + handle_arch_irq = handle_irq; + pr_info("Root IRQ handler: %ps\n", handle_irq); + return 0; +} + +int __init set_handle_fiq(void (*handle_fiq)(struct pt_regs *)) +{ + if (handle_arch_fiq != default_handle_fiq) + return -EBUSY; + + handle_arch_fiq = handle_fiq; + pr_info("Root FIQ handler: %ps\n", handle_fiq); + return 0; +} + void __init init_IRQ(void) { init_irq_stacks(); + init_irq_scs(); irqchip_init(); - if (!handle_arch_irq) - panic("No interrupt controller found."); if (system_uses_irq_prio_masking()) { /* @@ -76,18 +135,3 @@ void __init init_IRQ(void) local_daif_restore(DAIF_PROCCTX_NOIRQ); } } - -/* - * Stubs to make nmi_enter/exit() code callable from ASM - */ -asmlinkage void notrace asm_nmi_enter(void) -{ - nmi_enter(); -} -NOKPROBE_SYMBOL(asm_nmi_enter); - -asmlinkage void notrace asm_nmi_exit(void) -{ - nmi_exit(); -} -NOKPROBE_SYMBOL(asm_nmi_exit); diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c index 9a8a0ae1e75f..faf88ec9c48e 100644 --- a/arch/arm64/kernel/jump_label.c +++ b/arch/arm64/kernel/jump_label.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <linux/jump_label.h> #include <asm/insn.h> +#include <asm/patching.h> void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) @@ -25,14 +26,3 @@ void arch_jump_label_transform(struct jump_entry *entry, aarch64_insn_patch_text_nosync(addr, insn); } - -void arch_jump_label_transform_static(struct jump_entry *entry, - enum jump_label_type type) -{ - /* - * We use the architected A64 NOP in arch_static_branch, so there's no - * need to patch an identical A64 NOP over the top of it here. The core - * will call arch_jump_label_transform from a module notifier if the - * NOP needs to be replaced by a branch. - */ -} diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 2a11a962e571..12c7f3c8ba76 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -4,205 +4,42 @@ */ #include <linux/cache.h> -#include <linux/crc32.h> #include <linux/init.h> -#include <linux/libfdt.h> -#include <linux/mm_types.h> -#include <linux/sched.h> -#include <linux/types.h> +#include <linux/printk.h> -#include <asm/cacheflush.h> -#include <asm/fixmap.h> -#include <asm/kernel-pgtable.h> +#include <asm/cpufeature.h> #include <asm/memory.h> -#include <asm/mmu.h> -#include <asm/pgtable.h> -#include <asm/sections.h> -enum kaslr_status { - KASLR_ENABLED, - KASLR_DISABLED_CMDLINE, - KASLR_DISABLED_NO_SEED, - KASLR_DISABLED_FDT_REMAP, -}; - -static enum kaslr_status __initdata kaslr_status; -u64 __ro_after_init module_alloc_base; u16 __initdata memstart_offset_seed; -static __init u64 get_kaslr_seed(void *fdt) -{ - int node, len; - fdt64_t *prop; - u64 ret; - - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - return 0; - - prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); - if (!prop || len != sizeof(u64)) - return 0; - - ret = fdt64_to_cpu(*prop); - *prop = 0; - return ret; -} +bool __ro_after_init __kaslr_is_enabled = false; -static __init const u8 *kaslr_get_cmdline(void *fdt) +void __init kaslr_init(void) { - static __initconst const u8 default_cmdline[] = CONFIG_CMDLINE; - - if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { - int node; - const u8 *prop; - - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - goto out; - - prop = fdt_getprop(fdt, node, "bootargs", NULL); - if (!prop) - goto out; - return prop; - } -out: - return default_cmdline; -} - -/* - * This routine will be executed with the kernel mapped at its default virtual - * address, and if it returns successfully, the kernel will be remapped, and - * start_kernel() will be executed from a randomized virtual offset. The - * relocation will result in all absolute references (e.g., static variables - * containing function pointers) to be reinitialized, and zero-initialized - * .bss variables will be reset to 0. - */ -u64 __init kaslr_early_init(u64 dt_phys) -{ - void *fdt; - u64 seed, offset, mask, module_range; - const u8 *cmdline, *str; - int size; - - /* - * Set a reasonable default for module_alloc_base in case - * we end up running with module randomization disabled. - */ - module_alloc_base = (u64)_etext - MODULES_VSIZE; - __flush_dcache_area(&module_alloc_base, sizeof(module_alloc_base)); - - /* - * Try to map the FDT early. If this fails, we simply bail, - * and proceed with KASLR disabled. We will make another - * attempt at mapping the FDT in setup_machine() - */ - early_fixmap_init(); - fdt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); - if (!fdt) { - kaslr_status = KASLR_DISABLED_FDT_REMAP; - return 0; - } - - /* - * Retrieve (and wipe) the seed from the FDT - */ - seed = get_kaslr_seed(fdt); - - /* - * Check if 'nokaslr' appears on the command line, and - * return 0 if that is the case. - */ - cmdline = kaslr_get_cmdline(fdt); - str = strstr(cmdline, "nokaslr"); - if (str == cmdline || (str > cmdline && *(str - 1) == ' ')) { - kaslr_status = KASLR_DISABLED_CMDLINE; - return 0; - } - - if (!seed) { - kaslr_status = KASLR_DISABLED_NO_SEED; - return 0; + if (cpuid_feature_extract_unsigned_field(arm64_sw_feature_override.val & + arm64_sw_feature_override.mask, + ARM64_SW_FEATURE_OVERRIDE_NOKASLR)) { + pr_info("KASLR disabled on command line\n"); + return; } /* - * OK, so we are proceeding with KASLR enabled. Calculate a suitable - * kernel image offset from the seed. Let's place the kernel in the - * middle half of the VMALLOC area (VA_BITS_MIN - 2), and stay clear of - * the lower and upper quarters to avoid colliding with other - * allocations. - * Even if we could randomize at page granularity for 16k and 64k pages, - * let's always round to 2 MB so we don't interfere with the ability to - * map using contiguous PTEs + * The KASLR offset modulo MIN_KIMG_ALIGN is taken from the physical + * placement of the image rather than from the seed, so a displacement + * of less than MIN_KIMG_ALIGN means that no seed was provided. */ - mask = ((1UL << (VA_BITS_MIN - 2)) - 1) & ~(SZ_2M - 1); - offset = BIT(VA_BITS_MIN - 3) + (seed & mask); - - /* use the top 16 bits to randomize the linear region */ - memstart_offset_seed = seed >> 48; - - if (IS_ENABLED(CONFIG_KASAN)) - /* - * KASAN does not expect the module region to intersect the - * vmalloc region, since shadow memory is allocated for each - * module at load time, whereas the vmalloc region is shadowed - * by KASAN zero pages. So keep modules out of the vmalloc - * region if KASAN is enabled, and put the kernel well within - * 4 GB of the module region. - */ - return offset % SZ_2G; - - if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) { - /* - * Randomize the module region over a 2 GB window covering the - * kernel. This reduces the risk of modules leaking information - * about the address of the kernel itself, but results in - * branches between modules and the core kernel that are - * resolved via PLTs. (Branches between modules will be - * resolved normally.) - */ - module_range = SZ_2G - (u64)(_end - _stext); - module_alloc_base = max((u64)_end + offset - SZ_2G, - (u64)MODULES_VADDR); - } else { - /* - * Randomize the module region by setting module_alloc_base to - * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE, - * _stext) . This guarantees that the resulting region still - * covers [_stext, _etext], and that all relative branches can - * be resolved without veneers. - */ - module_range = MODULES_VSIZE - (u64)(_etext - _stext); - module_alloc_base = (u64)_etext + offset - MODULES_VSIZE; + if (kaslr_offset() < MIN_KIMG_ALIGN) { + pr_warn("KASLR disabled due to lack of seed\n"); + return; } - /* use the lower 21 bits to randomize the base of the module region */ - module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21; - module_alloc_base &= PAGE_MASK; - - __flush_dcache_area(&module_alloc_base, sizeof(module_alloc_base)); - __flush_dcache_area(&memstart_offset_seed, sizeof(memstart_offset_seed)); - - return offset; + pr_info("KASLR enabled\n"); + __kaslr_is_enabled = true; } -static int __init kaslr_init(void) +static int __init parse_nokaslr(char *unused) { - switch (kaslr_status) { - case KASLR_ENABLED: - pr_info("KASLR enabled\n"); - break; - case KASLR_DISABLED_CMDLINE: - pr_info("KASLR disabled on command line\n"); - break; - case KASLR_DISABLED_NO_SEED: - pr_warn("KASLR disabled due to lack of seed\n"); - break; - case KASLR_DISABLED_FDT_REMAP: - pr_warn("KASLR disabled due to FDT remapping failure\n"); - break; - } - + /* nokaslr param handling is done by early cpufeature code */ return 0; } -core_initcall(kaslr_init) +early_param("nokaslr", parse_nokaslr); diff --git a/arch/arm64/kernel/kexec_image.c b/arch/arm64/kernel/kexec_image.c index 29a9428486a5..532d72ea42ee 100644 --- a/arch/arm64/kernel/kexec_image.c +++ b/arch/arm64/kernel/kexec_image.c @@ -14,7 +14,6 @@ #include <linux/kexec.h> #include <linux/pe.h> #include <linux/string.h> -#include <linux/verification.h> #include <asm/byteorder.h> #include <asm/cpufeature.h> #include <asm/image.h> @@ -43,17 +42,13 @@ static void *image_load(struct kimage *image, u64 flags, value; bool be_image, be_kernel; struct kexec_buf kbuf; - unsigned long text_offset; + unsigned long text_offset, kernel_segment_number; struct kexec_segment *kernel_segment; int ret; - /* We don't support crash kernels yet. */ - if (image->type == KEXEC_TYPE_CRASH) - return ERR_PTR(-EOPNOTSUPP); - /* * We require a kernel with an unambiguous Image header. Per - * Documentation/arm64/booting.rst, this is the case when image_size + * Documentation/arch/arm64/booting.rst, this is the case when image_size * is non-zero (practically speaking, since v3.17). */ h = (struct arm64_image_header *)kernel; @@ -92,39 +87,52 @@ static void *image_load(struct kimage *image, /* Adjust kernel segment with TEXT_OFFSET */ kbuf.memsz += text_offset; - ret = kexec_add_buffer(&kbuf); - if (ret) + kernel_segment_number = image->nr_segments; + + /* + * The location of the kernel segment may make it impossible to satisfy + * the other segment requirements, so we try repeatedly to find a + * location that will work. + */ + while ((ret = kexec_add_buffer(&kbuf)) == 0) { + /* Try to load additional data */ + kernel_segment = &image->segment[kernel_segment_number]; + ret = load_other_segments(image, kernel_segment->mem, + kernel_segment->memsz, initrd, + initrd_len, cmdline); + if (!ret) + break; + + /* + * We couldn't find space for the other segments; erase the + * kernel segment and try the next available hole. + */ + image->nr_segments -= 1; + kbuf.buf_min = kernel_segment->mem + kernel_segment->memsz; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + } + + if (ret) { + pr_err("Could not find any suitable kernel location!"); return ERR_PTR(ret); + } - kernel_segment = &image->segment[image->nr_segments - 1]; + kernel_segment = &image->segment[kernel_segment_number]; kernel_segment->mem += text_offset; kernel_segment->memsz -= text_offset; image->start = kernel_segment->mem; - pr_debug("Loaded kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - kernel_segment->mem, kbuf.bufsz, - kernel_segment->memsz); - - /* Load additional data */ - ret = load_other_segments(image, - kernel_segment->mem, kernel_segment->memsz, - initrd, initrd_len, cmdline); + kexec_dprintk("Loaded kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kernel_segment->mem, kbuf.bufsz, + kernel_segment->memsz); - return ERR_PTR(ret); + return NULL; } -#ifdef CONFIG_KEXEC_IMAGE_VERIFY_SIG -static int image_verify_sig(const char *kernel, unsigned long kernel_len) -{ - return verify_pefile_signature(kernel, kernel_len, NULL, - VERIFYING_KEXEC_PE_SIGNATURE); -} -#endif - const struct kexec_file_ops kexec_image_ops = { .probe = image_probe, .load = image_load, #ifdef CONFIG_KEXEC_IMAGE_VERIFY_SIG - .verify_sig = image_verify_sig, + .verify_sig = kexec_kernel_verify_pe_sig, #endif }; diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index 43119922341f..4e1f983df3d1 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -17,6 +17,7 @@ #include <asm/debug-monitors.h> #include <asm/insn.h> +#include <asm/patching.h> #include <asm/traps.h> struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { @@ -223,6 +224,8 @@ int kgdb_arch_handle_exception(int exception_vector, int signo, */ if (!kernel_active_single_step()) kernel_enable_single_step(linux_regs); + else + kernel_rewind_single_step(linux_regs); err = 0; break; default: @@ -231,14 +234,14 @@ int kgdb_arch_handle_exception(int exception_vector, int signo, return err; } -static int kgdb_brk_fn(struct pt_regs *regs, unsigned int esr) +static int kgdb_brk_fn(struct pt_regs *regs, unsigned long esr) { kgdb_handle_exception(1, SIGTRAP, 0, regs); return DBG_HOOK_HANDLED; } NOKPROBE_SYMBOL(kgdb_brk_fn) -static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr) +static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned long esr) { compiled_break = 1; kgdb_handle_exception(1, SIGTRAP, 0, regs); @@ -247,12 +250,12 @@ static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr) } NOKPROBE_SYMBOL(kgdb_compiled_brk_fn); -static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr) +static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned long esr) { if (!kgdb_single_step) return DBG_HOOK_ERROR; - kgdb_handle_exception(1, SIGTRAP, 0, regs); + kgdb_handle_exception(0, SIGTRAP, 0, regs); return DBG_HOOK_HANDLED; } NOKPROBE_SYMBOL(kgdb_step_brk_fn); diff --git a/arch/arm64/kernel/kuser32.S b/arch/arm64/kernel/kuser32.S index 42bd8c0c60e0..af046ceac22d 100644 --- a/arch/arm64/kernel/kuser32.S +++ b/arch/arm64/kernel/kuser32.S @@ -10,11 +10,12 @@ * aarch32_setup_additional_pages() and are provided for compatibility * reasons with 32 bit (aarch32) applications that need them. * - * See Documentation/arm/kernel_user_helpers.rst for formal definitions. + * See Documentation/arch/arm/kernel_user_helpers.rst for formal definitions. */ #include <asm/unistd.h> + .section .rodata .align 5 .globl __kuser_helper_start __kuser_helper_start: diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 0df8493624e0..b38aae5b488d 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -11,6 +11,8 @@ #include <linux/kernel.h> #include <linux/kexec.h> #include <linux/page-flags.h> +#include <linux/reboot.h> +#include <linux/set_memory.h> #include <linux/smp.h> #include <asm/cacheflush.h> @@ -20,12 +22,8 @@ #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/page.h> - -#include "cpu-reset.h" - -/* Global variables for the arm64_relocate_new_kernel routine. */ -extern const unsigned char arm64_relocate_new_kernel[]; -extern const unsigned long arm64_relocate_new_kernel_size; +#include <asm/sections.h> +#include <asm/trans_pgd.h> /** * kexec_image_info - For debugging output. @@ -34,23 +32,12 @@ extern const unsigned long arm64_relocate_new_kernel_size; static void _kexec_image_info(const char *func, int line, const struct kimage *kimage) { - unsigned long i; - - pr_debug("%s:%d:\n", func, line); - pr_debug(" kexec kimage info:\n"); - pr_debug(" type: %d\n", kimage->type); - pr_debug(" start: %lx\n", kimage->start); - pr_debug(" head: %lx\n", kimage->head); - pr_debug(" nr_segments: %lu\n", kimage->nr_segments); - - for (i = 0; i < kimage->nr_segments; i++) { - pr_debug(" segment[%lu]: %016lx - %016lx, 0x%lx bytes, %lu pages\n", - i, - kimage->segment[i].mem, - kimage->segment[i].mem + kimage->segment[i].memsz, - kimage->segment[i].memsz, - kimage->segment[i].memsz / PAGE_SIZE); - } + kexec_dprintk("%s:%d:\n", func, line); + kexec_dprintk(" kexec kimage info:\n"); + kexec_dprintk(" type: %d\n", kimage->type); + kexec_dprintk(" head: %lx\n", kimage->head); + kexec_dprintk(" kern_reloc: %pa\n", &kimage->arch.kern_reloc); + kexec_dprintk(" el2_vectors: %pa\n", &kimage->arch.el2_vectors); } void machine_kexec_cleanup(struct kimage *kimage) @@ -67,8 +54,6 @@ void machine_kexec_cleanup(struct kimage *kimage) */ int machine_kexec_prepare(struct kimage *kimage) { - kexec_image_info(kimage); - if (kimage->type != KEXEC_TYPE_CRASH && cpus_are_stuck_in_kernel()) { pr_err("Can't kexec: CPUs are stuck in the kernel.\n"); return -EBUSY; @@ -78,43 +63,6 @@ int machine_kexec_prepare(struct kimage *kimage) } /** - * kexec_list_flush - Helper to flush the kimage list and source pages to PoC. - */ -static void kexec_list_flush(struct kimage *kimage) -{ - kimage_entry_t *entry; - - for (entry = &kimage->head; ; entry++) { - unsigned int flag; - void *addr; - - /* flush the list entries. */ - __flush_dcache_area(entry, sizeof(kimage_entry_t)); - - flag = *entry & IND_FLAGS; - if (flag == IND_DONE) - break; - - addr = phys_to_virt(*entry & PAGE_MASK); - - switch (flag) { - case IND_INDIRECTION: - /* Set entry point just before the new list page. */ - entry = (kimage_entry_t *)addr - 1; - break; - case IND_SOURCE: - /* flush the source pages. */ - __flush_dcache_area(addr, PAGE_SIZE); - break; - case IND_DESTINATION: - break; - default: - BUG(); - } - } -} - -/** * kexec_segment_flush - Helper to flush the kimage segments to PoC. */ static void kexec_segment_flush(const struct kimage *kimage) @@ -131,11 +79,84 @@ static void kexec_segment_flush(const struct kimage *kimage) kimage->segment[i].memsz, kimage->segment[i].memsz / PAGE_SIZE); - __flush_dcache_area(phys_to_virt(kimage->segment[i].mem), - kimage->segment[i].memsz); + dcache_clean_inval_poc( + (unsigned long)phys_to_virt(kimage->segment[i].mem), + (unsigned long)phys_to_virt(kimage->segment[i].mem) + + kimage->segment[i].memsz); } } +/* Allocates pages for kexec page table */ +static void *kexec_page_alloc(void *arg) +{ + struct kimage *kimage = arg; + struct page *page = kimage_alloc_control_pages(kimage, 0); + void *vaddr = NULL; + + if (!page) + return NULL; + + vaddr = page_address(page); + memset(vaddr, 0, PAGE_SIZE); + + return vaddr; +} + +int machine_kexec_post_load(struct kimage *kimage) +{ + int rc; + pgd_t *trans_pgd; + void *reloc_code = page_to_virt(kimage->control_code_page); + long reloc_size; + struct trans_pgd_info info = { + .trans_alloc_page = kexec_page_alloc, + .trans_alloc_arg = kimage, + }; + + /* If in place, relocation is not used, only flush next kernel */ + if (kimage->head & IND_DONE) { + kexec_segment_flush(kimage); + kexec_image_info(kimage); + return 0; + } + + kimage->arch.el2_vectors = 0; + if (is_hyp_nvhe()) { + rc = trans_pgd_copy_el2_vectors(&info, + &kimage->arch.el2_vectors); + if (rc) + return rc; + } + + /* Create a copy of the linear map */ + trans_pgd = kexec_page_alloc(kimage); + if (!trans_pgd) + return -ENOMEM; + rc = trans_pgd_create_copy(&info, &trans_pgd, PAGE_OFFSET, PAGE_END); + if (rc) + return rc; + kimage->arch.ttbr1 = __pa(trans_pgd); + kimage->arch.zero_page = __pa_symbol(empty_zero_page); + + reloc_size = __relocate_new_kernel_end - __relocate_new_kernel_start; + memcpy(reloc_code, __relocate_new_kernel_start, reloc_size); + kimage->arch.kern_reloc = __pa(reloc_code); + rc = trans_pgd_idmap_page(&info, &kimage->arch.ttbr0, + &kimage->arch.t0sz, reloc_code); + if (rc) + return rc; + kimage->arch.phys_offset = virt_to_phys(kimage) - (long)kimage; + + /* Flush the reloc_code in preparation for its execution. */ + dcache_clean_inval_poc((unsigned long)reloc_code, + (unsigned long)reloc_code + reloc_size); + icache_inval_pou((uintptr_t)reloc_code, + (uintptr_t)reloc_code + reloc_size); + kexec_image_info(kimage); + + return 0; +} + /** * machine_kexec - Do the kexec reboot. * @@ -143,8 +164,6 @@ static void kexec_segment_flush(const struct kimage *kimage) */ void machine_kexec(struct kimage *kimage) { - phys_addr_t reboot_code_buffer_phys; - void *reboot_code_buffer; bool in_kexec_crash = (kimage == kexec_crash_image); bool stuck_cpus = cpus_are_stuck_in_kernel(); @@ -155,71 +174,35 @@ void machine_kexec(struct kimage *kimage) WARN(in_kexec_crash && (stuck_cpus || smp_crash_stop_failed()), "Some CPUs may be stale, kdump will be unreliable.\n"); - reboot_code_buffer_phys = page_to_phys(kimage->control_code_page); - reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys); - - kexec_image_info(kimage); - - pr_debug("%s:%d: control_code_page: %p\n", __func__, __LINE__, - kimage->control_code_page); - pr_debug("%s:%d: reboot_code_buffer_phys: %pa\n", __func__, __LINE__, - &reboot_code_buffer_phys); - pr_debug("%s:%d: reboot_code_buffer: %p\n", __func__, __LINE__, - reboot_code_buffer); - pr_debug("%s:%d: relocate_new_kernel: %p\n", __func__, __LINE__, - arm64_relocate_new_kernel); - pr_debug("%s:%d: relocate_new_kernel_size: 0x%lx(%lu) bytes\n", - __func__, __LINE__, arm64_relocate_new_kernel_size, - arm64_relocate_new_kernel_size); - - /* - * Copy arm64_relocate_new_kernel to the reboot_code_buffer for use - * after the kernel is shut down. - */ - memcpy(reboot_code_buffer, arm64_relocate_new_kernel, - arm64_relocate_new_kernel_size); - - /* Flush the reboot_code_buffer in preparation for its execution. */ - __flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size); - - /* - * Although we've killed off the secondary CPUs, we don't update - * the online mask if we're handling a crash kernel and consequently - * need to avoid flush_icache_range(), which will attempt to IPI - * the offline CPUs. Therefore, we must use the __* variant here. - */ - __flush_icache_range((uintptr_t)reboot_code_buffer, - arm64_relocate_new_kernel_size); - - /* Flush the kimage list and its buffers. */ - kexec_list_flush(kimage); - - /* Flush the new image if already in place. */ - if ((kimage != kexec_crash_image) && (kimage->head & IND_DONE)) - kexec_segment_flush(kimage); - pr_info("Bye!\n"); local_daif_mask(); /* - * cpu_soft_restart will shutdown the MMU, disable data caches, then - * transfer control to the reboot_code_buffer which contains a copy of - * the arm64_relocate_new_kernel routine. arm64_relocate_new_kernel - * uses physical addressing to relocate the new image to its final - * position and transfers control to the image entry point when the - * relocation is complete. + * Both restart and kernel_reloc will shutdown the MMU, disable data + * caches. However, restart will start new kernel or purgatory directly, + * kernel_reloc contains the body of arm64_relocate_new_kernel * In kexec case, kimage->start points to purgatory assuming that * kernel entry and dtb address are embedded in purgatory by * userspace (kexec-tools). * In kexec_file case, the kernel starts directly without purgatory. */ - cpu_soft_restart(reboot_code_buffer_phys, kimage->head, kimage->start, -#ifdef CONFIG_KEXEC_FILE - kimage->arch.dtb_mem); -#else - 0); -#endif + if (kimage->head & IND_DONE) { + typeof(cpu_soft_restart) *restart; + + cpu_install_idmap(); + restart = (void *)__pa_symbol(cpu_soft_restart); + restart(is_hyp_nvhe(), kimage->start, kimage->arch.dtb_mem, + 0, 0); + } else { + void (*kernel_reloc)(struct kimage *kimage); + + if (is_hyp_nvhe()) + __hyp_set_vectors(kimage->arch.el2_vectors); + cpu_install_ttbr0(kimage->arch.ttbr0, kimage->arch.t0sz); + kernel_reloc = (void *)kimage->arch.kern_reloc; + kernel_reloc(kimage); + } BUG(); /* Should never get here. */ } @@ -272,28 +255,6 @@ void machine_crash_shutdown(struct pt_regs *regs) pr_info("Starting crashdump kernel...\n"); } -void arch_kexec_protect_crashkres(void) -{ - int i; - - kexec_segment_flush(kexec_crash_image); - - for (i = 0; i < kexec_crash_image->nr_segments; i++) - set_memory_valid( - __phys_to_virt(kexec_crash_image->segment[i].mem), - kexec_crash_image->segment[i].memsz >> PAGE_SHIFT, 0); -} - -void arch_kexec_unprotect_crashkres(void) -{ - int i; - - for (i = 0; i < kexec_crash_image->nr_segments; i++) - set_memory_valid( - __phys_to_virt(kexec_crash_image->segment[i].mem), - kexec_crash_image->segment[i].memsz >> PAGE_SHIFT, 1); -} - #ifdef CONFIG_HIBERNATION /* * To preserve the crash dump kernel image, the relevant memory segments @@ -335,8 +296,13 @@ bool crash_is_nosave(unsigned long pfn) /* in reserved memory? */ addr = __pfn_to_phys(pfn); - if ((addr < crashk_res.start) || (crashk_res.end < addr)) - return false; + if ((addr < crashk_res.start) || (crashk_res.end < addr)) { + if (!crashk_low_res.end) + return false; + + if ((addr < crashk_low_res.start) || (crashk_low_res.end < addr)) + return false; + } if (!kexec_crash_image) return true; diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index 7b08bf9499b6..0e017358f4ba 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -15,20 +15,12 @@ #include <linux/kexec.h> #include <linux/libfdt.h> #include <linux/memblock.h> +#include <linux/of.h> #include <linux/of_fdt.h> -#include <linux/random.h> +#include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> #include <linux/vmalloc.h> -#include <asm/byteorder.h> - -/* relevant device tree properties */ -#define FDT_PROP_INITRD_START "linux,initrd-start" -#define FDT_PROP_INITRD_END "linux,initrd-end" -#define FDT_PROP_BOOTARGS "bootargs" -#define FDT_PROP_KASLR_SEED "kaslr-seed" -#define FDT_PROP_RNG_SEED "rng-seed" -#define RNG_SEED_SIZE 128 const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_image_ops, @@ -37,143 +29,63 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { int arch_kimage_file_post_load_cleanup(struct kimage *image) { - vfree(image->arch.dtb); + kvfree(image->arch.dtb); image->arch.dtb = NULL; + vfree(image->elf_headers); + image->elf_headers = NULL; + image->elf_headers_sz = 0; + return kexec_image_post_load_cleanup_default(image); } -static int setup_dtb(struct kimage *image, - unsigned long initrd_load_addr, unsigned long initrd_len, - char *cmdline, void *dtb) +static int prepare_elf_headers(void **addr, unsigned long *sz) { - int off, ret; - - ret = fdt_path_offset(dtb, "/chosen"); - if (ret < 0) - goto out; - - off = ret; - - /* add bootargs */ - if (cmdline) { - ret = fdt_setprop_string(dtb, off, FDT_PROP_BOOTARGS, cmdline); - if (ret) - goto out; - } else { - ret = fdt_delprop(dtb, off, FDT_PROP_BOOTARGS); - if (ret && (ret != -FDT_ERR_NOTFOUND)) - goto out; - } - - /* add initrd-* */ - if (initrd_load_addr) { - ret = fdt_setprop_u64(dtb, off, FDT_PROP_INITRD_START, - initrd_load_addr); - if (ret) - goto out; - - ret = fdt_setprop_u64(dtb, off, FDT_PROP_INITRD_END, - initrd_load_addr + initrd_len); - if (ret) - goto out; - } else { - ret = fdt_delprop(dtb, off, FDT_PROP_INITRD_START); - if (ret && (ret != -FDT_ERR_NOTFOUND)) - goto out; - - ret = fdt_delprop(dtb, off, FDT_PROP_INITRD_END); - if (ret && (ret != -FDT_ERR_NOTFOUND)) - goto out; + struct crash_mem *cmem; + unsigned int nr_ranges; + int ret; + u64 i; + phys_addr_t start, end; + + nr_ranges = 2; /* for exclusion of crashkernel region */ + for_each_mem_range(i, &start, &end) + nr_ranges++; + + cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL); + if (!cmem) + return -ENOMEM; + + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; + for_each_mem_range(i, &start, &end) { + cmem->ranges[cmem->nr_ranges].start = start; + cmem->ranges[cmem->nr_ranges].end = end - 1; + cmem->nr_ranges++; } - /* add kaslr-seed */ - ret = fdt_delprop(dtb, off, FDT_PROP_KASLR_SEED); - if (ret == -FDT_ERR_NOTFOUND) - ret = 0; - else if (ret) + /* Exclude crashkernel region */ + ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (ret) goto out; - if (rng_is_initialized()) { - u64 seed = get_random_u64(); - ret = fdt_setprop_u64(dtb, off, FDT_PROP_KASLR_SEED, seed); + if (crashk_low_res.end) { + ret = crash_exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); if (ret) goto out; - } else { - pr_notice("RNG is not initialised: omitting \"%s\" property\n", - FDT_PROP_KASLR_SEED); } - /* add rng-seed */ - if (rng_is_initialized()) { - u8 rng_seed[RNG_SEED_SIZE]; - get_random_bytes(rng_seed, RNG_SEED_SIZE); - ret = fdt_setprop(dtb, off, FDT_PROP_RNG_SEED, rng_seed, - RNG_SEED_SIZE); - if (ret) - goto out; - } else { - pr_notice("RNG is not initialised: omitting \"%s\" property\n", - FDT_PROP_RNG_SEED); - } + ret = crash_prepare_elf64_headers(cmem, true, addr, sz); out: - if (ret) - return (ret == -FDT_ERR_NOSPACE) ? -ENOMEM : -EINVAL; - - return 0; + kfree(cmem); + return ret; } /* - * More space needed so that we can add initrd, bootargs, kaslr-seed, and - * rng-seed. + * Tries to add the initrd and DTB to the image. If it is not possible to find + * valid locations, this function will undo changes to the image and return non + * zero. */ -#define DTB_EXTRA_SPACE 0x1000 - -static int create_dtb(struct kimage *image, - unsigned long initrd_load_addr, unsigned long initrd_len, - char *cmdline, void **dtb) -{ - void *buf; - size_t buf_size; - size_t cmdline_len; - int ret; - - cmdline_len = cmdline ? strlen(cmdline) : 0; - buf_size = fdt_totalsize(initial_boot_params) - + cmdline_len + DTB_EXTRA_SPACE; - - for (;;) { - buf = vmalloc(buf_size); - if (!buf) - return -ENOMEM; - - /* duplicate a device tree blob */ - ret = fdt_open_into(initial_boot_params, buf, buf_size); - if (ret) - return -EINVAL; - - ret = setup_dtb(image, initrd_load_addr, initrd_len, - cmdline, buf); - if (ret) { - vfree(buf); - if (ret == -ENOMEM) { - /* unlikely, but just in case */ - buf_size += DTB_EXTRA_SPACE; - continue; - } else { - return ret; - } - } - - /* trim it */ - fdt_pack(buf); - *dtb = buf; - - return 0; - } -} - int load_other_segments(struct kimage *image, unsigned long kernel_load_addr, unsigned long kernel_size, @@ -181,14 +93,44 @@ int load_other_segments(struct kimage *image, char *cmdline) { struct kexec_buf kbuf; - void *dtb = NULL; - unsigned long initrd_load_addr = 0, dtb_len; + void *headers, *dtb = NULL; + unsigned long headers_sz, initrd_load_addr = 0, dtb_len, + orig_segments = image->nr_segments; int ret = 0; kbuf.image = image; /* not allocate anything below the kernel */ kbuf.buf_min = kernel_load_addr + kernel_size; + /* load elf core header */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = prepare_elf_headers(&headers, &headers_sz); + if (ret) { + pr_err("Preparing elf core header failed\n"); + goto out_err; + } + + kbuf.buffer = headers; + kbuf.bufsz = headers_sz; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = headers_sz; + kbuf.buf_align = SZ_64K; /* largest supported page size */ + kbuf.buf_max = ULONG_MAX; + kbuf.top_down = true; + + ret = kexec_add_buffer(&kbuf); + if (ret) { + vfree(headers); + goto out_err; + } + image->elf_headers = headers; + image->elf_load_addr = kbuf.mem; + image->elf_headers_sz = headers_sz; + + kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); + } + /* load initrd */ if (initrd) { kbuf.buffer = initrd; @@ -206,17 +148,21 @@ int load_other_segments(struct kimage *image, goto out_err; initrd_load_addr = kbuf.mem; - pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - initrd_load_addr, initrd_len, initrd_len); + kexec_dprintk("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + initrd_load_addr, kbuf.bufsz, kbuf.memsz); } /* load dtb */ - ret = create_dtb(image, initrd_load_addr, initrd_len, cmdline, &dtb); - if (ret) { + dtb = of_kexec_alloc_and_setup_fdt(image, initrd_load_addr, + initrd_len, cmdline, 0); + if (!dtb) { pr_err("Preparing for new dtb failed\n"); + ret = -EINVAL; goto out_err; } + /* trim it */ + fdt_pack(dtb); dtb_len = fdt_totalsize(dtb); kbuf.buffer = dtb; kbuf.bufsz = dtb_len; @@ -233,12 +179,13 @@ int load_other_segments(struct kimage *image, image->arch.dtb = dtb; image->arch.dtb_mem = kbuf.mem; - pr_debug("Loaded dtb at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - kbuf.mem, dtb_len, dtb_len); + kexec_dprintk("Loaded dtb at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kbuf.mem, kbuf.bufsz, kbuf.memsz); return 0; out_err: - vfree(dtb); + image->nr_segments = orig_segments; + kvfree(dtb); return ret; } diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c index 65b08a74aec6..bde32979c06a 100644 --- a/arch/arm64/kernel/module-plts.c +++ b/arch/arm64/kernel/module-plts.c @@ -7,6 +7,7 @@ #include <linux/ftrace.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/moduleloader.h> #include <linux/sort.h> static struct plt_entry __get_adrp_add_pair(u64 dst, u64 pc, @@ -37,7 +38,8 @@ struct plt_entry get_plt_entry(u64 dst, void *pc) return plt; } -bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b) +static bool plt_entries_equal(const struct plt_entry *a, + const struct plt_entry *b) { u64 p, q; @@ -64,17 +66,12 @@ bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b) (q + aarch64_insn_adrp_get_offset(le32_to_cpu(b->adrp))); } -static bool in_init(const struct module *mod, void *loc) -{ - return (u64)loc - (u64)mod->init_layout.base < mod->init_layout.size; -} - u64 module_emit_plt_entry(struct module *mod, Elf64_Shdr *sechdrs, void *loc, const Elf64_Rela *rela, Elf64_Sym *sym) { - struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core : - &mod->arch.init; + struct mod_plt_sec *pltsec = !within_module_init((unsigned long)loc, mod) ? + &mod->arch.core : &mod->arch.init; struct plt_entry *plt = (struct plt_entry *)sechdrs[pltsec->plt_shndx].sh_addr; int i = pltsec->plt_num_entries; int j = i - 1; @@ -104,8 +101,8 @@ u64 module_emit_plt_entry(struct module *mod, Elf64_Shdr *sechdrs, u64 module_emit_veneer_for_adrp(struct module *mod, Elf64_Shdr *sechdrs, void *loc, u64 val) { - struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core : - &mod->arch.init; + struct mod_plt_sec *pltsec = !within_module_init((unsigned long)loc, mod) ? + &mod->arch.core : &mod->arch.init; struct plt_entry *plt = (struct plt_entry *)sechdrs[pltsec->plt_shndx].sh_addr; int i = pltsec->plt_num_entries++; u32 br; @@ -131,7 +128,7 @@ u64 module_emit_veneer_for_adrp(struct module *mod, Elf64_Shdr *sechdrs, } #endif -#define cmp_3way(a,b) ((a) < (b) ? -1 : (a) > (b)) +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) static int cmp_rela(const void *a, const void *b) { @@ -170,9 +167,6 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, switch (ELF64_R_TYPE(rela[i].r_info)) { case R_AARCH64_JUMP26: case R_AARCH64_CALL26: - if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - break; - /* * We only have to consider branch targets that resolve * to symbols that are defined in a different section. @@ -206,8 +200,7 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, break; case R_AARCH64_ADR_PREL_PG_HI21_NC: case R_AARCH64_ADR_PREL_PG_HI21: - if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) || - !cpus_have_const_cap(ARM64_WORKAROUND_843419)) + if (!cpus_have_final_cap(ARM64_WORKAROUND_843419)) break; /* @@ -220,7 +213,7 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, * increasing the section's alignment so that the * resulting address of this instruction is guaranteed * to equal the offset in that particular bit (as well - * as all less signficant bits). This ensures that the + * as all less significant bits). This ensures that the * address modulo 4 KB != 0xfff8 or 0xfffc (which would * have all ones in bits [11:3]) */ @@ -242,17 +235,48 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, } } - if (IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) && - cpus_have_const_cap(ARM64_WORKAROUND_843419)) + if (cpus_have_final_cap(ARM64_WORKAROUND_843419)) { /* * Add some slack so we can skip PLT slots that may trigger * the erratum due to the placement of the ADRP instruction. */ ret += DIV_ROUND_UP(ret, (SZ_4K / sizeof(struct plt_entry))); + } return ret; } +static bool branch_rela_needs_plt(Elf64_Sym *syms, Elf64_Rela *rela, + Elf64_Word dstidx) +{ + + Elf64_Sym *s = syms + ELF64_R_SYM(rela->r_info); + + if (s->st_shndx == dstidx) + return false; + + return ELF64_R_TYPE(rela->r_info) == R_AARCH64_JUMP26 || + ELF64_R_TYPE(rela->r_info) == R_AARCH64_CALL26; +} + +/* Group branch PLT relas at the front end of the array. */ +static int partition_branch_plt_relas(Elf64_Sym *syms, Elf64_Rela *rela, + int numrels, Elf64_Word dstidx) +{ + int i = 0, j = numrels - 1; + + while (i < j) { + if (branch_rela_needs_plt(syms, &rela[i], dstidx)) + i++; + else if (branch_rela_needs_plt(syms, &rela[j], dstidx)) + swap(rela[i], rela[j]); + else + j--; + } + + return i; +} + int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { @@ -271,8 +295,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, mod->arch.core.plt_shndx = i; else if (!strcmp(secstrings + sechdrs[i].sh_name, ".init.plt")) mod->arch.init.plt_shndx = i; - else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE) && - !strcmp(secstrings + sechdrs[i].sh_name, + else if (!strcmp(secstrings + sechdrs[i].sh_name, ".text.ftrace_trampoline")) tramp = sechdrs + i; else if (sechdrs[i].sh_type == SHT_SYMTAB) @@ -290,7 +313,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, for (i = 0; i < ehdr->e_shnum; i++) { Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset; - int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela); + int nents, numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela); Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info; if (sechdrs[i].sh_type != SHT_RELA) @@ -300,10 +323,16 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, if (!(dstsec->sh_flags & SHF_EXECINSTR)) continue; - /* sort by type, symbol index and addend */ - sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL); + /* + * sort branch relocations requiring a PLT by type, symbol index + * and addend + */ + nents = partition_branch_plt_relas(syms, rels, numrels, + sechdrs[i].sh_info); + if (nents) + sort(rels, nents, sizeof(Elf64_Rela), cmp_rela, NULL); - if (!str_has_prefix(secstrings + dstsec->sh_name, ".init")) + if (!module_init_layout_section(secstrings + dstsec->sh_name)) core_plts += count_plts(syms, rels, numrels, sechdrs[i].sh_info, dstsec); else diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 1cd1a4d0ed30..dd851297596e 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -7,6 +7,8 @@ * Author: Will Deacon <will.deacon@arm.com> */ +#define pr_fmt(fmt) "Modules: " fmt + #include <linux/bitops.h> #include <linux/elf.h> #include <linux/ftrace.h> @@ -15,51 +17,137 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/moduleloader.h> +#include <linux/random.h> +#include <linux/scs.h> #include <linux/vmalloc.h> + #include <asm/alternative.h> #include <asm/insn.h> +#include <asm/scs.h> #include <asm/sections.h> -void *module_alloc(unsigned long size) +static u64 module_direct_base __ro_after_init = 0; +static u64 module_plt_base __ro_after_init = 0; + +/* + * Choose a random page-aligned base address for a window of 'size' bytes which + * entirely contains the interval [start, end - 1]. + */ +static u64 __init random_bounding_box(u64 size, u64 start, u64 end) { - u64 module_alloc_end = module_alloc_base + MODULES_VSIZE; - gfp_t gfp_mask = GFP_KERNEL; - void *p; + u64 max_pgoff, pgoff; - /* Silence the initial allocation */ - if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) - gfp_mask |= __GFP_NOWARN; + if ((end - start) >= size) + return 0; - if (IS_ENABLED(CONFIG_KASAN)) - /* don't exceed the static module region - see below */ - module_alloc_end = MODULES_END; + max_pgoff = (size - (end - start)) / PAGE_SIZE; + pgoff = get_random_u32_inclusive(0, max_pgoff); - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, - module_alloc_end, gfp_mask, PAGE_KERNEL, 0, - NUMA_NO_NODE, __builtin_return_address(0)); + return start - pgoff * PAGE_SIZE; +} - if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && - !IS_ENABLED(CONFIG_KASAN)) - /* - * KASAN can only deal with module allocations being served - * from the reserved module region, since the remainder of - * the vmalloc region is already backed by zero shadow pages, - * and punching holes into it is non-trivial. Since the module - * region is not randomized when KASAN is enabled, it is even - * less likely that the module region gets exhausted, so we - * can simply omit this fallback in that case. - */ - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, - module_alloc_base + SZ_2G, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); +/* + * Modules may directly reference data and text anywhere within the kernel + * image and other modules. References using PREL32 relocations have a +/-2G + * range, and so we need to ensure that the entire kernel image and all modules + * fall within a 2G window such that these are always within range. + * + * Modules may directly branch to functions and code within the kernel text, + * and to functions and code within other modules. These branches will use + * CALL26/JUMP26 relocations with a +/-128M range. Without PLTs, we must ensure + * that the entire kernel text and all module text falls within a 128M window + * such that these are always within range. With PLTs, we can expand this to a + * 2G window. + * + * We chose the 128M region to surround the entire kernel image (rather than + * just the text) as using the same bounds for the 128M and 2G regions ensures + * by construction that we never select a 128M region that is not a subset of + * the 2G region. For very large and unusual kernel configurations this means + * we may fall back to PLTs where they could have been avoided, but this keeps + * the logic significantly simpler. + */ +static int __init module_init_limits(void) +{ + u64 kernel_end = (u64)_end; + u64 kernel_start = (u64)_text; + u64 kernel_size = kernel_end - kernel_start; + + /* + * The default modules region is placed immediately below the kernel + * image, and is large enough to use the full 2G relocation range. + */ + BUILD_BUG_ON(KIMAGE_VADDR != MODULES_END); + BUILD_BUG_ON(MODULES_VSIZE < SZ_2G); + + if (!kaslr_enabled()) { + if (kernel_size < SZ_128M) + module_direct_base = kernel_end - SZ_128M; + if (kernel_size < SZ_2G) + module_plt_base = kernel_end - SZ_2G; + } else { + u64 min = kernel_start; + u64 max = kernel_end; + + if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) { + pr_info("2G module region forced by RANDOMIZE_MODULE_REGION_FULL\n"); + } else { + module_direct_base = random_bounding_box(SZ_128M, min, max); + if (module_direct_base) { + min = module_direct_base; + max = module_direct_base + SZ_128M; + } + } + + module_plt_base = random_bounding_box(SZ_2G, min, max); + } - if (p && (kasan_module_alloc(p, size) < 0)) { + pr_info("%llu pages in range for non-PLT usage", + module_direct_base ? (SZ_128M - kernel_size) / PAGE_SIZE : 0); + pr_info("%llu pages in range for PLT usage", + module_plt_base ? (SZ_2G - kernel_size) / PAGE_SIZE : 0); + + return 0; +} +subsys_initcall(module_init_limits); + +void *module_alloc(unsigned long size) +{ + void *p = NULL; + + /* + * Where possible, prefer to allocate within direct branch range of the + * kernel such that no PLTs are necessary. + */ + if (module_direct_base) { + p = __vmalloc_node_range(size, MODULE_ALIGN, + module_direct_base, + module_direct_base + SZ_128M, + GFP_KERNEL | __GFP_NOWARN, + PAGE_KERNEL, 0, NUMA_NO_NODE, + __builtin_return_address(0)); + } + + if (!p && module_plt_base) { + p = __vmalloc_node_range(size, MODULE_ALIGN, + module_plt_base, + module_plt_base + SZ_2G, + GFP_KERNEL | __GFP_NOWARN, + PAGE_KERNEL, 0, NUMA_NO_NODE, + __builtin_return_address(0)); + } + + if (!p) { + pr_warn_ratelimited("%s: unable to allocate memory\n", + __func__); + } + + if (p && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) { vfree(p); return NULL; } - return p; + /* Memory is intended to be executable, reset the pointer tag. */ + return kasan_reset_tag(p); } enum aarch64_reloc_op { @@ -315,21 +403,21 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, /* MOVW instruction relocations. */ case R_AARCH64_MOVW_UABS_G0_NC: overflow_check = false; - /* Fall through */ + fallthrough; case R_AARCH64_MOVW_UABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_UABS_G1_NC: overflow_check = false; - /* Fall through */ + fallthrough; case R_AARCH64_MOVW_UABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_UABS_G2_NC: overflow_check = false; - /* Fall through */ + fallthrough; case R_AARCH64_MOVW_UABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, AARCH64_INSN_IMM_MOVKZ); @@ -397,7 +485,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, break; case R_AARCH64_ADR_PREL_PG_HI21_NC: overflow_check = false; - /* Fall through */ + fallthrough; case R_AARCH64_ADR_PREL_PG_HI21: ovf = reloc_insn_adrp(me, sechdrs, loc, val); if (ovf && ovf != -ERANGE) @@ -441,9 +529,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_CALL26: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26, AARCH64_INSN_IMM_26); - - if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && - ovf == -ERANGE) { + if (ovf == -ERANGE) { val = module_emit_plt_entry(me, sechdrs, loc, &rel[i], sym); if (!val) return -ENOEXEC; @@ -471,21 +557,6 @@ overflow: return -ENOEXEC; } -static const Elf_Shdr *find_section(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - const char *name) -{ - const Elf_Shdr *s, *se; - const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) { - if (strcmp(name, secstrs + s->sh_name) == 0) - return s; - } - - return NULL; -} - static inline void __init_plt(struct plt_entry *plt, unsigned long addr) { *plt = get_plt_entry(addr, plt); @@ -495,7 +566,7 @@ static int module_init_ftrace_plt(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod) { -#if defined(CONFIG_ARM64_MODULE_PLTS) && defined(CONFIG_DYNAMIC_FTRACE) +#if defined(CONFIG_DYNAMIC_FTRACE) const Elf_Shdr *s; struct plt_entry *plts; @@ -507,9 +578,6 @@ static int module_init_ftrace_plt(const Elf_Ehdr *hdr, __init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR); - if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS)) - __init_plt(&plts[FTRACE_REGS_PLT_IDX], FTRACE_REGS_ADDR); - mod->arch.ftrace_trampolines = plts; #endif return 0; @@ -524,5 +592,11 @@ int module_finalize(const Elf_Ehdr *hdr, if (s) apply_alternatives_module((void *)s->sh_addr, s->sh_size); + if (scs_is_dynamic()) { + s = find_section(hdr, sechdrs, ".init.eh_frame"); + if (s) + scs_patch((void *)s->sh_addr, s->sh_size); + } + return module_init_ftrace_plt(hdr, sechdrs, me); } diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds deleted file mode 100644 index 22e36a21c113..000000000000 --- a/arch/arm64/kernel/module.lds +++ /dev/null @@ -1,5 +0,0 @@ -SECTIONS { - .plt (NOLOAD) : { BYTE(0) } - .init.plt (NOLOAD) : { BYTE(0) } - .text.ftrace_trampoline (NOLOAD) : { BYTE(0) } -} diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c new file mode 100644 index 000000000000..a41ef3213e1e --- /dev/null +++ b/arch/arm64/kernel/mte.c @@ -0,0 +1,606 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 ARM Ltd. + */ + +#include <linux/bitops.h> +#include <linux/cpu.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/prctl.h> +#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/string.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/thread_info.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/uio.h> + +#include <asm/barrier.h> +#include <asm/cpufeature.h> +#include <asm/mte.h> +#include <asm/ptrace.h> +#include <asm/sysreg.h> + +static DEFINE_PER_CPU_READ_MOSTLY(u64, mte_tcf_preferred); + +#ifdef CONFIG_KASAN_HW_TAGS +/* + * The asynchronous and asymmetric MTE modes have the same behavior for + * store operations. This flag is set when either of these modes is enabled. + */ +DEFINE_STATIC_KEY_FALSE(mte_async_or_asymm_mode); +EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode); +#endif + +void mte_sync_tags(pte_t pte, unsigned int nr_pages) +{ + struct page *page = pte_page(pte); + unsigned int i; + + /* if PG_mte_tagged is set, tags have already been initialised */ + for (i = 0; i < nr_pages; i++, page++) { + if (try_page_mte_tagging(page)) { + mte_clear_page_tags(page_address(page)); + set_page_mte_tagged(page); + } + } + + /* ensure the tags are visible before the PTE is set */ + smp_wmb(); +} + +int memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = page_address(page1); + addr2 = page_address(page2); + ret = memcmp(addr1, addr2, PAGE_SIZE); + + if (!system_supports_mte() || ret) + return ret; + + /* + * If the page content is identical but at least one of the pages is + * tagged, return non-zero to avoid KSM merging. If only one of the + * pages is tagged, set_pte_at() may zero or change the tags of the + * other page via mte_sync_tags(). + */ + if (page_mte_tagged(page1) || page_mte_tagged(page2)) + return addr1 != addr2; + + return ret; +} + +static inline void __mte_enable_kernel(const char *mode, unsigned long tcf) +{ + /* Enable MTE Sync Mode for EL1. */ + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK, + SYS_FIELD_PREP(SCTLR_EL1, TCF, tcf)); + isb(); + + pr_info_once("MTE: enabled in %s mode at EL1\n", mode); +} + +#ifdef CONFIG_KASAN_HW_TAGS +void mte_enable_kernel_sync(void) +{ + /* + * Make sure we enter this function when no PE has set + * async mode previously. + */ + WARN_ONCE(system_uses_mte_async_or_asymm_mode(), + "MTE async mode enabled system wide!"); + + __mte_enable_kernel("synchronous", SCTLR_EL1_TCF_SYNC); +} + +void mte_enable_kernel_async(void) +{ + __mte_enable_kernel("asynchronous", SCTLR_EL1_TCF_ASYNC); + + /* + * MTE async mode is set system wide by the first PE that + * executes this function. + * + * Note: If in future KASAN acquires a runtime switching + * mode in between sync and async, this strategy needs + * to be reviewed. + */ + if (!system_uses_mte_async_or_asymm_mode()) + static_branch_enable(&mte_async_or_asymm_mode); +} + +void mte_enable_kernel_asymm(void) +{ + if (cpus_have_cap(ARM64_MTE_ASYMM)) { + __mte_enable_kernel("asymmetric", SCTLR_EL1_TCF_ASYMM); + + /* + * MTE asymm mode behaves as async mode for store + * operations. The mode is set system wide by the + * first PE that executes this function. + * + * Note: If in future KASAN acquires a runtime switching + * mode in between sync and async, this strategy needs + * to be reviewed. + */ + if (!system_uses_mte_async_or_asymm_mode()) + static_branch_enable(&mte_async_or_asymm_mode); + } else { + /* + * If the CPU does not support MTE asymmetric mode the + * kernel falls back on synchronous mode which is the + * default for kasan=on. + */ + mte_enable_kernel_sync(); + } +} +#endif + +#ifdef CONFIG_KASAN_HW_TAGS +void mte_check_tfsr_el1(void) +{ + u64 tfsr_el1 = read_sysreg_s(SYS_TFSR_EL1); + + if (unlikely(tfsr_el1 & SYS_TFSR_EL1_TF1)) { + /* + * Note: isb() is not required after this direct write + * because there is no indirect read subsequent to it + * (per ARM DDI 0487F.c table D13-1). + */ + write_sysreg_s(0, SYS_TFSR_EL1); + + kasan_report_async(); + } +} +#endif + +/* + * This is where we actually resolve the system and process MTE mode + * configuration into an actual value in SCTLR_EL1 that affects + * userspace. + */ +static void mte_update_sctlr_user(struct task_struct *task) +{ + /* + * This must be called with preemption disabled and can only be called + * on the current or next task since the CPU must match where the thread + * is going to run. The caller is responsible for calling + * update_sctlr_el1() later in the same preemption disabled block. + */ + unsigned long sctlr = task->thread.sctlr_user; + unsigned long mte_ctrl = task->thread.mte_ctrl; + unsigned long pref, resolved_mte_tcf; + + pref = __this_cpu_read(mte_tcf_preferred); + /* + * If there is no overlap between the system preferred and + * program requested values go with what was requested. + */ + resolved_mte_tcf = (mte_ctrl & pref) ? pref : mte_ctrl; + sctlr &= ~SCTLR_EL1_TCF0_MASK; + /* + * Pick an actual setting. The order in which we check for + * set bits and map into register values determines our + * default order. + */ + if (resolved_mte_tcf & MTE_CTRL_TCF_ASYMM) + sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, ASYMM); + else if (resolved_mte_tcf & MTE_CTRL_TCF_ASYNC) + sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, ASYNC); + else if (resolved_mte_tcf & MTE_CTRL_TCF_SYNC) + sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, SYNC); + task->thread.sctlr_user = sctlr; +} + +static void mte_update_gcr_excl(struct task_struct *task) +{ + /* + * SYS_GCR_EL1 will be set to current->thread.mte_ctrl value by + * mte_set_user_gcr() in kernel_exit, but only if KASAN is enabled. + */ + if (kasan_hw_tags_enabled()) + return; + + write_sysreg_s( + ((task->thread.mte_ctrl >> MTE_CTRL_GCR_USER_EXCL_SHIFT) & + SYS_GCR_EL1_EXCL_MASK) | SYS_GCR_EL1_RRND, + SYS_GCR_EL1); +} + +#ifdef CONFIG_KASAN_HW_TAGS +/* Only called from assembly, silence sparse */ +void __init kasan_hw_tags_enable(struct alt_instr *alt, __le32 *origptr, + __le32 *updptr, int nr_inst); + +void __init kasan_hw_tags_enable(struct alt_instr *alt, __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); /* Branch -> NOP */ + + if (kasan_hw_tags_enabled()) + *updptr = cpu_to_le32(aarch64_insn_gen_nop()); +} +#endif + +void mte_thread_init_user(void) +{ + if (!system_supports_mte()) + return; + + /* clear any pending asynchronous tag fault */ + dsb(ish); + write_sysreg_s(0, SYS_TFSRE0_EL1); + clear_thread_flag(TIF_MTE_ASYNC_FAULT); + /* disable tag checking and reset tag generation mask */ + set_mte_ctrl(current, 0); +} + +void mte_thread_switch(struct task_struct *next) +{ + if (!system_supports_mte()) + return; + + mte_update_sctlr_user(next); + mte_update_gcr_excl(next); + + /* TCO may not have been disabled on exception entry for the current task. */ + mte_disable_tco_entry(next); + + /* + * Check if an async tag exception occurred at EL1. + * + * Note: On the context switch path we rely on the dsb() present + * in __switch_to() to guarantee that the indirect writes to TFSR_EL1 + * are synchronized before this point. + */ + isb(); + mte_check_tfsr_el1(); +} + +void mte_cpu_setup(void) +{ + u64 rgsr; + + /* + * CnP must be enabled only after the MAIR_EL1 register has been set + * up. Inconsistent MAIR_EL1 between CPUs sharing the same TLB may + * lead to the wrong memory type being used for a brief window during + * CPU power-up. + * + * CnP is not a boot feature so MTE gets enabled before CnP, but let's + * make sure that is the case. + */ + BUG_ON(read_sysreg(ttbr0_el1) & TTBR_CNP_BIT); + BUG_ON(read_sysreg(ttbr1_el1) & TTBR_CNP_BIT); + + /* Normal Tagged memory type at the corresponding MAIR index */ + sysreg_clear_set(mair_el1, + MAIR_ATTRIDX(MAIR_ATTR_MASK, MT_NORMAL_TAGGED), + MAIR_ATTRIDX(MAIR_ATTR_NORMAL_TAGGED, + MT_NORMAL_TAGGED)); + + write_sysreg_s(KERNEL_GCR_EL1, SYS_GCR_EL1); + + /* + * If GCR_EL1.RRND=1 is implemented the same way as RRND=0, then + * RGSR_EL1.SEED must be non-zero for IRG to produce + * pseudorandom numbers. As RGSR_EL1 is UNKNOWN out of reset, we + * must initialize it. + */ + rgsr = (read_sysreg(CNTVCT_EL0) & SYS_RGSR_EL1_SEED_MASK) << + SYS_RGSR_EL1_SEED_SHIFT; + if (rgsr == 0) + rgsr = 1 << SYS_RGSR_EL1_SEED_SHIFT; + write_sysreg_s(rgsr, SYS_RGSR_EL1); + + /* clear any pending tag check faults in TFSR*_EL1 */ + write_sysreg_s(0, SYS_TFSR_EL1); + write_sysreg_s(0, SYS_TFSRE0_EL1); + + local_flush_tlb_all(); +} + +void mte_suspend_enter(void) +{ + if (!system_supports_mte()) + return; + + /* + * The barriers are required to guarantee that the indirect writes + * to TFSR_EL1 are synchronized before we report the state. + */ + dsb(nsh); + isb(); + + /* Report SYS_TFSR_EL1 before suspend entry */ + mte_check_tfsr_el1(); +} + +void mte_suspend_exit(void) +{ + if (!system_supports_mte()) + return; + + mte_cpu_setup(); +} + +long set_mte_ctrl(struct task_struct *task, unsigned long arg) +{ + u64 mte_ctrl = (~((arg & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT) & + SYS_GCR_EL1_EXCL_MASK) << MTE_CTRL_GCR_USER_EXCL_SHIFT; + + if (!system_supports_mte()) + return 0; + + if (arg & PR_MTE_TCF_ASYNC) + mte_ctrl |= MTE_CTRL_TCF_ASYNC; + if (arg & PR_MTE_TCF_SYNC) + mte_ctrl |= MTE_CTRL_TCF_SYNC; + + /* + * If the system supports it and both sync and async modes are + * specified then implicitly enable asymmetric mode. + * Userspace could see a mix of both sync and async anyway due + * to differing or changing defaults on CPUs. + */ + if (cpus_have_cap(ARM64_MTE_ASYMM) && + (arg & PR_MTE_TCF_ASYNC) && + (arg & PR_MTE_TCF_SYNC)) + mte_ctrl |= MTE_CTRL_TCF_ASYMM; + + task->thread.mte_ctrl = mte_ctrl; + if (task == current) { + preempt_disable(); + mte_update_sctlr_user(task); + mte_update_gcr_excl(task); + update_sctlr_el1(task->thread.sctlr_user); + preempt_enable(); + } + + return 0; +} + +long get_mte_ctrl(struct task_struct *task) +{ + unsigned long ret; + u64 mte_ctrl = task->thread.mte_ctrl; + u64 incl = (~mte_ctrl >> MTE_CTRL_GCR_USER_EXCL_SHIFT) & + SYS_GCR_EL1_EXCL_MASK; + + if (!system_supports_mte()) + return 0; + + ret = incl << PR_MTE_TAG_SHIFT; + if (mte_ctrl & MTE_CTRL_TCF_ASYNC) + ret |= PR_MTE_TCF_ASYNC; + if (mte_ctrl & MTE_CTRL_TCF_SYNC) + ret |= PR_MTE_TCF_SYNC; + + return ret; +} + +/* + * Access MTE tags in another process' address space as given in mm. Update + * the number of tags copied. Return 0 if any tags copied, error otherwise. + * Inspired by __access_remote_vm(). + */ +static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, + struct iovec *kiov, unsigned int gup_flags) +{ + void __user *buf = kiov->iov_base; + size_t len = kiov->iov_len; + int err = 0; + int write = gup_flags & FOLL_WRITE; + + if (!access_ok(buf, len)) + return -EFAULT; + + if (mmap_read_lock_killable(mm)) + return -EIO; + + while (len) { + struct vm_area_struct *vma; + unsigned long tags, offset; + void *maddr; + struct page *page = get_user_page_vma_remote(mm, addr, + gup_flags, &vma); + + if (IS_ERR(page)) { + err = PTR_ERR(page); + break; + } + + /* + * Only copy tags if the page has been mapped as PROT_MTE + * (PG_mte_tagged set). Otherwise the tags are not valid and + * not accessible to user. Moreover, an mprotect(PROT_MTE) + * would cause the existing tags to be cleared if the page + * was never mapped with PROT_MTE. + */ + if (!(vma->vm_flags & VM_MTE)) { + err = -EOPNOTSUPP; + put_page(page); + break; + } + WARN_ON_ONCE(!page_mte_tagged(page)); + + /* limit access to the end of the page */ + offset = offset_in_page(addr); + tags = min(len, (PAGE_SIZE - offset) / MTE_GRANULE_SIZE); + + maddr = page_address(page); + if (write) { + tags = mte_copy_tags_from_user(maddr + offset, buf, tags); + set_page_dirty_lock(page); + } else { + tags = mte_copy_tags_to_user(buf, maddr + offset, tags); + } + put_page(page); + + /* error accessing the tracer's buffer */ + if (!tags) + break; + + len -= tags; + buf += tags; + addr += tags * MTE_GRANULE_SIZE; + } + mmap_read_unlock(mm); + + /* return an error if no tags copied */ + kiov->iov_len = buf - kiov->iov_base; + if (!kiov->iov_len) { + /* check for error accessing the tracee's address space */ + if (err) + return -EIO; + else + return -EFAULT; + } + + return 0; +} + +/* + * Copy MTE tags in another process' address space at 'addr' to/from tracer's + * iovec buffer. Return 0 on success. Inspired by ptrace_access_vm(). + */ +static int access_remote_tags(struct task_struct *tsk, unsigned long addr, + struct iovec *kiov, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return -EPERM; + + if (!tsk->ptrace || (current != tsk->parent) || + ((get_dumpable(mm) != SUID_DUMP_USER) && + !ptracer_capable(tsk, mm->user_ns))) { + mmput(mm); + return -EPERM; + } + + ret = __access_remote_tags(mm, addr, kiov, gup_flags); + mmput(mm); + + return ret; +} + +int mte_ptrace_copy_tags(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret; + struct iovec kiov; + struct iovec __user *uiov = (void __user *)data; + unsigned int gup_flags = FOLL_FORCE; + + if (!system_supports_mte()) + return -EIO; + + if (get_user(kiov.iov_base, &uiov->iov_base) || + get_user(kiov.iov_len, &uiov->iov_len)) + return -EFAULT; + + if (request == PTRACE_POKEMTETAGS) + gup_flags |= FOLL_WRITE; + + /* align addr to the MTE tag granule */ + addr &= MTE_GRANULE_MASK; + + ret = access_remote_tags(child, addr, &kiov, gup_flags); + if (!ret) + ret = put_user(kiov.iov_len, &uiov->iov_len); + + return ret; +} + +static ssize_t mte_tcf_preferred_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + switch (per_cpu(mte_tcf_preferred, dev->id)) { + case MTE_CTRL_TCF_ASYNC: + return sysfs_emit(buf, "async\n"); + case MTE_CTRL_TCF_SYNC: + return sysfs_emit(buf, "sync\n"); + case MTE_CTRL_TCF_ASYMM: + return sysfs_emit(buf, "asymm\n"); + default: + return sysfs_emit(buf, "???\n"); + } +} + +static ssize_t mte_tcf_preferred_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u64 tcf; + + if (sysfs_streq(buf, "async")) + tcf = MTE_CTRL_TCF_ASYNC; + else if (sysfs_streq(buf, "sync")) + tcf = MTE_CTRL_TCF_SYNC; + else if (cpus_have_cap(ARM64_MTE_ASYMM) && sysfs_streq(buf, "asymm")) + tcf = MTE_CTRL_TCF_ASYMM; + else + return -EINVAL; + + device_lock(dev); + per_cpu(mte_tcf_preferred, dev->id) = tcf; + device_unlock(dev); + + return count; +} +static DEVICE_ATTR_RW(mte_tcf_preferred); + +static int register_mte_tcf_preferred_sysctl(void) +{ + unsigned int cpu; + + if (!system_supports_mte()) + return 0; + + for_each_possible_cpu(cpu) { + per_cpu(mte_tcf_preferred, cpu) = MTE_CTRL_TCF_ASYNC; + device_create_file(get_cpu_device(cpu), + &dev_attr_mte_tcf_preferred); + } + + return 0; +} +subsys_initcall(register_mte_tcf_preferred_sysctl); + +/* + * Return 0 on success, the number of bytes not probed otherwise. + */ +size_t mte_probe_user_range(const char __user *uaddr, size_t size) +{ + const char __user *end = uaddr + size; + int err = 0; + char val; + + __raw_get_user(val, uaddr, err); + if (err) + return size; + + uaddr = PTR_ALIGN(uaddr, MTE_GRANULE_SIZE); + while (uaddr < end) { + /* + * A read is sufficient for mte, the caller should have probed + * for the pte write permission if required. + */ + __raw_get_user(val, uaddr, err); + if (err) + return end - uaddr; + uaddr += MTE_GRANULE_SIZE; + } + (void)val; + + return 0; +} diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index 1ef702b0be2d..aa718d6a9274 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -18,6 +18,7 @@ #include <linux/reboot.h> #include <linux/slab.h> #include <linux/types.h> +#include <linux/static_call.h> #include <asm/paravirt.h> #include <asm/pvclock-abi.h> @@ -26,11 +27,15 @@ struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; -struct paravirt_patch_template pv_ops; -EXPORT_SYMBOL_GPL(pv_ops); +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); struct pv_time_stolen_time_region { - struct pvclock_vcpu_stolen_time *kaddr; + struct pvclock_vcpu_stolen_time __rcu *kaddr; }; static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region); @@ -45,36 +50,50 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); /* return stolen time in ns by asking the hypervisor */ -static u64 pv_steal_clock(int cpu) +static u64 para_steal_clock(int cpu) { + struct pvclock_vcpu_stolen_time *kaddr = NULL; struct pv_time_stolen_time_region *reg; + u64 ret = 0; reg = per_cpu_ptr(&stolen_time_region, cpu); - if (!reg->kaddr) { - pr_warn_once("stolen time enabled but not configured for cpu %d\n", - cpu); + + /* + * paravirt_steal_clock() may be called before the CPU + * online notification callback runs. Until the callback + * has run we just return zero. + */ + rcu_read_lock(); + kaddr = rcu_dereference(reg->kaddr); + if (!kaddr) { + rcu_read_unlock(); return 0; } - return le64_to_cpu(READ_ONCE(reg->kaddr->stolen_time)); + ret = le64_to_cpu(READ_ONCE(kaddr->stolen_time)); + rcu_read_unlock(); + return ret; } -static int stolen_time_dying_cpu(unsigned int cpu) +static int stolen_time_cpu_down_prepare(unsigned int cpu) { + struct pvclock_vcpu_stolen_time *kaddr = NULL; struct pv_time_stolen_time_region *reg; reg = this_cpu_ptr(&stolen_time_region); if (!reg->kaddr) return 0; - memunmap(reg->kaddr); - memset(reg, 0, sizeof(*reg)); + kaddr = rcu_replace_pointer(reg->kaddr, NULL, true); + synchronize_rcu(); + memunmap(kaddr); return 0; } -static int init_stolen_time_cpu(unsigned int cpu) +static int stolen_time_cpu_online(unsigned int cpu) { + struct pvclock_vcpu_stolen_time *kaddr = NULL; struct pv_time_stolen_time_region *reg; struct arm_smccc_res res; @@ -85,17 +104,19 @@ static int init_stolen_time_cpu(unsigned int cpu) if (res.a0 == SMCCC_RET_NOT_SUPPORTED) return -EINVAL; - reg->kaddr = memremap(res.a0, + kaddr = memremap(res.a0, sizeof(struct pvclock_vcpu_stolen_time), MEMREMAP_WB); + rcu_assign_pointer(reg->kaddr, kaddr); + if (!reg->kaddr) { pr_warn("Failed to map stolen time data structure\n"); return -ENOMEM; } - if (le32_to_cpu(reg->kaddr->revision) != 0 || - le32_to_cpu(reg->kaddr->attributes) != 0) { + if (le32_to_cpu(kaddr->revision) != 0 || + le32_to_cpu(kaddr->attributes) != 0) { pr_warn_once("Unexpected revision or attributes in stolen time data\n"); return -ENXIO; } @@ -103,26 +124,23 @@ static int init_stolen_time_cpu(unsigned int cpu) return 0; } -static int pv_time_init_stolen_time(void) +static int __init pv_time_init_stolen_time(void) { int ret; - ret = cpuhp_setup_state(CPUHP_AP_ARM_KVMPV_STARTING, - "hypervisor/arm/pvtime:starting", - init_stolen_time_cpu, stolen_time_dying_cpu); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "hypervisor/arm/pvtime:online", + stolen_time_cpu_online, + stolen_time_cpu_down_prepare); if (ret < 0) return ret; return 0; } -static bool has_pv_steal_clock(void) +static bool __init has_pv_steal_clock(void) { struct arm_smccc_res res; - /* To detect the presence of PV time support we require SMCCC 1.1+ */ - if (psci_ops.smccc_version < SMCCC_VERSION_1_1) - return false; - arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_HV_PV_TIME_FEATURES, &res); @@ -146,7 +164,7 @@ int __init pv_time_init(void) if (ret) return ret; - pv_ops.time.steal_clock = pv_steal_clock; + static_call_update(pv_steal_clock, para_steal_clock); static_key_slow_inc(¶virt_steal_enabled); if (steal_acc) diff --git a/arch/arm64/kernel/patch-scs.c b/arch/arm64/kernel/patch-scs.c new file mode 100644 index 000000000000..a1fe4b4ff591 --- /dev/null +++ b/arch/arm64/kernel/patch-scs.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022 - Google LLC + * Author: Ard Biesheuvel <ardb@google.com> + */ + +#include <linux/bug.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/printk.h> +#include <linux/types.h> + +#include <asm/cacheflush.h> +#include <asm/scs.h> + +// +// This minimal DWARF CFI parser is partially based on the code in +// arch/arc/kernel/unwind.c, and on the document below: +// https://refspecs.linuxbase.org/LSB_4.0.0/LSB-Core-generic/LSB-Core-generic/ehframechpt.html +// + +#define DW_CFA_nop 0x00 +#define DW_CFA_set_loc 0x01 +#define DW_CFA_advance_loc1 0x02 +#define DW_CFA_advance_loc2 0x03 +#define DW_CFA_advance_loc4 0x04 +#define DW_CFA_offset_extended 0x05 +#define DW_CFA_restore_extended 0x06 +#define DW_CFA_undefined 0x07 +#define DW_CFA_same_value 0x08 +#define DW_CFA_register 0x09 +#define DW_CFA_remember_state 0x0a +#define DW_CFA_restore_state 0x0b +#define DW_CFA_def_cfa 0x0c +#define DW_CFA_def_cfa_register 0x0d +#define DW_CFA_def_cfa_offset 0x0e +#define DW_CFA_def_cfa_expression 0x0f +#define DW_CFA_expression 0x10 +#define DW_CFA_offset_extended_sf 0x11 +#define DW_CFA_def_cfa_sf 0x12 +#define DW_CFA_def_cfa_offset_sf 0x13 +#define DW_CFA_val_offset 0x14 +#define DW_CFA_val_offset_sf 0x15 +#define DW_CFA_val_expression 0x16 +#define DW_CFA_lo_user 0x1c +#define DW_CFA_negate_ra_state 0x2d +#define DW_CFA_GNU_args_size 0x2e +#define DW_CFA_GNU_negative_offset_extended 0x2f +#define DW_CFA_hi_user 0x3f + +extern const u8 __eh_frame_start[], __eh_frame_end[]; + +enum { + PACIASP = 0xd503233f, + AUTIASP = 0xd50323bf, + SCS_PUSH = 0xf800865e, + SCS_POP = 0xf85f8e5e, +}; + +static void __always_inline scs_patch_loc(u64 loc) +{ + u32 insn = le32_to_cpup((void *)loc); + + switch (insn) { + case PACIASP: + *(u32 *)loc = cpu_to_le32(SCS_PUSH); + break; + case AUTIASP: + *(u32 *)loc = cpu_to_le32(SCS_POP); + break; + default: + /* + * While the DW_CFA_negate_ra_state directive is guaranteed to + * appear right after a PACIASP/AUTIASP instruction, it may + * also appear after a DW_CFA_restore_state directive that + * restores a state that is only partially accurate, and is + * followed by DW_CFA_negate_ra_state directive to toggle the + * PAC bit again. So we permit other instructions here, and ignore + * them. + */ + return; + } + dcache_clean_pou(loc, loc + sizeof(u32)); +} + +/* + * Skip one uleb128/sleb128 encoded quantity from the opcode stream. All bytes + * except the last one have bit #7 set. + */ +static int __always_inline skip_xleb128(const u8 **opcode, int size) +{ + u8 c; + + do { + c = *(*opcode)++; + size--; + } while (c & BIT(7)); + + return size; +} + +struct eh_frame { + /* + * The size of this frame if 0 < size < U32_MAX, 0 terminates the list. + */ + u32 size; + + /* + * The first frame is a Common Information Entry (CIE) frame, followed + * by one or more Frame Description Entry (FDE) frames. In the former + * case, this field is 0, otherwise it is the negated offset relative + * to the associated CIE frame. + */ + u32 cie_id_or_pointer; + + union { + struct { // CIE + u8 version; + u8 augmentation_string[]; + }; + + struct { // FDE + s32 initial_loc; + s32 range; + u8 opcodes[]; + }; + }; +}; + +static int noinstr scs_handle_fde_frame(const struct eh_frame *frame, + bool fde_has_augmentation_data, + int code_alignment_factor, + bool dry_run) +{ + int size = frame->size - offsetof(struct eh_frame, opcodes) + 4; + u64 loc = (u64)offset_to_ptr(&frame->initial_loc); + const u8 *opcode = frame->opcodes; + + if (fde_has_augmentation_data) { + int l; + + // assume single byte uleb128_t + if (WARN_ON(*opcode & BIT(7))) + return -ENOEXEC; + + l = *opcode++; + opcode += l; + size -= l + 1; + } + + /* + * Starting from 'loc', apply the CFA opcodes that advance the location + * pointer, and identify the locations of the PAC instructions. + */ + while (size-- > 0) { + switch (*opcode++) { + case DW_CFA_nop: + case DW_CFA_remember_state: + case DW_CFA_restore_state: + break; + + case DW_CFA_advance_loc1: + loc += *opcode++ * code_alignment_factor; + size--; + break; + + case DW_CFA_advance_loc2: + loc += *opcode++ * code_alignment_factor; + loc += (*opcode++ << 8) * code_alignment_factor; + size -= 2; + break; + + case DW_CFA_def_cfa: + case DW_CFA_offset_extended: + size = skip_xleb128(&opcode, size); + fallthrough; + case DW_CFA_def_cfa_offset: + case DW_CFA_def_cfa_offset_sf: + case DW_CFA_def_cfa_register: + case DW_CFA_same_value: + case DW_CFA_restore_extended: + case 0x80 ... 0xbf: + size = skip_xleb128(&opcode, size); + break; + + case DW_CFA_negate_ra_state: + if (!dry_run) + scs_patch_loc(loc - 4); + break; + + case 0x40 ... 0x7f: + // advance loc + loc += (opcode[-1] & 0x3f) * code_alignment_factor; + break; + + case 0xc0 ... 0xff: + break; + + default: + pr_err("unhandled opcode: %02x in FDE frame %lx\n", opcode[-1], (uintptr_t)frame); + return -ENOEXEC; + } + } + return 0; +} + +int noinstr scs_patch(const u8 eh_frame[], int size) +{ + const u8 *p = eh_frame; + + while (size > 4) { + const struct eh_frame *frame = (const void *)p; + bool fde_has_augmentation_data = true; + int code_alignment_factor = 1; + int ret; + + if (frame->size == 0 || + frame->size == U32_MAX || + frame->size > size) + break; + + if (frame->cie_id_or_pointer == 0) { + const u8 *p = frame->augmentation_string; + + /* a 'z' in the augmentation string must come first */ + fde_has_augmentation_data = *p == 'z'; + + /* + * The code alignment factor is a uleb128 encoded field + * but given that the only sensible values are 1 or 4, + * there is no point in decoding the whole thing. + */ + p += strlen(p) + 1; + if (!WARN_ON(*p & BIT(7))) + code_alignment_factor = *p; + } else { + ret = scs_handle_fde_frame(frame, + fde_has_augmentation_data, + code_alignment_factor, + true); + if (ret) + return ret; + scs_handle_fde_frame(frame, fde_has_augmentation_data, + code_alignment_factor, false); + } + + p += sizeof(frame->size) + frame->size; + size -= sizeof(frame->size) + frame->size; + } + return 0; +} + +asmlinkage void __init scs_patch_vmlinux(void) +{ + if (!should_patch_pac_into_scs()) + return; + + WARN_ON(scs_patch(__eh_frame_start, __eh_frame_end - __eh_frame_start)); + icache_inval_all_pou(); + isb(); +} diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c new file mode 100644 index 000000000000..b4835f6d594b --- /dev/null +++ b/arch/arm64/kernel/patching.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/spinlock.h> +#include <linux/stop_machine.h> +#include <linux/uaccess.h> + +#include <asm/cacheflush.h> +#include <asm/fixmap.h> +#include <asm/insn.h> +#include <asm/kprobes.h> +#include <asm/patching.h> +#include <asm/sections.h> + +static DEFINE_RAW_SPINLOCK(patch_lock); + +static bool is_exit_text(unsigned long addr) +{ + /* discarded with init text/data */ + return system_state < SYSTEM_RUNNING && + addr >= (unsigned long)__exittext_begin && + addr < (unsigned long)__exittext_end; +} + +static bool is_image_text(unsigned long addr) +{ + return core_kernel_text(addr) || is_exit_text(addr); +} + +static void __kprobes *patch_map(void *addr, int fixmap) +{ + unsigned long uintaddr = (uintptr_t) addr; + bool image = is_image_text(uintaddr); + struct page *page; + + if (image) + page = phys_to_page(__pa_symbol(addr)); + else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + page = vmalloc_to_page(addr); + else + return addr; + + BUG_ON(!page); + return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + + (uintaddr & ~PAGE_MASK)); +} + +static void __kprobes patch_unmap(int fixmap) +{ + clear_fixmap(fixmap); +} +/* + * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always + * little-endian. + */ +int __kprobes aarch64_insn_read(void *addr, u32 *insnp) +{ + int ret; + __le32 val; + + ret = copy_from_kernel_nofault(&val, addr, AARCH64_INSN_SIZE); + if (!ret) + *insnp = le32_to_cpu(val); + + return ret; +} + +static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) +{ + void *waddr = addr; + unsigned long flags = 0; + int ret; + + raw_spin_lock_irqsave(&patch_lock, flags); + waddr = patch_map(addr, FIX_TEXT_POKE0); + + ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE); + + patch_unmap(FIX_TEXT_POKE0); + raw_spin_unlock_irqrestore(&patch_lock, flags); + + return ret; +} + +int __kprobes aarch64_insn_write(void *addr, u32 insn) +{ + return __aarch64_insn_write(addr, cpu_to_le32(insn)); +} + +noinstr int aarch64_insn_write_literal_u64(void *addr, u64 val) +{ + u64 *waddr; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&patch_lock, flags); + waddr = patch_map(addr, FIX_TEXT_POKE0); + + ret = copy_to_kernel_nofault(waddr, &val, sizeof(val)); + + patch_unmap(FIX_TEXT_POKE0); + raw_spin_unlock_irqrestore(&patch_lock, flags); + + return ret; +} + +int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) +{ + u32 *tp = addr; + int ret; + + /* A64 instructions must be word aligned */ + if ((uintptr_t)tp & 0x3) + return -EINVAL; + + ret = aarch64_insn_write(tp, insn); + if (ret == 0) + caches_clean_inval_pou((uintptr_t)tp, + (uintptr_t)tp + AARCH64_INSN_SIZE); + + return ret; +} + +struct aarch64_insn_patch { + void **text_addrs; + u32 *new_insns; + int insn_cnt; + atomic_t cpu_count; +}; + +static int __kprobes aarch64_insn_patch_text_cb(void *arg) +{ + int i, ret = 0; + struct aarch64_insn_patch *pp = arg; + + /* The last CPU becomes master */ + if (atomic_inc_return(&pp->cpu_count) == num_online_cpus()) { + for (i = 0; ret == 0 && i < pp->insn_cnt; i++) + ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i], + pp->new_insns[i]); + /* Notify other processors with an additional increment. */ + atomic_inc(&pp->cpu_count); + } else { + while (atomic_read(&pp->cpu_count) <= num_online_cpus()) + cpu_relax(); + isb(); + } + + return ret; +} + +int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt) +{ + struct aarch64_insn_patch patch = { + .text_addrs = addrs, + .new_insns = insns, + .insn_cnt = cnt, + .cpu_count = ATOMIC_INIT(0), + }; + + if (cnt <= 0) + return -EINVAL; + + return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch, + cpu_online_mask); +} diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index 570988c7a7ff..f872c57e9909 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -11,8 +11,6 @@ #include <linux/io.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/of_pci.h> -#include <linux/of_platform.h> #include <linux/pci.h> #include <linux/pci-acpi.h> #include <linux/pci-ecam.h> @@ -82,14 +80,29 @@ int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) { - if (!acpi_disabled) { - struct pci_config_window *cfg = bridge->bus->sysdata; - struct acpi_device *adev = to_acpi_device(cfg->parent); - struct device *bus_dev = &bridge->bus->dev; + struct pci_config_window *cfg; + struct acpi_device *adev; + struct device *bus_dev; - ACPI_COMPANION_SET(&bridge->dev, adev); - set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev))); - } + if (acpi_disabled) + return 0; + + cfg = bridge->bus->sysdata; + + /* + * On Hyper-V there is no corresponding ACPI device for a root bridge, + * therefore ->parent is set as NULL by the driver. And set 'adev' as + * NULL in this case because there is no proper ACPI device. + */ + if (!cfg->parent) + adev = NULL; + else + adev = to_acpi_device(cfg->parent); + + bus_dev = &bridge->bus->dev; + + ACPI_COMPANION_SET(&bridge->dev, adev); + set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev))); return 0; } @@ -117,7 +130,7 @@ pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root) struct device *dev = &root->device->dev; struct resource *bus_res = &root->secondary; u16 seg = root->segment; - struct pci_ecam_ops *ecam_ops; + const struct pci_ecam_ops *ecam_ops; struct resource cfgres; struct acpi_device *adev; struct pci_config_window *cfg; @@ -185,7 +198,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) root_ops->release_info = pci_acpi_generic_release_info; root_ops->prepare_resources = pci_acpi_root_prepare_resources; - root_ops->pci_ops = &ri->cfg->ops->pci_ops; + root_ops->pci_ops = (struct pci_ops *)&ri->cfg->ops->pci_ops; bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg); if (!bus) return NULL; diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index b0e03e052dd1..6d157f32187b 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -5,10 +5,10 @@ * Copyright (C) 2015 ARM Limited */ #include <linux/perf_event.h> +#include <linux/stacktrace.h> #include <linux/uaccess.h> #include <asm/pointer_auth.h> -#include <asm/stacktrace.h> struct frame_tail { struct frame_tail __user *fp; @@ -38,7 +38,7 @@ user_backtrace(struct frame_tail __user *tail, if (err) return NULL; - lr = ptrauth_strip_insn_pac(buftail.lr); + lr = ptrauth_strip_user_insn_pac(buftail.lr); perf_callchain_store(entry, lr); @@ -102,7 +102,7 @@ compat_user_backtrace(struct compat_frame_tail __user *tail, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* We don't support guest os callchain now */ return; } @@ -116,7 +116,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, tail = (struct frame_tail __user *)regs->regs[29]; while (entry->nr < entry->max_stack && - tail && !((unsigned long)tail & 0xf)) + tail && !((unsigned long)tail & 0x7)) tail = user_backtrace(tail, entry); } else { #ifdef CONFIG_COMPAT @@ -132,46 +132,38 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, } } -/* - * Gets called by walk_stackframe() for every stackframe. This will be called - * whist unwinding the stackframe and is like a subroutine return so we use - * the PC. - */ -static int callchain_trace(struct stackframe *frame, void *data) +static bool callchain_trace(void *data, unsigned long pc) { struct perf_callchain_entry_ctx *entry = data; - perf_callchain_store(entry, frame->pc); - return 0; + return perf_callchain_store(entry, pc) == 0; } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct stackframe frame; - - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* We don't support guest os callchain now */ return; } - start_backtrace(&frame, regs->regs[29], regs->pc); - walk_stackframe(current, &frame, callchain_trace, entry); + arch_stack_walk(callchain_trace, entry, current, regs); } unsigned long perf_instruction_pointer(struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + if (perf_guest_state()) + return perf_guest_get_ip(); return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + unsigned int guest_state = perf_guest_state(); int misc = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_state) { + if (guest_state & PERF_GUEST_USER) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c deleted file mode 100644 index e40b65645c86..000000000000 --- a/arch/arm64/kernel/perf_event.c +++ /dev/null @@ -1,1168 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * ARMv8 PMUv3 Performance Events handling code. - * - * Copyright (C) 2012 ARM Limited - * Author: Will Deacon <will.deacon@arm.com> - * - * This code is based heavily on the ARMv7 perf event code. - */ - -#include <asm/irq_regs.h> -#include <asm/perf_event.h> -#include <asm/sysreg.h> -#include <asm/virt.h> - -#include <linux/acpi.h> -#include <linux/clocksource.h> -#include <linux/kvm_host.h> -#include <linux/of.h> -#include <linux/perf/arm_pmu.h> -#include <linux/platform_device.h> -#include <linux/smp.h> - -/* ARMv8 Cortex-A53 specific event types. */ -#define ARMV8_A53_PERFCTR_PREF_LINEFILL 0xC2 - -/* ARMv8 Cavium ThunderX specific event types. */ -#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_MISS_ST 0xE9 -#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_ACCESS 0xEA -#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_MISS 0xEB -#define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS 0xEC -#define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS 0xED - -/* - * ARMv8 Architectural defined events, not all of these may - * be supported on any given implementation. Unsupported events will - * be disabled at run-time based on the PMCEID registers. - */ -static const unsigned armv8_pmuv3_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_BUS_CYCLES] = ARMV8_PMUV3_PERFCTR_BUS_CYCLES, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = ARMV8_PMUV3_PERFCTR_STALL_FRONTEND, - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = ARMV8_PMUV3_PERFCTR_STALL_BACKEND, -}; - -static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - PERF_CACHE_MAP_ALL_UNSUPPORTED, - - [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [C(L1D)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, - - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL, - [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1D_TLB, - - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, - [C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB, - - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, -}; - -static const unsigned armv8_a53_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - PERF_CACHE_MAP_ALL_UNSUPPORTED, - - [C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_A53_PERFCTR_PREF_LINEFILL, - - [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, - [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, -}; - -static const unsigned armv8_a57_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - PERF_CACHE_MAP_ALL_UNSUPPORTED, - - [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD, - [C(L1D)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD, - [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, - [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR, - - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, - [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, - - [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, - [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, -}; - -static const unsigned armv8_a73_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - PERF_CACHE_MAP_ALL_UNSUPPORTED, - - [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD, - [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, -}; - -static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - PERF_CACHE_MAP_ALL_UNSUPPORTED, - - [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD, - [C(L1D)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD, - [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, - [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_MISS_ST, - [C(L1D)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_ACCESS, - [C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_MISS, - - [C(L1I)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS, - [C(L1I)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS, - - [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD, - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, - [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR, - [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, -}; - -static const unsigned armv8_vulcan_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - PERF_CACHE_MAP_ALL_UNSUPPORTED, - - [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD, - [C(L1D)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD, - [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, - [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR, - - [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD, - [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR, - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, - [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, - - [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, - [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, -}; - -static ssize_t -armv8pmu_events_sysfs_show(struct device *dev, - struct device_attribute *attr, char *page) -{ - struct perf_pmu_events_attr *pmu_attr; - - pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); - - return sprintf(page, "event=0x%03llx\n", pmu_attr->id); -} - -#define ARMV8_EVENT_ATTR(name, config) \ - (&((struct perf_pmu_events_attr) { \ - .attr = __ATTR(name, 0444, armv8pmu_events_sysfs_show, NULL), \ - .id = config, \ - }).attr.attr) - -static struct attribute *armv8_pmuv3_event_attrs[] = { - ARMV8_EVENT_ATTR(sw_incr, ARMV8_PMUV3_PERFCTR_SW_INCR), - ARMV8_EVENT_ATTR(l1i_cache_refill, ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL), - ARMV8_EVENT_ATTR(l1i_tlb_refill, ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL), - ARMV8_EVENT_ATTR(l1d_cache_refill, ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL), - ARMV8_EVENT_ATTR(l1d_cache, ARMV8_PMUV3_PERFCTR_L1D_CACHE), - ARMV8_EVENT_ATTR(l1d_tlb_refill, ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL), - ARMV8_EVENT_ATTR(ld_retired, ARMV8_PMUV3_PERFCTR_LD_RETIRED), - ARMV8_EVENT_ATTR(st_retired, ARMV8_PMUV3_PERFCTR_ST_RETIRED), - ARMV8_EVENT_ATTR(inst_retired, ARMV8_PMUV3_PERFCTR_INST_RETIRED), - ARMV8_EVENT_ATTR(exc_taken, ARMV8_PMUV3_PERFCTR_EXC_TAKEN), - ARMV8_EVENT_ATTR(exc_return, ARMV8_PMUV3_PERFCTR_EXC_RETURN), - ARMV8_EVENT_ATTR(cid_write_retired, ARMV8_PMUV3_PERFCTR_CID_WRITE_RETIRED), - ARMV8_EVENT_ATTR(pc_write_retired, ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED), - ARMV8_EVENT_ATTR(br_immed_retired, ARMV8_PMUV3_PERFCTR_BR_IMMED_RETIRED), - ARMV8_EVENT_ATTR(br_return_retired, ARMV8_PMUV3_PERFCTR_BR_RETURN_RETIRED), - ARMV8_EVENT_ATTR(unaligned_ldst_retired, ARMV8_PMUV3_PERFCTR_UNALIGNED_LDST_RETIRED), - ARMV8_EVENT_ATTR(br_mis_pred, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED), - ARMV8_EVENT_ATTR(cpu_cycles, ARMV8_PMUV3_PERFCTR_CPU_CYCLES), - ARMV8_EVENT_ATTR(br_pred, ARMV8_PMUV3_PERFCTR_BR_PRED), - ARMV8_EVENT_ATTR(mem_access, ARMV8_PMUV3_PERFCTR_MEM_ACCESS), - ARMV8_EVENT_ATTR(l1i_cache, ARMV8_PMUV3_PERFCTR_L1I_CACHE), - ARMV8_EVENT_ATTR(l1d_cache_wb, ARMV8_PMUV3_PERFCTR_L1D_CACHE_WB), - ARMV8_EVENT_ATTR(l2d_cache, ARMV8_PMUV3_PERFCTR_L2D_CACHE), - ARMV8_EVENT_ATTR(l2d_cache_refill, ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL), - ARMV8_EVENT_ATTR(l2d_cache_wb, ARMV8_PMUV3_PERFCTR_L2D_CACHE_WB), - ARMV8_EVENT_ATTR(bus_access, ARMV8_PMUV3_PERFCTR_BUS_ACCESS), - ARMV8_EVENT_ATTR(memory_error, ARMV8_PMUV3_PERFCTR_MEMORY_ERROR), - ARMV8_EVENT_ATTR(inst_spec, ARMV8_PMUV3_PERFCTR_INST_SPEC), - ARMV8_EVENT_ATTR(ttbr_write_retired, ARMV8_PMUV3_PERFCTR_TTBR_WRITE_RETIRED), - ARMV8_EVENT_ATTR(bus_cycles, ARMV8_PMUV3_PERFCTR_BUS_CYCLES), - /* Don't expose the chain event in /sys, since it's useless in isolation */ - ARMV8_EVENT_ATTR(l1d_cache_allocate, ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE), - ARMV8_EVENT_ATTR(l2d_cache_allocate, ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE), - ARMV8_EVENT_ATTR(br_retired, ARMV8_PMUV3_PERFCTR_BR_RETIRED), - ARMV8_EVENT_ATTR(br_mis_pred_retired, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED), - ARMV8_EVENT_ATTR(stall_frontend, ARMV8_PMUV3_PERFCTR_STALL_FRONTEND), - ARMV8_EVENT_ATTR(stall_backend, ARMV8_PMUV3_PERFCTR_STALL_BACKEND), - ARMV8_EVENT_ATTR(l1d_tlb, ARMV8_PMUV3_PERFCTR_L1D_TLB), - ARMV8_EVENT_ATTR(l1i_tlb, ARMV8_PMUV3_PERFCTR_L1I_TLB), - ARMV8_EVENT_ATTR(l2i_cache, ARMV8_PMUV3_PERFCTR_L2I_CACHE), - ARMV8_EVENT_ATTR(l2i_cache_refill, ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL), - ARMV8_EVENT_ATTR(l3d_cache_allocate, ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE), - ARMV8_EVENT_ATTR(l3d_cache_refill, ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL), - ARMV8_EVENT_ATTR(l3d_cache, ARMV8_PMUV3_PERFCTR_L3D_CACHE), - ARMV8_EVENT_ATTR(l3d_cache_wb, ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB), - ARMV8_EVENT_ATTR(l2d_tlb_refill, ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL), - ARMV8_EVENT_ATTR(l2i_tlb_refill, ARMV8_PMUV3_PERFCTR_L2I_TLB_REFILL), - ARMV8_EVENT_ATTR(l2d_tlb, ARMV8_PMUV3_PERFCTR_L2D_TLB), - ARMV8_EVENT_ATTR(l2i_tlb, ARMV8_PMUV3_PERFCTR_L2I_TLB), - ARMV8_EVENT_ATTR(remote_access, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS), - ARMV8_EVENT_ATTR(ll_cache, ARMV8_PMUV3_PERFCTR_LL_CACHE), - ARMV8_EVENT_ATTR(ll_cache_miss, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS), - ARMV8_EVENT_ATTR(dtlb_walk, ARMV8_PMUV3_PERFCTR_DTLB_WALK), - ARMV8_EVENT_ATTR(itlb_walk, ARMV8_PMUV3_PERFCTR_ITLB_WALK), - ARMV8_EVENT_ATTR(ll_cache_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_RD), - ARMV8_EVENT_ATTR(ll_cache_miss_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD), - ARMV8_EVENT_ATTR(remote_access_rd, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD), - ARMV8_EVENT_ATTR(sample_pop, ARMV8_SPE_PERFCTR_SAMPLE_POP), - ARMV8_EVENT_ATTR(sample_feed, ARMV8_SPE_PERFCTR_SAMPLE_FEED), - ARMV8_EVENT_ATTR(sample_filtrate, ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE), - ARMV8_EVENT_ATTR(sample_collision, ARMV8_SPE_PERFCTR_SAMPLE_COLLISION), - NULL, -}; - -static umode_t -armv8pmu_event_attr_is_visible(struct kobject *kobj, - struct attribute *attr, int unused) -{ - struct device *dev = kobj_to_dev(kobj); - struct pmu *pmu = dev_get_drvdata(dev); - struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu); - struct perf_pmu_events_attr *pmu_attr; - - pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); - - if (pmu_attr->id < ARMV8_PMUV3_MAX_COMMON_EVENTS && - test_bit(pmu_attr->id, cpu_pmu->pmceid_bitmap)) - return attr->mode; - - pmu_attr->id -= ARMV8_PMUV3_EXT_COMMON_EVENT_BASE; - if (pmu_attr->id < ARMV8_PMUV3_MAX_COMMON_EVENTS && - test_bit(pmu_attr->id, cpu_pmu->pmceid_ext_bitmap)) - return attr->mode; - - return 0; -} - -static struct attribute_group armv8_pmuv3_events_attr_group = { - .name = "events", - .attrs = armv8_pmuv3_event_attrs, - .is_visible = armv8pmu_event_attr_is_visible, -}; - -PMU_FORMAT_ATTR(event, "config:0-15"); -PMU_FORMAT_ATTR(long, "config1:0"); - -static inline bool armv8pmu_event_is_64bit(struct perf_event *event) -{ - return event->attr.config1 & 0x1; -} - -static struct attribute *armv8_pmuv3_format_attrs[] = { - &format_attr_event.attr, - &format_attr_long.attr, - NULL, -}; - -static struct attribute_group armv8_pmuv3_format_attr_group = { - .name = "format", - .attrs = armv8_pmuv3_format_attrs, -}; - -/* - * Perf Events' indices - */ -#define ARMV8_IDX_CYCLE_COUNTER 0 -#define ARMV8_IDX_COUNTER0 1 -#define ARMV8_IDX_COUNTER_LAST(cpu_pmu) \ - (ARMV8_IDX_CYCLE_COUNTER + cpu_pmu->num_events - 1) - -/* - * We must chain two programmable counters for 64 bit events, - * except when we have allocated the 64bit cycle counter (for CPU - * cycles event). This must be called only when the event has - * a counter allocated. - */ -static inline bool armv8pmu_event_is_chained(struct perf_event *event) -{ - int idx = event->hw.idx; - - return !WARN_ON(idx < 0) && - armv8pmu_event_is_64bit(event) && - (idx != ARMV8_IDX_CYCLE_COUNTER); -} - -/* - * ARMv8 low level PMU access - */ - -/* - * Perf Event to low level counters mapping - */ -#define ARMV8_IDX_TO_COUNTER(x) \ - (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) - -static inline u32 armv8pmu_pmcr_read(void) -{ - return read_sysreg(pmcr_el0); -} - -static inline void armv8pmu_pmcr_write(u32 val) -{ - val &= ARMV8_PMU_PMCR_MASK; - isb(); - write_sysreg(val, pmcr_el0); -} - -static inline int armv8pmu_has_overflowed(u32 pmovsr) -{ - return pmovsr & ARMV8_PMU_OVERFLOWED_MASK; -} - -static inline int armv8pmu_counter_valid(struct arm_pmu *cpu_pmu, int idx) -{ - return idx >= ARMV8_IDX_CYCLE_COUNTER && - idx <= ARMV8_IDX_COUNTER_LAST(cpu_pmu); -} - -static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) -{ - return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); -} - -static inline void armv8pmu_select_counter(int idx) -{ - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(counter, pmselr_el0); - isb(); -} - -static inline u32 armv8pmu_read_evcntr(int idx) -{ - armv8pmu_select_counter(idx); - return read_sysreg(pmxevcntr_el0); -} - -static inline u64 armv8pmu_read_hw_counter(struct perf_event *event) -{ - int idx = event->hw.idx; - u64 val = 0; - - val = armv8pmu_read_evcntr(idx); - if (armv8pmu_event_is_chained(event)) - val = (val << 32) | armv8pmu_read_evcntr(idx - 1); - return val; -} - -static u64 armv8pmu_read_counter(struct perf_event *event) -{ - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - u64 value = 0; - - if (!armv8pmu_counter_valid(cpu_pmu, idx)) - pr_err("CPU%u reading wrong counter %d\n", - smp_processor_id(), idx); - else if (idx == ARMV8_IDX_CYCLE_COUNTER) - value = read_sysreg(pmccntr_el0); - else - value = armv8pmu_read_hw_counter(event); - - return value; -} - -static inline void armv8pmu_write_evcntr(int idx, u32 value) -{ - armv8pmu_select_counter(idx); - write_sysreg(value, pmxevcntr_el0); -} - -static inline void armv8pmu_write_hw_counter(struct perf_event *event, - u64 value) -{ - int idx = event->hw.idx; - - if (armv8pmu_event_is_chained(event)) { - armv8pmu_write_evcntr(idx, upper_32_bits(value)); - armv8pmu_write_evcntr(idx - 1, lower_32_bits(value)); - } else { - armv8pmu_write_evcntr(idx, value); - } -} - -static void armv8pmu_write_counter(struct perf_event *event, u64 value) -{ - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - - if (!armv8pmu_counter_valid(cpu_pmu, idx)) - pr_err("CPU%u writing wrong counter %d\n", - smp_processor_id(), idx); - else if (idx == ARMV8_IDX_CYCLE_COUNTER) { - /* - * The cycles counter is really a 64-bit counter. - * When treating it as a 32-bit counter, we only count - * the lower 32 bits, and set the upper 32-bits so that - * we get an interrupt upon 32-bit overflow. - */ - if (!armv8pmu_event_is_64bit(event)) - value |= 0xffffffff00000000ULL; - write_sysreg(value, pmccntr_el0); - } else - armv8pmu_write_hw_counter(event, value); -} - -static inline void armv8pmu_write_evtype(int idx, u32 val) -{ - armv8pmu_select_counter(idx); - val &= ARMV8_PMU_EVTYPE_MASK; - write_sysreg(val, pmxevtyper_el0); -} - -static inline void armv8pmu_write_event_type(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - - /* - * For chained events, the low counter is programmed to count - * the event of interest and the high counter is programmed - * with CHAIN event code with filters set to count at all ELs. - */ - if (armv8pmu_event_is_chained(event)) { - u32 chain_evt = ARMV8_PMUV3_PERFCTR_CHAIN | - ARMV8_PMU_INCLUDE_EL2; - - armv8pmu_write_evtype(idx - 1, hwc->config_base); - armv8pmu_write_evtype(idx, chain_evt); - } else { - armv8pmu_write_evtype(idx, hwc->config_base); - } -} - -static inline int armv8pmu_enable_counter(int idx) -{ - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmcntenset_el0); - return idx; -} - -static inline void armv8pmu_enable_event_counter(struct perf_event *event) -{ - struct perf_event_attr *attr = &event->attr; - int idx = event->hw.idx; - u32 counter_bits = BIT(ARMV8_IDX_TO_COUNTER(idx)); - - if (armv8pmu_event_is_chained(event)) - counter_bits |= BIT(ARMV8_IDX_TO_COUNTER(idx - 1)); - - kvm_set_pmu_events(counter_bits, attr); - - /* We rely on the hypervisor switch code to enable guest counters */ - if (!kvm_pmu_counter_deferred(attr)) { - armv8pmu_enable_counter(idx); - if (armv8pmu_event_is_chained(event)) - armv8pmu_enable_counter(idx - 1); - } -} - -static inline int armv8pmu_disable_counter(int idx) -{ - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmcntenclr_el0); - return idx; -} - -static inline void armv8pmu_disable_event_counter(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct perf_event_attr *attr = &event->attr; - int idx = hwc->idx; - u32 counter_bits = BIT(ARMV8_IDX_TO_COUNTER(idx)); - - if (armv8pmu_event_is_chained(event)) - counter_bits |= BIT(ARMV8_IDX_TO_COUNTER(idx - 1)); - - kvm_clr_pmu_events(counter_bits); - - /* We rely on the hypervisor switch code to disable guest counters */ - if (!kvm_pmu_counter_deferred(attr)) { - if (armv8pmu_event_is_chained(event)) - armv8pmu_disable_counter(idx - 1); - armv8pmu_disable_counter(idx); - } -} - -static inline int armv8pmu_enable_intens(int idx) -{ - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmintenset_el1); - return idx; -} - -static inline int armv8pmu_enable_event_irq(struct perf_event *event) -{ - return armv8pmu_enable_intens(event->hw.idx); -} - -static inline int armv8pmu_disable_intens(int idx) -{ - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmintenclr_el1); - isb(); - /* Clear the overflow flag in case an interrupt is pending. */ - write_sysreg(BIT(counter), pmovsclr_el0); - isb(); - - return idx; -} - -static inline int armv8pmu_disable_event_irq(struct perf_event *event) -{ - return armv8pmu_disable_intens(event->hw.idx); -} - -static inline u32 armv8pmu_getreset_flags(void) -{ - u32 value; - - /* Read */ - value = read_sysreg(pmovsclr_el0); - - /* Write to clear flags */ - value &= ARMV8_PMU_OVSR_MASK; - write_sysreg(value, pmovsclr_el0); - - return value; -} - -static void armv8pmu_enable_event(struct perf_event *event) -{ - unsigned long flags; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - /* - * Enable counter and interrupt, and set the counter to count - * the event that we're interested in. - */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); - - /* - * Disable counter - */ - armv8pmu_disable_event_counter(event); - - /* - * Set event (if destined for PMNx counters). - */ - armv8pmu_write_event_type(event); - - /* - * Enable interrupt for this counter - */ - armv8pmu_enable_event_irq(event); - - /* - * Enable counter - */ - armv8pmu_enable_event_counter(event); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); -} - -static void armv8pmu_disable_event(struct perf_event *event) -{ - unsigned long flags; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - /* - * Disable counter and interrupt - */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); - - /* - * Disable counter - */ - armv8pmu_disable_event_counter(event); - - /* - * Disable interrupt for this counter - */ - armv8pmu_disable_event_irq(event); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); -} - -static void armv8pmu_start(struct arm_pmu *cpu_pmu) -{ - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - raw_spin_lock_irqsave(&events->pmu_lock, flags); - /* Enable all counters */ - armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); -} - -static void armv8pmu_stop(struct arm_pmu *cpu_pmu) -{ - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - raw_spin_lock_irqsave(&events->pmu_lock, flags); - /* Disable all counters */ - armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); -} - -static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu) -{ - u32 pmovsr; - struct perf_sample_data data; - struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events); - struct pt_regs *regs; - int idx; - - /* - * Get and reset the IRQ flags - */ - pmovsr = armv8pmu_getreset_flags(); - - /* - * Did an overflow occur? - */ - if (!armv8pmu_has_overflowed(pmovsr)) - return IRQ_NONE; - - /* - * Handle the counter(s) overflow(s) - */ - regs = get_irq_regs(); - - /* - * Stop the PMU while processing the counter overflows - * to prevent skews in group events. - */ - armv8pmu_stop(cpu_pmu); - for (idx = 0; idx < cpu_pmu->num_events; ++idx) { - struct perf_event *event = cpuc->events[idx]; - struct hw_perf_event *hwc; - - /* Ignore if we don't have an event. */ - if (!event) - continue; - - /* - * We have a single interrupt for all counters. Check that - * each counter has overflowed before we process it. - */ - if (!armv8pmu_counter_has_overflowed(pmovsr, idx)) - continue; - - hwc = &event->hw; - armpmu_event_update(event); - perf_sample_data_init(&data, 0, hwc->last_period); - if (!armpmu_event_set_period(event)) - continue; - - if (perf_event_overflow(event, &data, regs)) - cpu_pmu->disable(event); - } - armv8pmu_start(cpu_pmu); - - /* - * Handle the pending perf events. - * - * Note: this call *must* be run with interrupts disabled. For - * platforms that can have the PMU interrupts raised as an NMI, this - * will not work. - */ - irq_work_run(); - - return IRQ_HANDLED; -} - -static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc, - struct arm_pmu *cpu_pmu) -{ - int idx; - - for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; idx ++) { - if (!test_and_set_bit(idx, cpuc->used_mask)) - return idx; - } - return -EAGAIN; -} - -static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc, - struct arm_pmu *cpu_pmu) -{ - int idx; - - /* - * Chaining requires two consecutive event counters, where - * the lower idx must be even. - */ - for (idx = ARMV8_IDX_COUNTER0 + 1; idx < cpu_pmu->num_events; idx += 2) { - if (!test_and_set_bit(idx, cpuc->used_mask)) { - /* Check if the preceding even counter is available */ - if (!test_and_set_bit(idx - 1, cpuc->used_mask)) - return idx; - /* Release the Odd counter */ - clear_bit(idx, cpuc->used_mask); - } - } - return -EAGAIN; -} - -static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, - struct perf_event *event) -{ - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct hw_perf_event *hwc = &event->hw; - unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; - - /* Always prefer to place a cycle counter into the cycle counter. */ - if (evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) { - if (!test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask)) - return ARMV8_IDX_CYCLE_COUNTER; - } - - /* - * Otherwise use events counters - */ - if (armv8pmu_event_is_64bit(event)) - return armv8pmu_get_chain_idx(cpuc, cpu_pmu); - else - return armv8pmu_get_single_idx(cpuc, cpu_pmu); -} - -static void armv8pmu_clear_event_idx(struct pmu_hw_events *cpuc, - struct perf_event *event) -{ - int idx = event->hw.idx; - - clear_bit(idx, cpuc->used_mask); - if (armv8pmu_event_is_chained(event)) - clear_bit(idx - 1, cpuc->used_mask); -} - -/* - * Add an event filter to a given event. - */ -static int armv8pmu_set_event_filter(struct hw_perf_event *event, - struct perf_event_attr *attr) -{ - unsigned long config_base = 0; - - if (attr->exclude_idle) - return -EPERM; - - /* - * If we're running in hyp mode, then we *are* the hypervisor. - * Therefore we ignore exclude_hv in this configuration, since - * there's no hypervisor to sample anyway. This is consistent - * with other architectures (x86 and Power). - */ - if (is_kernel_in_hyp_mode()) { - if (!attr->exclude_kernel && !attr->exclude_host) - config_base |= ARMV8_PMU_INCLUDE_EL2; - if (attr->exclude_guest) - config_base |= ARMV8_PMU_EXCLUDE_EL1; - if (attr->exclude_host) - config_base |= ARMV8_PMU_EXCLUDE_EL0; - } else { - if (!attr->exclude_hv && !attr->exclude_host) - config_base |= ARMV8_PMU_INCLUDE_EL2; - } - - /* - * Filter out !VHE kernels and guest kernels - */ - if (attr->exclude_kernel) - config_base |= ARMV8_PMU_EXCLUDE_EL1; - - if (attr->exclude_user) - config_base |= ARMV8_PMU_EXCLUDE_EL0; - - /* - * Install the filter into config_base as this is used to - * construct the event type. - */ - event->config_base = config_base; - - return 0; -} - -static int armv8pmu_filter_match(struct perf_event *event) -{ - unsigned long evtype = event->hw.config_base & ARMV8_PMU_EVTYPE_EVENT; - return evtype != ARMV8_PMUV3_PERFCTR_CHAIN; -} - -static void armv8pmu_reset(void *info) -{ - struct arm_pmu *cpu_pmu = (struct arm_pmu *)info; - u32 idx, nb_cnt = cpu_pmu->num_events; - - /* The counter and interrupt enable registers are unknown at reset. */ - for (idx = ARMV8_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx) { - armv8pmu_disable_counter(idx); - armv8pmu_disable_intens(idx); - } - - /* Clear the counters we flip at guest entry/exit */ - kvm_clr_pmu_events(U32_MAX); - - /* - * Initialize & Reset PMNC. Request overflow interrupt for - * 64 bit cycle counter but cheat in armv8pmu_write_counter(). - */ - armv8pmu_pmcr_write(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C | - ARMV8_PMU_PMCR_LC); -} - -static int __armv8_pmuv3_map_event(struct perf_event *event, - const unsigned (*extra_event_map) - [PERF_COUNT_HW_MAX], - const unsigned (*extra_cache_map) - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]) -{ - int hw_event_id; - struct arm_pmu *armpmu = to_arm_pmu(event->pmu); - - hw_event_id = armpmu_map_event(event, &armv8_pmuv3_perf_map, - &armv8_pmuv3_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); - - if (armv8pmu_event_is_64bit(event)) - event->hw.flags |= ARMPMU_EVT_64BIT; - - /* Only expose micro/arch events supported by this PMU */ - if ((hw_event_id > 0) && (hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS) - && test_bit(hw_event_id, armpmu->pmceid_bitmap)) { - return hw_event_id; - } - - return armpmu_map_event(event, extra_event_map, extra_cache_map, - ARMV8_PMU_EVTYPE_EVENT); -} - -static int armv8_pmuv3_map_event(struct perf_event *event) -{ - return __armv8_pmuv3_map_event(event, NULL, NULL); -} - -static int armv8_a53_map_event(struct perf_event *event) -{ - return __armv8_pmuv3_map_event(event, NULL, &armv8_a53_perf_cache_map); -} - -static int armv8_a57_map_event(struct perf_event *event) -{ - return __armv8_pmuv3_map_event(event, NULL, &armv8_a57_perf_cache_map); -} - -static int armv8_a73_map_event(struct perf_event *event) -{ - return __armv8_pmuv3_map_event(event, NULL, &armv8_a73_perf_cache_map); -} - -static int armv8_thunder_map_event(struct perf_event *event) -{ - return __armv8_pmuv3_map_event(event, NULL, - &armv8_thunder_perf_cache_map); -} - -static int armv8_vulcan_map_event(struct perf_event *event) -{ - return __armv8_pmuv3_map_event(event, NULL, - &armv8_vulcan_perf_cache_map); -} - -struct armv8pmu_probe_info { - struct arm_pmu *pmu; - bool present; -}; - -static void __armv8pmu_probe_pmu(void *info) -{ - struct armv8pmu_probe_info *probe = info; - struct arm_pmu *cpu_pmu = probe->pmu; - u64 dfr0; - u64 pmceid_raw[2]; - u32 pmceid[2]; - int pmuver; - - dfr0 = read_sysreg(id_aa64dfr0_el1); - pmuver = cpuid_feature_extract_unsigned_field(dfr0, - ID_AA64DFR0_PMUVER_SHIFT); - if (pmuver == 0xf || pmuver == 0) - return; - - probe->present = true; - - /* Read the nb of CNTx counters supported from PMNC */ - cpu_pmu->num_events = (armv8pmu_pmcr_read() >> ARMV8_PMU_PMCR_N_SHIFT) - & ARMV8_PMU_PMCR_N_MASK; - - /* Add the CPU cycles counter */ - cpu_pmu->num_events += 1; - - pmceid[0] = pmceid_raw[0] = read_sysreg(pmceid0_el0); - pmceid[1] = pmceid_raw[1] = read_sysreg(pmceid1_el0); - - bitmap_from_arr32(cpu_pmu->pmceid_bitmap, - pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS); - - pmceid[0] = pmceid_raw[0] >> 32; - pmceid[1] = pmceid_raw[1] >> 32; - - bitmap_from_arr32(cpu_pmu->pmceid_ext_bitmap, - pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS); -} - -static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu) -{ - struct armv8pmu_probe_info probe = { - .pmu = cpu_pmu, - .present = false, - }; - int ret; - - ret = smp_call_function_any(&cpu_pmu->supported_cpus, - __armv8pmu_probe_pmu, - &probe, 1); - if (ret) - return ret; - - return probe.present ? 0 : -ENODEV; -} - -static int armv8_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8pmu_probe_pmu(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->handle_irq = armv8pmu_handle_irq; - cpu_pmu->enable = armv8pmu_enable_event; - cpu_pmu->disable = armv8pmu_disable_event; - cpu_pmu->read_counter = armv8pmu_read_counter; - cpu_pmu->write_counter = armv8pmu_write_counter; - cpu_pmu->get_event_idx = armv8pmu_get_event_idx; - cpu_pmu->clear_event_idx = armv8pmu_clear_event_idx; - cpu_pmu->start = armv8pmu_start; - cpu_pmu->stop = armv8pmu_stop; - cpu_pmu->reset = armv8pmu_reset; - cpu_pmu->set_event_filter = armv8pmu_set_event_filter; - cpu_pmu->filter_match = armv8pmu_filter_match; - - return 0; -} - -static int armv8_pmuv3_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_pmuv3"; - cpu_pmu->map_event = armv8_pmuv3_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cortex_a35"; - cpu_pmu->map_event = armv8_a53_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cortex_a53"; - cpu_pmu->map_event = armv8_a53_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cortex_a57"; - cpu_pmu->map_event = armv8_a57_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cortex_a72"; - cpu_pmu->map_event = armv8_a57_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cortex_a73"; - cpu_pmu->map_event = armv8_a73_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cavium_thunder"; - cpu_pmu->map_event = armv8_thunder_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_brcm_vulcan"; - cpu_pmu->map_event = armv8_vulcan_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static const struct of_device_id armv8_pmu_of_device_ids[] = { - {.compatible = "arm,armv8-pmuv3", .data = armv8_pmuv3_init}, - {.compatible = "arm,cortex-a35-pmu", .data = armv8_a35_pmu_init}, - {.compatible = "arm,cortex-a53-pmu", .data = armv8_a53_pmu_init}, - {.compatible = "arm,cortex-a57-pmu", .data = armv8_a57_pmu_init}, - {.compatible = "arm,cortex-a72-pmu", .data = armv8_a72_pmu_init}, - {.compatible = "arm,cortex-a73-pmu", .data = armv8_a73_pmu_init}, - {.compatible = "cavium,thunder-pmu", .data = armv8_thunder_pmu_init}, - {.compatible = "brcm,vulcan-pmu", .data = armv8_vulcan_pmu_init}, - {}, -}; - -static int armv8_pmu_device_probe(struct platform_device *pdev) -{ - return arm_pmu_device_probe(pdev, armv8_pmu_of_device_ids, NULL); -} - -static struct platform_driver armv8_pmu_driver = { - .driver = { - .name = ARMV8_PMU_PDEV_NAME, - .of_match_table = armv8_pmu_of_device_ids, - .suppress_bind_attrs = true, - }, - .probe = armv8_pmu_device_probe, -}; - -static int __init armv8_pmu_driver_init(void) -{ - if (acpi_disabled) - return platform_driver_register(&armv8_pmu_driver); - else - return arm_pmu_acpi_probe(armv8_pmuv3_init); -} -device_initcall(armv8_pmu_driver_init) - -void arch_perf_update_userpage(struct perf_event *event, - struct perf_event_mmap_page *userpg, u64 now) -{ - u32 freq; - u32 shift; - - /* - * Internal timekeeping for enabled/running/stopped times - * is always computed with the sched_clock. - */ - freq = arch_timer_get_rate(); - userpg->cap_user_time = 1; - - clocks_calc_mult_shift(&userpg->time_mult, &shift, freq, - NSEC_PER_SEC, 0); - /* - * time_shift is not expected to be greater than 31 due to - * the original published conversion algorithm shifting a - * 32-bit value (now specifies a 64-bit value) - refer - * perf_event_mmap_page documentation in perf_event.h. - */ - if (shift == 32) { - shift = 31; - userpg->time_mult >>= 1; - } - userpg->time_shift = (u16)shift; - userpg->time_offset = -now; -} diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c index 0bbac612146e..b4eece3eb17d 100644 --- a/arch/arm64/kernel/perf_regs.c +++ b/arch/arm64/kernel/perf_regs.c @@ -9,21 +9,58 @@ #include <asm/perf_regs.h> #include <asm/ptrace.h> +static u64 perf_ext_regs_value(int idx) +{ + switch (idx) { + case PERF_REG_ARM64_VG: + if (WARN_ON_ONCE(!system_supports_sve())) + return 0; + + /* + * Vector granule is current length in bits of SVE registers + * divided by 64. + */ + return (task_get_sve_vl(current) * 8) / 64; + default: + WARN_ON_ONCE(true); + return 0; + } +} + u64 perf_reg_value(struct pt_regs *regs, int idx) { - if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_MAX)) + if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_EXTENDED_MAX)) return 0; /* - * Compat (i.e. 32 bit) mode: - * - PC has been set in the pt_regs struct in kernel_entry, - * - Handle SP and LR here. + * Our handling of compat tasks (PERF_SAMPLE_REGS_ABI_32) is weird, but + * we're stuck with it for ABI compatibility reasons. + * + * For a 32-bit consumer inspecting a 32-bit task, then it will look at + * the first 16 registers (see arch/arm/include/uapi/asm/perf_regs.h). + * These correspond directly to a prefix of the registers saved in our + * 'struct pt_regs', with the exception of the PC, so we copy that down + * (x15 corresponds to SP_hyp in the architecture). + * + * So far, so good. + * + * The oddity arises when a 64-bit consumer looks at a 32-bit task and + * asks for registers beyond PERF_REG_ARM_MAX. In this case, we return + * SP_usr, LR_usr and PC in the positions where the AArch64 SP, LR and + * PC registers would normally live. The initial idea was to allow a + * 64-bit unwinder to unwind a 32-bit task and, although it's not clear + * how well that works in practice, somebody might be relying on it. + * + * At the time we make a sample, we don't know whether the consumer is + * 32-bit or 64-bit, so we have to cater for both possibilities. */ if (compat_user_mode(regs)) { if ((u32)idx == PERF_REG_ARM64_SP) return regs->compat_sp; if ((u32)idx == PERF_REG_ARM64_LR) return regs->compat_lr; + if (idx == 15) + return regs->pc; } if ((u32)idx == PERF_REG_ARM64_SP) @@ -32,6 +69,9 @@ u64 perf_reg_value(struct pt_regs *regs, int idx) if ((u32)idx == PERF_REG_ARM64_PC) return regs->pc; + if ((u32)idx >= PERF_REG_ARM64_MAX) + return perf_ext_regs_value(idx); + return regs->regs[idx]; } @@ -39,7 +79,12 @@ u64 perf_reg_value(struct pt_regs *regs, int idx) int perf_reg_validate(u64 mask) { - if (!mask || mask & REG_RESERVED) + u64 reserved_mask = REG_RESERVED; + + if (system_supports_sve()) + reserved_mask &= ~(1ULL << PERF_REG_ARM64_VG); + + if (!mask || mask & reserved_mask) return -EINVAL; return 0; @@ -54,8 +99,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile new file mode 100644 index 000000000000..c844a0546d7f --- /dev/null +++ b/arch/arm64/kernel/pi/Makefile @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright 2022 Google LLC + +KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ + -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \ + $(DISABLE_LATENT_ENTROPY_PLUGIN) \ + $(call cc-option,-mbranch-protection=none) \ + -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \ + -include $(srctree)/include/linux/hidden.h \ + -D__DISABLE_EXPORTS -ffreestanding -D__NO_FORTIFY \ + -fno-asynchronous-unwind-tables -fno-unwind-tables \ + $(call cc-option,-fno-addrsig) + +# remove SCS flags from all objects in this directory +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_SCS), $(KBUILD_CFLAGS)) +# disable LTO +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS)) + +GCOV_PROFILE := n +KASAN_SANITIZE := n +KCSAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ \ + --remove-section=.note.gnu.property \ + --prefix-alloc-sections=.init +$(obj)/%.pi.o: $(obj)/%.o FORCE + $(call if_changed,objcopy) + +$(obj)/lib-%.o: $(srctree)/lib/%.c FORCE + $(call if_changed_rule,cc_o_c) + +obj-y := kaslr_early.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o +extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c new file mode 100644 index 000000000000..17bff6e399e4 --- /dev/null +++ b/arch/arm64/kernel/pi/kaslr_early.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2022 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +// NOTE: code in this file runs *very* early, and is not permitted to use +// global variables or anything that relies on absolute addressing. + +#include <linux/libfdt.h> +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/sizes.h> +#include <linux/string.h> + +#include <asm/archrandom.h> +#include <asm/memory.h> + +/* taken from lib/string.c */ +static char *__strstr(const char *s1, const char *s2) +{ + size_t l1, l2; + + l2 = strlen(s2); + if (!l2) + return (char *)s1; + l1 = strlen(s1); + while (l1 >= l2) { + l1--; + if (!memcmp(s1, s2, l2)) + return (char *)s1; + s1++; + } + return NULL; +} +static bool cmdline_contains_nokaslr(const u8 *cmdline) +{ + const u8 *str; + + str = __strstr(cmdline, "nokaslr"); + return str == cmdline || (str > cmdline && *(str - 1) == ' '); +} + +static bool is_kaslr_disabled_cmdline(void *fdt) +{ + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { + int node; + const u8 *prop; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + goto out; + + prop = fdt_getprop(fdt, node, "bootargs", NULL); + if (!prop) + goto out; + + if (cmdline_contains_nokaslr(prop)) + return true; + + if (IS_ENABLED(CONFIG_CMDLINE_EXTEND)) + goto out; + + return false; + } +out: + return cmdline_contains_nokaslr(CONFIG_CMDLINE); +} + +static u64 get_kaslr_seed(void *fdt) +{ + int node, len; + fdt64_t *prop; + u64 ret; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return 0; + + prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); + if (!prop || len != sizeof(u64)) + return 0; + + ret = fdt64_to_cpu(*prop); + *prop = 0; + return ret; +} + +asmlinkage u64 kaslr_early_init(void *fdt) +{ + u64 seed; + + if (is_kaslr_disabled_cmdline(fdt)) + return 0; + + seed = get_kaslr_seed(fdt); + if (!seed) { + if (!__early_cpu_has_rndr() || + !__arm64_rndr((unsigned long *)&seed)) + return 0; + } + + /* + * OK, so we are proceeding with KASLR enabled. Calculate a suitable + * kernel image offset from the seed. Let's place the kernel in the + * middle half of the VMALLOC area (VA_BITS_MIN - 2), and stay clear of + * the lower and upper quarters to avoid colliding with other + * allocations. + */ + return BIT(VA_BITS_MIN - 3) + (seed & GENMASK(VA_BITS_MIN - 3, 0)); +} diff --git a/arch/arm64/kernel/pointer_auth.c b/arch/arm64/kernel/pointer_auth.c index c507b584259d..2708b620b4ae 100644 --- a/arch/arm64/kernel/pointer_auth.c +++ b/arch/arm64/kernel/pointer_auth.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/compat.h> #include <linux/errno.h> #include <linux/prctl.h> #include <linux/random.h> @@ -9,7 +10,7 @@ int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg) { - struct ptrauth_keys *keys = &tsk->thread.keys_user; + struct ptrauth_keys_user *keys = &tsk->thread.keys_user; unsigned long addr_key_mask = PR_PAC_APIAKEY | PR_PAC_APIBKEY | PR_PAC_APDAKEY | PR_PAC_APDBKEY; unsigned long key_mask = addr_key_mask | PR_PAC_APGAKEY; @@ -17,9 +18,11 @@ int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg) if (!system_supports_address_auth() && !system_supports_generic_auth()) return -EINVAL; + if (is_compat_thread(task_thread_info(tsk))) + return -EINVAL; + if (!arg) { - ptrauth_keys_init(keys); - ptrauth_keys_switch(keys); + ptrauth_keys_init_user(keys); return 0; } @@ -40,8 +43,71 @@ int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg) get_random_bytes(&keys->apdb, sizeof(keys->apdb)); if (arg & PR_PAC_APGAKEY) get_random_bytes(&keys->apga, sizeof(keys->apga)); + ptrauth_keys_install_user(keys); + + return 0; +} + +static u64 arg_to_enxx_mask(unsigned long arg) +{ + u64 sctlr_enxx_mask = 0; + + WARN_ON(arg & ~PR_PAC_ENABLED_KEYS_MASK); + if (arg & PR_PAC_APIAKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENIA; + if (arg & PR_PAC_APIBKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENIB; + if (arg & PR_PAC_APDAKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENDA; + if (arg & PR_PAC_APDBKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENDB; + return sctlr_enxx_mask; +} + +int ptrauth_set_enabled_keys(struct task_struct *tsk, unsigned long keys, + unsigned long enabled) +{ + u64 sctlr; - ptrauth_keys_switch(keys); + if (!system_supports_address_auth()) + return -EINVAL; + + if (is_compat_thread(task_thread_info(tsk))) + return -EINVAL; + + if ((keys & ~PR_PAC_ENABLED_KEYS_MASK) || (enabled & ~keys)) + return -EINVAL; + + preempt_disable(); + sctlr = tsk->thread.sctlr_user; + sctlr &= ~arg_to_enxx_mask(keys); + sctlr |= arg_to_enxx_mask(enabled); + tsk->thread.sctlr_user = sctlr; + if (tsk == current) + update_sctlr_el1(sctlr); + preempt_enable(); return 0; } + +int ptrauth_get_enabled_keys(struct task_struct *tsk) +{ + int retval = 0; + + if (!system_supports_address_auth()) + return -EINVAL; + + if (is_compat_thread(task_thread_info(tsk))) + return -EINVAL; + + if (tsk->thread.sctlr_user & SCTLR_ELx_ENIA) + retval |= PR_PAC_APIAKEY; + if (tsk->thread.sctlr_user & SCTLR_ELx_ENIB) + retval |= PR_PAC_APIBKEY; + if (tsk->thread.sctlr_user & SCTLR_ELx_ENDA) + retval |= PR_PAC_APDAKEY; + if (tsk->thread.sctlr_user & SCTLR_ELx_ENDB) + retval |= PR_PAC_APDBKEY; + + return retval; +} diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index b78fac9e546c..968d5fffe233 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -24,12 +24,13 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * currently safe. Lastly, MSR instructions can do any number of nasty * things we can't handle during single-stepping. */ - if (aarch64_get_insn_class(insn) == AARCH64_INSN_CLS_BR_SYS) { + if (aarch64_insn_is_class_branch_sys(insn)) { if (aarch64_insn_is_branch(insn) || aarch64_insn_is_msr_imm(insn) || aarch64_insn_is_msr_reg(insn) || aarch64_insn_is_exception(insn) || - aarch64_insn_is_eret(insn)) + aarch64_insn_is_eret(insn) || + aarch64_insn_is_eret_auth(insn)) return false; /* @@ -42,11 +43,13 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) != AARCH64_INSN_SPCLREG_DAIF; /* - * The HINT instruction is is problematic when single-stepping, - * except for the NOP case. + * The HINT instruction is steppable only if it is in whitelist + * and the rest of other such instructions are blocked for + * single stepping as they may cause exception or other + * unintended behaviour. */ if (aarch64_insn_is_hint(insn)) - return aarch64_insn_is_nop(insn); + return aarch64_insn_is_steppable_hint(insn); return true; } diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index d1c95dcf1d78..70b91a8c6bb3 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -7,26 +7,31 @@ * Copyright (C) 2013 Linaro Limited. * Author: Sandeepa Prabhu <sandeepa.prabhu@linaro.org> */ + +#define pr_fmt(fmt) "kprobes: " fmt + +#include <linux/extable.h> #include <linux/kasan.h> #include <linux/kernel.h> #include <linux/kprobes.h> -#include <linux/extable.h> -#include <linux/slab.h> -#include <linux/stop_machine.h> #include <linux/sched/debug.h> #include <linux/set_memory.h> +#include <linux/slab.h> +#include <linux/stop_machine.h> #include <linux/stringify.h> +#include <linux/uaccess.h> #include <linux/vmalloc.h> -#include <asm/traps.h> -#include <asm/ptrace.h> + #include <asm/cacheflush.h> -#include <asm/debug-monitors.h> #include <asm/daifflags.h> -#include <asm/system_misc.h> +#include <asm/debug-monitors.h> #include <asm/insn.h> -#include <linux/uaccess.h> #include <asm/irq.h> +#include <asm/patching.h> +#include <asm/ptrace.h> #include <asm/sections.h> +#include <asm/system_misc.h> +#include <asm/traps.h> #include "decode-insn.h" @@ -34,27 +39,33 @@ DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); static void __kprobes -post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *); - -static int __kprobes patch_text(kprobe_opcode_t *addr, u32 opcode) -{ - void *addrs[1]; - u32 insns[1]; - - addrs[0] = addr; - insns[0] = opcode; - - return aarch64_insn_patch_text(addrs, insns, 1); -} +post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *); static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { - /* prepare insn slot */ - patch_text(p->ainsn.api.insn, p->opcode); + kprobe_opcode_t *addr = p->ainsn.api.insn; - flush_icache_range((uintptr_t) (p->ainsn.api.insn), - (uintptr_t) (p->ainsn.api.insn) + - MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + /* + * Prepare insn slot, Mark Rutland points out it depends on a coupe of + * subtleties: + * + * - That the I-cache maintenance for these instructions is complete + * *before* the kprobe BRK is written (and aarch64_insn_patch_text_nosync() + * ensures this, but just omits causing a Context-Synchronization-Event + * on all CPUS). + * + * - That the kprobe BRK results in an exception (and consequently a + * Context-Synchronoization-Event), which ensures that the CPU will + * fetch thesingle-step slot instructions *after* this, ensuring that + * the new instructions are used + * + * It supposes to place ISB after patching to guarantee I-cache maintenance + * is observed on all CPUS, however, single-step slot is installed in + * the BRK exception handler, so it is unnecessary to generate + * Contex-Synchronization-Event via ISB again. + */ + aarch64_insn_patch_text_nosync(addr, p->opcode); + aarch64_insn_patch_text_nosync(addr + 1, BRK64_OPCODE_KPROBES_SS); /* * Needs restoring of return address after stepping xol. @@ -77,7 +88,7 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs); /* single step simulated, now go for post processing */ - post_kprobe_handler(kcb, regs); + post_kprobe_handler(p, kcb, regs); } int __kprobes arch_prepare_kprobe(struct kprobe *p) @@ -120,27 +131,26 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) void *alloc_insn_page(void) { - void *page; - - page = vmalloc_exec(PAGE_SIZE); - if (page) { - set_memory_ro((unsigned long)page, 1); - set_vm_flush_reset_perms(page); - } - - return page; + return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); } /* arm kprobe: install breakpoint in text */ void __kprobes arch_arm_kprobe(struct kprobe *p) { - patch_text(p->addr, BRK64_OPCODE_KPROBES); + void *addr = p->addr; + u32 insn = BRK64_OPCODE_KPROBES; + + aarch64_insn_patch_text(&addr, &insn, 1); } /* disarm kprobe: remove breakpoint from text */ void __kprobes arch_disarm_kprobe(struct kprobe *p) { - patch_text(p->addr, p->opcode); + void *addr = p->addr; + + aarch64_insn_patch_text(&addr, &p->opcode, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) @@ -169,20 +179,15 @@ static void __kprobes set_current_kprobe(struct kprobe *p) } /* - * Interrupts need to be disabled before single-step mode is set, and not - * reenabled until after single-step mode ends. - * Without disabling interrupt on local CPU, there is a chance of - * interrupt occurrence in the period of exception return and start of - * out-of-line single-step, that result in wrongly single stepping - * into the interrupt handler. + * Mask all of DAIF while executing the instruction out-of-line, to keep things + * simple and avoid nesting exceptions. Interrupts do have to be disabled since + * the kprobe state is per-CPU and doesn't get migrated. */ static void __kprobes kprobes_save_local_irqflag(struct kprobe_ctlblk *kcb, struct pt_regs *regs) { kcb->saved_irqflag = regs->pstate & DAIF_MASK; - regs->pstate |= PSR_I_BIT; - /* Unmask PSTATE.D for enabling software step exceptions. */ - regs->pstate &= ~PSR_D_BIT; + regs->pstate |= DAIF_MASK; } static void __kprobes kprobes_restore_local_irqflag(struct kprobe_ctlblk *kcb, @@ -192,19 +197,6 @@ static void __kprobes kprobes_restore_local_irqflag(struct kprobe_ctlblk *kcb, regs->pstate |= kcb->saved_irqflag; } -static void __kprobes -set_ss_context(struct kprobe_ctlblk *kcb, unsigned long addr) -{ - kcb->ss_ctx.ss_pending = true; - kcb->ss_ctx.match_addr = addr + sizeof(kprobe_opcode_t); -} - -static void __kprobes clear_ss_context(struct kprobe_ctlblk *kcb) -{ - kcb->ss_ctx.ss_pending = false; - kcb->ss_ctx.match_addr = 0; -} - static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) @@ -224,11 +216,7 @@ static void __kprobes setup_singlestep(struct kprobe *p, /* prepare for single stepping */ slot = (unsigned long)p->ainsn.api.insn; - set_ss_context(kcb, slot); /* mark pending ss */ - - /* IRQs and single stepping do not mix well. */ kprobes_save_local_irqflag(kcb, regs); - kernel_enable_single_step(regs); instruction_pointer_set(regs, slot); } else { /* insn simulation */ @@ -248,7 +236,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, break; case KPROBE_HIT_SS: case KPROBE_REENTER: - pr_warn("Unrecoverable kprobe detected.\n"); + pr_warn("Failed to recover from reentered kprobes.\n"); dump_kprobe(p); BUG(); break; @@ -261,13 +249,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, } static void __kprobes -post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) +post_kprobe_handler(struct kprobe *cur, struct kprobe_ctlblk *kcb, struct pt_regs *regs) { - struct kprobe *cur = kprobe_running(); - - if (!cur) - return; - /* return addr restore if non-branching insn */ if (cur->ainsn.api.restore != 0) instruction_pointer_set(regs, cur->ainsn.api.restore); @@ -279,12 +262,8 @@ post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) } /* call post handler */ kcb->kprobe_status = KPROBE_HIT_SSDONE; - if (cur->post_handler) { - /* post_handler can hit breakpoint and single step - * again, so we enable D-flag for recursive exception. - */ + if (cur->post_handler) cur->post_handler(cur, regs, 0); - } reset_current_kprobe(); } @@ -305,47 +284,22 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr) * normal page fault. */ instruction_pointer_set(regs, (unsigned long) cur->addr); - if (!instruction_pointer(regs)) - BUG(); + BUG_ON(!instruction_pointer(regs)); - kernel_disable_single_step(); - - if (kcb->kprobe_status == KPROBE_REENTER) + if (kcb->kprobe_status == KPROBE_REENTER) { restore_previous_kprobe(kcb); - else + } else { + kprobes_restore_local_irqflag(kcb, regs); reset_current_kprobe(); + } break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: - /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accounting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(cur); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (cur->fault_handler && cur->fault_handler(cur, regs, fsr)) - return 1; - - /* - * In case the user-specified fault handler returned - * zero, try to fix up. - */ - if (fixup_exception(regs)) - return 1; } return 0; } -static void __kprobes kprobe_handler(struct pt_regs *regs) +static int __kprobes +kprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr) { struct kprobe *p, *cur_kprobe; struct kprobe_ctlblk *kcb; @@ -355,88 +309,66 @@ static void __kprobes kprobe_handler(struct pt_regs *regs) cur_kprobe = kprobe_running(); p = get_kprobe((kprobe_opcode_t *) addr); - - if (p) { - if (cur_kprobe) { - if (reenter_kprobe(p, regs, kcb)) - return; - } else { - /* Probe hit */ - set_current_kprobe(p); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - - /* - * If we have no pre-handler or it returned 0, we - * continue with normal processing. If we have a - * pre-handler and it returned non-zero, it will - * modify the execution path and no need to single - * stepping. Let's just reset current kprobe and exit. - * - * pre_handler can hit a breakpoint and can step thru - * before return, keep PSTATE D-flag enabled until - * pre_handler return back. - */ - if (!p->pre_handler || !p->pre_handler(p, regs)) { - setup_singlestep(p, regs, kcb, 0); - } else - reset_current_kprobe(); - } + if (WARN_ON_ONCE(!p)) { + /* + * Something went wrong. This BRK used an immediate reserved + * for kprobes, but we couldn't find any corresponding probe. + */ + return DBG_HOOK_ERROR; } - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - * Return back to original instruction, and continue. - */ -} -static int __kprobes -kprobe_ss_hit(struct kprobe_ctlblk *kcb, unsigned long addr) -{ - if ((kcb->ss_ctx.ss_pending) - && (kcb->ss_ctx.match_addr == addr)) { - clear_ss_context(kcb); /* clear pending ss */ - return DBG_HOOK_HANDLED; + if (cur_kprobe) { + /* Hit a kprobe inside another kprobe */ + if (!reenter_kprobe(p, regs, kcb)) + return DBG_HOOK_ERROR; + } else { + /* Probe hit */ + set_current_kprobe(p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + /* + * If we have no pre-handler or it returned 0, we + * continue with normal processing. If we have a + * pre-handler and it returned non-zero, it will + * modify the execution path and not need to single-step + * Let's just reset current kprobe and exit. + */ + if (!p->pre_handler || !p->pre_handler(p, regs)) + setup_singlestep(p, regs, kcb, 0); + else + reset_current_kprobe(); } - /* not ours, kprobes should ignore it */ - return DBG_HOOK_ERROR; + + return DBG_HOOK_HANDLED; } +static struct break_hook kprobes_break_hook = { + .imm = KPROBES_BRK_IMM, + .fn = kprobe_breakpoint_handler, +}; + static int __kprobes -kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr) +kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - int retval; - - /* return error if this is not our step */ - retval = kprobe_ss_hit(kcb, instruction_pointer(regs)); + unsigned long addr = instruction_pointer(regs); + struct kprobe *cur = kprobe_running(); - if (retval == DBG_HOOK_HANDLED) { + if (cur && (kcb->kprobe_status & (KPROBE_HIT_SS | KPROBE_REENTER)) && + ((unsigned long)&cur->ainsn.api.insn[1] == addr)) { kprobes_restore_local_irqflag(kcb, regs); - kernel_disable_single_step(); + post_kprobe_handler(cur, kcb, regs); - post_kprobe_handler(kcb, regs); + return DBG_HOOK_HANDLED; } - return retval; -} - -static struct step_hook kprobes_step_hook = { - .fn = kprobe_single_step_handler, -}; - -static int __kprobes -kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr) -{ - kprobe_handler(regs); - return DBG_HOOK_HANDLED; + /* not ours, kprobes should ignore it */ + return DBG_HOOK_ERROR; } -static struct break_hook kprobes_break_hook = { - .imm = KPROBES_BRK_IMM, - .fn = kprobe_breakpoint_handler, +static struct break_hook kprobes_break_ss_hook = { + .imm = KPROBES_BRK_SS_IMM, + .fn = kprobe_breakpoint_ss_handler, }; /* @@ -455,10 +387,6 @@ int __init arch_populate_kprobe_blacklist(void) (unsigned long)__irqentry_text_end); if (ret) return ret; - ret = kprobe_add_area_blacklist((unsigned long)__idmap_text_start, - (unsigned long)__idmap_text_end); - if (ret) - return ret; ret = kprobe_add_area_blacklist((unsigned long)__hyp_text_start, (unsigned long)__hyp_text_end); if (ret || is_kernel_in_hyp_mode()) @@ -470,90 +398,17 @@ int __init arch_populate_kprobe_blacklist(void) void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs) { - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address = - (unsigned long)&kretprobe_trampoline; - kprobe_opcode_t *correct_ret_addr = NULL; - - INIT_HLIST_HEAD(&empty_rp); - kretprobe_hash_lock(current, &head, &flags); - - /* - * It is possible to have multiple instances associated with a given - * task either because multiple functions in the call path have - * return probes installed on them, and/or more than one - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always pushed into the head of the list - * - when multiple return probes are registered for the same - * function, the (chronologically) first instance's ret_addr - * will be the real return address, and all the rest will - * point to kretprobe_trampoline. - */ - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - orig_ret_address = (unsigned long)ri->ret_addr; - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - - correct_ret_addr = ri->ret_addr; - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - orig_ret_address = (unsigned long)ri->ret_addr; - if (ri->rp && ri->rp->handler) { - __this_cpu_write(current_kprobe, &ri->rp->kp); - get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; - ri->ret_addr = correct_ret_addr; - ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, NULL); - } - - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_hash_unlock(current, &flags); - - hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - return (void *)orig_ret_address; + return (void *)kretprobe_trampoline_handler(regs, (void *)regs->regs[29]); } void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { ri->ret_addr = (kprobe_opcode_t *)regs->regs[30]; + ri->fp = (void *)regs->regs[29]; /* replace return addr (x30) with trampoline */ - regs->regs[30] = (long)&kretprobe_trampoline; + regs->regs[30] = (long)&__kretprobe_trampoline; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) @@ -564,7 +419,7 @@ int __kprobes arch_trampoline_kprobe(struct kprobe *p) int __init arch_init_kprobes(void) { register_kernel_break_hook(&kprobes_break_hook); - register_kernel_step_hook(&kprobes_step_hook); + register_kernel_break_hook(&kprobes_break_ss_hook); return 0; } diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S index 45dce03aaeaf..9a6499bed58b 100644 --- a/arch/arm64/kernel/probes/kprobes_trampoline.S +++ b/arch/arm64/kernel/probes/kprobes_trampoline.S @@ -25,7 +25,7 @@ stp x24, x25, [sp, #S_X24] stp x26, x27, [sp, #S_X26] stp x28, x29, [sp, #S_X28] - add x0, sp, #S_FRAME_SIZE + add x0, sp, #PT_REGS_SIZE stp lr, x0, [sp, #S_LR] /* * Construct a useful saved PSTATE @@ -61,11 +61,14 @@ ldp x28, x29, [sp, #S_X28] .endm -ENTRY(kretprobe_trampoline) - sub sp, sp, #S_FRAME_SIZE +SYM_CODE_START(__kretprobe_trampoline) + sub sp, sp, #PT_REGS_SIZE save_all_base_regs + /* Setup a frame pointer. */ + add x29, sp, #S_FP + mov x0, sp bl trampoline_probe_handler /* @@ -74,9 +77,10 @@ ENTRY(kretprobe_trampoline) */ mov lr, x0 + /* The frame pointer (x29) is restored with other registers. */ restore_all_base_regs - add sp, sp, #S_FRAME_SIZE + add sp, sp, #PT_REGS_SIZE ret -ENDPROC(kretprobe_trampoline) +SYM_CODE_END(__kretprobe_trampoline) diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 25f67ec59635..22d0b3252476 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -10,6 +10,7 @@ #include <linux/kprobes.h> #include <asm/ptrace.h> +#include <asm/traps.h> #include "simulate-insn.h" diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index a412d8edbcd2..d49aef2657cd 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -21,7 +21,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, memcpy(dst, src, len); /* flush caches (dcache/icache) */ - sync_icache_aliases(dst, len); + sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len); kunmap_atomic(xol_page_kaddr); } @@ -38,7 +38,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, /* TODO: Currently we do not support AARCH32 instruction probing */ if (mm->context.flags & MMCF_AARCH32) - return -ENOTSUPP; + return -EOPNOTSUPP; else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) return -EINVAL; @@ -166,7 +166,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, } static int uprobe_breakpoint_handler(struct pt_regs *regs, - unsigned int esr) + unsigned long esr) { if (uprobe_pre_sstep_notifier(regs)) return DBG_HOOK_HANDLED; @@ -175,7 +175,7 @@ static int uprobe_breakpoint_handler(struct pt_regs *regs, } static int uprobe_single_step_handler(struct pt_regs *regs, - unsigned int esr) + unsigned long esr) { struct uprobe_task *utask = current->utask; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 71f788cd2b18..7387b68c745b 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -6,19 +6,18 @@ * Copyright (C) 1996-2000 Russell King - Converted to ARM. * Copyright (C) 2012 ARM Ltd. */ - -#include <stdarg.h> - #include <linux/compat.h> #include <linux/efi.h> +#include <linux/elf.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/kernel.h> -#include <linux/lockdep.h> +#include <linux/mman.h> #include <linux/mm.h> +#include <linux/nospec.h> #include <linux/stddef.h> #include <linux/sysctl.h> #include <linux/unistd.h> @@ -41,22 +40,25 @@ #include <linux/percpu.h> #include <linux/thread_info.h> #include <linux/prctl.h> +#include <linux/stacktrace.h> #include <asm/alternative.h> -#include <asm/arch_gicv3.h> #include <asm/compat.h> #include <asm/cpufeature.h> #include <asm/cacheflush.h> #include <asm/exec.h> #include <asm/fpsimd.h> #include <asm/mmu_context.h> +#include <asm/mte.h> #include <asm/processor.h> #include <asm/pointer_auth.h> #include <asm/stacktrace.h> +#include <asm/switch_to.h> +#include <asm/system_misc.h> #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include <linux/stackprotector.h> -unsigned long __stack_chk_guard __read_mostly; +unsigned long __stack_chk_guard __ro_after_init; EXPORT_SYMBOL(__stack_chk_guard); #endif @@ -66,69 +68,8 @@ EXPORT_SYMBOL(__stack_chk_guard); void (*pm_power_off)(void); EXPORT_SYMBOL_GPL(pm_power_off); -void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); - -static void __cpu_do_idle(void) -{ - dsb(sy); - wfi(); -} - -static void __cpu_do_idle_irqprio(void) -{ - unsigned long pmr; - unsigned long daif_bits; - - daif_bits = read_sysreg(daif); - write_sysreg(daif_bits | PSR_I_BIT, daif); - - /* - * Unmask PMR before going idle to make sure interrupts can - * be raised. - */ - pmr = gic_read_pmr(); - gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); - - __cpu_do_idle(); - - gic_write_pmr(pmr); - write_sysreg(daif_bits, daif); -} - -/* - * cpu_do_idle() - * - * Idle the processor (wait for interrupt). - * - * If the CPU supports priority masking we must do additional work to - * ensure that interrupts are not masked at the PMR (because the core will - * not wake up if we block the wake up signal in the interrupt controller). - */ -void cpu_do_idle(void) -{ - if (system_uses_irq_prio_masking()) - __cpu_do_idle_irqprio(); - else - __cpu_do_idle(); -} - -/* - * This is our default idle handler. - */ -void arch_cpu_idle(void) -{ - /* - * This should do all the clock switching and wait for interrupt - * tricks - */ - trace_cpu_idle_rcuidle(1, smp_processor_id()); - cpu_do_idle(); - local_irq_enable(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); -} - #ifdef CONFIG_HOTPLUG_CPU -void arch_cpu_idle_dead(void) +void __noreturn arch_cpu_idle_dead(void) { cpu_die(); } @@ -141,11 +82,11 @@ void arch_cpu_idle_dead(void) * to execute e.g. a RAM-based pin loop is not sufficient. This allows the * kexec'd kernel to use any and all RAM as it sees fit, without having to * avoid any code or data used by any SW CPU pin loop. The CPU hotplug - * functionality embodied in disable_nonboot_cpus() to achieve this. + * functionality embodied in smpt_shutdown_nonboot_cpus() to achieve this. */ void machine_shutdown(void) { - disable_nonboot_cpus(); + smp_shutdown_nonboot_cpus(reboot_cpu); } /* @@ -170,8 +111,7 @@ void machine_power_off(void) { local_irq_disable(); smp_send_stop(); - if (pm_power_off) - pm_power_off(); + do_kernel_power_off(); } /* @@ -197,10 +137,7 @@ void machine_restart(char *cmd) efi_reboot(reboot_mode, NULL); /* Now call the architecture specific reboot code. */ - if (arm_pm_restart) - arm_pm_restart(reboot_mode, cmd); - else - do_kernel_restart(cmd); + do_kernel_restart(cmd); /* * Whoops - the architecture was unable to reboot. @@ -209,12 +146,21 @@ void machine_restart(char *cmd) while (1); } +#define bstr(suffix, str) [PSR_BTYPE_ ## suffix >> PSR_BTYPE_SHIFT] = str +static const char *const btypes[] = { + bstr(NONE, "--"), + bstr( JC, "jc"), + bstr( C, "-c"), + bstr( J , "j-") +}; +#undef bstr + static void print_pstate(struct pt_regs *regs) { u64 pstate = regs->pstate; if (compat_user_mode(regs)) { - printk("pstate: %08llx (%c%c%c%c %c %s %s %c%c%c)\n", + printk("pstate: %08llx (%c%c%c%c %c %s %s %c%c%c %cDIT %cSSBS)\n", pstate, pstate & PSR_AA32_N_BIT ? 'N' : 'n', pstate & PSR_AA32_Z_BIT ? 'Z' : 'z', @@ -225,9 +171,14 @@ static void print_pstate(struct pt_regs *regs) pstate & PSR_AA32_E_BIT ? "BE" : "LE", pstate & PSR_AA32_A_BIT ? 'A' : 'a', pstate & PSR_AA32_I_BIT ? 'I' : 'i', - pstate & PSR_AA32_F_BIT ? 'F' : 'f'); + pstate & PSR_AA32_F_BIT ? 'F' : 'f', + pstate & PSR_AA32_DIT_BIT ? '+' : '-', + pstate & PSR_AA32_SSBS_BIT ? '+' : '-'); } else { - printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO)\n", + const char *btype_str = btypes[(pstate & PSR_BTYPE_MASK) >> + PSR_BTYPE_SHIFT]; + + printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO %cTCO %cDIT %cSSBS BTYPE=%s)\n", pstate, pstate & PSR_N_BIT ? 'N' : 'n', pstate & PSR_Z_BIT ? 'Z' : 'z', @@ -238,7 +189,11 @@ static void print_pstate(struct pt_regs *regs) pstate & PSR_I_BIT ? 'I' : 'i', pstate & PSR_F_BIT ? 'F' : 'f', pstate & PSR_PAN_BIT ? '+' : '-', - pstate & PSR_UAO_BIT ? '+' : '-'); + pstate & PSR_UAO_BIT ? '+' : '-', + pstate & PSR_TCO_BIT ? '+' : '-', + pstate & PSR_DIT_BIT ? '+' : '-', + pstate & PSR_SSBS_BIT ? '+' : '-', + btype_str); } } @@ -262,7 +217,7 @@ void __show_regs(struct pt_regs *regs) if (!user_mode(regs)) { printk("pc : %pS\n", (void *)regs->pc); - printk("lr : %pS\n", (void *)lr); + printk("lr : %pS\n", (void *)ptrauth_strip_kernel_insn_pac(lr)); } else { printk("pc : %016llx\n", regs->pc); printk("lr : %016llx\n", lr); @@ -276,27 +231,26 @@ void __show_regs(struct pt_regs *regs) i = top_reg; while (i >= 0) { - printk("x%-2d: %016llx ", i, regs->regs[i]); - i--; + printk("x%-2d: %016llx", i, regs->regs[i]); - if (i % 2 == 0) { - pr_cont("x%-2d: %016llx ", i, regs->regs[i]); - i--; - } + while (i-- % 3) + pr_cont(" x%-2d: %016llx", i, regs->regs[i]); pr_cont("\n"); } } -void show_regs(struct pt_regs * regs) +void show_regs(struct pt_regs *regs) { __show_regs(regs); - dump_backtrace(regs, NULL); + dump_backtrace(regs, NULL, KERN_DEFAULT); } static void tls_thread_flush(void) { write_sysreg(0, tpidr_el0); + if (system_supports_tpidr2()) + write_sysreg_s(0, SYS_TPIDR2_EL0); if (is_compat_task()) { current->thread.uw.tp_value = 0; @@ -325,10 +279,6 @@ void flush_thread(void) flush_tagged_addr_state(); } -void release_thread(struct task_struct *dead_task) -{ -} - void arch_release_task_struct(struct task_struct *tsk) { fpsimd_release_task(tsk); @@ -345,24 +295,58 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) /* * Detach src's sve_state (if any) from dst so that it does not - * get erroneously used or freed prematurely. dst's sve_state + * get erroneously used or freed prematurely. dst's copies * will be allocated on demand later on if dst uses SVE. * For consistency, also clear TIF_SVE here: this could be done * later in copy_process(), but to avoid tripping up future - * maintainers it is best not to leave TIF_SVE and sve_state in + * maintainers it is best not to leave TIF flags and buffers in * an inconsistent state, even temporarily. */ dst->thread.sve_state = NULL; clear_tsk_thread_flag(dst, TIF_SVE); + /* + * In the unlikely event that we create a new thread with ZA + * enabled we should retain the ZA and ZT state so duplicate + * it here. This may be shortly freed if we exec() or if + * CLONE_SETTLS but it's simpler to do it here. To avoid + * confusing the rest of the code ensure that we have a + * sve_state allocated whenever sme_state is allocated. + */ + if (thread_za_enabled(&src->thread)) { + dst->thread.sve_state = kzalloc(sve_state_size(src), + GFP_KERNEL); + if (!dst->thread.sve_state) + return -ENOMEM; + + dst->thread.sme_state = kmemdup(src->thread.sme_state, + sme_state_size(src), + GFP_KERNEL); + if (!dst->thread.sme_state) { + kfree(dst->thread.sve_state); + dst->thread.sve_state = NULL; + return -ENOMEM; + } + } else { + dst->thread.sme_state = NULL; + clear_tsk_thread_flag(dst, TIF_SME); + } + + dst->thread.fp_type = FP_STATE_FPSIMD; + + /* clear any pending asynchronous tag fault raised by the parent */ + clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT); + return 0; } asmlinkage void ret_from_fork(void) asm("ret_from_fork"); -int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long stack_start = args->stack; + unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); @@ -376,7 +360,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, */ fpsimd_flush_task_state(p); - if (likely(!(p->flags & PF_KTHREAD))) { + ptrauth_thread_init_kernel(p); + + if (likely(!args->fn)) { *childregs = *current_pt_regs(); childregs->regs[0] = 0; @@ -385,6 +371,8 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, * out-of-sync with the saved value. */ *task_user_tls(p) = read_sysreg(tpidr_el0); + if (system_supports_tpidr2()) + p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); if (stack_start) { if (is_compat_thread(task_thread_info(p))) @@ -394,29 +382,34 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, } /* - * If a TLS pointer was passed to clone (4th argument), use it - * for the new thread. + * If a TLS pointer was passed to clone, use it for the new + * thread. We also reset TPIDR2 if it's in use. */ - if (clone_flags & CLONE_SETTLS) - p->thread.uw.tp_value = childregs->regs[3]; + if (clone_flags & CLONE_SETTLS) { + p->thread.uw.tp_value = tls; + p->thread.tpidr2_el0 = 0; + } } else { + /* + * A kthread has no context to ERET to, so ensure any buggy + * ERET is treated as an illegal exception return. + * + * When a user task is created from a kthread, childregs will + * be initialized by start_thread() or start_compat_thread(). + */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->pstate = PSR_MODE_EL1h; - if (IS_ENABLED(CONFIG_ARM64_UAO) && - cpus_have_const_cap(ARM64_HAS_UAO)) - childregs->pstate |= PSR_UAO_BIT; - - if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) - set_ssbs_bit(childregs); + childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT; - if (system_uses_irq_prio_masking()) - childregs->pmr_save = GIC_PRIO_IRQON; - - p->thread.cpu_context.x19 = stack_start; - p->thread.cpu_context.x20 = stk_sz; + p->thread.cpu_context.x19 = (unsigned long)args->fn; + p->thread.cpu_context.x20 = (unsigned long)args->fn_arg; } p->thread.cpu_context.pc = (unsigned long)ret_from_fork; p->thread.cpu_context.sp = (unsigned long)childregs; + /* + * For the benefit of the unwinder, set up childregs->stackframe + * as the final frame for the new task. + */ + p->thread.cpu_context.fp = (unsigned long)childregs->stackframe; ptrace_hw_copy_thread(p); @@ -426,6 +419,8 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, void tls_preserve_current_state(void) { *task_user_tls(current) = read_sysreg(tpidr_el0); + if (system_supports_tpidr2() && !is_compat_task()) + current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); } static void tls_thread_switch(struct task_struct *next) @@ -438,17 +433,8 @@ static void tls_thread_switch(struct task_struct *next) write_sysreg(0, tpidrro_el0); write_sysreg(*task_user_tls(next), tpidr_el0); -} - -/* Restore the UAO state depending on next's addr_limit */ -void uao_thread_switch(struct task_struct *next) -{ - if (IS_ENABLED(CONFIG_ARM64_UAO)) { - if (task_thread_info(next)->addr_limit == KERNEL_DS) - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO)); - else - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO)); - } + if (system_supports_tpidr2()) + write_sysreg_s(next->thread.tpidr2_el0, SYS_TPIDR2_EL0); } /* @@ -457,8 +443,6 @@ void uao_thread_switch(struct task_struct *next) */ static void ssbs_thread_switch(struct task_struct *next) { - struct pt_regs *regs = task_pt_regs(next); - /* * Nothing to do for kernel threads, but 'regs' may be junk * (e.g. idle task) so check the flags and bail early. @@ -466,15 +450,14 @@ static void ssbs_thread_switch(struct task_struct *next) if (unlikely(next->flags & PF_KTHREAD)) return; - /* If the mitigation is enabled, then we leave SSBS clear. */ - if ((arm64_get_ssbd_state() == ARM64_SSBD_FORCE_ENABLE) || - test_tsk_thread_flag(next, TIF_SSBD)) + /* + * If all CPUs implement the SSBS extension, then we just need to + * context-switch the PSTATE field. + */ + if (alternative_has_cap_unlikely(ARM64_SSBS)) return; - if (compat_user_mode(regs)) - set_compat_ssbs_bit(regs); - else if (user_mode(regs)) - set_ssbs_bit(regs); + spectre_v4_enable_task_mitigation(next); } /* @@ -492,9 +475,52 @@ static void entry_task_switch(struct task_struct *next) } /* + * ARM erratum 1418040 handling, affecting the 32bit view of CNTVCT. + * Ensure access is disabled when switching to a 32bit task, ensure + * access is enabled when switching to a 64bit task. + */ +static void erratum_1418040_thread_switch(struct task_struct *next) +{ + if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) || + !this_cpu_has_cap(ARM64_WORKAROUND_1418040)) + return; + + if (is_compat_thread(task_thread_info(next))) + sysreg_clear_set(cntkctl_el1, ARCH_TIMER_USR_VCT_ACCESS_EN, 0); + else + sysreg_clear_set(cntkctl_el1, 0, ARCH_TIMER_USR_VCT_ACCESS_EN); +} + +static void erratum_1418040_new_exec(void) +{ + preempt_disable(); + erratum_1418040_thread_switch(current); + preempt_enable(); +} + +/* + * __switch_to() checks current->thread.sctlr_user as an optimisation. Therefore + * this function must be called with preemption disabled and the update to + * sctlr_user must be made in the same preemption disabled block so that + * __switch_to() does not see the variable update before the SCTLR_EL1 one. + */ +void update_sctlr_el1(u64 sctlr) +{ + /* + * EnIA must not be cleared while in the kernel as this is necessary for + * in-kernel PAC. It will be cleared on kernel exit if needed. + */ + sysreg_clear_set(sctlr_el1, SCTLR_USER_MASK & ~SCTLR_ELx_ENIA, sctlr); + + /* ISB required for the kernel uaccess routines when setting TCF0. */ + isb(); +} + +/* * Thread switching. */ -__notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, +__notrace_funcgraph __sched +struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next) { struct task_struct *last; @@ -504,9 +530,9 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, hw_breakpoint_thread_switch(next); contextidr_thread_switch(next); entry_task_switch(next); - uao_thread_switch(next); - ptrauth_thread_switch(next); ssbs_thread_switch(next); + erratum_1418040_thread_switch(next); + ptrauth_thread_switch_user(next); /* * Complete any pending TLB or cache maintenance on this CPU in case @@ -516,55 +542,118 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, */ dsb(ish); + /* + * MTE thread switching must happen after the DSB above to ensure that + * any asynchronous tag check faults have been logged in the TFSR*_EL1 + * registers. + */ + mte_thread_switch(next); + /* avoid expensive SCTLR_EL1 accesses if no change */ + if (prev->thread.sctlr_user != next->thread.sctlr_user) + update_sctlr_el1(next->thread.sctlr_user); + /* the actual thread switch */ last = cpu_switch_to(prev, next); return last; } -unsigned long get_wchan(struct task_struct *p) +struct wchan_info { + unsigned long pc; + int count; +}; + +static bool get_wchan_cb(void *arg, unsigned long pc) { - struct stackframe frame; - unsigned long stack_page, ret = 0; - int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) - return 0; + struct wchan_info *wchan_info = arg; - stack_page = (unsigned long)try_get_task_stack(p); - if (!stack_page) - return 0; + if (!in_sched_functions(pc)) { + wchan_info->pc = pc; + return false; + } + return wchan_info->count++ < 16; +} + +unsigned long __get_wchan(struct task_struct *p) +{ + struct wchan_info wchan_info = { + .pc = 0, + .count = 0, + }; - start_backtrace(&frame, thread_saved_fp(p), thread_saved_pc(p)); + if (!try_get_task_stack(p)) + return 0; - do { - if (unwind_frame(p, &frame)) - goto out; - if (!in_sched_functions(frame.pc)) { - ret = frame.pc; - goto out; - } - } while (count ++ < 16); + arch_stack_walk(get_wchan_cb, &wchan_info, p, NULL); -out: put_task_stack(p); - return ret; + + return wchan_info.pc; } unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() & ~PAGE_MASK; + sp -= get_random_u32_below(PAGE_SIZE); return sp & ~0xf; } +#ifdef CONFIG_COMPAT +int compat_elf_check_arch(const struct elf32_hdr *hdr) +{ + if (!system_supports_32bit_el0()) + return false; + + if ((hdr)->e_machine != EM_ARM) + return false; + + if (!((hdr)->e_flags & EF_ARM_EABI_MASK)) + return false; + + /* + * Prevent execve() of a 32-bit program from a deadline task + * if the restricted affinity mask would be inadmissible on an + * asymmetric system. + */ + return !static_branch_unlikely(&arm64_mismatched_32bit_el0) || + !dl_task_check_affinity(current, system_32bit_el0_cpumask()); +} +#endif + /* * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. */ void arch_setup_new_exec(void) { - current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0; + unsigned long mmflags = 0; + + if (is_compat_task()) { + mmflags = MMCF_AARCH32; + + /* + * Restrict the CPU affinity mask for a 32-bit task so that + * it contains only 32-bit-capable CPUs. + * + * From the perspective of the task, this looks similar to + * what would happen if the 64-bit-only CPUs were hot-unplugged + * at the point of execve(), although we try a bit harder to + * honour the cpuset hierarchy. + */ + if (static_branch_unlikely(&arm64_mismatched_32bit_el0)) + force_compatible_cpus_allowed_ptr(current); + } else if (static_branch_unlikely(&arm64_mismatched_32bit_el0)) { + relax_compatible_cpus_allowed_ptr(current); + } - ptrauth_thread_init_user(current); + current->mm->context.flags = mmflags; + ptrauth_thread_init_user(); + mte_thread_init_user(); + erratum_1418040_new_exec(); + + if (task_spec_ssb_noexec(current)) { + arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS, + PR_SPEC_ENABLE); + } } #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI @@ -573,11 +662,19 @@ void arch_setup_new_exec(void) */ static unsigned int tagged_addr_disabled; -long set_tagged_addr_ctrl(unsigned long arg) +long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg) { - if (is_compat_task()) + unsigned long valid_mask = PR_TAGGED_ADDR_ENABLE; + struct thread_info *ti = task_thread_info(task); + + if (is_compat_thread(ti)) return -EINVAL; - if (arg & ~PR_TAGGED_ADDR_ENABLE) + + if (system_supports_mte()) + valid_mask |= PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC \ + | PR_MTE_TAG_MASK; + + if (arg & ~valid_mask) return -EINVAL; /* @@ -587,20 +684,28 @@ long set_tagged_addr_ctrl(unsigned long arg) if (arg & PR_TAGGED_ADDR_ENABLE && tagged_addr_disabled) return -EINVAL; - update_thread_flag(TIF_TAGGED_ADDR, arg & PR_TAGGED_ADDR_ENABLE); + if (set_mte_ctrl(task, arg) != 0) + return -EINVAL; + + update_ti_thread_flag(ti, TIF_TAGGED_ADDR, arg & PR_TAGGED_ADDR_ENABLE); return 0; } -long get_tagged_addr_ctrl(void) +long get_tagged_addr_ctrl(struct task_struct *task) { - if (is_compat_task()) + long ret = 0; + struct thread_info *ti = task_thread_info(task); + + if (is_compat_thread(ti)) return -EINVAL; - if (test_thread_flag(TIF_TAGGED_ADDR)) - return PR_TAGGED_ADDR_ENABLE; + if (test_ti_thread_flag(ti, TIF_TAGGED_ADDR)) + ret = PR_TAGGED_ADDR_ENABLE; - return 0; + ret |= get_mte_ctrl(task); + + return ret; } /* @@ -608,8 +713,6 @@ long get_tagged_addr_ctrl(void) * only prevents the tagged address ABI enabling via prctl() and does not * disable it for tasks that already opted in to the relaxed ABI. */ -static int zero; -static int one = 1; static struct ctl_table tagged_addr_sysctl_table[] = { { @@ -618,10 +721,9 @@ static struct ctl_table tagged_addr_sysctl_table[] = { .data = &tagged_addr_disabled, .maxlen = sizeof(int), .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, - { } }; static int __init tagged_addr_init(void) @@ -634,18 +736,24 @@ static int __init tagged_addr_init(void) core_initcall(tagged_addr_init); #endif /* CONFIG_ARM64_TAGGED_ADDR_ABI */ -asmlinkage void __sched arm64_preempt_schedule_irq(void) +#ifdef CONFIG_BINFMT_ELF +int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state, + bool has_interp, bool is_interp) { - lockdep_assert_irqs_disabled(); - /* - * Preempting a task from an IRQ means we leave copies of PSTATE - * on the stack. cpufeature's enable calls may modify PSTATE, but - * resuming one of these preempted tasks would undo those changes. - * - * Only allow a task to be preempted once cpufeatures have been - * enabled. + * For dynamically linked executables the interpreter is + * responsible for setting PROT_BTI on everything except + * itself. */ - if (static_branch_likely(&arm64_const_caps_ready)) - preempt_schedule_irq(); + if (is_interp != has_interp) + return prot; + + if (!(state->flags & ARM64_ELF_BTI)) + return prot; + + if (prot & PROT_EXEC) + prot |= PROT_BTI; + + return prot; } +#endif diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c new file mode 100644 index 000000000000..6268a13a1d58 --- /dev/null +++ b/arch/arm64/kernel/proton-pack.c @@ -0,0 +1,1156 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Handle detection, reporting and mitigation of Spectre v1, v2, v3a and v4, as + * detailed at: + * + * https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability + * + * This code was originally written hastily under an awful lot of stress and so + * aspects of it are somewhat hacky. Unfortunately, changing anything in here + * instantly makes me feel ill. Thanks, Jann. Thann. + * + * Copyright (C) 2018 ARM Ltd, All Rights Reserved. + * Copyright (C) 2020 Google LLC + * + * "If there's something strange in your neighbourhood, who you gonna call?" + * + * Authors: Will Deacon <will@kernel.org> and Marc Zyngier <maz@kernel.org> + */ + +#include <linux/arm-smccc.h> +#include <linux/bpf.h> +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/nospec.h> +#include <linux/prctl.h> +#include <linux/sched/task_stack.h> + +#include <asm/debug-monitors.h> +#include <asm/insn.h> +#include <asm/spectre.h> +#include <asm/traps.h> +#include <asm/vectors.h> +#include <asm/virt.h> + +/* + * We try to ensure that the mitigation state can never change as the result of + * onlining a late CPU. + */ +static void update_mitigation_state(enum mitigation_state *oldp, + enum mitigation_state new) +{ + enum mitigation_state state; + + do { + state = READ_ONCE(*oldp); + if (new <= state) + break; + + /* Userspace almost certainly can't deal with this. */ + if (WARN_ON(system_capabilities_finalized())) + break; + } while (cmpxchg_relaxed(oldp, state, new) != state); +} + +/* + * Spectre v1. + * + * The kernel can't protect userspace for this one: it's each person for + * themselves. Advertise what we're doing and be done with it. + */ +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} + +/* + * Spectre v2. + * + * This one sucks. A CPU is either: + * + * - Mitigated in hardware and advertised by ID_AA64PFR0_EL1.CSV2. + * - Mitigated in hardware and listed in our "safe list". + * - Mitigated in software by firmware. + * - Mitigated in software by a CPU-specific dance in the kernel and a + * firmware call at EL2. + * - Vulnerable. + * + * It's not unlikely for different CPUs in a big.LITTLE system to fall into + * different camps. + */ +static enum mitigation_state spectre_v2_state; + +static bool __read_mostly __nospectre_v2; +static int __init parse_spectre_v2_param(char *str) +{ + __nospectre_v2 = true; + return 0; +} +early_param("nospectre_v2", parse_spectre_v2_param); + +static bool spectre_v2_mitigations_off(void) +{ + bool ret = __nospectre_v2 || cpu_mitigations_off(); + + if (ret) + pr_info_once("spectre-v2 mitigation disabled by command line option\n"); + + return ret; +} + +static const char *get_bhb_affected_string(enum mitigation_state bhb_state) +{ + switch (bhb_state) { + case SPECTRE_UNAFFECTED: + return ""; + default: + case SPECTRE_VULNERABLE: + return ", but not BHB"; + case SPECTRE_MITIGATED: + return ", BHB"; + } +} + +static bool _unprivileged_ebpf_enabled(void) +{ +#ifdef CONFIG_BPF_SYSCALL + return !sysctl_unprivileged_bpf_disabled; +#else + return false; +#endif +} + +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, + char *buf) +{ + enum mitigation_state bhb_state = arm64_get_spectre_bhb_state(); + const char *bhb_str = get_bhb_affected_string(bhb_state); + const char *v2_str = "Branch predictor hardening"; + + switch (spectre_v2_state) { + case SPECTRE_UNAFFECTED: + if (bhb_state == SPECTRE_UNAFFECTED) + return sprintf(buf, "Not affected\n"); + + /* + * Platforms affected by Spectre-BHB can't report + * "Not affected" for Spectre-v2. + */ + v2_str = "CSV2"; + fallthrough; + case SPECTRE_MITIGATED: + if (bhb_state == SPECTRE_MITIGATED && _unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + + return sprintf(buf, "Mitigation: %s%s\n", v2_str, bhb_str); + case SPECTRE_VULNERABLE: + fallthrough; + default: + return sprintf(buf, "Vulnerable\n"); + } +} + +static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void) +{ + u64 pfr0; + static const struct midr_range spectre_v2_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + { /* sentinel */ } + }; + + /* If the CPU has CSV2 set, we're safe */ + pfr0 = read_cpuid(ID_AA64PFR0_EL1); + if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_CSV2_SHIFT)) + return SPECTRE_UNAFFECTED; + + /* Alternatively, we have a list of unaffected CPUs */ + if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list)) + return SPECTRE_UNAFFECTED; + + return SPECTRE_VULNERABLE; +} + +static enum mitigation_state spectre_v2_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_1, &res); + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + default: + fallthrough; + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +bool has_spectre_v2(const struct arm64_cpu_capabilities *entry, int scope) +{ + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + if (spectre_v2_get_cpu_hw_mitigation_state() == SPECTRE_UNAFFECTED) + return false; + + if (spectre_v2_get_cpu_fw_mitigation_state() == SPECTRE_UNAFFECTED) + return false; + + return true; +} + +enum mitigation_state arm64_get_spectre_v2_state(void) +{ + return spectre_v2_state; +} + +DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); + +static void install_bp_hardening_cb(bp_hardening_cb_t fn) +{ + __this_cpu_write(bp_hardening_data.fn, fn); + + /* + * Vinz Clortho takes the hyp_vecs start/end "keys" at + * the door when we're a guest. Skip the hyp-vectors work. + */ + if (!is_hyp_mode_available()) + return; + + __this_cpu_write(bp_hardening_data.slot, HYP_VECTOR_SPECTRE_DIRECT); +} + +/* Called during entry so must be noinstr */ +static noinstr void call_smc_arch_workaround_1(void) +{ + arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); +} + +/* Called during entry so must be noinstr */ +static noinstr void call_hvc_arch_workaround_1(void) +{ + arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); +} + +/* Called during entry so must be noinstr */ +static noinstr void qcom_link_stack_sanitisation(void) +{ + u64 tmp; + + asm volatile("mov %0, x30 \n" + ".rept 16 \n" + "bl . + 4 \n" + ".endr \n" + "mov x30, %0 \n" + : "=&r" (tmp)); +} + +static bp_hardening_cb_t spectre_v2_get_sw_mitigation_cb(void) +{ + u32 midr = read_cpuid_id(); + if (((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR) && + ((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR_V1)) + return NULL; + + return qcom_link_stack_sanitisation; +} + +static enum mitigation_state spectre_v2_enable_fw_mitigation(void) +{ + bp_hardening_cb_t cb; + enum mitigation_state state; + + state = spectre_v2_get_cpu_fw_mitigation_state(); + if (state != SPECTRE_MITIGATED) + return state; + + if (spectre_v2_mitigations_off()) + return SPECTRE_VULNERABLE; + + switch (arm_smccc_1_1_get_conduit()) { + case SMCCC_CONDUIT_HVC: + cb = call_hvc_arch_workaround_1; + break; + + case SMCCC_CONDUIT_SMC: + cb = call_smc_arch_workaround_1; + break; + + default: + return SPECTRE_VULNERABLE; + } + + /* + * Prefer a CPU-specific workaround if it exists. Note that we + * still rely on firmware for the mitigation at EL2. + */ + cb = spectre_v2_get_sw_mitigation_cb() ?: cb; + install_bp_hardening_cb(cb); + return SPECTRE_MITIGATED; +} + +void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused) +{ + enum mitigation_state state; + + WARN_ON(preemptible()); + + state = spectre_v2_get_cpu_hw_mitigation_state(); + if (state == SPECTRE_VULNERABLE) + state = spectre_v2_enable_fw_mitigation(); + + update_mitigation_state(&spectre_v2_state, state); +} + +/* + * Spectre-v3a. + * + * Phew, there's not an awful lot to do here! We just instruct EL2 to use + * an indirect trampoline for the hyp vectors so that guests can't read + * VBAR_EL2 to defeat randomisation of the hypervisor VA layout. + */ +bool has_spectre_v3a(const struct arm64_cpu_capabilities *entry, int scope) +{ + static const struct midr_range spectre_v3a_unsafe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + {}, + }; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + return is_midr_in_range_list(read_cpuid_id(), spectre_v3a_unsafe_list); +} + +void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused) +{ + struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); + + if (this_cpu_has_cap(ARM64_SPECTRE_V3A)) + data->slot += HYP_VECTOR_INDIRECT; +} + +/* + * Spectre v4. + * + * If you thought Spectre v2 was nasty, wait until you see this mess. A CPU is + * either: + * + * - Mitigated in hardware and listed in our "safe list". + * - Mitigated in hardware via PSTATE.SSBS. + * - Mitigated in software by firmware (sometimes referred to as SSBD). + * + * Wait, that doesn't sound so bad, does it? Keep reading... + * + * A major source of headaches is that the software mitigation is enabled both + * on a per-task basis, but can also be forced on for the kernel, necessitating + * both context-switch *and* entry/exit hooks. To make it even worse, some CPUs + * allow EL0 to toggle SSBS directly, which can end up with the prctl() state + * being stale when re-entering the kernel. The usual big.LITTLE caveats apply, + * so you can have systems that have both firmware and SSBS mitigations. This + * means we actually have to reject late onlining of CPUs with mitigations if + * all of the currently onlined CPUs are safelisted, as the mitigation tends to + * be opt-in for userspace. Yes, really, the cure is worse than the disease. + * + * The only good part is that if the firmware mitigation is present, then it is + * present for all CPUs, meaning we don't have to worry about late onlining of a + * vulnerable CPU if one of the boot CPUs is using the firmware mitigation. + * + * Give me a VAX-11/780 any day of the week... + */ +static enum mitigation_state spectre_v4_state; + +/* This is the per-cpu state tracking whether we need to talk to firmware */ +DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); + +enum spectre_v4_policy { + SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, + SPECTRE_V4_POLICY_MITIGATION_ENABLED, + SPECTRE_V4_POLICY_MITIGATION_DISABLED, +}; + +static enum spectre_v4_policy __read_mostly __spectre_v4_policy; + +static const struct spectre_v4_param { + const char *str; + enum spectre_v4_policy policy; +} spectre_v4_params[] = { + { "force-on", SPECTRE_V4_POLICY_MITIGATION_ENABLED, }, + { "force-off", SPECTRE_V4_POLICY_MITIGATION_DISABLED, }, + { "kernel", SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, }, +}; +static int __init parse_spectre_v4_param(char *str) +{ + int i; + + if (!str || !str[0]) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(spectre_v4_params); i++) { + const struct spectre_v4_param *param = &spectre_v4_params[i]; + + if (strncmp(str, param->str, strlen(param->str))) + continue; + + __spectre_v4_policy = param->policy; + return 0; + } + + return -EINVAL; +} +early_param("ssbd", parse_spectre_v4_param); + +/* + * Because this was all written in a rush by people working in different silos, + * we've ended up with multiple command line options to control the same thing. + * Wrap these up in some helpers, which prefer disabling the mitigation if faced + * with contradictory parameters. The mitigation is always either "off", + * "dynamic" or "on". + */ +static bool spectre_v4_mitigations_off(void) +{ + bool ret = cpu_mitigations_off() || + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED; + + if (ret) + pr_info_once("spectre-v4 mitigation disabled by command-line option\n"); + + return ret; +} + +/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */ +static bool spectre_v4_mitigations_dynamic(void) +{ + return !spectre_v4_mitigations_off() && + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DYNAMIC; +} + +static bool spectre_v4_mitigations_on(void) +{ + return !spectre_v4_mitigations_off() && + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_ENABLED; +} + +ssize_t cpu_show_spec_store_bypass(struct device *dev, + struct device_attribute *attr, char *buf) +{ + switch (spectre_v4_state) { + case SPECTRE_UNAFFECTED: + return sprintf(buf, "Not affected\n"); + case SPECTRE_MITIGATED: + return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n"); + case SPECTRE_VULNERABLE: + fallthrough; + default: + return sprintf(buf, "Vulnerable\n"); + } +} + +enum mitigation_state arm64_get_spectre_v4_state(void) +{ + return spectre_v4_state; +} + +static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void) +{ + static const struct midr_range spectre_v4_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + { /* sentinel */ }, + }; + + if (is_midr_in_range_list(read_cpuid_id(), spectre_v4_safe_list)) + return SPECTRE_UNAFFECTED; + + /* CPU features are detected first */ + if (this_cpu_has_cap(ARM64_SSBS)) + return SPECTRE_MITIGATED; + + return SPECTRE_VULNERABLE; +} + +static enum mitigation_state spectre_v4_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_2, &res); + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + fallthrough; + case SMCCC_RET_NOT_REQUIRED: + return SPECTRE_UNAFFECTED; + default: + fallthrough; + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope) +{ + enum mitigation_state state; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + state = spectre_v4_get_cpu_hw_mitigation_state(); + if (state == SPECTRE_VULNERABLE) + state = spectre_v4_get_cpu_fw_mitigation_state(); + + return state != SPECTRE_UNAFFECTED; +} + +bool try_emulate_el1_ssbs(struct pt_regs *regs, u32 instr) +{ + const u32 instr_mask = ~(1U << PSTATE_Imm_shift); + const u32 instr_val = 0xd500401f | PSTATE_SSBS; + + if ((instr & instr_mask) != instr_val) + return false; + + if (instr & BIT(PSTATE_Imm_shift)) + regs->pstate |= PSR_SSBS_BIT; + else + regs->pstate &= ~PSR_SSBS_BIT; + + arm64_skip_faulting_instruction(regs, 4); + return true; +} + +static enum mitigation_state spectre_v4_enable_hw_mitigation(void) +{ + enum mitigation_state state; + + /* + * If the system is mitigated but this CPU doesn't have SSBS, then + * we must be on the safelist and there's nothing more to do. + */ + state = spectre_v4_get_cpu_hw_mitigation_state(); + if (state != SPECTRE_MITIGATED || !this_cpu_has_cap(ARM64_SSBS)) + return state; + + if (spectre_v4_mitigations_off()) { + sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS); + set_pstate_ssbs(1); + return SPECTRE_VULNERABLE; + } + + /* SCTLR_EL1.DSSBS was initialised to 0 during boot */ + set_pstate_ssbs(0); + return SPECTRE_MITIGATED; +} + +/* + * Patch a branch over the Spectre-v4 mitigation code with a NOP so that + * we fallthrough and check whether firmware needs to be called on this CPU. + */ +void __init spectre_v4_patch_fw_mitigation_enable(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); /* Branch -> NOP */ + + if (spectre_v4_mitigations_off()) + return; + + if (cpus_have_cap(ARM64_SSBS)) + return; + + if (spectre_v4_mitigations_dynamic()) + *updptr = cpu_to_le32(aarch64_insn_gen_nop()); +} + +/* + * Patch a NOP in the Spectre-v4 mitigation code with an SMC/HVC instruction + * to call into firmware to adjust the mitigation state. + */ +void __init smccc_patch_fw_mitigation_conduit(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + u32 insn; + + BUG_ON(nr_inst != 1); /* NOP -> HVC/SMC */ + + switch (arm_smccc_1_1_get_conduit()) { + case SMCCC_CONDUIT_HVC: + insn = aarch64_insn_get_hvc_value(); + break; + case SMCCC_CONDUIT_SMC: + insn = aarch64_insn_get_smc_value(); + break; + default: + return; + } + + *updptr = cpu_to_le32(insn); +} + +static enum mitigation_state spectre_v4_enable_fw_mitigation(void) +{ + enum mitigation_state state; + + state = spectre_v4_get_cpu_fw_mitigation_state(); + if (state != SPECTRE_MITIGATED) + return state; + + if (spectre_v4_mitigations_off()) { + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, false, NULL); + return SPECTRE_VULNERABLE; + } + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, true, NULL); + + if (spectre_v4_mitigations_dynamic()) + __this_cpu_write(arm64_ssbd_callback_required, 1); + + return SPECTRE_MITIGATED; +} + +void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused) +{ + enum mitigation_state state; + + WARN_ON(preemptible()); + + state = spectre_v4_enable_hw_mitigation(); + if (state == SPECTRE_VULNERABLE) + state = spectre_v4_enable_fw_mitigation(); + + update_mitigation_state(&spectre_v4_state, state); +} + +static void __update_pstate_ssbs(struct pt_regs *regs, bool state) +{ + u64 bit = compat_user_mode(regs) ? PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; + + if (state) + regs->pstate |= bit; + else + regs->pstate &= ~bit; +} + +void spectre_v4_enable_task_mitigation(struct task_struct *tsk) +{ + struct pt_regs *regs = task_pt_regs(tsk); + bool ssbs = false, kthread = tsk->flags & PF_KTHREAD; + + if (spectre_v4_mitigations_off()) + ssbs = true; + else if (spectre_v4_mitigations_dynamic() && !kthread) + ssbs = !test_tsk_thread_flag(tsk, TIF_SSBD); + + __update_pstate_ssbs(regs, ssbs); +} + +/* + * The Spectre-v4 mitigation can be controlled via a prctl() from userspace. + * This is interesting because the "speculation disabled" behaviour can be + * configured so that it is preserved across exec(), which means that the + * prctl() may be necessary even when PSTATE.SSBS can be toggled directly + * from userspace. + */ +static void ssbd_prctl_enable_mitigation(struct task_struct *task) +{ + task_clear_spec_ssb_noexec(task); + task_set_spec_ssb_disable(task); + set_tsk_thread_flag(task, TIF_SSBD); +} + +static void ssbd_prctl_disable_mitigation(struct task_struct *task) +{ + task_clear_spec_ssb_noexec(task); + task_clear_spec_ssb_disable(task); + clear_tsk_thread_flag(task, TIF_SSBD); +} + +static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + switch (ctrl) { + case PR_SPEC_ENABLE: + /* Enable speculation: disable mitigation */ + /* + * Force disabled speculation prevents it from being + * re-enabled. + */ + if (task_spec_ssb_force_disable(task)) + return -EPERM; + + /* + * If the mitigation is forced on, then speculation is forced + * off and we again prevent it from being re-enabled. + */ + if (spectre_v4_mitigations_on()) + return -EPERM; + + ssbd_prctl_disable_mitigation(task); + break; + case PR_SPEC_FORCE_DISABLE: + /* Force disable speculation: force enable mitigation */ + /* + * If the mitigation is forced off, then speculation is forced + * on and we prevent it from being disabled. + */ + if (spectre_v4_mitigations_off()) + return -EPERM; + + task_set_spec_ssb_force_disable(task); + fallthrough; + case PR_SPEC_DISABLE: + /* Disable speculation: enable mitigation */ + /* Same as PR_SPEC_FORCE_DISABLE */ + if (spectre_v4_mitigations_off()) + return -EPERM; + + ssbd_prctl_enable_mitigation(task); + break; + case PR_SPEC_DISABLE_NOEXEC: + /* Disable speculation until execve(): enable mitigation */ + /* + * If the mitigation state is forced one way or the other, then + * we must fail now before we try to toggle it on execve(). + */ + if (task_spec_ssb_force_disable(task) || + spectre_v4_mitigations_off() || + spectre_v4_mitigations_on()) { + return -EPERM; + } + + ssbd_prctl_enable_mitigation(task); + task_set_spec_ssb_noexec(task); + break; + default: + return -ERANGE; + } + + spectre_v4_enable_task_mitigation(task); + return 0; +} + +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssbd_prctl_set(task, ctrl); + default: + return -ENODEV; + } +} + +static int ssbd_prctl_get(struct task_struct *task) +{ + switch (spectre_v4_state) { + case SPECTRE_UNAFFECTED: + return PR_SPEC_NOT_AFFECTED; + case SPECTRE_MITIGATED: + if (spectre_v4_mitigations_on()) + return PR_SPEC_NOT_AFFECTED; + + if (spectre_v4_mitigations_dynamic()) + break; + + /* Mitigations are disabled, so we're vulnerable. */ + fallthrough; + case SPECTRE_VULNERABLE: + fallthrough; + default: + return PR_SPEC_ENABLE; + } + + /* Check the mitigation state for this task */ + if (task_spec_ssb_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + + if (task_spec_ssb_noexec(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC; + + if (task_spec_ssb_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; +} + +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssbd_prctl_get(task); + default: + return -ENODEV; + } +} + +/* + * Spectre BHB. + * + * A CPU is either: + * - Mitigated by a branchy loop a CPU specific number of times, and listed + * in our "loop mitigated list". + * - Mitigated in software by the firmware Spectre v2 call. + * - Has the ClearBHB instruction to perform the mitigation. + * - Has the 'Exception Clears Branch History Buffer' (ECBHB) feature, so no + * software mitigation in the vectors is needed. + * - Has CSV2.3, so is unaffected. + */ +static enum mitigation_state spectre_bhb_state; + +enum mitigation_state arm64_get_spectre_bhb_state(void) +{ + return spectre_bhb_state; +} + +enum bhb_mitigation_bits { + BHB_LOOP, + BHB_FW, + BHB_HW, + BHB_INSN, +}; +static unsigned long system_bhb_mitigations; + +/* + * This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any + * SCOPE_SYSTEM call will give the right answer. + */ +u8 spectre_bhb_loop_affected(int scope) +{ + u8 k = 0; + static u8 max_bhb_k; + + if (scope == SCOPE_LOCAL_CPU) { + static const struct midr_range spectre_bhb_k32_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + {}, + }; + static const struct midr_range spectre_bhb_k24_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + {}, + }; + static const struct midr_range spectre_bhb_k11_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + {}, + }; + static const struct midr_range spectre_bhb_k8_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + {}, + }; + + if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) + k = 32; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) + k = 24; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k11_list)) + k = 11; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list)) + k = 8; + + max_bhb_k = max(max_bhb_k, k); + } else { + k = max_bhb_k; + } + + return k; +} + +static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_3, &res); + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + default: + fallthrough; + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +static bool is_spectre_bhb_fw_affected(int scope) +{ + static bool system_affected; + enum mitigation_state fw_state; + bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE; + static const struct midr_range spectre_bhb_firmware_mitigated_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), + {}, + }; + bool cpu_in_list = is_midr_in_range_list(read_cpuid_id(), + spectre_bhb_firmware_mitigated_list); + + if (scope != SCOPE_LOCAL_CPU) + return system_affected; + + fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); + if (cpu_in_list || (has_smccc && fw_state == SPECTRE_MITIGATED)) { + system_affected = true; + return true; + } + + return false; +} + +static bool supports_ecbhb(int scope) +{ + u64 mmfr1; + + if (scope == SCOPE_LOCAL_CPU) + mmfr1 = read_sysreg_s(SYS_ID_AA64MMFR1_EL1); + else + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + + return cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_EL1_ECBHB_SHIFT); +} + +bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, + int scope) +{ + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + if (supports_csv2p3(scope)) + return false; + + if (supports_clearbhb(scope)) + return true; + + if (spectre_bhb_loop_affected(scope)) + return true; + + if (is_spectre_bhb_fw_affected(scope)) + return true; + + return false; +} + +static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) +{ + const char *v = arm64_get_bp_hardening_vector(slot); + + __this_cpu_write(this_cpu_vector, v); + + /* + * When KPTI is in use, the vectors are switched when exiting to + * user-space. + */ + if (cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0)) + return; + + write_sysreg(v, vbar_el1); + isb(); +} + +static bool __read_mostly __nospectre_bhb; +static int __init parse_spectre_bhb_param(char *str) +{ + __nospectre_bhb = true; + return 0; +} +early_param("nospectre_bhb", parse_spectre_bhb_param); + +void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) +{ + bp_hardening_cb_t cpu_cb; + enum mitigation_state fw_state, state = SPECTRE_VULNERABLE; + struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); + + if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU)) + return; + + if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) { + /* No point mitigating Spectre-BHB alone. */ + } else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) { + pr_info_once("spectre-bhb mitigation disabled by compile time option\n"); + } else if (cpu_mitigations_off() || __nospectre_bhb) { + pr_info_once("spectre-bhb mitigation disabled by command line option\n"); + } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { + state = SPECTRE_MITIGATED; + set_bit(BHB_HW, &system_bhb_mitigations); + } else if (supports_clearbhb(SCOPE_LOCAL_CPU)) { + /* + * Ensure KVM uses the indirect vector which will have ClearBHB + * added. + */ + if (!data->slot) + data->slot = HYP_VECTOR_INDIRECT; + + this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN); + state = SPECTRE_MITIGATED; + set_bit(BHB_INSN, &system_bhb_mitigations); + } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { + /* + * Ensure KVM uses the indirect vector which will have the + * branchy-loop added. A57/A72-r0 will already have selected + * the spectre-indirect vector, which is sufficient for BHB + * too. + */ + if (!data->slot) + data->slot = HYP_VECTOR_INDIRECT; + + this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP); + state = SPECTRE_MITIGATED; + set_bit(BHB_LOOP, &system_bhb_mitigations); + } else if (is_spectre_bhb_fw_affected(SCOPE_LOCAL_CPU)) { + fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); + if (fw_state == SPECTRE_MITIGATED) { + /* + * Ensure KVM uses one of the spectre bp_hardening + * vectors. The indirect vector doesn't include the EL3 + * call, so needs upgrading to + * HYP_VECTOR_SPECTRE_INDIRECT. + */ + if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) + data->slot += 1; + + this_cpu_set_vectors(EL1_VECTOR_BHB_FW); + + /* + * The WA3 call in the vectors supersedes the WA1 call + * made during context-switch. Uninstall any firmware + * bp_hardening callback. + */ + cpu_cb = spectre_v2_get_sw_mitigation_cb(); + if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) + __this_cpu_write(bp_hardening_data.fn, NULL); + + state = SPECTRE_MITIGATED; + set_bit(BHB_FW, &system_bhb_mitigations); + } + } + + update_mitigation_state(&spectre_bhb_state, state); +} + +/* Patched to NOP when enabled */ +void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); + + if (test_bit(BHB_LOOP, &system_bhb_mitigations)) + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); +} + +/* Patched to NOP when enabled */ +void noinstr spectre_bhb_patch_fw_mitigation_enabled(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); + + if (test_bit(BHB_FW, &system_bhb_mitigations)) + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); +} + +/* Patched to correct the immediate */ +void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u8 rd; + u32 insn; + u16 loop_count = spectre_bhb_loop_affected(SCOPE_SYSTEM); + + BUG_ON(nr_inst != 1); /* MOV -> MOV */ + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) + return; + + insn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + insn = aarch64_insn_gen_movewide(rd, loop_count, 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr++ = cpu_to_le32(insn); +} + +/* Patched to mov WA3 when supported */ +void noinstr spectre_bhb_patch_wa3(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u8 rd; + u32 insn; + + BUG_ON(nr_inst != 1); /* MOV -> MOV */ + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY) || + !test_bit(BHB_FW, &system_bhb_mitigations)) + return; + + insn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + + insn = aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_ORR, + AARCH64_INSN_VARIANT_32BIT, + AARCH64_INSN_REG_ZR, rd, + ARM_SMCCC_ARCH_WORKAROUND_3); + if (WARN_ON_ONCE(insn == AARCH64_BREAK_FAULT)) + return; + + *updptr++ = cpu_to_le32(insn); +} + +/* Patched to NOP when not supported */ +void __init spectre_bhb_patch_clearbhb(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 2); + + if (test_bit(BHB_INSN, &system_bhb_mitigations)) + return; + + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); +} + +#ifdef CONFIG_BPF_SYSCALL +#define EBPF_WARN "Unprivileged eBPF is enabled, data leaks possible via Spectre v2 BHB attacks!\n" +void unpriv_ebpf_notify(int new_state) +{ + if (spectre_v2_state == SPECTRE_VULNERABLE || + spectre_bhb_state != SPECTRE_MITIGATED) + return; + + if (!new_state) + pr_err("WARNING: %s", EBPF_WARN); +} +#endif diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c index 43ae4e0c968f..29a8e444db83 100644 --- a/arch/arm64/kernel/psci.c +++ b/arch/arm64/kernel/psci.c @@ -38,7 +38,8 @@ static int __init cpu_psci_cpu_prepare(unsigned int cpu) static int cpu_psci_cpu_boot(unsigned int cpu) { - int err = psci_ops.cpu_on(cpu_logical_map(cpu), __pa_symbol(secondary_entry)); + phys_addr_t pa_secondary_entry = __pa_symbol(secondary_entry); + int err = psci_ops.cpu_on(cpu_logical_map(cpu), pa_secondary_entry); if (err) pr_err("failed to boot CPU%d (%d)\n", cpu, err); @@ -66,7 +67,6 @@ static int cpu_psci_cpu_disable(unsigned int cpu) static void cpu_psci_cpu_die(unsigned int cpu) { - int ret; /* * There are no known implementations of PSCI actually using the * power state field, pass a sensible default for now. @@ -74,9 +74,7 @@ static void cpu_psci_cpu_die(unsigned int cpu) u32 state = PSCI_POWER_STATE_TYPE_POWER_DOWN << PSCI_0_2_POWER_STATE_TYPE_SHIFT; - ret = psci_ops.cpu_off(state); - - pr_crit("unable to power off CPU%u (%d)\n", cpu, ret); + psci_ops.cpu_off(state); } static int cpu_psci_cpu_kill(unsigned int cpu) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 6771c399d40c..dc6cf0e37194 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -27,14 +27,14 @@ #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/regset.h> -#include <linux/tracehook.h> #include <linux/elf.h> +#include <linux/rseq.h> #include <asm/compat.h> #include <asm/cpufeature.h> #include <asm/debug-monitors.h> #include <asm/fpsimd.h> -#include <asm/pgtable.h> +#include <asm/mte.h> #include <asm/pointer_auth.h> #include <asm/stacktrace.h> #include <asm/syscall.h> @@ -122,7 +122,7 @@ static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) { return ((addr & ~(THREAD_SIZE - 1)) == (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))) || - on_irq_stack(addr, NULL); + on_irq_stack(addr, sizeof(unsigned long)); } /** @@ -192,14 +192,12 @@ static void ptrace_hbptriggered(struct perf_event *bp, break; } } - arm64_force_sig_ptrace_errno_trap(si_errno, - (void __user *)bkpt->trigger, + arm64_force_sig_ptrace_errno_trap(si_errno, bkpt->trigger, desc); + return; } #endif - arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, - (void __user *)(bkpt->trigger), - desc); + arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, bkpt->trigger, desc); } /* @@ -475,11 +473,10 @@ static int ptrace_hbp_set_addr(unsigned int note_type, static int hw_break_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { unsigned int note_type = regset->core_note_type; - int ret, idx = 0, offset, limit; + int ret, idx = 0; u32 info, ctrl; u64 addr; @@ -488,49 +485,21 @@ static int hw_break_get(struct task_struct *target, if (ret) return ret; - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &info, 0, - sizeof(info)); - if (ret) - return ret; - - /* Pad */ - offset = offsetof(struct user_hwdebug_state, pad); - ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, offset, - offset + PTRACE_HBP_PAD_SZ); - if (ret) - return ret; - + membuf_write(&to, &info, sizeof(info)); + membuf_zero(&to, sizeof(u32)); /* (address, ctrl) registers */ - offset = offsetof(struct user_hwdebug_state, dbg_regs); - limit = regset->n * regset->size; - while (count && offset < limit) { + while (to.left) { ret = ptrace_hbp_get_addr(note_type, target, idx, &addr); if (ret) return ret; - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &addr, - offset, offset + PTRACE_HBP_ADDR_SZ); - if (ret) - return ret; - offset += PTRACE_HBP_ADDR_SZ; - ret = ptrace_hbp_get_ctrl(note_type, target, idx, &ctrl); if (ret) return ret; - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &ctrl, - offset, offset + PTRACE_HBP_CTRL_SZ); - if (ret) - return ret; - offset += PTRACE_HBP_CTRL_SZ; - - ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - offset, - offset + PTRACE_HBP_PAD_SZ); - if (ret) - return ret; - offset += PTRACE_HBP_PAD_SZ; + membuf_store(&to, addr); + membuf_store(&to, ctrl); + membuf_zero(&to, sizeof(u32)); idx++; } - return 0; } @@ -546,9 +515,7 @@ static int hw_break_set(struct task_struct *target, /* Resource info and pad */ offset = offsetof(struct user_hwdebug_state, dbg_regs); - ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, offset); - if (ret) - return ret; + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, offset); /* (address, ctrl) registers */ limit = regset->n * regset->size; @@ -575,11 +542,8 @@ static int hw_break_set(struct task_struct *target, return ret; offset += PTRACE_HBP_CTRL_SZ; - ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, - offset, - offset + PTRACE_HBP_PAD_SZ); - if (ret) - return ret; + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, + offset, offset + PTRACE_HBP_PAD_SZ); offset += PTRACE_HBP_PAD_SZ; idx++; } @@ -590,11 +554,10 @@ static int hw_break_set(struct task_struct *target, static int gpr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct user_pt_regs *uregs = &task_pt_regs(target)->user_regs; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, 0, -1); + return membuf_write(&to, uregs, sizeof(*uregs)); } static int gpr_set(struct task_struct *target, const struct user_regset *regset, @@ -615,13 +578,19 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset, return 0; } +static int fpr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!system_supports_fpsimd()) + return -ENODEV; + return regset->n; +} + /* * TODO: update fp accessors for lazy context switching (sync/flush hwstate) */ static int __fpr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, unsigned int start_pos) + struct membuf to) { struct user_fpsimd_state *uregs; @@ -629,18 +598,19 @@ static int __fpr_get(struct task_struct *target, uregs = &target->thread.uw.fpsimd_state; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, - start_pos, start_pos + sizeof(*uregs)); + return membuf_write(&to, uregs, sizeof(*uregs)); } static int fpr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { + if (!system_supports_fpsimd()) + return -EINVAL; + if (target == current) fpsimd_preserve_current_state(); - return __fpr_get(target, regset, pos, count, kbuf, ubuf, 0); + return __fpr_get(target, regset, to); } static int __fpr_set(struct task_struct *target, @@ -676,6 +646,9 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, { int ret; + if (!system_supports_fpsimd()) + return -EINVAL; + ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, 0); if (ret) return ret; @@ -687,15 +660,20 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, } static int tls_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - unsigned long *tls = &target->thread.uw.tp_value; + int ret; if (target == current) tls_preserve_current_state(); - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, tls, 0, -1); + ret = membuf_store(&to, target->thread.uw.tp_value); + if (system_supports_tpidr2()) + ret = membuf_store(&to, target->thread.tpidr2_el0); + else + ret = membuf_zero(&to, sizeof(u64)); + + return ret; } static int tls_set(struct task_struct *target, const struct user_regset *regset, @@ -703,25 +681,28 @@ static int tls_set(struct task_struct *target, const struct user_regset *regset, const void *kbuf, const void __user *ubuf) { int ret; - unsigned long tls = target->thread.uw.tp_value; + unsigned long tls[2]; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &tls, 0, -1); + tls[0] = target->thread.uw.tp_value; + if (system_supports_tpidr2()) + tls[1] = target->thread.tpidr2_el0; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, tls, 0, count); if (ret) return ret; - target->thread.uw.tp_value = tls; + target->thread.uw.tp_value = tls[0]; + if (system_supports_tpidr2()) + target->thread.tpidr2_el0 = tls[1]; + return ret; } static int system_call_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - int syscallno = task_pt_regs(target)->syscallno; - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &syscallno, 0, -1); + return membuf_store(&to, task_pt_regs(target)->syscallno); } static int system_call_set(struct task_struct *target, @@ -743,21 +724,51 @@ static int system_call_set(struct task_struct *target, #ifdef CONFIG_ARM64_SVE static void sve_init_header_from_task(struct user_sve_header *header, - struct task_struct *target) + struct task_struct *target, + enum vec_type type) { unsigned int vq; + bool active; + bool fpsimd_only; + enum vec_type task_type; memset(header, 0, sizeof(*header)); - header->flags = test_tsk_thread_flag(target, TIF_SVE) ? - SVE_PT_REGS_SVE : SVE_PT_REGS_FPSIMD; - if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) - header->flags |= SVE_PT_VL_INHERIT; + /* Check if the requested registers are active for the task */ + if (thread_sm_enabled(&target->thread)) + task_type = ARM64_VEC_SME; + else + task_type = ARM64_VEC_SVE; + active = (task_type == type); + + switch (type) { + case ARM64_VEC_SVE: + if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) + header->flags |= SVE_PT_VL_INHERIT; + fpsimd_only = !test_tsk_thread_flag(target, TIF_SVE); + break; + case ARM64_VEC_SME: + if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) + header->flags |= SVE_PT_VL_INHERIT; + fpsimd_only = false; + break; + default: + WARN_ON_ONCE(1); + return; + } + + if (active) { + if (fpsimd_only) { + header->flags |= SVE_PT_REGS_FPSIMD; + } else { + header->flags |= SVE_PT_REGS_SVE; + } + } - header->vl = target->thread.sve_vl; + header->vl = task_get_vl(target, type); vq = sve_vq_from_vl(header->vl); - header->max_vl = sve_max_vl; + header->max_vl = vec_max_vl(type); header->size = SVE_PT_SIZE(vq, header->flags); header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl), SVE_PT_REGS_SVE); @@ -768,99 +779,79 @@ static unsigned int sve_size_from_header(struct user_sve_header const *header) return ALIGN(header->size, SVE_VQ_BYTES); } -static unsigned int sve_get_size(struct task_struct *target, - const struct user_regset *regset) -{ - struct user_sve_header header; - - if (!system_supports_sve()) - return 0; - - sve_init_header_from_task(&header, target); - return sve_size_from_header(&header); -} - -static int sve_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) +static int sve_get_common(struct task_struct *target, + const struct user_regset *regset, + struct membuf to, + enum vec_type type) { - int ret; struct user_sve_header header; unsigned int vq; unsigned long start, end; - if (!system_supports_sve()) - return -EINVAL; - /* Header */ - sve_init_header_from_task(&header, target); + sve_init_header_from_task(&header, target, type); vq = sve_vq_from_vl(header.vl); - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &header, - 0, sizeof(header)); - if (ret) - return ret; + membuf_write(&to, &header, sizeof(header)); if (target == current) fpsimd_preserve_current_state(); - /* Registers: FPSIMD-only case */ - BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); - if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) - return __fpr_get(target, regset, pos, count, kbuf, ubuf, - SVE_PT_FPSIMD_OFFSET); + BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); - /* Otherwise: full SVE case */ + switch ((header.flags & SVE_PT_REGS_MASK)) { + case SVE_PT_REGS_FPSIMD: + return __fpr_get(target, regset, to); - BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); - start = SVE_PT_SVE_OFFSET; - end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - target->thread.sve_state, - start, end); - if (ret) - return ret; + case SVE_PT_REGS_SVE: + start = SVE_PT_SVE_OFFSET; + end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); + membuf_write(&to, target->thread.sve_state, end - start); - start = end; - end = SVE_PT_SVE_FPSR_OFFSET(vq); - ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - start, end); - if (ret) - return ret; + start = end; + end = SVE_PT_SVE_FPSR_OFFSET(vq); + membuf_zero(&to, end - start); - /* - * Copy fpsr, and fpcr which must follow contiguously in - * struct fpsimd_state: - */ - start = end; - end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.uw.fpsimd_state.fpsr, - start, end); - if (ret) - return ret; + /* + * Copy fpsr, and fpcr which must follow contiguously in + * struct fpsimd_state: + */ + start = end; + end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; + membuf_write(&to, &target->thread.uw.fpsimd_state.fpsr, + end - start); - start = end; - end = sve_size_from_header(&header); - return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - start, end); + start = end; + end = sve_size_from_header(&header); + return membuf_zero(&to, end - start); + + default: + return 0; + } } -static int sve_set(struct task_struct *target, +static int sve_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) + struct membuf to) +{ + if (!system_supports_sve()) + return -EINVAL; + + return sve_get_common(target, regset, to, ARM64_VEC_SVE); +} + +static int sve_set_common(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf, + enum vec_type type) { int ret; struct user_sve_header header; unsigned int vq; unsigned long start, end; - if (!system_supports_sve()) - return -EINVAL; - /* Header */ if (count < sizeof(header)) return -EINVAL; @@ -871,15 +862,47 @@ static int sve_set(struct task_struct *target, /* * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by - * sve_set_vector_length(), which will also validate them for us: + * vec_set_vector_length(), which will also validate them for us: */ - ret = sve_set_vector_length(target, header.vl, + ret = vec_set_vector_length(target, type, header.vl, ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); if (ret) goto out; /* Actual VL set may be less than the user asked for: */ - vq = sve_vq_from_vl(target->thread.sve_vl); + vq = sve_vq_from_vl(task_get_vl(target, type)); + + /* Enter/exit streaming mode */ + if (system_supports_sme()) { + u64 old_svcr = target->thread.svcr; + + switch (type) { + case ARM64_VEC_SVE: + target->thread.svcr &= ~SVCR_SM_MASK; + break; + case ARM64_VEC_SME: + target->thread.svcr |= SVCR_SM_MASK; + + /* + * Disable traps and ensure there is SME storage but + * preserve any currently set values in ZA/ZT. + */ + sme_alloc(target, false); + set_tsk_thread_flag(target, TIF_SME); + break; + default: + WARN_ON_ONCE(1); + ret = -EINVAL; + goto out; + } + + /* + * If we switched then invalidate any existing SVE + * state and ensure there's storage. + */ + if (target->thread.svcr != old_svcr) + sve_alloc(target, true); + } /* Registers: FPSIMD-only case */ @@ -888,10 +911,14 @@ static int sve_set(struct task_struct *target, ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, SVE_PT_FPSIMD_OFFSET); clear_tsk_thread_flag(target, TIF_SVE); + target->thread.fp_type = FP_STATE_FPSIMD; goto out; } - /* Otherwise: full SVE case */ + /* + * Otherwise: no registers or full SVE case. For backwards + * compatibility reasons we treat empty flags as SVE registers. + */ /* * If setting a different VL from the requested VL and there is @@ -903,15 +930,25 @@ static int sve_set(struct task_struct *target, goto out; } - sve_alloc(target); + sve_alloc(target, true); + if (!target->thread.sve_state) { + ret = -ENOMEM; + clear_tsk_thread_flag(target, TIF_SVE); + target->thread.fp_type = FP_STATE_FPSIMD; + goto out; + } /* * Ensure target->thread.sve_state is up to date with target's - * FPSIMD regs, so that a short copyin leaves trailing registers - * unmodified. + * FPSIMD regs, so that a short copyin leaves trailing + * registers unmodified. Only enable SVE if we are + * configuring normal SVE, a system with streaming SVE may not + * have normal SVE. */ fpsimd_sync_to_sve(target); - set_tsk_thread_flag(target, TIF_SVE); + if (type == ARM64_VEC_SVE) + set_tsk_thread_flag(target, TIF_SVE); + target->thread.fp_type = FP_STATE_SVE; BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); start = SVE_PT_SVE_OFFSET; @@ -924,10 +961,7 @@ static int sve_set(struct task_struct *target, start = end; end = SVE_PT_SVE_FPSR_OFFSET(vq); - ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, - start, end); - if (ret) - goto out; + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, start, end); /* * Copy fpsr, and fpcr which must follow contiguously in @@ -944,13 +978,238 @@ out: return ret; } +static int sve_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + if (!system_supports_sve()) + return -EINVAL; + + return sve_set_common(target, regset, pos, count, kbuf, ubuf, + ARM64_VEC_SVE); +} + #endif /* CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +static int ssve_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_sme()) + return -EINVAL; + + return sve_get_common(target, regset, to, ARM64_VEC_SME); +} + +static int ssve_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + if (!system_supports_sme()) + return -EINVAL; + + return sve_set_common(target, regset, pos, count, kbuf, ubuf, + ARM64_VEC_SME); +} + +static int za_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_za_header header; + unsigned int vq; + unsigned long start, end; + + if (!system_supports_sme()) + return -EINVAL; + + /* Header */ + memset(&header, 0, sizeof(header)); + + if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) + header.flags |= ZA_PT_VL_INHERIT; + + header.vl = task_get_sme_vl(target); + vq = sve_vq_from_vl(header.vl); + header.max_vl = sme_max_vl(); + header.max_size = ZA_PT_SIZE(vq); + + /* If ZA is not active there is only the header */ + if (thread_za_enabled(&target->thread)) + header.size = ZA_PT_SIZE(vq); + else + header.size = ZA_PT_ZA_OFFSET; + + membuf_write(&to, &header, sizeof(header)); + + BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); + end = ZA_PT_ZA_OFFSET; + + if (target == current) + fpsimd_preserve_current_state(); + + /* Any register data to include? */ + if (thread_za_enabled(&target->thread)) { + start = end; + end = ZA_PT_SIZE(vq); + membuf_write(&to, target->thread.sme_state, end - start); + } + + /* Zero any trailing padding */ + start = end; + end = ALIGN(header.size, SVE_VQ_BYTES); + return membuf_zero(&to, end - start); +} + +static int za_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct user_za_header header; + unsigned int vq; + unsigned long start, end; + + if (!system_supports_sme()) + return -EINVAL; + + /* Header */ + if (count < sizeof(header)) + return -EINVAL; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, + 0, sizeof(header)); + if (ret) + goto out; + + /* + * All current ZA_PT_* flags are consumed by + * vec_set_vector_length(), which will also validate them for + * us: + */ + ret = vec_set_vector_length(target, ARM64_VEC_SME, header.vl, + ((unsigned long)header.flags) << 16); + if (ret) + goto out; + + /* Actual VL set may be less than the user asked for: */ + vq = sve_vq_from_vl(task_get_sme_vl(target)); + + /* Ensure there is some SVE storage for streaming mode */ + if (!target->thread.sve_state) { + sve_alloc(target, false); + if (!target->thread.sve_state) { + ret = -ENOMEM; + goto out; + } + } + + /* + * Only flush the storage if PSTATE.ZA was not already set, + * otherwise preserve any existing data. + */ + sme_alloc(target, !thread_za_enabled(&target->thread)); + if (!target->thread.sme_state) + return -ENOMEM; + + /* If there is no data then disable ZA */ + if (!count) { + target->thread.svcr &= ~SVCR_ZA_MASK; + goto out; + } + + /* + * If setting a different VL from the requested VL and there is + * register data, the data layout will be wrong: don't even + * try to set the registers in this case. + */ + if (vq != sve_vq_from_vl(header.vl)) { + ret = -EIO; + goto out; + } + + BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); + start = ZA_PT_ZA_OFFSET; + end = ZA_PT_SIZE(vq); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + target->thread.sme_state, + start, end); + if (ret) + goto out; + + /* Mark ZA as active and let userspace use it */ + set_tsk_thread_flag(target, TIF_SME); + target->thread.svcr |= SVCR_ZA_MASK; + +out: + fpsimd_flush_task_state(target); + return ret; +} + +static int zt_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_sme2()) + return -EINVAL; + + /* + * If PSTATE.ZA is not set then ZT will be zeroed when it is + * enabled so report the current register value as zero. + */ + if (thread_za_enabled(&target->thread)) + membuf_write(&to, thread_zt_state(&target->thread), + ZT_SIG_REG_BYTES); + else + membuf_zero(&to, ZT_SIG_REG_BYTES); + + return 0; +} + +static int zt_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + if (!system_supports_sme2()) + return -EINVAL; + + /* Ensure SVE storage in case this is first use of SME */ + sve_alloc(target, false); + if (!target->thread.sve_state) + return -ENOMEM; + + if (!thread_za_enabled(&target->thread)) { + sme_alloc(target, true); + if (!target->thread.sme_state) + return -ENOMEM; + } + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + thread_zt_state(&target->thread), + 0, ZT_SIG_REG_BYTES); + if (ret == 0) { + target->thread.svcr |= SVCR_ZA_MASK; + set_tsk_thread_flag(target, TIF_SME); + } + + fpsimd_flush_task_state(target); + + return ret; +} + +#endif /* CONFIG_ARM64_SME */ + #ifdef CONFIG_ARM64_PTR_AUTH static int pac_mask_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { /* * The PAC bits can differ across data and instruction pointers @@ -966,7 +1225,39 @@ static int pac_mask_get(struct task_struct *target, if (!system_supports_address_auth()) return -EINVAL; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &uregs, 0, -1); + return membuf_write(&to, &uregs, sizeof(uregs)); +} + +static int pac_enabled_keys_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + long enabled_keys = ptrauth_get_enabled_keys(target); + + if (IS_ERR_VALUE(enabled_keys)) + return enabled_keys; + + return membuf_write(&to, &enabled_keys, sizeof(enabled_keys)); +} + +static int pac_enabled_keys_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + long enabled_keys = ptrauth_get_enabled_keys(target); + + if (IS_ERR_VALUE(enabled_keys)) + return enabled_keys; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &enabled_keys, 0, + sizeof(long)); + if (ret) + return ret; + + return ptrauth_set_enabled_keys(target, PR_PAC_ENABLED_KEYS_MASK, + enabled_keys); } #ifdef CONFIG_CHECKPOINT_RESTORE @@ -986,7 +1277,7 @@ static struct ptrauth_key pac_key_from_user(__uint128_t ukey) } static void pac_address_keys_to_user(struct user_pac_address_keys *ukeys, - const struct ptrauth_keys *keys) + const struct ptrauth_keys_user *keys) { ukeys->apiakey = pac_key_to_user(&keys->apia); ukeys->apibkey = pac_key_to_user(&keys->apib); @@ -994,7 +1285,7 @@ static void pac_address_keys_to_user(struct user_pac_address_keys *ukeys, ukeys->apdbkey = pac_key_to_user(&keys->apdb); } -static void pac_address_keys_from_user(struct ptrauth_keys *keys, +static void pac_address_keys_from_user(struct ptrauth_keys_user *keys, const struct user_pac_address_keys *ukeys) { keys->apia = pac_key_from_user(ukeys->apiakey); @@ -1005,10 +1296,9 @@ static void pac_address_keys_from_user(struct ptrauth_keys *keys, static int pac_address_keys_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - struct ptrauth_keys *keys = &target->thread.keys_user; + struct ptrauth_keys_user *keys = &target->thread.keys_user; struct user_pac_address_keys user_keys; if (!system_supports_address_auth()) @@ -1016,8 +1306,7 @@ static int pac_address_keys_get(struct task_struct *target, pac_address_keys_to_user(&user_keys, keys); - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &user_keys, 0, -1); + return membuf_write(&to, &user_keys, sizeof(user_keys)); } static int pac_address_keys_set(struct task_struct *target, @@ -1025,7 +1314,7 @@ static int pac_address_keys_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct ptrauth_keys *keys = &target->thread.keys_user; + struct ptrauth_keys_user *keys = &target->thread.keys_user; struct user_pac_address_keys user_keys; int ret; @@ -1043,12 +1332,12 @@ static int pac_address_keys_set(struct task_struct *target, } static void pac_generic_keys_to_user(struct user_pac_generic_keys *ukeys, - const struct ptrauth_keys *keys) + const struct ptrauth_keys_user *keys) { ukeys->apgakey = pac_key_to_user(&keys->apga); } -static void pac_generic_keys_from_user(struct ptrauth_keys *keys, +static void pac_generic_keys_from_user(struct ptrauth_keys_user *keys, const struct user_pac_generic_keys *ukeys) { keys->apga = pac_key_from_user(ukeys->apgakey); @@ -1056,10 +1345,9 @@ static void pac_generic_keys_from_user(struct ptrauth_keys *keys, static int pac_generic_keys_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - struct ptrauth_keys *keys = &target->thread.keys_user; + struct ptrauth_keys_user *keys = &target->thread.keys_user; struct user_pac_generic_keys user_keys; if (!system_supports_generic_auth()) @@ -1067,8 +1355,7 @@ static int pac_generic_keys_get(struct task_struct *target, pac_generic_keys_to_user(&user_keys, keys); - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &user_keys, 0, -1); + return membuf_write(&to, &user_keys, sizeof(user_keys)); } static int pac_generic_keys_set(struct task_struct *target, @@ -1076,7 +1363,7 @@ static int pac_generic_keys_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct ptrauth_keys *keys = &target->thread.keys_user; + struct ptrauth_keys_user *keys = &target->thread.keys_user; struct user_pac_generic_keys user_keys; int ret; @@ -1095,6 +1382,35 @@ static int pac_generic_keys_set(struct task_struct *target, #endif /* CONFIG_CHECKPOINT_RESTORE */ #endif /* CONFIG_ARM64_PTR_AUTH */ +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI +static int tagged_addr_ctrl_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + long ctrl = get_tagged_addr_ctrl(target); + + if (IS_ERR_VALUE(ctrl)) + return ctrl; + + return membuf_write(&to, &ctrl, sizeof(ctrl)); +} + +static int tagged_addr_ctrl_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + long ctrl; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + return set_tagged_addr_ctrl(target, ctrl); +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -1107,13 +1423,22 @@ enum aarch64_regset { #ifdef CONFIG_ARM64_SVE REGSET_SVE, #endif +#ifdef CONFIG_ARM64_SME + REGSET_SSVE, + REGSET_ZA, + REGSET_ZT, +#endif #ifdef CONFIG_ARM64_PTR_AUTH REGSET_PAC_MASK, + REGSET_PAC_ENABLED_KEYS, #ifdef CONFIG_CHECKPOINT_RESTORE REGSET_PACA_KEYS, REGSET_PACG_KEYS, #endif #endif +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI + REGSET_TAGGED_ADDR_CTRL, +#endif }; static const struct user_regset aarch64_regsets[] = { @@ -1122,7 +1447,7 @@ static const struct user_regset aarch64_regsets[] = { .n = sizeof(struct user_pt_regs) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), - .get = gpr_get, + .regset_get = gpr_get, .set = gpr_set }, [REGSET_FPR] = { @@ -1134,15 +1459,16 @@ static const struct user_regset aarch64_regsets[] = { */ .size = sizeof(u32), .align = sizeof(u32), - .get = fpr_get, + .active = fpr_active, + .regset_get = fpr_get, .set = fpr_set }, [REGSET_TLS] = { .core_note_type = NT_ARM_TLS, - .n = 1, + .n = 2, .size = sizeof(void *), .align = sizeof(void *), - .get = tls_get, + .regset_get = tls_get, .set = tls_set, }, #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -1151,7 +1477,7 @@ static const struct user_regset aarch64_regsets[] = { .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = hw_break_get, + .regset_get = hw_break_get, .set = hw_break_set, }, [REGSET_HW_WATCH] = { @@ -1159,7 +1485,7 @@ static const struct user_regset aarch64_regsets[] = { .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = hw_break_get, + .regset_get = hw_break_get, .set = hw_break_set, }, #endif @@ -1168,7 +1494,7 @@ static const struct user_regset aarch64_regsets[] = { .n = 1, .size = sizeof(int), .align = sizeof(int), - .get = system_call_get, + .regset_get = system_call_get, .set = system_call_set, }, #ifdef CONFIG_ARM64_SVE @@ -1178,9 +1504,43 @@ static const struct user_regset aarch64_regsets[] = { SVE_VQ_BYTES), .size = SVE_VQ_BYTES, .align = SVE_VQ_BYTES, - .get = sve_get, + .regset_get = sve_get, .set = sve_set, - .get_size = sve_get_size, + }, +#endif +#ifdef CONFIG_ARM64_SME + [REGSET_SSVE] = { /* Streaming mode SVE */ + .core_note_type = NT_ARM_SSVE, + .n = DIV_ROUND_UP(SVE_PT_SIZE(SME_VQ_MAX, SVE_PT_REGS_SVE), + SVE_VQ_BYTES), + .size = SVE_VQ_BYTES, + .align = SVE_VQ_BYTES, + .regset_get = ssve_get, + .set = ssve_set, + }, + [REGSET_ZA] = { /* SME ZA */ + .core_note_type = NT_ARM_ZA, + /* + * ZA is a single register but it's variably sized and + * the ptrace core requires that the size of any data + * be an exact multiple of the configured register + * size so report as though we had SVE_VQ_BYTES + * registers. These values aren't exposed to + * userspace. + */ + .n = DIV_ROUND_UP(ZA_PT_SIZE(SME_VQ_MAX), SVE_VQ_BYTES), + .size = SVE_VQ_BYTES, + .align = SVE_VQ_BYTES, + .regset_get = za_get, + .set = za_set, + }, + [REGSET_ZT] = { /* SME ZT */ + .core_note_type = NT_ARM_ZT, + .n = 1, + .size = ZT_SIG_REG_BYTES, + .align = sizeof(u64), + .regset_get = zt_get, + .set = zt_set, }, #endif #ifdef CONFIG_ARM64_PTR_AUTH @@ -1189,16 +1549,24 @@ static const struct user_regset aarch64_regsets[] = { .n = sizeof(struct user_pac_mask) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), - .get = pac_mask_get, + .regset_get = pac_mask_get, /* this cannot be set dynamically */ }, + [REGSET_PAC_ENABLED_KEYS] = { + .core_note_type = NT_ARM_PAC_ENABLED_KEYS, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = pac_enabled_keys_get, + .set = pac_enabled_keys_set, + }, #ifdef CONFIG_CHECKPOINT_RESTORE [REGSET_PACA_KEYS] = { .core_note_type = NT_ARM_PACA_KEYS, .n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t), .size = sizeof(__uint128_t), .align = sizeof(__uint128_t), - .get = pac_address_keys_get, + .regset_get = pac_address_keys_get, .set = pac_address_keys_set, }, [REGSET_PACG_KEYS] = { @@ -1206,11 +1574,21 @@ static const struct user_regset aarch64_regsets[] = { .n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t), .size = sizeof(__uint128_t), .align = sizeof(__uint128_t), - .get = pac_generic_keys_get, + .regset_get = pac_generic_keys_get, .set = pac_generic_keys_set, }, #endif #endif +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI + [REGSET_TAGGED_ADDR_CTRL] = { + .core_note_type = NT_ARM_TAGGED_ADDR_CTRL, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = tagged_addr_ctrl_get, + .set = tagged_addr_ctrl_set, + }, +#endif }; static const struct user_regset_view user_aarch64_view = { @@ -1224,57 +1602,31 @@ enum compat_regset { REGSET_COMPAT_VFP, }; -static int compat_gpr_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) +static inline compat_ulong_t compat_get_user_reg(struct task_struct *task, int idx) { - int ret = 0; - unsigned int i, start, num_regs; - - /* Calculate the number of AArch32 registers contained in count */ - num_regs = count / regset->size; - - /* Convert pos into an register number */ - start = pos / regset->size; - - if (start + num_regs > regset->n) - return -EIO; + struct pt_regs *regs = task_pt_regs(task); - for (i = 0; i < num_regs; ++i) { - unsigned int idx = start + i; - compat_ulong_t reg; - - switch (idx) { - case 15: - reg = task_pt_regs(target)->pc; - break; - case 16: - reg = task_pt_regs(target)->pstate; - reg = pstate_to_compat_psr(reg); - break; - case 17: - reg = task_pt_regs(target)->orig_x0; - break; - default: - reg = task_pt_regs(target)->regs[idx]; - } - - if (kbuf) { - memcpy(kbuf, ®, sizeof(reg)); - kbuf += sizeof(reg); - } else { - ret = copy_to_user(ubuf, ®, sizeof(reg)); - if (ret) { - ret = -EFAULT; - break; - } - - ubuf += sizeof(reg); - } + switch (idx) { + case 15: + return regs->pc; + case 16: + return pstate_to_compat_psr(regs->pstate); + case 17: + return regs->orig_x0; + default: + return regs->regs[idx]; } +} - return ret; +static int compat_gpr_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + int i = 0; + + while (to.left) + membuf_store(&to, compat_get_user_reg(target, i++)); + return 0; } static int compat_gpr_set(struct task_struct *target, @@ -1341,12 +1693,13 @@ static int compat_gpr_set(struct task_struct *target, static int compat_vfp_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct user_fpsimd_state *uregs; compat_ulong_t fpscr; - int ret, vregs_end_pos; + + if (!system_supports_fpsimd()) + return -EINVAL; uregs = &target->thread.uw.fpsimd_state; @@ -1357,19 +1710,10 @@ static int compat_vfp_get(struct task_struct *target, * The VFP registers are packed into the fpsimd_state, so they all sit * nicely together for us. We just need to create the fpscr separately. */ - vregs_end_pos = VFP_STATE_SIZE - sizeof(compat_ulong_t); - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, - 0, vregs_end_pos); - - if (count && !ret) { - fpscr = (uregs->fpsr & VFP_FPSCR_STAT_MASK) | - (uregs->fpcr & VFP_FPSCR_CTRL_MASK); - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &fpscr, - vregs_end_pos, VFP_STATE_SIZE); - } - - return ret; + membuf_write(&to, uregs, VFP_STATE_SIZE - sizeof(compat_ulong_t)); + fpscr = (uregs->fpsr & VFP_FPSCR_STAT_MASK) | + (uregs->fpcr & VFP_FPSCR_CTRL_MASK); + return membuf_store(&to, fpscr); } static int compat_vfp_set(struct task_struct *target, @@ -1381,6 +1725,9 @@ static int compat_vfp_set(struct task_struct *target, compat_ulong_t fpscr; int ret, vregs_end_pos; + if (!system_supports_fpsimd()) + return -EINVAL; + uregs = &target->thread.uw.fpsimd_state; vregs_end_pos = VFP_STATE_SIZE - sizeof(compat_ulong_t); @@ -1401,11 +1748,10 @@ static int compat_vfp_set(struct task_struct *target, } static int compat_tls_get(struct task_struct *target, - const struct user_regset *regset, unsigned int pos, - unsigned int count, void *kbuf, void __user *ubuf) + const struct user_regset *regset, + struct membuf to) { - compat_ulong_t tls = (compat_ulong_t)target->thread.uw.tp_value; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &tls, 0, -1); + return membuf_store(&to, (compat_ulong_t)target->thread.uw.tp_value); } static int compat_tls_set(struct task_struct *target, @@ -1430,7 +1776,7 @@ static const struct user_regset aarch32_regsets[] = { .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), - .get = compat_gpr_get, + .regset_get = compat_gpr_get, .set = compat_gpr_set }, [REGSET_COMPAT_VFP] = { @@ -1438,7 +1784,8 @@ static const struct user_regset aarch32_regsets[] = { .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), - .get = compat_vfp_get, + .active = fpr_active, + .regset_get = compat_vfp_get, .set = compat_vfp_set }, }; @@ -1454,7 +1801,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), - .get = compat_gpr_get, + .regset_get = compat_gpr_get, .set = compat_gpr_set }, [REGSET_FPR] = { @@ -1462,7 +1809,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), - .get = compat_vfp_get, + .regset_get = compat_vfp_get, .set = compat_vfp_set }, [REGSET_TLS] = { @@ -1470,7 +1817,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .n = 1, .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), - .get = compat_tls_get, + .regset_get = compat_tls_get, .set = compat_tls_set, }, #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -1479,7 +1826,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = hw_break_get, + .regset_get = hw_break_get, .set = hw_break_set, }, [REGSET_HW_WATCH] = { @@ -1487,7 +1834,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = hw_break_get, + .regset_get = hw_break_get, .set = hw_break_set, }, #endif @@ -1496,7 +1843,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .n = 1, .size = sizeof(int), .align = sizeof(int), - .get = system_call_get, + .regset_get = system_call_get, .set = system_call_set, }, }; @@ -1521,9 +1868,7 @@ static int compat_ptrace_read_user(struct task_struct *tsk, compat_ulong_t off, else if (off == COMPAT_PT_TEXT_END_ADDR) tmp = tsk->mm->end_code; else if (off < sizeof(compat_elf_gregset_t)) - return copy_regset_to_user(tsk, &user_aarch32_view, - REGSET_COMPAT_GPR, off, - sizeof(compat_ulong_t), ret); + tmp = compat_get_user_reg(tsk, off >> 2); else if (off >= COMPAT_USER_SZ) return -EIO; else @@ -1535,8 +1880,8 @@ static int compat_ptrace_read_user(struct task_struct *tsk, compat_ulong_t off, static int compat_ptrace_write_user(struct task_struct *tsk, compat_ulong_t off, compat_ulong_t val) { - int ret; - mm_segment_t old_fs = get_fs(); + struct pt_regs newregs = *task_pt_regs(tsk); + unsigned int idx = off / 4; if (off & 3 || off >= COMPAT_USER_SZ) return -EIO; @@ -1544,14 +1889,25 @@ static int compat_ptrace_write_user(struct task_struct *tsk, compat_ulong_t off, if (off >= sizeof(compat_elf_gregset_t)) return 0; - set_fs(KERNEL_DS); - ret = copy_regset_from_user(tsk, &user_aarch32_view, - REGSET_COMPAT_GPR, off, - sizeof(compat_ulong_t), - &val); - set_fs(old_fs); + switch (idx) { + case 15: + newregs.pc = val; + break; + case 16: + newregs.pstate = compat_psr_to_pstate(val); + break; + case 17: + newregs.orig_x0 = val; + break; + default: + newregs.regs[idx] = val; + } - return ret; + if (!valid_user_regs(&newregs.user_regs, tsk)) + return -EINVAL; + + *task_pt_regs(tsk) = newregs; + return 0; } #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -1776,6 +2132,12 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { + switch (request) { + case PTRACE_PEEKMTETAGS: + case PTRACE_POKEMTETAGS: + return mte_ptrace_copy_tags(child, request, addr, data); + } + return ptrace_request(child, request, addr, data); } @@ -1784,40 +2146,63 @@ enum ptrace_syscall_dir { PTRACE_SYSCALL_EXIT, }; -static void tracehook_report_syscall(struct pt_regs *regs, - enum ptrace_syscall_dir dir) +static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir) { int regno; unsigned long saved_reg; /* - * A scratch register (ip(r12) on AArch32, x7 on AArch64) is - * used to denote syscall entry/exit: + * We have some ABI weirdness here in the way that we handle syscall + * exit stops because we indicate whether or not the stop has been + * signalled from syscall entry or syscall exit by clobbering a general + * purpose register (ip/r12 for AArch32, x7 for AArch64) in the tracee + * and restoring its old value after the stop. This means that: + * + * - Any writes by the tracer to this register during the stop are + * ignored/discarded. + * + * - The actual value of the register is not available during the stop, + * so the tracer cannot save it and restore it later. + * + * - Syscall stops behave differently to seccomp and pseudo-step traps + * (the latter do not nobble any registers). */ regno = (is_compat_task() ? 12 : 7); saved_reg = regs->regs[regno]; regs->regs[regno] = dir; - if (dir == PTRACE_SYSCALL_EXIT) - tracehook_report_syscall_exit(regs, 0); - else if (tracehook_report_syscall_entry(regs)) - forget_syscall(regs); + if (dir == PTRACE_SYSCALL_ENTER) { + if (ptrace_report_syscall_entry(regs)) + forget_syscall(regs); + regs->regs[regno] = saved_reg; + } else if (!test_thread_flag(TIF_SINGLESTEP)) { + ptrace_report_syscall_exit(regs, 0); + regs->regs[regno] = saved_reg; + } else { + regs->regs[regno] = saved_reg; - regs->regs[regno] = saved_reg; + /* + * Signal a pseudo-step exception since we are stepping but + * tracer modifications to the registers may have rewound the + * state machine. + */ + ptrace_report_syscall_exit(regs, 1); + } } int syscall_trace_enter(struct pt_regs *regs) { - if (test_thread_flag(TIF_SYSCALL_TRACE) || - test_thread_flag(TIF_SYSCALL_EMU)) { - tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); - if (!in_syscall(regs) || test_thread_flag(TIF_SYSCALL_EMU)) - return -1; + unsigned long flags = read_thread_flags(); + + if (flags & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE)) { + report_syscall(regs, PTRACE_SYSCALL_ENTER); + if (flags & _TIF_SYSCALL_EMU) + return NO_SYSCALL; } /* Do the secure computing after ptrace; failures should be fast. */ if (secure_computing() == -1) - return -1; + return NO_SYSCALL; if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, regs->syscallno); @@ -1830,13 +2215,15 @@ int syscall_trace_enter(struct pt_regs *regs) void syscall_trace_exit(struct pt_regs *regs) { + unsigned long flags = read_thread_flags(); + audit_syscall_exit(regs); - if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) - trace_sys_exit(regs, regs_return_value(regs)); + if (flags & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, syscall_get_return_value(current, regs)); - if (test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT); + if (flags & (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP)) + report_syscall(regs, PTRACE_SYSCALL_EXIT); rseq_syscall(regs); } @@ -1852,8 +2239,8 @@ void syscall_trace_exit(struct pt_regs *regs) * We also reserve IL for the kernel; SS is handled dynamically. */ #define SPSR_EL1_AARCH64_RES0_BITS \ - (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 25) | GENMASK_ULL(23, 22) | \ - GENMASK_ULL(20, 13) | GENMASK_ULL(11, 10) | GENMASK_ULL(5, 5)) + (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 26) | GENMASK_ULL(23, 22) | \ + GENMASK_ULL(20, 13) | GENMASK_ULL(5, 5)) #define SPSR_EL1_AARCH32_RES0_BITS \ (GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20)) @@ -1913,8 +2300,8 @@ static int valid_native_regs(struct user_pt_regs *regs) */ int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task) { - if (!test_tsk_thread_flag(task, TIF_SINGLESTEP)) - regs->pstate &= ~DBG_SPSR_SS; + /* https://lore.kernel.org/lkml/20191118131525.GA4180@willie-the-truck */ + user_regs_reset_single_step(regs, task); if (is_compat_thread(task_thread_info(task))) return valid_compat_regs(regs); diff --git a/arch/arm64/kernel/reloc_test_core.c b/arch/arm64/kernel/reloc_test_core.c index e87a2b7f20f6..99f2ffe9fc05 100644 --- a/arch/arm64/kernel/reloc_test_core.c +++ b/arch/arm64/kernel/reloc_test_core.c @@ -48,7 +48,7 @@ static struct { { "R_AARCH64_PREL16", relative_data16, (u64)&sym64_rel }, }; -static int reloc_test_init(void) +static int __init reloc_test_init(void) { int i; @@ -67,7 +67,7 @@ static int reloc_test_init(void) return 0; } -static void reloc_test_exit(void) +static void __exit reloc_test_exit(void) { } diff --git a/arch/arm64/kernel/reloc_test_syms.S b/arch/arm64/kernel/reloc_test_syms.S index 16a34f188f26..c50f45fa29fa 100644 --- a/arch/arm64/kernel/reloc_test_syms.S +++ b/arch/arm64/kernel/reloc_test_syms.S @@ -5,81 +5,81 @@ #include <linux/linkage.h> -ENTRY(absolute_data64) +SYM_FUNC_START(absolute_data64) ldr x0, 0f ret 0: .quad sym64_abs -ENDPROC(absolute_data64) +SYM_FUNC_END(absolute_data64) -ENTRY(absolute_data32) +SYM_FUNC_START(absolute_data32) ldr w0, 0f ret 0: .long sym32_abs -ENDPROC(absolute_data32) +SYM_FUNC_END(absolute_data32) -ENTRY(absolute_data16) +SYM_FUNC_START(absolute_data16) adr x0, 0f ldrh w0, [x0] ret 0: .short sym16_abs, 0 -ENDPROC(absolute_data16) +SYM_FUNC_END(absolute_data16) -ENTRY(signed_movw) +SYM_FUNC_START(signed_movw) movz x0, #:abs_g2_s:sym64_abs movk x0, #:abs_g1_nc:sym64_abs movk x0, #:abs_g0_nc:sym64_abs ret -ENDPROC(signed_movw) +SYM_FUNC_END(signed_movw) -ENTRY(unsigned_movw) +SYM_FUNC_START(unsigned_movw) movz x0, #:abs_g3:sym64_abs movk x0, #:abs_g2_nc:sym64_abs movk x0, #:abs_g1_nc:sym64_abs movk x0, #:abs_g0_nc:sym64_abs ret -ENDPROC(unsigned_movw) +SYM_FUNC_END(unsigned_movw) .align 12 .space 0xff8 -ENTRY(relative_adrp) +SYM_FUNC_START(relative_adrp) adrp x0, sym64_rel add x0, x0, #:lo12:sym64_rel ret -ENDPROC(relative_adrp) +SYM_FUNC_END(relative_adrp) .align 12 .space 0xffc -ENTRY(relative_adrp_far) +SYM_FUNC_START(relative_adrp_far) adrp x0, memstart_addr add x0, x0, #:lo12:memstart_addr ret -ENDPROC(relative_adrp_far) +SYM_FUNC_END(relative_adrp_far) -ENTRY(relative_adr) +SYM_FUNC_START(relative_adr) adr x0, sym64_rel ret -ENDPROC(relative_adr) +SYM_FUNC_END(relative_adr) -ENTRY(relative_data64) +SYM_FUNC_START(relative_data64) adr x1, 0f ldr x0, [x1] add x0, x0, x1 ret 0: .quad sym64_rel - . -ENDPROC(relative_data64) +SYM_FUNC_END(relative_data64) -ENTRY(relative_data32) +SYM_FUNC_START(relative_data32) adr x1, 0f ldr w0, [x1] add x0, x0, x1 ret 0: .long sym64_rel - . -ENDPROC(relative_data32) +SYM_FUNC_END(relative_data32) -ENTRY(relative_data16) +SYM_FUNC_START(relative_data16) adr x1, 0f ldrsh w0, [x1] add x0, x0, x1 ret 0: .short sym64_rel - ., 0 -ENDPROC(relative_data16) +SYM_FUNC_END(relative_data16) diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index c1d7db71a726..413f899e4ac6 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -4,6 +4,8 @@ * * Copyright (C) Linaro. * Copyright (C) Huawei Futurewei Technologies. + * Copyright (C) 2021, Microsoft Corporation. + * Pasha Tatashin <pasha.tatashin@soleen.com> */ #include <linux/kexec.h> @@ -13,117 +15,86 @@ #include <asm/kexec.h> #include <asm/page.h> #include <asm/sysreg.h> +#include <asm/virt.h> +.macro turn_off_mmu tmp1, tmp2 + mov_q \tmp1, INIT_SCTLR_EL1_MMU_OFF + pre_disable_mmu_workaround + msr sctlr_el1, \tmp1 + isb +.endm + +.section ".kexec_relocate.text", "ax" /* * arm64_relocate_new_kernel - Put a 2nd stage image in place and boot it. * - * The memory that the old kernel occupies may be overwritten when coping the + * The memory that the old kernel occupies may be overwritten when copying the * new image to its final location. To assure that the * arm64_relocate_new_kernel routine which does that copy is not overwritten, * all code and data needed by arm64_relocate_new_kernel must be between the * symbols arm64_relocate_new_kernel and arm64_relocate_new_kernel_end. The * machine_kexec() routine will copy arm64_relocate_new_kernel to the kexec - * control_code_page, a special page which has been set up to be preserved - * during the copy operation. + * safe memory that has been set up to be preserved during the copy operation. */ -ENTRY(arm64_relocate_new_kernel) +SYM_CODE_START(arm64_relocate_new_kernel) + /* + * The kimage structure isn't allocated specially and may be clobbered + * during relocation. We must load any values we need from it prior to + * any relocation occurring. + */ + ldr x28, [x0, #KIMAGE_START] + ldr x27, [x0, #KIMAGE_ARCH_EL2_VECTORS] + ldr x26, [x0, #KIMAGE_ARCH_DTB_MEM] /* Setup the list loop variables. */ - mov x18, x2 /* x18 = dtb address */ - mov x17, x1 /* x17 = kimage_start */ - mov x16, x0 /* x16 = kimage_head */ - raw_dcache_line_size x15, x0 /* x15 = dcache line size */ - mov x14, xzr /* x14 = entry ptr */ - mov x13, xzr /* x13 = copy dest */ - - /* Clear the sctlr_el2 flags. */ - mrs x0, CurrentEL - cmp x0, #CurrentEL_EL2 - b.ne 1f - mrs x0, sctlr_el2 - ldr x1, =SCTLR_ELx_FLAGS - bic x0, x0, x1 - pre_disable_mmu_workaround - msr sctlr_el2, x0 - isb -1: - - /* Check if the new image needs relocation. */ - tbnz x16, IND_DONE_BIT, .Ldone - + ldr x18, [x0, #KIMAGE_ARCH_ZERO_PAGE] /* x18 = zero page for BBM */ + ldr x17, [x0, #KIMAGE_ARCH_TTBR1] /* x17 = linear map copy */ + ldr x16, [x0, #KIMAGE_HEAD] /* x16 = kimage_head */ + ldr x22, [x0, #KIMAGE_ARCH_PHYS_OFFSET] /* x22 phys_offset */ + raw_dcache_line_size x15, x1 /* x15 = dcache line size */ + break_before_make_ttbr_switch x18, x17, x1, x2 /* set linear map */ .Lloop: and x12, x16, PAGE_MASK /* x12 = addr */ - + sub x12, x12, x22 /* Convert x12 to virt */ /* Test the entry flags. */ .Ltest_source: tbz x16, IND_SOURCE_BIT, .Ltest_indirection /* Invalidate dest page to PoC. */ - mov x0, x13 - add x20, x0, #PAGE_SIZE - sub x1, x15, #1 - bic x0, x0, x1 -2: dc ivac, x0 - add x0, x0, x15 - cmp x0, x20 - b.lo 2b - dsb sy - - mov x20, x13 - mov x21, x12 - copy_page x20, x21, x0, x1, x2, x3, x4, x5, x6, x7 - - /* dest += PAGE_SIZE */ - add x13, x13, PAGE_SIZE + mov x19, x13 + copy_page x13, x12, x1, x2, x3, x4, x5, x6, x7, x8 + add x1, x19, #PAGE_SIZE + dcache_by_myline_op civac, sy, x19, x1, x15, x20 b .Lnext - .Ltest_indirection: tbz x16, IND_INDIRECTION_BIT, .Ltest_destination - - /* ptr = addr */ - mov x14, x12 + mov x14, x12 /* ptr = addr */ b .Lnext - .Ltest_destination: tbz x16, IND_DESTINATION_BIT, .Lnext - - /* dest = addr */ - mov x13, x12 - + mov x13, x12 /* dest = addr */ .Lnext: - /* entry = *ptr++ */ - ldr x16, [x14], #8 - - /* while (!(entry & DONE)) */ - tbz x16, IND_DONE_BIT, .Lloop - -.Ldone: + ldr x16, [x14], #8 /* entry = *ptr++ */ + tbz x16, IND_DONE_BIT, .Lloop /* while (!(entry & DONE)) */ /* wait for writes from copy_page to finish */ dsb nsh ic iallu dsb nsh isb + turn_off_mmu x12, x13 /* Start new image. */ - mov x0, x18 + cbz x27, .Lel1 + mov x1, x28 /* kernel entry point */ + mov x2, x26 /* dtb address */ + mov x3, xzr + mov x4, xzr + mov x0, #HVC_SOFT_RESTART + hvc #0 /* Jumps from el2 */ +.Lel1: + mov x0, x26 /* dtb address */ mov x1, xzr mov x2, xzr mov x3, xzr - br x17 - -ENDPROC(arm64_relocate_new_kernel) - -.ltorg - -.align 3 /* To keep the 64-bit values below naturally aligned. */ - -.Lcopy_end: -.org KEXEC_CONTROL_PAGE_SIZE - -/* - * arm64_relocate_new_kernel_size - Number of bytes to copy to the - * control_code_page. - */ -.globl arm64_relocate_new_kernel_size -arm64_relocate_new_kernel_size: - .quad .Lcopy_end - arm64_relocate_new_kernel + br x28 /* Jumps from el1 */ +SYM_CODE_END(arm64_relocate_new_kernel) diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index a5e8b3b9d798..68330017d04f 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -9,25 +9,25 @@ #include <linux/export.h> #include <linux/ftrace.h> #include <linux/kprobes.h> +#include <linux/stacktrace.h> #include <asm/stack_pointer.h> -#include <asm/stacktrace.h> struct return_address_data { unsigned int level; void *addr; }; -static int save_return_addr(struct stackframe *frame, void *d) +static bool save_return_addr(void *d, unsigned long pc) { struct return_address_data *data = d; if (!data->level) { - data->addr = (void *)frame->pc; - return 1; + data->addr = (void *)pc; + return false; } else { --data->level; - return 0; + return true; } } NOKPROBE_SYMBOL(save_return_addr); @@ -35,15 +35,11 @@ NOKPROBE_SYMBOL(save_return_addr); void *return_address(unsigned int level) { struct return_address_data data; - struct stackframe frame; data.level = level + 2; data.addr = NULL; - start_backtrace(&frame, - (unsigned long)__builtin_frame_address(0), - (unsigned long)return_address); - walk_stackframe(current, &frame, save_return_addr, &data); + arch_stack_walk(save_return_addr, &data, current, NULL); if (!data.level) return data.addr; diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index d6259dac62b6..255d12f881c2 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -7,9 +7,11 @@ #include <linux/hardirq.h> #include <linux/irqflags.h> #include <linux/sched/task_stack.h> +#include <linux/scs.h> #include <linux/uaccess.h> #include <asm/alternative.h> +#include <asm/exception.h> #include <asm/kprobes.h> #include <asm/mmu.h> #include <asm/ptrace.h> @@ -37,6 +39,17 @@ DEFINE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); DEFINE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); #endif +DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr); +DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr); + +#ifdef CONFIG_SHADOW_CALL_STACK +DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr); +DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr); +#endif + +DEFINE_PER_CPU(struct sdei_registered_event *, sdei_active_normal_event); +DEFINE_PER_CPU(struct sdei_registered_event *, sdei_active_critical_event); + static void _free_sdei_stack(unsigned long * __percpu *ptr, int cpu) { unsigned long *p; @@ -52,6 +65,9 @@ static void free_sdei_stacks(void) { int cpu; + if (!IS_ENABLED(CONFIG_VMAP_STACK)) + return; + for_each_possible_cpu(cpu) { _free_sdei_stack(&sdei_stack_normal_ptr, cpu); _free_sdei_stack(&sdei_stack_critical_ptr, cpu); @@ -75,6 +91,9 @@ static int init_sdei_stacks(void) int cpu; int err = 0; + if (!IS_ENABLED(CONFIG_VMAP_STACK)) + return 0; + for_each_possible_cpu(cpu) { err = _init_sdei_stack(&sdei_stack_normal_ptr, cpu); if (err) @@ -90,58 +109,60 @@ static int init_sdei_stacks(void) return err; } -static bool on_sdei_normal_stack(unsigned long sp, struct stack_info *info) +static void _free_sdei_scs(unsigned long * __percpu *ptr, int cpu) { - unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr); - unsigned long high = low + SDEI_STACK_SIZE; - - if (!low) - return false; + void *s; - if (sp < low || sp >= high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = STACK_TYPE_SDEI_NORMAL; + s = per_cpu(*ptr, cpu); + if (s) { + per_cpu(*ptr, cpu) = NULL; + scs_free(s); } - - return true; } -static bool on_sdei_critical_stack(unsigned long sp, struct stack_info *info) +static void free_sdei_scs(void) { - unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr); - unsigned long high = low + SDEI_STACK_SIZE; + int cpu; - if (!low) - return false; + for_each_possible_cpu(cpu) { + _free_sdei_scs(&sdei_shadow_call_stack_normal_ptr, cpu); + _free_sdei_scs(&sdei_shadow_call_stack_critical_ptr, cpu); + } +} - if (sp < low || sp >= high) - return false; +static int _init_sdei_scs(unsigned long * __percpu *ptr, int cpu) +{ + void *s; - if (info) { - info->low = low; - info->high = high; - info->type = STACK_TYPE_SDEI_CRITICAL; - } + s = scs_alloc(cpu_to_node(cpu)); + if (!s) + return -ENOMEM; + per_cpu(*ptr, cpu) = s; - return true; + return 0; } -bool _on_sdei_stack(unsigned long sp, struct stack_info *info) +static int init_sdei_scs(void) { - if (!IS_ENABLED(CONFIG_VMAP_STACK)) - return false; + int cpu; + int err = 0; + + if (!scs_is_enabled()) + return 0; - if (on_sdei_critical_stack(sp, info)) - return true; + for_each_possible_cpu(cpu) { + err = _init_sdei_scs(&sdei_shadow_call_stack_normal_ptr, cpu); + if (err) + break; + err = _init_sdei_scs(&sdei_shadow_call_stack_critical_ptr, cpu); + if (err) + break; + } - if (on_sdei_normal_stack(sp, info)) - return true; + if (err) + free_sdei_scs(); - return false; + return err; } unsigned long sdei_arch_get_entry_point(int conduit) @@ -152,15 +173,16 @@ unsigned long sdei_arch_get_entry_point(int conduit) * dropped to EL1 because we don't support VHE, then we can't support * SDEI. */ - if (is_hyp_mode_available() && !is_kernel_in_hyp_mode()) { + if (is_hyp_nvhe()) { pr_err("Not supported on this hardware/boot configuration\n"); - return 0; + goto out_err; } - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - if (init_sdei_stacks()) - return 0; - } + if (init_sdei_stacks()) + goto out_err; + + if (init_sdei_scs()) + goto out_err_free_stacks; sdei_exit_mode = (conduit == SMCCC_CONDUIT_HVC) ? SDEI_EXIT_HVC : SDEI_EXIT_SMC; @@ -175,16 +197,20 @@ unsigned long sdei_arch_get_entry_point(int conduit) #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ return (unsigned long)__sdei_asm_handler; +out_err_free_stacks: + free_sdei_stacks(); +out_err: + return 0; } /* - * __sdei_handler() returns one of: + * do_sdei_event() returns one of: * SDEI_EV_HANDLED - success, return to the interrupted context. * SDEI_EV_FAILED - failure, return this error code to firmare. * virtual-address - success, return to this address. */ -static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, - struct sdei_registered_event *arg) +unsigned long __kprobes do_sdei_event(struct pt_regs *regs, + struct sdei_registered_event *arg) { u32 mode; int i, err = 0; @@ -202,12 +228,6 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, sdei_api_event_context(i, ®s->regs[i]); } - /* - * We didn't take an exception to get here, set PAN. UAO will be cleared - * by sdei_event_handler()s set_fs(USER_DS) call. - */ - __uaccess_enable_hw_pan(); - err = sdei_event_handler(regs, arg); if (err) return SDEI_EV_FAILED; @@ -245,28 +265,3 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, return vbar + 0x480; } - - -asmlinkage __kprobes notrace unsigned long -__sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) -{ - unsigned long ret; - bool do_nmi_exit = false; - - /* - * nmi_enter() deals with printk() re-entrance and use of RCU when - * RCU believed this CPU was idle. Because critical events can - * interrupt normal events, we may already be in_nmi(). - */ - if (!in_nmi()) { - nmi_enter(); - do_nmi_exit = true; - } - - ret = _sdei_handler(regs, arg); - - if (do_nmi_exit) - nmi_exit(); - - return ret; -} diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 56f664561754..42c690bb2d60 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -23,12 +23,14 @@ #include <linux/interrupt.h> #include <linux/smp.h> #include <linux/fs.h> +#include <linux/panic_notifier.h> #include <linux/proc_fs.h> #include <linux/memblock.h> #include <linux/of_fdt.h> #include <linux/efi.h> #include <linux/psci.h> #include <linux/sched/task.h> +#include <linux/scs.h> #include <linux/mm.h> #include <asm/acpi.h> @@ -41,6 +43,7 @@ #include <asm/cpu_ops.h> #include <asm/kasan.h> #include <asm/numa.h> +#include <asm/scs.h> #include <asm/sections.h> #include <asm/setup.h> #include <asm/smp_plat.h> @@ -55,6 +58,7 @@ static int num_standard_resources; static struct resource *standard_resources; phys_addr_t __fdt_pointer __initdata; +u64 mmu_enabled_at_boot __initdata; /* * Standard memory resources @@ -85,14 +89,8 @@ u64 __cacheline_aligned boot_args[4]; void __init smp_setup_processor_id(void) { u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; - cpu_logical_map(0) = mpidr; + set_cpu_logical_map(0, mpidr); - /* - * clear __my_cpu_offset on boot CPU to avoid hang caused by - * using percpu variable early, for example, lockdep will - * access percpu variable inside lock_release - */ - set_my_cpu_offset(0); pr_info("Booting Linux on physical CPU 0x%010lx [0x%08x]\n", (unsigned long)mpidr, read_cpuid_id()); } @@ -168,6 +166,21 @@ static void __init smp_build_mpidr_hash(void) pr_warn("Large number of MPIDR hash buckets detected\n"); } +static void *early_fdt_ptr __initdata; + +void __init *get_early_fdt_ptr(void) +{ + return early_fdt_ptr; +} + +asmlinkage void __init early_fdt_map(u64 dt_phys) +{ + int fdt_size; + + early_fixmap_init(); + early_fdt_ptr = fixmap_remap_fdt(dt_phys, &fdt_size, PAGE_KERNEL); +} + static void __init setup_machine_fdt(phys_addr_t dt_phys) { int size; @@ -179,11 +192,16 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys) if (!dt_virt || !early_init_dt_scan(dt_virt)) { pr_crit("\n" - "Error: invalid device tree blob at physical address %pa (virtual address 0x%p)\n" + "Error: invalid device tree blob at physical address %pa (virtual address 0x%px)\n" "The dtb must be 8-byte aligned and must not exceed 2 MB in size\n" "\nPlease check your bootloader.", &dt_phys, dt_virt); + /* + * Note that in this _really_ early stage we cannot even BUG() + * or oops, so the least terrible thing to do is cpu_relax(), + * or else we could end-up printing non-initialized data, etc. + */ while (true) cpu_relax(); } @@ -206,10 +224,12 @@ static void __init request_standard_resources(void) unsigned long i = 0; size_t res_size; - kernel_code.start = __pa_symbol(_text); + kernel_code.start = __pa_symbol(_stext); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); + insert_resource(&iomem_resource, &kernel_code); + insert_resource(&iomem_resource, &kernel_data); num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); @@ -217,32 +237,21 @@ static void __init request_standard_resources(void) if (!standard_resources) panic("%s: Failed to allocate %zu bytes\n", __func__, res_size); - for_each_memblock(memory, region) { + for_each_mem_region(region) { res = &standard_resources[i++]; if (memblock_is_nomap(region)) { res->name = "reserved"; res->flags = IORESOURCE_MEM; + res->start = __pfn_to_phys(memblock_region_reserved_base_pfn(region)); + res->end = __pfn_to_phys(memblock_region_reserved_end_pfn(region)) - 1; } else { res->name = "System RAM"; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); + res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; } - res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); - res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; - - request_resource(&iomem_resource, res); - - if (kernel_code.start >= res->start && - kernel_code.end <= res->end) - request_resource(res, &kernel_code); - if (kernel_data.start >= res->start && - kernel_data.end <= res->end) - request_resource(res, &kernel_data); -#ifdef CONFIG_KEXEC_CORE - /* Userspace will find "Crash kernel" region in /proc/iomem. */ - if (crashk_res.end && crashk_res.start >= res->start && - crashk_res.end <= res->end) - request_resource(res, &crashk_res); -#endif + + insert_resource(&iomem_resource, res); } } @@ -257,7 +266,7 @@ static int __init reserve_memblock_reserved_regions(void) if (!memblock_is_region_reserved(mem->start, mem_size)) continue; - for_each_reserved_mem_region(j, &r_start, &r_end) { + for_each_reserved_mem_range(j, &r_start, &r_end) { resource_size_t start, end; start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start); @@ -276,15 +285,26 @@ arch_initcall(reserve_memblock_reserved_regions); u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; -void __init setup_arch(char **cmdline_p) +u64 cpu_logical_map(unsigned int cpu) +{ + return __cpu_logical_map[cpu]; +} + +void __init __no_sanitize_address setup_arch(char **cmdline_p) { - init_mm.start_code = (unsigned long) _text; - init_mm.end_code = (unsigned long) _etext; - init_mm.end_data = (unsigned long) _edata; - init_mm.brk = (unsigned long) _end; + setup_initial_init_mm(_stext, _etext, _edata, _end); *cmdline_p = boot_command_line; + kaslr_init(); + + /* + * If know now we are going to need KPTI then use non-global + * mappings from the start, avoiding the cost of rewriting + * everything later. + */ + arm64_use_ng_mappings = kaslr_requires_kpti(); + early_fixmap_init(); early_ioremap_init(); @@ -297,6 +317,8 @@ void __init setup_arch(char **cmdline_p) jump_label_init(); parse_early_param(); + dynamic_scs_init(); + /* * Unmask asynchronous aborts and fiq after bringing up possible * earlycon. (Report possible System Errors once we can report this @@ -312,6 +334,14 @@ void __init setup_arch(char **cmdline_p) xen_early_init(); efi_init(); + + if (!efi_enabled(EFI_BOOT)) { + if ((u64)_text % MIN_KIMG_ALIGN) + pr_warn(FW_BUG "Kernel image misaligned at boot, please fix your bootloader!"); + WARN_TAINT(mmu_enabled_at_boot, TAINT_FIRMWARE_WORKAROUND, + FW_BUG "Booted with MMU enabled!"); + } + arm64_memblock_init(); paging_init(); @@ -337,12 +367,12 @@ void __init setup_arch(char **cmdline_p) else psci_acpi_init(); - cpu_read_bootcpu_ops(); + init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); /* Init percpu seeds for random tags after cpus are set up. */ - kasan_init_tags(); + kasan_init_sw_tags(); #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* @@ -350,12 +380,9 @@ void __init setup_arch(char **cmdline_p) * faults in case uaccess_enable() is inadvertently called by the init * thread. */ - init_task.thread_info.ttbr0 = __pa_symbol(empty_zero_page); + init_task.thread_info.ttbr0 = phys_to_ttbr(__pa_symbol(reserved_pg_dir)); #endif -#ifdef CONFIG_VT - conswitchp = &dummy_con; -#endif if (boot_args[1] || boot_args[2] || boot_args[3]) { pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n" "\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n" @@ -367,34 +394,20 @@ void __init setup_arch(char **cmdline_p) static inline bool cpu_can_disable(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU - if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_can_disable) - return cpu_ops[cpu]->cpu_can_disable(cpu); + const struct cpu_operations *ops = get_cpu_ops(cpu); + + if (ops && ops->cpu_can_disable) + return ops->cpu_can_disable(cpu); #endif return false; } -static int __init topology_init(void) +bool arch_cpu_is_hotpluggable(int num) { - int i; - - for_each_online_node(i) - register_one_node(i); - - for_each_possible_cpu(i) { - struct cpu *cpu = &per_cpu(cpu_data.cpu, i); - cpu->hotpluggable = cpu_can_disable(i); - register_cpu(cpu, i); - } - - return 0; + return cpu_can_disable(num); } -subsys_initcall(topology_init); -/* - * Dump out kernel offset information on panic. - */ -static int dump_kernel_offset(struct notifier_block *self, unsigned long v, - void *p) +static void dump_kernel_offset(void) { const unsigned long offset = kaslr_offset(); @@ -405,17 +418,33 @@ static int dump_kernel_offset(struct notifier_block *self, unsigned long v, } else { pr_emerg("Kernel Offset: disabled\n"); } +} + +static int arm64_panic_block_dump(struct notifier_block *self, + unsigned long v, void *p) +{ + dump_kernel_offset(); + dump_cpu_features(); + dump_mem_limit(); return 0; } -static struct notifier_block kernel_offset_notifier = { - .notifier_call = dump_kernel_offset +static struct notifier_block arm64_panic_block = { + .notifier_call = arm64_panic_block_dump }; -static int __init register_kernel_offset_dumper(void) +static int __init register_arm64_panic_block(void) { atomic_notifier_chain_register(&panic_notifier_list, - &kernel_offset_notifier); + &arm64_panic_block); + return 0; +} +device_initcall(register_arm64_panic_block); + +static int __init check_mmu_enabled_at_boot(void) +{ + if (!efi_enabled(EFI_BOOT) && mmu_enabled_at_boot) + panic("Non-EFI boot detected with MMU and caches enabled"); return 0; } -__initcall(register_kernel_offset_dumper); +device_initcall_sync(check_mmu_enabled_at_boot); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index dd2cdc0d5be2..0e8beb3349ea 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -11,24 +11,25 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/signal.h> -#include <linux/personality.h> #include <linux/freezer.h> #include <linux/stddef.h> #include <linux/uaccess.h> #include <linux/sizes.h> #include <linux/string.h> -#include <linux/tracehook.h> +#include <linux/resume_user_mode.h> #include <linux/ratelimit.h> #include <linux/syscalls.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/elf.h> +#include <asm/exception.h> #include <asm/cacheflush.h> #include <asm/ucontext.h> #include <asm/unistd.h> #include <asm/fpsimd.h> #include <asm/ptrace.h> +#include <asm/syscall.h> #include <asm/signal32.h> #include <asm/traps.h> #include <asm/vdso.h> @@ -56,6 +57,9 @@ struct rt_sigframe_user_layout { unsigned long fpsimd_offset; unsigned long esr_offset; unsigned long sve_offset; + unsigned long tpidr2_offset; + unsigned long za_offset; + unsigned long zt_offset; unsigned long extra_offset; unsigned long end_offset; }; @@ -90,7 +94,7 @@ static size_t sigframe_size(struct rt_sigframe_user_layout const *user) * not taken into account. This limit is not a guarantee and is * NOT ABI. */ -#define SIGFRAME_MAXSZ SZ_64K +#define SIGFRAME_MAXSZ SZ_256K static int __sigframe_alloc(struct rt_sigframe_user_layout *user, unsigned long *offset, size_t size, bool extend) @@ -167,6 +171,19 @@ static void __user *apply_user_offset( return base + offset; } +struct user_ctxs { + struct fpsimd_context __user *fpsimd; + u32 fpsimd_size; + struct sve_context __user *sve; + u32 sve_size; + struct tpidr2_context __user *tpidr2; + u32 tpidr2_size; + struct za_context __user *za; + u32 za_size; + struct zt_context __user *zt; + u32 zt_size; +}; + static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) { struct user_fpsimd_state const *fpsimd = @@ -185,27 +202,23 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) return err ? -EFAULT : 0; } -static int restore_fpsimd_context(struct fpsimd_context __user *ctx) +static int restore_fpsimd_context(struct user_ctxs *user) { struct user_fpsimd_state fpsimd; - __u32 magic, size; int err = 0; - /* check the magic/size information */ - __get_user_error(magic, &ctx->head.magic, err); - __get_user_error(size, &ctx->head.size, err); - if (err) - return -EFAULT; - if (magic != FPSIMD_MAGIC || size != sizeof(struct fpsimd_context)) + /* check the size information */ + if (user->fpsimd_size != sizeof(struct fpsimd_context)) return -EINVAL; /* copy the FP and status/control registers */ - err = __copy_from_user(fpsimd.vregs, ctx->vregs, + err = __copy_from_user(fpsimd.vregs, &(user->fpsimd->vregs), sizeof(fpsimd.vregs)); - __get_user_error(fpsimd.fpsr, &ctx->fpsr, err); - __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); + __get_user_error(fpsimd.fpsr, &(user->fpsimd->fpsr), err); + __get_user_error(fpsimd.fpcr, &(user->fpsimd->fpcr), err); clear_thread_flag(TIF_SVE); + current->thread.fp_type = FP_STATE_FPSIMD; /* load the hardware registers from the fpsimd_state structure */ if (!err) @@ -215,22 +228,23 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx) } -struct user_ctxs { - struct fpsimd_context __user *fpsimd; - struct sve_context __user *sve; -}; - #ifdef CONFIG_ARM64_SVE static int preserve_sve_context(struct sve_context __user *ctx) { int err = 0; u16 reserved[ARRAY_SIZE(ctx->__reserved)]; - unsigned int vl = current->thread.sve_vl; + u16 flags = 0; + unsigned int vl = task_get_sve_vl(current); unsigned int vq = 0; - if (test_thread_flag(TIF_SVE)) + if (thread_sm_enabled(¤t->thread)) { + vl = task_get_sme_vl(current); vq = sve_vq_from_vl(vl); + flags |= SVE_SIG_FLAG_SM; + } else if (test_thread_flag(TIF_SVE)) { + vq = sve_vq_from_vl(vl); + } memset(reserved, 0, sizeof(reserved)); @@ -238,13 +252,15 @@ static int preserve_sve_context(struct sve_context __user *ctx) __put_user_error(round_up(SVE_SIG_CONTEXT_SIZE(vq), 16), &ctx->head.size, err); __put_user_error(vl, &ctx->vl, err); + __put_user_error(flags, &ctx->flags, err); BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { /* * This assumes that the SVE state has already been saved to - * the task struct by calling preserve_fpsimd_context(). + * the task struct by calling the function + * fpsimd_signal_preserve_current_state(). */ err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET, current->thread.sve_state, @@ -256,25 +272,49 @@ static int preserve_sve_context(struct sve_context __user *ctx) static int restore_sve_fpsimd_context(struct user_ctxs *user) { - int err; - unsigned int vq; + int err = 0; + unsigned int vl, vq; struct user_fpsimd_state fpsimd; - struct sve_context sve; + u16 user_vl, flags; - if (__copy_from_user(&sve, user->sve, sizeof(sve))) - return -EFAULT; + if (user->sve_size < sizeof(*user->sve)) + return -EINVAL; + + __get_user_error(user_vl, &(user->sve->vl), err); + __get_user_error(flags, &(user->sve->flags), err); + if (err) + return err; + + if (flags & SVE_SIG_FLAG_SM) { + if (!system_supports_sme()) + return -EINVAL; - if (sve.vl != current->thread.sve_vl) + vl = task_get_sme_vl(current); + } else { + /* + * A SME only system use SVE for streaming mode so can + * have a SVE formatted context with a zero VL and no + * payload data. + */ + if (!system_supports_sve() && !system_supports_sme()) + return -EINVAL; + + vl = task_get_sve_vl(current); + } + + if (user_vl != vl) return -EINVAL; - if (sve.head.size <= sizeof(*user->sve)) { + if (user->sve_size == sizeof(*user->sve)) { clear_thread_flag(TIF_SVE); + current->thread.svcr &= ~SVCR_SM_MASK; + current->thread.fp_type = FP_STATE_FPSIMD; goto fpsimd_only; } - vq = sve_vq_from_vl(sve.vl); + vq = sve_vq_from_vl(vl); - if (sve.head.size < SVE_SIG_CONTEXT_SIZE(vq)) + if (user->sve_size < SVE_SIG_CONTEXT_SIZE(vq)) return -EINVAL; /* @@ -287,7 +327,12 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) fpsimd_flush_task_state(current); /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ - sve_alloc(current); + sve_alloc(current, true); + if (!current->thread.sve_state) { + clear_thread_flag(TIF_SVE); + return -ENOMEM; + } + err = __copy_from_user(current->thread.sve_state, (char __user const *)user->sve + SVE_SIG_REGS_OFFSET, @@ -295,7 +340,11 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (err) return -EFAULT; - set_thread_flag(TIF_SVE); + if (flags & SVE_SIG_FLAG_SM) + current->thread.svcr |= SVCR_SM_MASK; + else + set_thread_flag(TIF_SVE); + current->thread.fp_type = FP_STATE_SVE; fpsimd_only: /* copy the FP and status/control registers */ @@ -314,12 +363,216 @@ fpsimd_only: #else /* ! CONFIG_ARM64_SVE */ -/* Turn any non-optimised out attempts to use these into a link error: */ +static int restore_sve_fpsimd_context(struct user_ctxs *user) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} + +/* Turn any non-optimised out attempts to use this into a link error: */ extern int preserve_sve_context(void __user *ctx); -extern int restore_sve_fpsimd_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +static int preserve_tpidr2_context(struct tpidr2_context __user *ctx) +{ + int err = 0; + + current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + + __put_user_error(TPIDR2_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(current->thread.tpidr2_el0, &ctx->tpidr2, err); + + return err; +} + +static int restore_tpidr2_context(struct user_ctxs *user) +{ + u64 tpidr2_el0; + int err = 0; + + if (user->tpidr2_size != sizeof(*user->tpidr2)) + return -EINVAL; + + __get_user_error(tpidr2_el0, &user->tpidr2->tpidr2, err); + if (!err) + write_sysreg_s(tpidr2_el0, SYS_TPIDR2_EL0); + + return err; +} + +static int preserve_za_context(struct za_context __user *ctx) +{ + int err = 0; + u16 reserved[ARRAY_SIZE(ctx->__reserved)]; + unsigned int vl = task_get_sme_vl(current); + unsigned int vq; + + if (thread_za_enabled(¤t->thread)) + vq = sve_vq_from_vl(vl); + else + vq = 0; + + memset(reserved, 0, sizeof(reserved)); + + __put_user_error(ZA_MAGIC, &ctx->head.magic, err); + __put_user_error(round_up(ZA_SIG_CONTEXT_SIZE(vq), 16), + &ctx->head.size, err); + __put_user_error(vl, &ctx->vl, err); + BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); + err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); + + if (vq) { + /* + * This assumes that the ZA state has already been saved to + * the task struct by calling the function + * fpsimd_signal_preserve_current_state(). + */ + err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET, + current->thread.sme_state, + ZA_SIG_REGS_SIZE(vq)); + } + + return err ? -EFAULT : 0; +} + +static int restore_za_context(struct user_ctxs *user) +{ + int err = 0; + unsigned int vq; + u16 user_vl; + + if (user->za_size < sizeof(*user->za)) + return -EINVAL; + + __get_user_error(user_vl, &(user->za->vl), err); + if (err) + return err; + + if (user_vl != task_get_sme_vl(current)) + return -EINVAL; + + if (user->za_size == sizeof(*user->za)) { + current->thread.svcr &= ~SVCR_ZA_MASK; + return 0; + } + + vq = sve_vq_from_vl(user_vl); + + if (user->za_size < ZA_SIG_CONTEXT_SIZE(vq)) + return -EINVAL; + + /* + * Careful: we are about __copy_from_user() directly into + * thread.sme_state with preemption enabled, so protection is + * needed to prevent a racing context switch from writing stale + * registers back over the new data. + */ + + fpsimd_flush_task_state(current); + /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ + + sme_alloc(current, true); + if (!current->thread.sme_state) { + current->thread.svcr &= ~SVCR_ZA_MASK; + clear_thread_flag(TIF_SME); + return -ENOMEM; + } + + err = __copy_from_user(current->thread.sme_state, + (char __user const *)user->za + + ZA_SIG_REGS_OFFSET, + ZA_SIG_REGS_SIZE(vq)); + if (err) + return -EFAULT; + + set_thread_flag(TIF_SME); + current->thread.svcr |= SVCR_ZA_MASK; + + return 0; +} + +static int preserve_zt_context(struct zt_context __user *ctx) +{ + int err = 0; + u16 reserved[ARRAY_SIZE(ctx->__reserved)]; + + if (WARN_ON(!thread_za_enabled(¤t->thread))) + return -EINVAL; + + memset(reserved, 0, sizeof(reserved)); + + __put_user_error(ZT_MAGIC, &ctx->head.magic, err); + __put_user_error(round_up(ZT_SIG_CONTEXT_SIZE(1), 16), + &ctx->head.size, err); + __put_user_error(1, &ctx->nregs, err); + BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); + err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); + + /* + * This assumes that the ZT state has already been saved to + * the task struct by calling the function + * fpsimd_signal_preserve_current_state(). + */ + err |= __copy_to_user((char __user *)ctx + ZT_SIG_REGS_OFFSET, + thread_zt_state(¤t->thread), + ZT_SIG_REGS_SIZE(1)); + + return err ? -EFAULT : 0; +} + +static int restore_zt_context(struct user_ctxs *user) +{ + int err; + u16 nregs; + + /* ZA must be restored first for this check to be valid */ + if (!thread_za_enabled(¤t->thread)) + return -EINVAL; + + if (user->zt_size != ZT_SIG_CONTEXT_SIZE(1)) + return -EINVAL; + + if (__copy_from_user(&nregs, &(user->zt->nregs), sizeof(nregs))) + return -EFAULT; + + if (nregs != 1) + return -EINVAL; + + /* + * Careful: we are about __copy_from_user() directly into + * thread.zt_state with preemption enabled, so protection is + * needed to prevent a racing context switch from writing stale + * registers back over the new data. + */ + + fpsimd_flush_task_state(current); + /* From now, fpsimd_thread_switch() won't touch ZT in thread state */ + + err = __copy_from_user(thread_zt_state(¤t->thread), + (char __user const *)user->zt + + ZT_SIG_REGS_OFFSET, + ZT_SIG_REGS_SIZE(1)); + if (err) + return -EFAULT; + + return 0; +} + +#else /* ! CONFIG_ARM64_SME */ + +/* Turn any non-optimised out attempts to use these into a link error: */ +extern int preserve_tpidr2_context(void __user *ctx); +extern int restore_tpidr2_context(struct user_ctxs *user); +extern int preserve_za_context(void __user *ctx); +extern int restore_za_context(struct user_ctxs *user); +extern int preserve_zt_context(void __user *ctx); +extern int restore_zt_context(struct user_ctxs *user); + +#endif /* ! CONFIG_ARM64_SME */ static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) @@ -334,6 +587,9 @@ static int parse_user_sigframe(struct user_ctxs *user, user->fpsimd = NULL; user->sve = NULL; + user->tpidr2 = NULL; + user->za = NULL; + user->zt = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -371,13 +627,13 @@ static int parse_user_sigframe(struct user_ctxs *user, goto done; case FPSIMD_MAGIC: - if (user->fpsimd) + if (!system_supports_fpsimd()) goto invalid; - - if (size < sizeof(*user->fpsimd)) + if (user->fpsimd) goto invalid; user->fpsimd = (struct fpsimd_context __user *)head; + user->fpsimd_size = size; break; case ESR_MAGIC: @@ -385,16 +641,47 @@ static int parse_user_sigframe(struct user_ctxs *user, break; case SVE_MAGIC: - if (!system_supports_sve()) + if (!system_supports_sve() && !system_supports_sme()) goto invalid; if (user->sve) goto invalid; - if (size < sizeof(*user->sve)) + user->sve = (struct sve_context __user *)head; + user->sve_size = size; + break; + + case TPIDR2_MAGIC: + if (!system_supports_tpidr2()) goto invalid; - user->sve = (struct sve_context __user *)head; + if (user->tpidr2) + goto invalid; + + user->tpidr2 = (struct tpidr2_context __user *)head; + user->tpidr2_size = size; + break; + + case ZA_MAGIC: + if (!system_supports_sme()) + goto invalid; + + if (user->za) + goto invalid; + + user->za = (struct za_context __user *)head; + user->za_size = size; + break; + + case ZT_MAGIC: + if (!system_supports_sme2()) + goto invalid; + + if (user->zt) + goto invalid; + + user->zt = (struct zt_context __user *)head; + user->zt_size = size; break; case EXTRA_MAGIC: @@ -506,20 +793,25 @@ static int restore_sigframe(struct pt_regs *regs, if (err == 0) err = parse_user_sigframe(&user, sf); - if (err == 0) { + if (err == 0 && system_supports_fpsimd()) { if (!user.fpsimd) return -EINVAL; - if (user.sve) { - if (!system_supports_sve()) - return -EINVAL; - + if (user.sve) err = restore_sve_fpsimd_context(&user); - } else { - err = restore_fpsimd_context(user.fpsimd); - } + else + err = restore_fpsimd_context(&user); } + if (err == 0 && system_supports_tpidr2() && user.tpidr2) + err = restore_tpidr2_context(&user); + + if (err == 0 && system_supports_sme() && user.za) + err = restore_za_context(&user); + + if (err == 0 && system_supports_sme2() && user.zt) + err = restore_zt_context(&user); + return err; } @@ -568,10 +860,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, { int err; - err = sigframe_alloc(user, &user->fpsimd_offset, - sizeof(struct fpsimd_context)); - if (err) - return err; + if (system_supports_fpsimd()) { + err = sigframe_alloc(user, &user->fpsimd_offset, + sizeof(struct fpsimd_context)); + if (err) + return err; + } /* fault information, if valid */ if (add_all || current->thread.fault_code) { @@ -581,14 +875,15 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } - if (system_supports_sve()) { + if (system_supports_sve() || system_supports_sme()) { unsigned int vq = 0; - if (add_all || test_thread_flag(TIF_SVE)) { - int vl = sve_max_vl; + if (add_all || test_thread_flag(TIF_SVE) || + thread_sm_enabled(¤t->thread)) { + int vl = max(sve_max_vl(), sme_max_vl()); if (!add_all) - vl = current->thread.sve_vl; + vl = thread_get_cur_vl(¤t->thread); vq = sve_vq_from_vl(vl); } @@ -599,6 +894,40 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } + if (system_supports_tpidr2()) { + err = sigframe_alloc(user, &user->tpidr2_offset, + sizeof(struct tpidr2_context)); + if (err) + return err; + } + + if (system_supports_sme()) { + unsigned int vl; + unsigned int vq = 0; + + if (add_all) + vl = sme_max_vl(); + else + vl = task_get_sme_vl(current); + + if (thread_za_enabled(¤t->thread)) + vq = sve_vq_from_vl(vl); + + err = sigframe_alloc(user, &user->za_offset, + ZA_SIG_CONTEXT_SIZE(vq)); + if (err) + return err; + } + + if (system_supports_sme2()) { + if (add_all || thread_za_enabled(¤t->thread)) { + err = sigframe_alloc(user, &user->zt_offset, + ZT_SIG_CONTEXT_SIZE(1)); + if (err) + return err; + } + } + return sigframe_alloc_end(user); } @@ -623,7 +952,7 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); - if (err == 0) { + if (err == 0 && system_supports_fpsimd()) { struct fpsimd_context __user *fpsimd_ctx = apply_user_offset(user, user->fpsimd_offset); err |= preserve_fpsimd_context(fpsimd_ctx); @@ -639,13 +968,35 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); } - /* Scalable Vector Extension state, if present */ - if (system_supports_sve() && err == 0 && user->sve_offset) { + /* Scalable Vector Extension state (including streaming), if present */ + if ((system_supports_sve() || system_supports_sme()) && + err == 0 && user->sve_offset) { struct sve_context __user *sve_ctx = apply_user_offset(user, user->sve_offset); err |= preserve_sve_context(sve_ctx); } + /* TPIDR2 if supported */ + if (system_supports_tpidr2() && err == 0) { + struct tpidr2_context __user *tpidr2_ctx = + apply_user_offset(user, user->tpidr2_offset); + err |= preserve_tpidr2_context(tpidr2_ctx); + } + + /* ZA state if present */ + if (system_supports_sme() && err == 0 && user->za_offset) { + struct za_context __user *za_ctx = + apply_user_offset(user, user->za_offset); + err |= preserve_za_context(za_ctx); + } + + /* ZT state if present */ + if (system_supports_sme2() && err == 0 && user->zt_offset) { + struct zt_context __user *zt_ctx = + apply_user_offset(user, user->zt_offset); + err |= preserve_zt_context(zt_ctx); + } + if (err == 0 && user->extra_offset) { char __user *sfp = (char __user *)user->sigframe; char __user *userp = @@ -730,6 +1081,44 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, regs->regs[29] = (unsigned long)&user->next_frame->fp; regs->pc = (unsigned long)ka->sa.sa_handler; + /* + * Signal delivery is a (wacky) indirect function call in + * userspace, so simulate the same setting of BTYPE as a BLR + * <register containing the signal handler entry point>. + * Signal delivery to a location in a PROT_BTI guarded page + * that is not a function entry point will now trigger a + * SIGILL in userspace. + * + * If the signal handler entry point is not in a PROT_BTI + * guarded page, this is harmless. + */ + if (system_supports_bti()) { + regs->pstate &= ~PSR_BTYPE_MASK; + regs->pstate |= PSR_BTYPE_C; + } + + /* TCO (Tag Check Override) always cleared for signal handlers */ + regs->pstate &= ~PSR_TCO_BIT; + + /* Signal handlers are invoked with ZA and streaming mode disabled */ + if (system_supports_sme()) { + /* + * If we were in streaming mode the saved register + * state was SVE but we will exit SM and use the + * FPSIMD register state - flush the saved FPSIMD + * register state in case it gets loaded. + */ + if (current->thread.svcr & SVCR_SM_MASK) { + memset(¤t->thread.uw.fpsimd_state, 0, + sizeof(current->thread.uw.fpsimd_state)); + current->thread.fp_type = FP_STATE_FPSIMD; + } + + current->thread.svcr &= ~(SVCR_ZA_MASK | + SVCR_SM_MASK); + sme_smstop(); + } + if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else @@ -782,7 +1171,6 @@ static void setup_restart_syscall(struct pt_regs *regs) */ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { - struct task_struct *tsk = current; sigset_t *oldset = sigmask_to_save(); int usig = ksig->sig; int ret; @@ -806,14 +1194,8 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) */ ret |= !valid_user_regs(®s->user_regs, current); - /* - * Fast forward the stepping logic so we step into the signal - * handler. - */ - if (!ret) - user_fastforward_single_step(tsk); - - signal_setup_done(ret, ksig, 0); + /* Step into the signal handler if we are stepping */ + signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLESTEP)); } /* @@ -875,7 +1257,7 @@ static void do_signal(struct pt_regs *regs) retval == -ERESTART_RESTARTBLOCK || (retval == -ERESTARTSYS && !(ksig.ka.sa.sa_flags & SA_RESTART)))) { - regs->regs[0] = -EINTR; + syscall_set_return_value(current, regs, -EINTR, 0); regs->pc = continue_addr; } @@ -896,20 +1278,9 @@ static void do_signal(struct pt_regs *regs) restore_saved_sigmask(); } -asmlinkage void do_notify_resume(struct pt_regs *regs, - unsigned long thread_flags) +void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) { - /* - * The assembly code enters us with IRQs off, but it hasn't - * informed the tracing code of that for efficiency reasons. - * Update the trace code with the current status. - */ - trace_hardirqs_off(); - do { - /* Check valid user FS if needed */ - addr_limit_user_check(); - if (thread_flags & _TIF_NEED_RESCHED) { /* Unmask Debug and SError for the next task */ local_daif_restore(DAIF_PROCCTX_NOIRQ); @@ -921,21 +1292,24 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, if (thread_flags & _TIF_UPROBE) uprobe_notify_resume(regs); - if (thread_flags & _TIF_SIGPENDING) + if (thread_flags & _TIF_MTE_ASYNC_FAULT) { + clear_thread_flag(TIF_MTE_ASYNC_FAULT); + send_sig_fault(SIGSEGV, SEGV_MTEAERR, + (void __user *)NULL, current); + } + + if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); - if (thread_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } + if (thread_flags & _TIF_NOTIFY_RESUME) + resume_user_mode_work(regs); if (thread_flags & _TIF_FOREIGN_FPSTATE) fpsimd_restore_current_state(); } local_daif_mask(); - thread_flags = READ_ONCE(current_thread_info()->flags); + thread_flags = read_thread_flags(); } while (thread_flags & _TIF_WORK_MASK); } @@ -963,3 +1337,43 @@ void __init minsigstksz_setup(void) round_up(sizeof(struct frame_record), 16) + 16; /* max alignment padding */ } + +/* + * Compile-time assertions for siginfo_t offsets. Check NSIG* as well, as + * changes likely come with new fields that should be added below. + */ +static_assert(NSIGILL == 11); +static_assert(NSIGFPE == 15); +static_assert(NSIGSEGV == 10); +static_assert(NSIGBUS == 5); +static_assert(NSIGTRAP == 6); +static_assert(NSIGCHLD == 6); +static_assert(NSIGSYS == 2); +static_assert(sizeof(siginfo_t) == 128); +static_assert(__alignof__(siginfo_t) == 8); +static_assert(offsetof(siginfo_t, si_signo) == 0x00); +static_assert(offsetof(siginfo_t, si_errno) == 0x04); +static_assert(offsetof(siginfo_t, si_code) == 0x08); +static_assert(offsetof(siginfo_t, si_pid) == 0x10); +static_assert(offsetof(siginfo_t, si_uid) == 0x14); +static_assert(offsetof(siginfo_t, si_tid) == 0x10); +static_assert(offsetof(siginfo_t, si_overrun) == 0x14); +static_assert(offsetof(siginfo_t, si_status) == 0x18); +static_assert(offsetof(siginfo_t, si_utime) == 0x20); +static_assert(offsetof(siginfo_t, si_stime) == 0x28); +static_assert(offsetof(siginfo_t, si_value) == 0x18); +static_assert(offsetof(siginfo_t, si_int) == 0x18); +static_assert(offsetof(siginfo_t, si_ptr) == 0x18); +static_assert(offsetof(siginfo_t, si_addr) == 0x10); +static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18); +static_assert(offsetof(siginfo_t, si_lower) == 0x20); +static_assert(offsetof(siginfo_t, si_upper) == 0x28); +static_assert(offsetof(siginfo_t, si_pkey) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); +static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); +static_assert(offsetof(siginfo_t, si_band) == 0x10); +static_assert(offsetof(siginfo_t, si_fd) == 0x18); +static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); +static_assert(offsetof(siginfo_t, si_syscall) == 0x18); +static_assert(offsetof(siginfo_t, si_arch) == 0x1c); diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 12a585386c2f..bbd542704730 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -46,8 +46,6 @@ struct compat_aux_sigframe { unsigned long end_magic; } __attribute__((__aligned__(8))); -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set) { compat_sigset_t cset; @@ -190,10 +188,8 @@ static int compat_restore_sigframe(struct pt_regs *regs, unsigned long psr; err = get_sigset_t(&set, &sf->uc.uc_sigmask); - if (err == 0) { - sigdelsetmask(&set, ~_BLOCKABLE); + if (err == 0) set_current_blocked(&set); - } __get_user_error(regs->regs[0], &sf->uc.uc_mcontext.arm_r0, err); __get_user_error(regs->regs[1], &sf->uc.uc_mcontext.arm_r1, err); @@ -223,7 +219,7 @@ static int compat_restore_sigframe(struct pt_regs *regs, err |= !valid_user_regs(®s->user_regs, current); aux = (struct compat_aux_sigframe __user *) sf->uc.uc_regspace; - if (err == 0) + if (err == 0 && system_supports_fpsimd()) err |= compat_restore_vfp_context(&aux->vfp); return err; @@ -342,38 +338,13 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, retcode = ptr_to_compat(ka->sa.sa_restorer); } else { /* Set up sigreturn pointer */ -#ifdef CONFIG_COMPAT_VDSO - void *vdso_base = current->mm->context.vdso; - void *vdso_trampoline; - - if (ka->sa.sa_flags & SA_SIGINFO) { - if (thumb) { - vdso_trampoline = VDSO_SYMBOL(vdso_base, - compat_rt_sigreturn_thumb); - } else { - vdso_trampoline = VDSO_SYMBOL(vdso_base, - compat_rt_sigreturn_arm); - } - } else { - if (thumb) { - vdso_trampoline = VDSO_SYMBOL(vdso_base, - compat_sigreturn_thumb); - } else { - vdso_trampoline = VDSO_SYMBOL(vdso_base, - compat_sigreturn_arm); - } - } - - retcode = ptr_to_compat(vdso_trampoline) + thumb; -#else unsigned int idx = thumb << 1; if (ka->sa.sa_flags & SA_SIGINFO) idx += 3; - retcode = (unsigned long)current->mm->context.vdso + + retcode = (unsigned long)current->mm->context.sigpage + (idx << 2) + thumb; -#endif } regs->regs[0] = usig; @@ -419,7 +390,7 @@ static int compat_setup_sigframe(struct compat_sigframe __user *sf, aux = (struct compat_aux_sigframe __user *) sf->uc.uc_regspace; - if (err == 0) + if (err == 0 && system_supports_fpsimd()) err |= compat_preserve_vfp_context(&aux->vfp); __put_user_error(0, &aux->end_magic, err); @@ -482,3 +453,43 @@ void compat_setup_restart_syscall(struct pt_regs *regs) { regs->regs[7] = __NR_compat_restart_syscall; } + +/* + * Compile-time assertions for siginfo_t offsets. Check NSIG* as well, as + * changes likely come with new fields that should be added below. + */ +static_assert(NSIGILL == 11); +static_assert(NSIGFPE == 15); +static_assert(NSIGSEGV == 10); +static_assert(NSIGBUS == 5); +static_assert(NSIGTRAP == 6); +static_assert(NSIGCHLD == 6); +static_assert(NSIGSYS == 2); +static_assert(sizeof(compat_siginfo_t) == 128); +static_assert(__alignof__(compat_siginfo_t) == 4); +static_assert(offsetof(compat_siginfo_t, si_signo) == 0x00); +static_assert(offsetof(compat_siginfo_t, si_errno) == 0x04); +static_assert(offsetof(compat_siginfo_t, si_code) == 0x08); +static_assert(offsetof(compat_siginfo_t, si_pid) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_uid) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_tid) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_overrun) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_status) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_utime) == 0x18); +static_assert(offsetof(compat_siginfo_t, si_stime) == 0x1c); +static_assert(offsetof(compat_siginfo_t, si_value) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_int) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_ptr) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_addr) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_addr_lsb) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_lower) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_upper) == 0x18); +static_assert(offsetof(compat_siginfo_t, si_pkey) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_perf_data) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_perf_type) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_perf_flags) == 0x18); +static_assert(offsetof(compat_siginfo_t, si_band) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_fd) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_call_addr) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_syscall) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_arch) == 0x14); diff --git a/arch/arm64/kernel/sigreturn32.S b/arch/arm64/kernel/sigreturn32.S index 475d30d471ac..ccbd4aab4ba4 100644 --- a/arch/arm64/kernel/sigreturn32.S +++ b/arch/arm64/kernel/sigreturn32.S @@ -15,6 +15,7 @@ #include <asm/unistd.h> + .section .rodata .globl __aarch32_sigret_code_start __aarch32_sigret_code_start: diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index f5b04dd8a710..2aa5129d8253 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -3,6 +3,7 @@ #include <linux/linkage.h> #include <asm/asm-offsets.h> #include <asm/assembler.h> +#include <asm/smp.h> .text /* @@ -61,7 +62,7 @@ * * x0 = struct sleep_stack_data area */ -ENTRY(__cpu_suspend_enter) +SYM_FUNC_START(__cpu_suspend_enter) stp x29, lr, [x0, #SLEEP_STACK_DATA_CALLEE_REGS] stp x19, x20, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+16] stp x21, x22, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+32] @@ -94,22 +95,31 @@ ENTRY(__cpu_suspend_enter) ldp x29, lr, [sp], #16 mov x0, #1 ret -ENDPROC(__cpu_suspend_enter) +SYM_FUNC_END(__cpu_suspend_enter) - .pushsection ".idmap.text", "awx" -ENTRY(cpu_resume) - bl el2_setup // if in EL2 drop to EL1 cleanly + .pushsection ".idmap.text", "a" +SYM_CODE_START(cpu_resume) + mov x0, xzr + bl init_kernel_el + mov x19, x0 // preserve boot mode +#if VA_BITS > 48 + ldr_l x0, vabits_actual +#endif bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ adrp x1, swapper_pg_dir + adrp x2, idmap_pg_dir bl __enable_mmu ldr x8, =_cpu_resume br x8 -ENDPROC(cpu_resume) +SYM_CODE_END(cpu_resume) .ltorg .popsection -ENTRY(_cpu_resume) +SYM_FUNC_START(_cpu_resume) + mov x0, x19 + bl finalise_el2 + mrs x1, mpidr_el1 adr_l x8, mpidr_hash // x8 = struct mpidr_hash virt address @@ -132,7 +142,7 @@ ENTRY(_cpu_resume) */ bl cpu_do_resume -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) mov x0, sp bl kasan_unpoison_task_stack_below #endif @@ -145,4 +155,4 @@ ENTRY(_cpu_resume) ldp x29, lr, [x29] mov x0, #0 ret -ENDPROC(_cpu_resume) +SYM_FUNC_END(_cpu_resume) diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S index 54655273d1e0..487381164ff6 100644 --- a/arch/arm64/kernel/smccc-call.S +++ b/arch/arm64/kernel/smccc-call.S @@ -7,21 +7,48 @@ #include <asm/asm-offsets.h> #include <asm/assembler.h> +#include <asm/thread_info.h> + +/* + * If we have SMCCC v1.3 and (as is likely) no SVE state in + * the registers then set the SMCCC hint bit to say there's no + * need to preserve it. Do this by directly adjusting the SMCCC + * function value which is already stored in x0 ready to be called. + */ +SYM_FUNC_START(__arm_smccc_sve_check) + + ldr_l x16, smccc_has_sve_hint + cbz x16, 2f + + get_current_task x16 + ldr x16, [x16, #TSK_TI_FLAGS] + tbnz x16, #TIF_FOREIGN_FPSTATE, 1f // Any live FP state? + tbnz x16, #TIF_SVE, 2f // Does that state include SVE? + +1: orr x0, x0, ARM_SMCCC_1_3_SVE_HINT + +2: ret +SYM_FUNC_END(__arm_smccc_sve_check) +EXPORT_SYMBOL(__arm_smccc_sve_check) .macro SMCCC instr - .cfi_startproc + stp x29, x30, [sp, #-16]! + mov x29, sp +alternative_if ARM64_SVE + bl __arm_smccc_sve_check +alternative_else_nop_endif \instr #0 - ldr x4, [sp] + ldr x4, [sp, #16] stp x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS] stp x2, x3, [x4, #ARM_SMCCC_RES_X2_OFFS] - ldr x4, [sp, #8] + ldr x4, [sp, #24] cbz x4, 1f /* no quirk structure */ ldr x9, [x4, #ARM_SMCCC_QUIRK_ID_OFFS] cmp x9, #ARM_SMCCC_QUIRK_QCOM_A6 b.ne 1f str x6, [x4, ARM_SMCCC_QUIRK_STATE_OFFS] -1: ret - .cfi_endproc +1: ldp x29, x30, [sp], #16 + ret .endm /* @@ -30,9 +57,9 @@ * unsigned long a6, unsigned long a7, struct arm_smccc_res *res, * struct arm_smccc_quirk *quirk) */ -ENTRY(__arm_smccc_smc) +SYM_FUNC_START(__arm_smccc_smc) SMCCC smc -ENDPROC(__arm_smccc_smc) +SYM_FUNC_END(__arm_smccc_smc) EXPORT_SYMBOL(__arm_smccc_smc) /* @@ -41,7 +68,64 @@ EXPORT_SYMBOL(__arm_smccc_smc) * unsigned long a6, unsigned long a7, struct arm_smccc_res *res, * struct arm_smccc_quirk *quirk) */ -ENTRY(__arm_smccc_hvc) +SYM_FUNC_START(__arm_smccc_hvc) SMCCC hvc -ENDPROC(__arm_smccc_hvc) +SYM_FUNC_END(__arm_smccc_hvc) EXPORT_SYMBOL(__arm_smccc_hvc) + + .macro SMCCC_1_2 instr + /* Save `res` and free a GPR that won't be clobbered */ + stp x1, x19, [sp, #-16]! + + /* Ensure `args` won't be clobbered while loading regs in next step */ + mov x19, x0 + + /* Load the registers x0 - x17 from the struct arm_smccc_1_2_regs */ + ldp x0, x1, [x19, #ARM_SMCCC_1_2_REGS_X0_OFFS] + ldp x2, x3, [x19, #ARM_SMCCC_1_2_REGS_X2_OFFS] + ldp x4, x5, [x19, #ARM_SMCCC_1_2_REGS_X4_OFFS] + ldp x6, x7, [x19, #ARM_SMCCC_1_2_REGS_X6_OFFS] + ldp x8, x9, [x19, #ARM_SMCCC_1_2_REGS_X8_OFFS] + ldp x10, x11, [x19, #ARM_SMCCC_1_2_REGS_X10_OFFS] + ldp x12, x13, [x19, #ARM_SMCCC_1_2_REGS_X12_OFFS] + ldp x14, x15, [x19, #ARM_SMCCC_1_2_REGS_X14_OFFS] + ldp x16, x17, [x19, #ARM_SMCCC_1_2_REGS_X16_OFFS] + + \instr #0 + + /* Load the `res` from the stack */ + ldr x19, [sp] + + /* Store the registers x0 - x17 into the result structure */ + stp x0, x1, [x19, #ARM_SMCCC_1_2_REGS_X0_OFFS] + stp x2, x3, [x19, #ARM_SMCCC_1_2_REGS_X2_OFFS] + stp x4, x5, [x19, #ARM_SMCCC_1_2_REGS_X4_OFFS] + stp x6, x7, [x19, #ARM_SMCCC_1_2_REGS_X6_OFFS] + stp x8, x9, [x19, #ARM_SMCCC_1_2_REGS_X8_OFFS] + stp x10, x11, [x19, #ARM_SMCCC_1_2_REGS_X10_OFFS] + stp x12, x13, [x19, #ARM_SMCCC_1_2_REGS_X12_OFFS] + stp x14, x15, [x19, #ARM_SMCCC_1_2_REGS_X14_OFFS] + stp x16, x17, [x19, #ARM_SMCCC_1_2_REGS_X16_OFFS] + + /* Restore original x19 */ + ldp xzr, x19, [sp], #16 + ret +.endm + +/* + * void arm_smccc_1_2_hvc(const struct arm_smccc_1_2_regs *args, + * struct arm_smccc_1_2_regs *res); + */ +SYM_FUNC_START(arm_smccc_1_2_hvc) + SMCCC_1_2 hvc +SYM_FUNC_END(arm_smccc_1_2_hvc) +EXPORT_SYMBOL(arm_smccc_1_2_hvc) + +/* + * void arm_smccc_1_2_smc(const struct arm_smccc_1_2_regs *args, + * struct arm_smccc_1_2_regs *res); + */ +SYM_FUNC_START(arm_smccc_1_2_smc) + SMCCC_1_2 smc +SYM_FUNC_END(arm_smccc_1_2_smc) +EXPORT_SYMBOL(arm_smccc_1_2_smc) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index d4ed9a19d8fe..4ced34f62dab 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -30,8 +30,11 @@ #include <linux/completion.h> #include <linux/of.h> #include <linux/irq_work.h> +#include <linux/kernel_stat.h> #include <linux/kexec.h> +#include <linux/kgdb.h> #include <linux/kvm_host.h> +#include <linux/nmi.h> #include <asm/alternative.h> #include <asm/atomic.h> @@ -43,8 +46,6 @@ #include <asm/kvm_mmu.h> #include <asm/mmu_context.h> #include <asm/numa.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> #include <asm/processor.h> #include <asm/smp_plat.h> #include <asm/sections.h> @@ -52,7 +53,6 @@ #include <asm/ptrace.h> #include <asm/virt.h> -#define CREATE_TRACE_POINTS #include <trace/events/ipi.h> DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); @@ -65,7 +65,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); */ struct secondary_data secondary_data; /* Number of CPUs which aren't online, but looping in kernel text. */ -int cpus_stuck_in_kernel; +static int cpus_stuck_in_kernel; enum ipi_msg_type { IPI_RESCHEDULE, @@ -74,10 +74,24 @@ enum ipi_msg_type { IPI_CPU_CRASH_STOP, IPI_TIMER, IPI_IRQ_WORK, - IPI_WAKEUP + NR_IPI, + /* + * Any enum >= NR_IPI and < MAX_IPI is special and not tracable + * with trace_ipi_* + */ + IPI_CPU_BACKTRACE = NR_IPI, + IPI_KGDB_ROUNDUP, + MAX_IPI }; +static int ipi_irq_base __ro_after_init; +static int nr_ipi __ro_after_init = NR_IPI; +static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init; + +static void ipi_setup(int cpu); + #ifdef CONFIG_HOTPLUG_CPU +static void ipi_teardown(int cpu); static int op_cpu_kill(unsigned int cpu); #else static inline int op_cpu_kill(unsigned int cpu) @@ -93,8 +107,10 @@ static inline int op_cpu_kill(unsigned int cpu) */ static int boot_secondary(unsigned int cpu, struct task_struct *idle) { - if (cpu_ops[cpu]->cpu_boot) - return cpu_ops[cpu]->cpu_boot(cpu); + const struct cpu_operations *ops = get_cpu_ops(cpu); + + if (ops->cpu_boot) + return ops->cpu_boot(cpu); return -EOPNOTSUPP; } @@ -111,67 +127,58 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) * page tables. */ secondary_data.task = idle; - secondary_data.stack = task_stack_page(idle) + THREAD_SIZE; update_cpu_boot_status(CPU_MMU_OFF); - __flush_dcache_area(&secondary_data, sizeof(secondary_data)); - /* - * Now bring the CPU into our world. - */ + /* Now bring the CPU into our world */ ret = boot_secondary(cpu, idle); - if (ret == 0) { - /* - * CPU was successfully started, wait for it to come online or - * time out. - */ - wait_for_completion_timeout(&cpu_running, - msecs_to_jiffies(5000)); - - if (!cpu_online(cpu)) { - pr_crit("CPU%u: failed to come online\n", cpu); - ret = -EIO; - } - } else { + if (ret) { pr_err("CPU%u: failed to boot: %d\n", cpu, ret); return ret; } + /* + * CPU was successfully started, wait for it to come online or + * time out. + */ + wait_for_completion_timeout(&cpu_running, + msecs_to_jiffies(5000)); + if (cpu_online(cpu)) + return 0; + + pr_crit("CPU%u: failed to come online\n", cpu); secondary_data.task = NULL; - secondary_data.stack = NULL; - __flush_dcache_area(&secondary_data, sizeof(secondary_data)); status = READ_ONCE(secondary_data.status); - if (ret && status) { + if (status == CPU_MMU_OFF) + status = READ_ONCE(__early_cpu_boot_status); - if (status == CPU_MMU_OFF) - status = READ_ONCE(__early_cpu_boot_status); - - switch (status & CPU_BOOT_STATUS_MASK) { - default: - pr_err("CPU%u: failed in unknown state : 0x%lx\n", - cpu, status); - cpus_stuck_in_kernel++; - break; - case CPU_KILL_ME: - if (!op_cpu_kill(cpu)) { - pr_crit("CPU%u: died during early boot\n", cpu); - break; - } - pr_crit("CPU%u: may not have shut down cleanly\n", cpu); - /* Fall through */ - case CPU_STUCK_IN_KERNEL: - pr_crit("CPU%u: is stuck in kernel\n", cpu); - if (status & CPU_STUCK_REASON_52_BIT_VA) - pr_crit("CPU%u: does not support 52-bit VAs\n", cpu); - if (status & CPU_STUCK_REASON_NO_GRAN) - pr_crit("CPU%u: does not support %luK granule \n", cpu, PAGE_SIZE / SZ_1K); - cpus_stuck_in_kernel++; + switch (status & CPU_BOOT_STATUS_MASK) { + default: + pr_err("CPU%u: failed in unknown state : 0x%lx\n", + cpu, status); + cpus_stuck_in_kernel++; + break; + case CPU_KILL_ME: + if (!op_cpu_kill(cpu)) { + pr_crit("CPU%u: died during early boot\n", cpu); break; - case CPU_PANIC_KERNEL: - panic("CPU%u detected unsupported configuration\n", cpu); } + pr_crit("CPU%u: may not have shut down cleanly\n", cpu); + fallthrough; + case CPU_STUCK_IN_KERNEL: + pr_crit("CPU%u: is stuck in kernel\n", cpu); + if (status & CPU_STUCK_REASON_52_BIT_VA) + pr_crit("CPU%u: does not support 52-bit VAs\n", cpu); + if (status & CPU_STUCK_REASON_NO_GRAN) { + pr_crit("CPU%u: does not support %luK granule\n", + cpu, PAGE_SIZE / SZ_1K); + } + cpus_stuck_in_kernel++; + break; + case CPU_PANIC_KERNEL: + panic("CPU%u detected unsupported configuration\n", cpu); } - return ret; + return -EIO; } static void init_gic_priority_masking(void) @@ -184,6 +191,7 @@ static void init_gic_priority_masking(void) cpuflags = read_sysreg(daif); WARN_ON(!(cpuflags & PSR_I_BIT)); + WARN_ON(!(cpuflags & PSR_F_BIT)); gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); } @@ -196,10 +204,8 @@ asmlinkage notrace void secondary_start_kernel(void) { u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; struct mm_struct *mm = &init_mm; - unsigned int cpu; - - cpu = task_cpu(current); - set_my_cpu_offset(per_cpu_offset(cpu)); + const struct cpu_operations *ops; + unsigned int cpu = smp_processor_id(); /* * All kernel threads share the same mm context; grab a @@ -217,7 +223,7 @@ asmlinkage notrace void secondary_start_kernel(void) if (system_uses_irq_prio_masking()) init_gic_priority_masking(); - preempt_disable(); + rcutree_report_cpu_starting(cpu); trace_hardirqs_off(); /* @@ -227,20 +233,23 @@ asmlinkage notrace void secondary_start_kernel(void) */ check_local_cpu_capabilities(); - if (cpu_ops[cpu]->cpu_postboot) - cpu_ops[cpu]->cpu_postboot(); + ops = get_cpu_ops(cpu); + if (ops->cpu_postboot) + ops->cpu_postboot(); /* * Log the CPU info before it is marked online and might get read. */ cpuinfo_store_cpu(); + store_cpu_topology(cpu); /* * Enable GIC and timers. */ notify_cpu_starting(cpu); - store_cpu_topology(cpu); + ipi_setup(cpu); + numa_add_cpu(cpu); /* @@ -266,19 +275,21 @@ asmlinkage notrace void secondary_start_kernel(void) #ifdef CONFIG_HOTPLUG_CPU static int op_cpu_disable(unsigned int cpu) { + const struct cpu_operations *ops = get_cpu_ops(cpu); + /* * If we don't have a cpu_die method, abort before we reach the point * of no return. CPU0 may not have an cpu_ops, so test for it. */ - if (!cpu_ops[cpu] || !cpu_ops[cpu]->cpu_die) + if (!ops || !ops->cpu_die) return -EOPNOTSUPP; /* * We may need to abort a hot unplug for some other mechanism-specific * reason. */ - if (cpu_ops[cpu]->cpu_disable) - return cpu_ops[cpu]->cpu_disable(cpu); + if (ops->cpu_disable) + return ops->cpu_disable(cpu); return 0; } @@ -303,6 +314,7 @@ int __cpu_disable(void) * and we must not schedule until we're ready to give up the cpu. */ set_cpu_online(cpu, false); + ipi_teardown(cpu); /* * OK - migrate IRQs away from this CPU @@ -314,30 +326,28 @@ int __cpu_disable(void) static int op_cpu_kill(unsigned int cpu) { + const struct cpu_operations *ops = get_cpu_ops(cpu); + /* * If we have no means of synchronising with the dying CPU, then assume * that it is really dead. We can only wait for an arbitrary length of * time and hope that it's dead, so let's skip the wait and just hope. */ - if (!cpu_ops[cpu]->cpu_kill) + if (!ops->cpu_kill) return 0; - return cpu_ops[cpu]->cpu_kill(cpu); + return ops->cpu_kill(cpu); } /* - * called on the thread which is asking for a CPU to be shutdown - - * waits until shutdown has completed, or it is timed out. + * Called on the thread which is asking for a CPU to be shutdown after the + * shutdown completed. */ -void __cpu_die(unsigned int cpu) +void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { int err; - if (!cpu_wait_death(cpu, 5)) { - pr_crit("CPU%u: cpu didn't die\n", cpu); - return; - } - pr_notice("CPU%u: shutdown\n", cpu); + pr_debug("CPU%u: shutdown\n", cpu); /* * Now that the dying CPU is beyond the point of no return w.r.t. @@ -354,33 +364,44 @@ void __cpu_die(unsigned int cpu) * Called from the idle thread for the CPU which has been shutdown. * */ -void cpu_die(void) +void __noreturn cpu_die(void) { unsigned int cpu = smp_processor_id(); + const struct cpu_operations *ops = get_cpu_ops(cpu); idle_task_exit(); local_daif_mask(); - /* Tell __cpu_die() that this CPU is now safe to dispose of */ - (void)cpu_report_death(); + /* Tell cpuhp_bp_sync_dead() that this CPU is now safe to dispose of */ + cpuhp_ap_report_dead(); /* * Actually shutdown the CPU. This must never fail. The specific hotplug * mechanism must perform all required cache maintenance to ensure that * no dirty lines are lost in the process of shutting down the CPU. */ - cpu_ops[cpu]->cpu_die(cpu); + ops->cpu_die(cpu); BUG(); } #endif +static void __cpu_try_die(int cpu) +{ +#ifdef CONFIG_HOTPLUG_CPU + const struct cpu_operations *ops = get_cpu_ops(cpu); + + if (ops && ops->cpu_die) + ops->cpu_die(cpu); +#endif +} + /* * Kill the calling secondary CPU, early in bringup before it is turned * online. */ -void cpu_die_early(void) +void __noreturn cpu_die_early(void) { int cpu = smp_processor_id(); @@ -388,13 +409,13 @@ void cpu_die_early(void) /* Mark this CPU absent */ set_cpu_present(cpu, 0); + rcutree_report_cpu_dead(); + + if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) { + update_cpu_boot_status(CPU_KILL_ME); + __cpu_try_die(cpu); + } -#ifdef CONFIG_HOTPLUG_CPU - update_cpu_boot_status(CPU_KILL_ME); - /* Check if we can park ourselves */ - if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_die) - cpu_ops[cpu]->cpu_die(cpu); -#endif update_cpu_boot_status(CPU_STUCK_IN_KERNEL); cpu_park_loop(); @@ -409,61 +430,38 @@ static void __init hyp_mode_check(void) "CPU: CPUs started in inconsistent modes"); else pr_info("CPU: All CPU(s) started at EL1\n"); - if (IS_ENABLED(CONFIG_KVM_ARM_HOST)) + if (IS_ENABLED(CONFIG_KVM) && !is_kernel_in_hyp_mode()) { kvm_compute_layout(); + kvm_apply_hyp_relocations(); + } } void __init smp_cpus_done(unsigned int max_cpus) { pr_info("SMP: Total of %d processors activated.\n", num_online_cpus()); - setup_cpu_features(); hyp_mode_check(); - apply_alternatives_all(); + setup_system_features(); + setup_user_features(); mark_linear_text_alias_ro(); } void __init smp_prepare_boot_cpu(void) { - set_my_cpu_offset(per_cpu_offset(smp_processor_id())); - cpuinfo_store_boot_cpu(); - /* - * We now know enough about the boot CPU to apply the - * alternatives that cannot wait until interrupt handling - * and/or scheduling is enabled. + * The runtime per-cpu areas have been allocated by + * setup_per_cpu_areas(), and CPU0's boot time per-cpu area will be + * freed shortly, so we must move over to the runtime per-cpu area. */ - apply_boot_alternatives(); + set_my_cpu_offset(per_cpu_offset(smp_processor_id())); + + cpuinfo_store_boot_cpu(); + setup_boot_cpu_features(); /* Conditionally switch to GIC PMR for interrupt masking */ if (system_uses_irq_prio_masking()) init_gic_priority_masking(); -} - -static u64 __init of_get_cpu_mpidr(struct device_node *dn) -{ - const __be32 *cell; - u64 hwid; - - /* - * A cpu node with missing "reg" property is - * considered invalid to build a cpu_logical_map - * entry. - */ - cell = of_get_property(dn, "reg", NULL); - if (!cell) { - pr_err("%pOF: missing reg property\n", dn); - return INVALID_HWID; - } - hwid = of_read_number(cell, of_n_addr_cells(dn)); - /* - * Non affinity bits must be set to 0 in the DT - */ - if (hwid & ~MPIDR_HWID_BITMASK) { - pr_err("%pOF: invalid reg property\n", dn); - return INVALID_HWID; - } - return hwid; + kasan_init_hw_tags(); } /* @@ -488,10 +486,13 @@ static bool __init is_mpidr_duplicate(unsigned int cpu, u64 hwid) */ static int __init smp_cpu_setup(int cpu) { - if (cpu_read_ops(cpu)) + const struct cpu_operations *ops; + + if (init_cpu_ops(cpu)) return -ENODEV; - if (cpu_ops[cpu]->cpu_init(cpu)) + ops = get_cpu_ops(cpu); + if (ops->cpu_init(cpu)) return -ENODEV; set_cpu_possible(cpu, true); @@ -509,6 +510,7 @@ struct acpi_madt_generic_interrupt *acpi_cpu_get_madt_gicc(int cpu) { return &cpu_madt_gicc[cpu]; } +EXPORT_SYMBOL_GPL(acpi_cpu_get_madt_gicc); /* * acpi_map_gic_cpu_interface - parse processor MADT entry @@ -521,7 +523,7 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) { u64 hwid = processor->arm_mpidr; - if (!(processor->flags & ACPI_MADT_ENABLED)) { + if (!acpi_gicc_is_usable(processor)) { pr_debug("skipping disabled CPU entry with 0x%llx MPIDR\n", hwid); return; } @@ -552,7 +554,7 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) return; /* map the logical cpu id to cpu MPIDR */ - cpu_logical_map(cpu_count) = hwid; + set_cpu_logical_map(cpu_count, hwid); cpu_madt_gicc[cpu_count] = *processor; @@ -626,9 +628,9 @@ static void __init of_parse_and_init_cpus(void) struct device_node *dn; for_each_of_cpu_node(dn) { - u64 hwid = of_get_cpu_mpidr(dn); + u64 hwid = of_get_cpu_hwid(dn, 0); - if (hwid == INVALID_HWID) + if (hwid & ~MPIDR_HWID_BITMASK) goto next; if (is_mpidr_duplicate(cpu_count, hwid)) { @@ -666,7 +668,7 @@ static void __init of_parse_and_init_cpus(void) goto next; pr_debug("cpu logical map 0x%llx\n", hwid); - cpu_logical_map(cpu_count) = hwid; + set_cpu_logical_map(cpu_count, hwid); early_map_cpu_to_node(cpu_count, of_node_to_nid(dn)); next: @@ -707,13 +709,14 @@ void __init smp_init_cpus(void) for (i = 1; i < nr_cpu_ids; i++) { if (cpu_logical_map(i) != INVALID_HWID) { if (smp_cpu_setup(i)) - cpu_logical_map(i) = INVALID_HWID; + set_cpu_logical_map(i, INVALID_HWID); } } } void __init smp_prepare_cpus(unsigned int max_cpus) { + const struct cpu_operations *ops; int err; unsigned int cpu; unsigned int this_cpu; @@ -744,10 +747,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus) if (cpu == smp_processor_id()) continue; - if (!cpu_ops[cpu]) + ops = get_cpu_ops(cpu); + if (!ops) continue; - err = cpu_ops[cpu]->cpu_prepare(cpu); + err = ops->cpu_prepare(cpu); if (err) continue; @@ -756,31 +760,20 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } } -void (*__smp_cross_call)(const struct cpumask *, unsigned int); - -void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int)) -{ - __smp_cross_call = fn; -} - static const char *ipi_types[NR_IPI] __tracepoint_string = { -#define S(x,s) [x] = s - S(IPI_RESCHEDULE, "Rescheduling interrupts"), - S(IPI_CALL_FUNC, "Function call interrupts"), - S(IPI_CPU_STOP, "CPU stop interrupts"), - S(IPI_CPU_CRASH_STOP, "CPU stop (for crash dump) interrupts"), - S(IPI_TIMER, "Timer broadcast interrupts"), - S(IPI_IRQ_WORK, "IRQ work interrupts"), - S(IPI_WAKEUP, "CPU wake-up interrupts"), + [IPI_RESCHEDULE] = "Rescheduling interrupts", + [IPI_CALL_FUNC] = "Function call interrupts", + [IPI_CPU_STOP] = "CPU stop interrupts", + [IPI_CPU_CRASH_STOP] = "CPU stop (for crash dump) interrupts", + [IPI_TIMER] = "Timer broadcast interrupts", + [IPI_IRQ_WORK] = "IRQ work interrupts", }; -static void smp_cross_call(const struct cpumask *target, unsigned int ipinr) -{ - trace_ipi_raise(target, ipi_types[ipinr]); - __smp_cross_call(target, ipinr); -} +static void smp_cross_call(const struct cpumask *target, unsigned int ipinr); + +unsigned long irq_err_count; -void show_ipi_list(struct seq_file *p, int prec) +int arch_show_interrupts(struct seq_file *p, int prec) { unsigned int cpu, i; @@ -788,21 +781,12 @@ void show_ipi_list(struct seq_file *p, int prec) seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i, prec >= 4 ? " " : ""); for_each_online_cpu(cpu) - seq_printf(p, "%10u ", - __get_irq_stat(cpu, ipi_irqs[i])); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu)); seq_printf(p, " %s\n", ipi_types[i]); } -} - -u64 smp_irq_stat_cpu(unsigned int cpu) -{ - u64 sum = 0; - int i; - for (i = 0; i < NR_IPI; i++) - sum += __get_irq_stat(cpu, ipi_irqs[i]); - - return sum; + seq_printf(p, "%*s: %10lu\n", prec, "Err", irq_err_count); + return 0; } void arch_send_call_function_ipi_mask(const struct cpumask *mask) @@ -815,22 +799,14 @@ void arch_send_call_function_single_ipi(int cpu) smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC); } -#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL -void arch_send_wakeup_ipi_mask(const struct cpumask *mask) -{ - smp_cross_call(mask, IPI_WAKEUP); -} -#endif - #ifdef CONFIG_IRQ_WORK void arch_irq_work_raise(void) { - if (__smp_cross_call) - smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK); + smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK); } #endif -static void local_cpu_stop(void) +static void __noreturn local_cpu_stop(void) { set_cpu_online(smp_processor_id(), false); @@ -844,7 +820,7 @@ static void local_cpu_stop(void) * that cpu_online_mask gets correctly updated and smp_send_stop() can skip * CPUs that have already stopped themselves. */ -void panic_smp_self_stop(void) +void __noreturn panic_smp_self_stop(void) { local_cpu_stop(); } @@ -853,7 +829,7 @@ void panic_smp_self_stop(void) static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0); #endif -static void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) +static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) { #ifdef CONFIG_KEXEC_CORE crash_save_cpu(regs, cpu); @@ -863,28 +839,57 @@ static void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) local_irq_disable(); sdei_mask_local_cpu(); -#ifdef CONFIG_HOTPLUG_CPU - if (cpu_ops[cpu]->cpu_die) - cpu_ops[cpu]->cpu_die(cpu); -#endif + if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) + __cpu_try_die(cpu); /* just in case */ cpu_park_loop(); +#else + BUG(); #endif } +static void arm64_backtrace_ipi(cpumask_t *mask) +{ + __ipi_send_mask(ipi_desc[IPI_CPU_BACKTRACE], mask); +} + +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu) +{ + /* + * NOTE: though nmi_trigger_cpumask_backtrace() has "nmi_" in the name, + * nothing about it truly needs to be implemented using an NMI, it's + * just that it's _allowed_ to work with NMIs. If ipi_should_be_nmi() + * returned false our backtrace attempt will just use a regular IPI. + */ + nmi_trigger_cpumask_backtrace(mask, exclude_cpu, arm64_backtrace_ipi); +} + +#ifdef CONFIG_KGDB +void kgdb_roundup_cpus(void) +{ + int this_cpu = raw_smp_processor_id(); + int cpu; + + for_each_online_cpu(cpu) { + /* No need to roundup ourselves */ + if (cpu == this_cpu) + continue; + + __ipi_send_single(ipi_desc[IPI_KGDB_ROUNDUP], cpu); + } +} +#endif + /* * Main handler for inter-processor interrupts */ -void handle_IPI(int ipinr, struct pt_regs *regs) +static void do_handle_IPI(int ipinr) { unsigned int cpu = smp_processor_id(); - struct pt_regs *old_regs = set_irq_regs(regs); - if ((unsigned)ipinr < NR_IPI) { - trace_ipi_entry_rcuidle(ipi_types[ipinr]); - __inc_irq_stat(cpu, ipi_irqs[ipinr]); - } + if ((unsigned)ipinr < NR_IPI) + trace_ipi_entry(ipi_types[ipinr]); switch (ipinr) { case IPI_RESCHEDULE: @@ -892,21 +897,16 @@ void handle_IPI(int ipinr, struct pt_regs *regs) break; case IPI_CALL_FUNC: - irq_enter(); generic_smp_call_function_interrupt(); - irq_exit(); break; case IPI_CPU_STOP: - irq_enter(); local_cpu_stop(); - irq_exit(); break; case IPI_CPU_CRASH_STOP: if (IS_ENABLED(CONFIG_KEXEC_CORE)) { - irq_enter(); - ipi_cpu_crash_stop(cpu, regs); + ipi_cpu_crash_stop(cpu, get_irq_regs()); unreachable(); } @@ -914,27 +914,27 @@ void handle_IPI(int ipinr, struct pt_regs *regs) #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST case IPI_TIMER: - irq_enter(); tick_receive_broadcast(); - irq_exit(); break; #endif #ifdef CONFIG_IRQ_WORK case IPI_IRQ_WORK: - irq_enter(); irq_work_run(); - irq_exit(); break; #endif -#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL - case IPI_WAKEUP: - WARN_ONCE(!acpi_parking_protocol_valid(cpu), - "CPU%u: Wake-up IPI outside the ACPI parking protocol\n", - cpu); + case IPI_CPU_BACKTRACE: + /* + * NOTE: in some cases this _won't_ be NMI context. See the + * comment in arch_trigger_cpumask_backtrace(). + */ + nmi_cpu_backtrace(get_irq_regs()); + break; + + case IPI_KGDB_ROUNDUP: + kgdb_nmicallback(cpu, get_irq_regs()); break; -#endif default: pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); @@ -942,15 +942,121 @@ void handle_IPI(int ipinr, struct pt_regs *regs) } if ((unsigned)ipinr < NR_IPI) - trace_ipi_exit_rcuidle(ipi_types[ipinr]); - set_irq_regs(old_regs); + trace_ipi_exit(ipi_types[ipinr]); +} + +static irqreturn_t ipi_handler(int irq, void *data) +{ + do_handle_IPI(irq - ipi_irq_base); + return IRQ_HANDLED; +} + +static void smp_cross_call(const struct cpumask *target, unsigned int ipinr) +{ + trace_ipi_raise(target, ipi_types[ipinr]); + __ipi_send_mask(ipi_desc[ipinr], target); +} + +static bool ipi_should_be_nmi(enum ipi_msg_type ipi) +{ + if (!system_uses_irq_prio_masking()) + return false; + + switch (ipi) { + case IPI_CPU_STOP: + case IPI_CPU_CRASH_STOP: + case IPI_CPU_BACKTRACE: + case IPI_KGDB_ROUNDUP: + return true; + default: + return false; + } +} + +static void ipi_setup(int cpu) +{ + int i; + + if (WARN_ON_ONCE(!ipi_irq_base)) + return; + + for (i = 0; i < nr_ipi; i++) { + if (ipi_should_be_nmi(i)) { + prepare_percpu_nmi(ipi_irq_base + i); + enable_percpu_nmi(ipi_irq_base + i, 0); + } else { + enable_percpu_irq(ipi_irq_base + i, 0); + } + } +} + +#ifdef CONFIG_HOTPLUG_CPU +static void ipi_teardown(int cpu) +{ + int i; + + if (WARN_ON_ONCE(!ipi_irq_base)) + return; + + for (i = 0; i < nr_ipi; i++) { + if (ipi_should_be_nmi(i)) { + disable_percpu_nmi(ipi_irq_base + i); + teardown_percpu_nmi(ipi_irq_base + i); + } else { + disable_percpu_irq(ipi_irq_base + i); + } + } +} +#endif + +void __init set_smp_ipi_range(int ipi_base, int n) +{ + int i; + + WARN_ON(n < MAX_IPI); + nr_ipi = min(n, MAX_IPI); + + for (i = 0; i < nr_ipi; i++) { + int err; + + if (ipi_should_be_nmi(i)) { + err = request_percpu_nmi(ipi_base + i, ipi_handler, + "IPI", &cpu_number); + WARN(err, "Could not request IPI %d as NMI, err=%d\n", + i, err); + } else { + err = request_percpu_irq(ipi_base + i, ipi_handler, + "IPI", &cpu_number); + WARN(err, "Could not request IPI %d as IRQ, err=%d\n", + i, err); + } + + ipi_desc[i] = irq_to_desc(ipi_base + i); + irq_set_status_flags(ipi_base + i, IRQ_HIDDEN); + } + + ipi_irq_base = ipi_base; + + /* Setup the boot CPU immediately */ + ipi_setup(smp_processor_id()); } -void smp_send_reschedule(int cpu) +void arch_smp_send_reschedule(int cpu) { smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE); } +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL +void arch_send_wakeup_ipi(unsigned int cpu) +{ + /* + * We use a scheduler IPI to wake the CPU as this avoids the need for a + * dedicated IPI and we can safely handle spurious scheduler IPIs. + */ + smp_send_reschedule(cpu); +} +#endif + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST void tick_broadcast(const struct cpumask *mask) { @@ -958,11 +1064,22 @@ void tick_broadcast(const struct cpumask *mask) } #endif +/* + * The number of CPUs online, not counting this CPU (which may not be + * fully online and so not counted in num_online_cpus()). + */ +static inline unsigned int num_other_online_cpus(void) +{ + unsigned int this_cpu_online = cpu_online(smp_processor_id()); + + return num_online_cpus() - this_cpu_online; +} + void smp_send_stop(void) { unsigned long timeout; - if (num_online_cpus() > 1) { + if (num_other_online_cpus()) { cpumask_t mask; cpumask_copy(&mask, cpu_online_mask); @@ -975,10 +1092,10 @@ void smp_send_stop(void) /* Wait up to one second for other CPUs to stop */ timeout = USEC_PER_SEC; - while (num_online_cpus() > 1 && timeout--) + while (num_other_online_cpus() && timeout--) udelay(1); - if (num_online_cpus() > 1) + if (num_other_online_cpus()) pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", cpumask_pr_args(cpu_online_mask)); @@ -1001,15 +1118,17 @@ void crash_smp_send_stop(void) cpus_stopped = 1; - if (num_online_cpus() == 1) { - sdei_mask_local_cpu(); - return; - } + /* + * If this cpu is the only one alive at this point in time, online or + * not, there are no stop messages to be sent around, so just back out. + */ + if (num_other_online_cpus() == 0) + goto skip_ipi; cpumask_copy(&mask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &mask); - atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + atomic_set(&waiting_for_crash_ipi, num_other_online_cpus()); pr_crit("SMP: stopping secondary CPUs\n"); smp_cross_call(&mask, IPI_CPU_CRASH_STOP); @@ -1023,7 +1142,9 @@ void crash_smp_send_stop(void) pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", cpumask_pr_args(&mask)); +skip_ipi: sdei_mask_local_cpu(); + sdei_handler_abort(); } bool smp_crash_stop_failed(void) @@ -1032,20 +1153,13 @@ bool smp_crash_stop_failed(void) } #endif -/* - * not supported here - */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - static bool have_cpu_die(void) { #ifdef CONFIG_HOTPLUG_CPU int any_cpu = raw_smp_processor_id(); + const struct cpu_operations *ops = get_cpu_ops(any_cpu); - if (cpu_ops[any_cpu] && cpu_ops[any_cpu]->cpu_die) + if (ops && ops->cpu_die) return true; #endif return false; @@ -1055,5 +1169,6 @@ bool cpus_are_stuck_in_kernel(void) { bool smp_spin_tables = (num_possible_cpus() > 1 && !have_cpu_die()); - return !!cpus_stuck_in_kernel || smp_spin_tables; + return !!cpus_stuck_in_kernel || smp_spin_tables || + is_protected_kvm_enabled(); } diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c index c8a3fee00c11..49029eace3ad 100644 --- a/arch/arm64/kernel/smp_spin_table.c +++ b/arch/arm64/kernel/smp_spin_table.c @@ -19,7 +19,7 @@ #include <asm/smp_plat.h> extern void secondary_holding_pen(void); -volatile unsigned long __section(.mmuoff.data.read) +volatile unsigned long __section(".mmuoff.data.read") secondary_holding_pen_release = INVALID_HWID; static phys_addr_t cpu_release_addr[NR_CPUS]; @@ -36,7 +36,7 @@ static void write_pen_release(u64 val) unsigned long size = sizeof(secondary_holding_pen_release); secondary_holding_pen_release = val; - __flush_dcache_area(start, size); + dcache_clean_inval_poc((unsigned long)start, (unsigned long)start + size); } @@ -66,6 +66,7 @@ static int smp_spin_table_cpu_init(unsigned int cpu) static int smp_spin_table_cpu_prepare(unsigned int cpu) { __le64 __iomem *release_addr; + phys_addr_t pa_holding_pen = __pa_symbol(secondary_holding_pen); if (!cpu_release_addr[cpu]) return -ENODEV; @@ -83,14 +84,15 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu) /* * We write the release address as LE regardless of the native - * endianess of the kernel. Therefore, any boot-loaders that + * endianness of the kernel. Therefore, any boot-loaders that * read this address need to convert this address to the - * boot-loader's endianess before jumping. This is mandated by + * boot-loader's endianness before jumping. This is mandated by * the boot protocol. */ - writeq_relaxed(__pa_symbol(secondary_holding_pen), release_addr); - __flush_dcache_area((__force void *)release_addr, - sizeof(*release_addr)); + writeq_relaxed(pa_holding_pen, release_addr); + dcache_clean_inval_poc((__force unsigned long)release_addr, + (__force unsigned long)release_addr + + sizeof(*release_addr)); /* * Send an event to wake up the secondary CPU. diff --git a/arch/arm64/kernel/ssbd.c b/arch/arm64/kernel/ssbd.c deleted file mode 100644 index 52cfc6148355..000000000000 --- a/arch/arm64/kernel/ssbd.c +++ /dev/null @@ -1,129 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2018 ARM Ltd, All Rights Reserved. - */ - -#include <linux/compat.h> -#include <linux/errno.h> -#include <linux/prctl.h> -#include <linux/sched.h> -#include <linux/sched/task_stack.h> -#include <linux/thread_info.h> - -#include <asm/cpufeature.h> - -static void ssbd_ssbs_enable(struct task_struct *task) -{ - u64 val = is_compat_thread(task_thread_info(task)) ? - PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; - - task_pt_regs(task)->pstate |= val; -} - -static void ssbd_ssbs_disable(struct task_struct *task) -{ - u64 val = is_compat_thread(task_thread_info(task)) ? - PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; - - task_pt_regs(task)->pstate &= ~val; -} - -/* - * prctl interface for SSBD - */ -static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl) -{ - int state = arm64_get_ssbd_state(); - - /* Unsupported */ - if (state == ARM64_SSBD_UNKNOWN) - return -EINVAL; - - /* Treat the unaffected/mitigated state separately */ - if (state == ARM64_SSBD_MITIGATED) { - switch (ctrl) { - case PR_SPEC_ENABLE: - return -EPERM; - case PR_SPEC_DISABLE: - case PR_SPEC_FORCE_DISABLE: - return 0; - } - } - - /* - * Things are a bit backward here: the arm64 internal API - * *enables the mitigation* when the userspace API *disables - * speculation*. So much fun. - */ - switch (ctrl) { - case PR_SPEC_ENABLE: - /* If speculation is force disabled, enable is not allowed */ - if (state == ARM64_SSBD_FORCE_ENABLE || - task_spec_ssb_force_disable(task)) - return -EPERM; - task_clear_spec_ssb_disable(task); - clear_tsk_thread_flag(task, TIF_SSBD); - ssbd_ssbs_enable(task); - break; - case PR_SPEC_DISABLE: - if (state == ARM64_SSBD_FORCE_DISABLE) - return -EPERM; - task_set_spec_ssb_disable(task); - set_tsk_thread_flag(task, TIF_SSBD); - ssbd_ssbs_disable(task); - break; - case PR_SPEC_FORCE_DISABLE: - if (state == ARM64_SSBD_FORCE_DISABLE) - return -EPERM; - task_set_spec_ssb_disable(task); - task_set_spec_ssb_force_disable(task); - set_tsk_thread_flag(task, TIF_SSBD); - ssbd_ssbs_disable(task); - break; - default: - return -ERANGE; - } - - return 0; -} - -int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, - unsigned long ctrl) -{ - switch (which) { - case PR_SPEC_STORE_BYPASS: - return ssbd_prctl_set(task, ctrl); - default: - return -ENODEV; - } -} - -static int ssbd_prctl_get(struct task_struct *task) -{ - switch (arm64_get_ssbd_state()) { - case ARM64_SSBD_UNKNOWN: - return -EINVAL; - case ARM64_SSBD_FORCE_ENABLE: - return PR_SPEC_DISABLE; - case ARM64_SSBD_KERNEL: - if (task_spec_ssb_force_disable(task)) - return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; - if (task_spec_ssb_disable(task)) - return PR_SPEC_PRCTL | PR_SPEC_DISABLE; - return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - case ARM64_SSBD_FORCE_DISABLE: - return PR_SPEC_ENABLE; - default: - return PR_SPEC_NOT_AFFECTED; - } -} - -int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) -{ - switch (which) { - case PR_SPEC_STORE_BYPASS: - return ssbd_prctl_get(task); - default: - return -ENODEV; - } -} diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index a336cb124320..7f88028a00c0 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -5,6 +5,7 @@ * Copyright (C) 2012 ARM Ltd. */ #include <linux/kernel.h> +#include <linux/efi.h> #include <linux/export.h> #include <linux/ftrace.h> #include <linux/kprobes.h> @@ -13,200 +14,287 @@ #include <linux/sched/task_stack.h> #include <linux/stacktrace.h> +#include <asm/efi.h> #include <asm/irq.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> /* - * AArch64 PCS assigns the frame pointer to x29. + * Kernel unwind state * - * A simple function prologue looks like this: - * sub sp, sp, #0x10 - * stp x29, x30, [sp] - * mov x29, sp - * - * A simple function epilogue looks like this: - * mov sp, x29 - * ldp x29, x30, [sp] - * add sp, sp, #0x10 + * @common: Common unwind state. + * @task: The task being unwound. + * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance + * associated with the most recently encountered replacement lr + * value. */ +struct kunwind_state { + struct unwind_state common; + struct task_struct *task; +#ifdef CONFIG_KRETPROBES + struct llist_node *kr_cur; +#endif +}; + +static __always_inline void +kunwind_init(struct kunwind_state *state, + struct task_struct *task) +{ + unwind_init_common(&state->common); + state->task = task; +} /* - * Unwind from one frame record (A) to the next frame record (B). + * Start an unwind from a pt_regs. * - * We terminate early if the location of B indicates a malformed chain of frame - * records (e.g. a cycle), determined based on the location and fp value of A - * and the location (but not the fp value) of B. + * The unwind will begin at the PC within the regs. + * + * The regs must be on a stack currently owned by the calling task. */ -int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) +static __always_inline void +kunwind_init_from_regs(struct kunwind_state *state, + struct pt_regs *regs) { - unsigned long fp = frame->fp; - struct stack_info info; + kunwind_init(state, current); - if (fp & 0xf) - return -EINVAL; + state->common.fp = regs->regs[29]; + state->common.pc = regs->pc; +} - if (!tsk) - tsk = current; +/* + * Start an unwind from a caller. + * + * The unwind will begin at the caller of whichever function this is inlined + * into. + * + * The function which invokes this must be noinline. + */ +static __always_inline void +kunwind_init_from_caller(struct kunwind_state *state) +{ + kunwind_init(state, current); - if (!on_accessible_stack(tsk, fp, &info)) - return -EINVAL; - - if (test_bit(info.type, frame->stacks_done)) - return -EINVAL; - - /* - * As stacks grow downward, any valid record on the same stack must be - * at a strictly higher address than the prior record. - * - * Stacks can nest in several valid orders, e.g. - * - * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL - * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW - * - * ... but the nesting itself is strict. Once we transition from one - * stack to another, it's never valid to unwind back to that first - * stack. - */ - if (info.type == frame->prev_type) { - if (fp <= frame->prev_fp) - return -EINVAL; - } else { - set_bit(frame->prev_type, frame->stacks_done); - } + state->common.fp = (unsigned long)__builtin_frame_address(1); + state->common.pc = (unsigned long)__builtin_return_address(0); +} + +/* + * Start an unwind from a blocked task. + * + * The unwind will begin at the blocked tasks saved PC (i.e. the caller of + * cpu_switch_to()). + * + * The caller should ensure the task is blocked in cpu_switch_to() for the + * duration of the unwind, or the unwind will be bogus. It is never valid to + * call this for the current task. + */ +static __always_inline void +kunwind_init_from_task(struct kunwind_state *state, + struct task_struct *task) +{ + kunwind_init(state, task); - /* - * Record this frame record's values and location. The prev_fp and - * prev_type are only meaningful to the next unwind_frame() invocation. - */ - frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); - frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); - frame->prev_fp = fp; - frame->prev_type = info.type; + state->common.fp = thread_saved_fp(task); + state->common.pc = thread_saved_pc(task); +} +static __always_inline int +kunwind_recover_return_address(struct kunwind_state *state) +{ #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (tsk->ret_stack && - (frame->pc == (unsigned long)return_to_handler)) { - struct ftrace_ret_stack *ret_stack; - /* - * This is a case where function graph tracer has - * modified a return address (LR) in a stack frame - * to hook a function return. - * So replace it to an original value. - */ - ret_stack = ftrace_graph_get_ret_stack(tsk, frame->graph++); - if (WARN_ON_ONCE(!ret_stack)) + if (state->task->ret_stack && + (state->common.pc == (unsigned long)return_to_handler)) { + unsigned long orig_pc; + orig_pc = ftrace_graph_ret_addr(state->task, NULL, + state->common.pc, + (void *)state->common.fp); + if (WARN_ON_ONCE(state->common.pc == orig_pc)) return -EINVAL; - frame->pc = ret_stack->ret; + state->common.pc = orig_pc; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - /* - * Frames created upon entry from EL0 have NULL FP and PC values, so - * don't bother reporting these. Frames created by __noreturn functions - * might have a valid FP even if PC is bogus, so only terminate where - * both are NULL. - */ - if (!frame->fp && !frame->pc) - return -EINVAL; +#ifdef CONFIG_KRETPROBES + if (is_kretprobe_trampoline(state->common.pc)) { + unsigned long orig_pc; + orig_pc = kretprobe_find_ret_addr(state->task, + (void *)state->common.fp, + &state->kr_cur); + state->common.pc = orig_pc; + } +#endif /* CONFIG_KRETPROBES */ return 0; } -NOKPROBE_SYMBOL(unwind_frame); -void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, - int (*fn)(struct stackframe *, void *), void *data) +/* + * Unwind from one frame record (A) to the next frame record (B). + * + * We terminate early if the location of B indicates a malformed chain of frame + * records (e.g. a cycle), determined based on the location and fp value of A + * and the location (but not the fp value) of B. + */ +static __always_inline int +kunwind_next(struct kunwind_state *state) { + struct task_struct *tsk = state->task; + unsigned long fp = state->common.fp; + int err; + + /* Final frame; nothing to unwind */ + if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) + return -ENOENT; + + err = unwind_next_frame_record(&state->common); + if (err) + return err; + + state->common.pc = ptrauth_strip_kernel_insn_pac(state->common.pc); + + return kunwind_recover_return_address(state); +} + +typedef bool (*kunwind_consume_fn)(const struct kunwind_state *state, void *cookie); + +static __always_inline void +do_kunwind(struct kunwind_state *state, kunwind_consume_fn consume_state, + void *cookie) +{ + if (kunwind_recover_return_address(state)) + return; + while (1) { int ret; - if (fn(frame, data)) + if (!consume_state(state, cookie)) break; - ret = unwind_frame(tsk, frame); + ret = kunwind_next(state); if (ret < 0) break; } } -NOKPROBE_SYMBOL(walk_stackframe); -#ifdef CONFIG_STACKTRACE -struct stack_trace_data { - struct stack_trace *trace; - unsigned int no_sched_functions; - unsigned int skip; -}; +/* + * Per-cpu stacks are only accessible when unwinding the current task in a + * non-preemptible context. + */ +#define STACKINFO_CPU(name) \ + ({ \ + ((task == current) && !preemptible()) \ + ? stackinfo_get_##name() \ + : stackinfo_get_unknown(); \ + }) + +/* + * SDEI stacks are only accessible when unwinding the current task in an NMI + * context. + */ +#define STACKINFO_SDEI(name) \ + ({ \ + ((task == current) && in_nmi()) \ + ? stackinfo_get_sdei_##name() \ + : stackinfo_get_unknown(); \ + }) -static int save_trace(struct stackframe *frame, void *d) +#define STACKINFO_EFI \ + ({ \ + ((task == current) && current_in_efi()) \ + ? stackinfo_get_efi() \ + : stackinfo_get_unknown(); \ + }) + +static __always_inline void +kunwind_stack_walk(kunwind_consume_fn consume_state, + void *cookie, struct task_struct *task, + struct pt_regs *regs) { - struct stack_trace_data *data = d; - struct stack_trace *trace = data->trace; - unsigned long addr = frame->pc; - - if (data->no_sched_functions && in_sched_functions(addr)) - return 0; - if (data->skip) { - data->skip--; - return 0; + struct stack_info stacks[] = { + stackinfo_get_task(task), + STACKINFO_CPU(irq), +#if defined(CONFIG_VMAP_STACK) + STACKINFO_CPU(overflow), +#endif +#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_ARM_SDE_INTERFACE) + STACKINFO_SDEI(normal), + STACKINFO_SDEI(critical), +#endif +#ifdef CONFIG_EFI + STACKINFO_EFI, +#endif + }; + struct kunwind_state state = { + .common = { + .stacks = stacks, + .nr_stacks = ARRAY_SIZE(stacks), + }, + }; + + if (regs) { + if (task != current) + return; + kunwind_init_from_regs(&state, regs); + } else if (task == current) { + kunwind_init_from_caller(&state); + } else { + kunwind_init_from_task(&state, task); } - trace->entries[trace->nr_entries++] = addr; + do_kunwind(&state, consume_state, cookie); +} - return trace->nr_entries >= trace->max_entries; +struct kunwind_consume_entry_data { + stack_trace_consume_fn consume_entry; + void *cookie; +}; + +static bool +arch_kunwind_consume_entry(const struct kunwind_state *state, void *cookie) +{ + struct kunwind_consume_entry_data *data = cookie; + return data->consume_entry(data->cookie, state->common.pc); } -void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) +noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task, + struct pt_regs *regs) { - struct stack_trace_data data; - struct stackframe frame; + struct kunwind_consume_entry_data data = { + .consume_entry = consume_entry, + .cookie = cookie, + }; - data.trace = trace; - data.skip = trace->skip; - data.no_sched_functions = 0; + kunwind_stack_walk(arch_kunwind_consume_entry, &data, task, regs); +} - start_backtrace(&frame, regs->regs[29], regs->pc); - walk_stackframe(current, &frame, save_trace, &data); +static bool dump_backtrace_entry(void *arg, unsigned long where) +{ + char *loglvl = arg; + printk("%s %pSb\n", loglvl, (void *)where); + return true; } -EXPORT_SYMBOL_GPL(save_stack_trace_regs); -static noinline void __save_stack_trace(struct task_struct *tsk, - struct stack_trace *trace, unsigned int nosched) +void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { - struct stack_trace_data data; - struct stackframe frame; + pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); - if (!try_get_task_stack(tsk)) + if (regs && user_mode(regs)) return; - data.trace = trace; - data.skip = trace->skip; - data.no_sched_functions = nosched; + if (!tsk) + tsk = current; - if (tsk != current) { - start_backtrace(&frame, thread_saved_fp(tsk), - thread_saved_pc(tsk)); - } else { - /* We don't want this function nor the caller */ - data.skip += 2; - start_backtrace(&frame, - (unsigned long)__builtin_frame_address(0), - (unsigned long)__save_stack_trace); - } + if (!try_get_task_stack(tsk)) + return; - walk_stackframe(tsk, &frame, save_trace, &data); + printk("%sCall trace:\n", loglvl); + arch_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); put_task_stack(tsk); } -EXPORT_SYMBOL_GPL(save_stack_trace_tsk); -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { - __save_stack_trace(tsk, trace, 1); + dump_backtrace(NULL, tsk, loglvl); + barrier(); } - -void save_stack_trace(struct stack_trace *trace) -{ - __save_stack_trace(current, trace, 0); -} - -EXPORT_SYMBOL_GPL(save_stack_trace); -#endif diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 9405d1b7f4b0..eca4d0435211 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -3,13 +3,16 @@ #include <linux/percpu.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/pgtable.h> +#include <linux/cpuidle.h> #include <asm/alternative.h> #include <asm/cacheflush.h> #include <asm/cpufeature.h> +#include <asm/cpuidle.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/exec.h> -#include <asm/pgtable.h> +#include <asm/mte.h> #include <asm/memory.h> #include <asm/mmu_context.h> #include <asm/smp_plat.h> @@ -41,6 +44,8 @@ void notrace __cpu_suspend_exit(void) { unsigned int cpu = smp_processor_id(); + mte_suspend_exit(); + /* * We are resuming from reset with the idmap active in TTBR0_EL1. * We must uninstall the idmap and restore the expected MMU @@ -50,14 +55,15 @@ void notrace __cpu_suspend_exit(void) /* Restore CnP bit in TTBR1_EL1 */ if (system_supports_cnp()) - cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); + cpu_enable_swapper_cnp(); /* * PSTATE was not saved over suspend/resume, re-enable any detected * features that might not have been set correctly. */ + if (alternative_has_cap_unlikely(ARM64_HAS_DIT)) + set_pstate_dit(1); __uaccess_enable_hw_pan(); - uao_thread_switch(current); /* * Restore HW breakpoint registers to sane values @@ -72,8 +78,10 @@ void notrace __cpu_suspend_exit(void) * have turned the mitigation on. If the user has forcefully * disabled it, make sure their wishes are obeyed. */ - if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) - arm64_set_ssbd_mitigation(false); + spectre_v4_enable_mitigation(NULL); + + /* Restore additional feature-specific configuration */ + ptrauth_suspend_exit(); } /* @@ -88,21 +96,46 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) int ret = 0; unsigned long flags; struct sleep_stack_data state; + struct arm_cpuidle_irq_context context; + + /* + * Some portions of CPU state (e.g. PSTATE.{PAN,DIT}) are initialized + * before alternatives are patched, but are only restored by + * __cpu_suspend_exit() after alternatives are patched. To avoid + * accidentally losing these bits we must not attempt to suspend until + * after alternatives have been patched. + */ + WARN_ON(!system_capabilities_finalized()); + + /* Report any MTE async fault before going to suspend */ + mte_suspend_enter(); /* * From this point debug exceptions are disabled to prevent * updates to mdscr register (saved and restored along with * general purpose registers) from kernel debuggers. + * + * Strictly speaking the trace_hardirqs_off() here is superfluous, + * hardirqs should be firmly off by now. This really ought to use + * something like raw_local_daif_save(). */ flags = local_daif_save(); /* - * Function graph tracer state gets incosistent when the kernel + * Function graph tracer state gets inconsistent when the kernel * calls functions that never return (aka suspend finishers) hence * disable graph tracing during their execution. */ pause_graph_tracing(); + /* + * Switch to using DAIF.IF instead of PMR in order to reliably + * resume if we're using pseudo-NMIs. + */ + arm_cpuidle_save_irq_context(&context); + + ct_cpuidle_enter(); + if (__cpu_suspend_enter(&state)) { /* Call the suspend finisher */ ret = fn(arg); @@ -116,16 +149,21 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) */ if (!ret) ret = -EOPNOTSUPP; + + ct_cpuidle_exit(); } else { + ct_cpuidle_exit(); __cpu_suspend_exit(); } + arm_cpuidle_restore_irq_context(&context); + unpause_graph_tracing(); /* * Restore pstate flags. OS lock and mdscr have been already * restored, so from this point onwards, debugging is fully - * renabled if it was enabled when core started shutdown. + * reenabled if it was enabled when core started shutdown. */ local_daif_restore(flags); diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index 3c18c2454089..4a609e9b65de 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -9,7 +9,6 @@ #include <linux/compat.h> #include <linux/cpufeature.h> -#include <linux/personality.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/slab.h> @@ -32,7 +31,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end) if (fatal_signal_pending(current)) return 0; - if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_1542419)) { /* * The workaround requires an inner-shareable tlbi. * We pick the reserved-ASID to minimise the impact. @@ -41,7 +40,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end) dsb(ish); } - ret = __flush_cache_user_range(start, start + chunk); + ret = caches_clean_inval_user_pou(start, start + chunk); if (ret) return ret; @@ -68,7 +67,7 @@ do_compat_cache_op(unsigned long start, unsigned long end, int flags) */ long compat_arm_syscall(struct pt_regs *regs, int scno) { - void __user *addr; + unsigned long addr; switch (scno) { /* @@ -111,10 +110,9 @@ long compat_arm_syscall(struct pt_regs *regs, int scno) break; } - addr = (void __user *)instruction_pointer(regs) - - (compat_thumb_mode(regs) ? 2 : 4); + addr = instruction_pointer(regs) - (compat_thumb_mode(regs) ? 2 : 4); arm64_notify_die("Oops - bad compat syscall(2)", regs, - SIGILL, ILL_ILLTRP, addr, scno); + SIGILL, ILL_ILLTRP, addr, 0); return 0; } diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index 9a9d98a443fc..9a70d9746b66 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -5,10 +5,11 @@ #include <linux/errno.h> #include <linux/nospec.h> #include <linux/ptrace.h> +#include <linux/randomize_kstack.h> #include <linux/syscalls.h> -#include <asm/daifflags.h> #include <asm/debug-monitors.h> +#include <asm/exception.h> #include <asm/fpsimd.h> #include <asm/syscall.h> #include <asm/thread_info.h> @@ -42,6 +43,8 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, { long ret; + add_random_kstack_offset(); + if (scno < sc_nr) { syscall_fn_t syscall_fn; syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; @@ -50,7 +53,20 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, ret = do_ni_syscall(regs, scno); } - regs->regs[0] = ret; + syscall_set_return_value(current, regs, 0, ret); + + /* + * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), + * but not enough for arm64 stack utilization comfort. To keep + * reasonable stack head room, reduce the maximum offset to 9 bits. + * + * The actual entropy will be further reduced by the compiler when + * applying stack alignment constraints: the AAPCS mandates a + * 16-byte (i.e. 4-bit) aligned SP at function boundaries. + * + * The resulting 5 bits of entropy is seen in SP[8:4]. + */ + choose_random_kstack_offset(get_random_u16() & 0x1FF); } static inline bool has_syscall_work(unsigned long flags) @@ -58,54 +74,60 @@ static inline bool has_syscall_work(unsigned long flags) return unlikely(flags & _TIF_SYSCALL_WORK); } -int syscall_trace_enter(struct pt_regs *regs); -void syscall_trace_exit(struct pt_regs *regs); - -#ifdef CONFIG_ARM64_ERRATUM_1463225 -DECLARE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); - -static void cortex_a76_erratum_1463225_svc_handler(void) -{ - u32 reg, val; - - if (!unlikely(test_thread_flag(TIF_SINGLESTEP))) - return; - - if (!unlikely(this_cpu_has_cap(ARM64_WORKAROUND_1463225))) - return; - - __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 1); - reg = read_sysreg(mdscr_el1); - val = reg | DBG_MDSCR_SS | DBG_MDSCR_KDE; - write_sysreg(val, mdscr_el1); - asm volatile("msr daifclr, #8"); - isb(); - - /* We will have taken a single-step exception by this point */ - - write_sysreg(reg, mdscr_el1); - __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 0); -} -#else -static void cortex_a76_erratum_1463225_svc_handler(void) { } -#endif /* CONFIG_ARM64_ERRATUM_1463225 */ - static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, const syscall_fn_t syscall_table[]) { - unsigned long flags = current_thread_info()->flags; + unsigned long flags = read_thread_flags(); regs->orig_x0 = regs->regs[0]; regs->syscallno = scno; - cortex_a76_erratum_1463225_svc_handler(); - local_daif_restore(DAIF_PROCCTX); - user_exit(); + /* + * BTI note: + * The architecture does not guarantee that SPSR.BTYPE is zero + * on taking an SVC, so we could return to userspace with a + * non-zero BTYPE after the syscall. + * + * This shouldn't matter except when userspace is explicitly + * doing something stupid, such as setting PROT_BTI on a page + * that lacks conforming BTI/PACIxSP instructions, falling + * through from one executable page to another with differing + * PROT_BTI, or messing with BTYPE via ptrace: in such cases, + * userspace should not be surprised if a SIGILL occurs on + * syscall return. + * + * So, don't touch regs->pstate & PSR_BTYPE_MASK here. + * (Similarly for HVC and SMC elsewhere.) + */ + + if (flags & _TIF_MTE_ASYNC_FAULT) { + /* + * Process the asynchronous tag check fault before the actual + * syscall. do_notify_resume() will send a signal to userspace + * before the syscall is restarted. + */ + syscall_set_return_value(current, regs, -ERESTARTNOINTR, 0); + return; + } if (has_syscall_work(flags)) { - /* set default errno for user-issued syscall(-1) */ + /* + * The de-facto standard way to skip a system call using ptrace + * is to set the system call to -1 (NO_SYSCALL) and set x0 to a + * suitable error code for consumption by userspace. However, + * this cannot be distinguished from a user-issued syscall(-1) + * and so we must set x0 to -ENOSYS here in case the tracer doesn't + * issue the skip and we fall into trace_exit with x0 preserved. + * + * This is slightly odd because it also means that if a tracer + * sets the system call number to -1 but does not initialise x0, + * then x0 will be preserved for all system calls apart from a + * user-issued syscall(-1). However, requesting a skip and not + * setting the return value is unlikely to do anything sensible + * anyway. + */ if (scno == NO_SYSCALL) - regs->regs[0] = -ENOSYS; + syscall_set_return_value(current, regs, -ENOSYS, 0); scno = syscall_trace_enter(regs); if (scno == NO_SYSCALL) goto trace_exit; @@ -119,49 +141,22 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, * exit regardless, as the old entry assembly did. */ if (!has_syscall_work(flags) && !IS_ENABLED(CONFIG_DEBUG_RSEQ)) { - local_daif_mask(); - flags = current_thread_info()->flags; - if (!has_syscall_work(flags)) { - /* - * We're off to userspace, where interrupts are - * always enabled after we restore the flags from - * the SPSR. - */ - trace_hardirqs_on(); + flags = read_thread_flags(); + if (!has_syscall_work(flags) && !(flags & _TIF_SINGLESTEP)) return; - } - local_daif_restore(DAIF_PROCCTX); } trace_exit: syscall_trace_exit(regs); } -static inline void sve_user_discard(void) -{ - if (!system_supports_sve()) - return; - - clear_thread_flag(TIF_SVE); - - /* - * task_fpsimd_load() won't be called to update CPACR_EL1 in - * ret_to_user unless TIF_FOREIGN_FPSTATE is still set, which only - * happens if a context switch or kernel_neon_begin() or context - * modification (sigreturn, ptrace) intervenes. - * So, ensure that CPACR_EL1 is already correct for the fast-path case. - */ - sve_user_disable(); -} - -void el0_svc_handler(struct pt_regs *regs) +void do_el0_svc(struct pt_regs *regs) { - sve_user_discard(); el0_svc_common(regs, regs->regs[8], __NR_syscalls, sys_call_table); } #ifdef CONFIG_COMPAT -void el0_svc_compat_handler(struct pt_regs *regs) +void do_el0_svc_compat(struct pt_regs *regs) { el0_svc_common(regs, regs->regs[7], __NR_compat_syscalls, compat_sys_call_table); diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 73f06d4b3aae..b5855eb7435d 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -18,36 +18,37 @@ #include <linux/timex.h> #include <linux/errno.h> #include <linux/profile.h> +#include <linux/stacktrace.h> #include <linux/syscore_ops.h> #include <linux/timer.h> #include <linux/irq.h> #include <linux/delay.h> #include <linux/clocksource.h> -#include <linux/clk-provider.h> +#include <linux/of_clk.h> #include <linux/acpi.h> #include <clocksource/arm_arch_timer.h> #include <asm/thread_info.h> -#include <asm/stacktrace.h> #include <asm/paravirt.h> -unsigned long profile_pc(struct pt_regs *regs) +static bool profile_pc_cb(void *arg, unsigned long pc) { - struct stackframe frame; + unsigned long *prof_pc = arg; - if (!in_lock_functions(regs->pc)) - return regs->pc; + if (in_lock_functions(pc)) + return true; + *prof_pc = pc; + return false; +} - start_backtrace(&frame, regs->regs[29], regs->pc); +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long prof_pc = 0; - do { - int ret = unwind_frame(NULL, &frame); - if (ret < 0) - return 0; - } while (in_lock_functions(frame.pc)); + arch_stack_walk(profile_pc_cb, &prof_pc, current, regs); - return frame.pc; + return prof_pc; } EXPORT_SYMBOL(profile_pc); diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index fa9528dfd0ce..1a2c72f3e7f8 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -14,6 +14,7 @@ #include <linux/acpi.h> #include <linux/arch_topology.h> #include <linux/cacheinfo.h> +#include <linux/cpufreq.h> #include <linux/init.h> #include <linux/percpu.h> @@ -21,44 +22,6 @@ #include <asm/cputype.h> #include <asm/topology.h> -void store_cpu_topology(unsigned int cpuid) -{ - struct cpu_topology *cpuid_topo = &cpu_topology[cpuid]; - u64 mpidr; - - if (cpuid_topo->package_id != -1) - goto topology_populated; - - mpidr = read_cpuid_mpidr(); - - /* Uniprocessor systems can rely on default topology values */ - if (mpidr & MPIDR_UP_BITMASK) - return; - - /* Create cpu topology mapping based on MPIDR. */ - if (mpidr & MPIDR_MT_BITMASK) { - /* Multiprocessor system : Multi-threads per core */ - cpuid_topo->thread_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); - cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 1); - cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) | - MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8; - } else { - /* Multiprocessor system : Single-thread per core */ - cpuid_topo->thread_id = -1; - cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); - cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) | - MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 | - MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16; - } - - pr_debug("CPU%u: cluster %d core %d thread %d mpidr %#016llx\n", - cpuid, cpuid_topo->package_id, cpuid_topo->core_id, - cpuid_topo->thread_id, mpidr); - -topology_populated: - update_siblings_masks(cpuid); -} - #ifdef CONFIG_ACPI static bool __init acpi_cpu_is_threaded(int cpu) { @@ -86,8 +49,6 @@ int __init parse_acpi_topology(void) return 0; for_each_possible_cpu(cpu) { - int i, cache_id; - topology_id = find_acpi_cpu_topology(cpu, 0); if (topology_id < 0) return topology_id; @@ -100,24 +61,288 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].thread_id = -1; cpu_topology[cpu].core_id = topology_id; } + topology_id = find_acpi_cpu_topology_cluster(cpu); + cpu_topology[cpu].cluster_id = topology_id; topology_id = find_acpi_cpu_topology_package(cpu); cpu_topology[cpu].package_id = topology_id; + } + + return 0; +} +#endif - i = acpi_find_last_cache_level(cpu); +#ifdef CONFIG_ARM64_AMU_EXTN +#define read_corecnt() read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0) +#define read_constcnt() read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0) +#else +#define read_corecnt() (0UL) +#define read_constcnt() (0UL) +#endif - if (i > 0) { - /* - * this is the only part of cpu_topology that has - * a direct relationship with the cache topology - */ - cache_id = find_acpi_cpu_cache_topology(cpu, i); - if (cache_id > 0) - cpu_topology[cpu].llc_id = cache_id; - } +#undef pr_fmt +#define pr_fmt(fmt) "AMU: " fmt + +/* + * Ensure that amu_scale_freq_tick() will return SCHED_CAPACITY_SCALE until + * the CPU capacity and its associated frequency have been correctly + * initialized. + */ +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT); +static DEFINE_PER_CPU(u64, arch_const_cycles_prev); +static DEFINE_PER_CPU(u64, arch_core_cycles_prev); +static cpumask_var_t amu_fie_cpus; + +void update_freq_counters_refs(void) +{ + this_cpu_write(arch_core_cycles_prev, read_corecnt()); + this_cpu_write(arch_const_cycles_prev, read_constcnt()); +} + +static inline bool freq_counters_valid(int cpu) +{ + if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) + return false; + + if (!cpu_has_amu_feat(cpu)) { + pr_debug("CPU%d: counters are not supported.\n", cpu); + return false; + } + + if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) || + !per_cpu(arch_core_cycles_prev, cpu))) { + pr_debug("CPU%d: cycle counters are not enabled.\n", cpu); + return false; } + return true; +} + +void freq_inv_set_max_ratio(int cpu, u64 max_rate) +{ + u64 ratio, ref_rate = arch_timer_get_rate(); + + if (unlikely(!max_rate || !ref_rate)) { + WARN_ONCE(1, "CPU%d: invalid maximum or reference frequency.\n", + cpu); + return; + } + + /* + * Pre-compute the fixed ratio between the frequency of the constant + * reference counter and the maximum frequency of the CPU. + * + * ref_rate + * arch_max_freq_scale = ---------- * SCHED_CAPACITY_SCALE² + * max_rate + * + * We use a factor of 2 * SCHED_CAPACITY_SHIFT -> SCHED_CAPACITY_SCALE² + * in order to ensure a good resolution for arch_max_freq_scale for + * very low reference frequencies (down to the KHz range which should + * be unlikely). + */ + ratio = ref_rate << (2 * SCHED_CAPACITY_SHIFT); + ratio = div64_u64(ratio, max_rate); + if (!ratio) { + WARN_ONCE(1, "Reference frequency too low.\n"); + return; + } + + WRITE_ONCE(per_cpu(arch_max_freq_scale, cpu), (unsigned long)ratio); +} + +static void amu_scale_freq_tick(void) +{ + u64 prev_core_cnt, prev_const_cnt; + u64 core_cnt, const_cnt, scale; + + prev_const_cnt = this_cpu_read(arch_const_cycles_prev); + prev_core_cnt = this_cpu_read(arch_core_cycles_prev); + + update_freq_counters_refs(); + + const_cnt = this_cpu_read(arch_const_cycles_prev); + core_cnt = this_cpu_read(arch_core_cycles_prev); + + if (unlikely(core_cnt <= prev_core_cnt || + const_cnt <= prev_const_cnt)) + return; + + /* + * /\core arch_max_freq_scale + * scale = ------- * -------------------- + * /\const SCHED_CAPACITY_SCALE + * + * See validate_cpu_freq_invariance_counters() for details on + * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT. + */ + scale = core_cnt - prev_core_cnt; + scale *= this_cpu_read(arch_max_freq_scale); + scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT, + const_cnt - prev_const_cnt); + + scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); + this_cpu_write(arch_freq_scale, (unsigned long)scale); +} + +static struct scale_freq_data amu_sfd = { + .source = SCALE_FREQ_SOURCE_ARCH, + .set_freq_scale = amu_scale_freq_tick, +}; + +static void amu_fie_setup(const struct cpumask *cpus) +{ + int cpu; + + /* We are already set since the last insmod of cpufreq driver */ + if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + return; + + for_each_cpu(cpu, cpus) { + if (!freq_counters_valid(cpu)) + return; + } + + cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); + + topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus); + + pr_debug("CPUs[%*pbl]: counters will be used for FIE.", + cpumask_pr_args(cpus)); +} + +static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + + if (val == CPUFREQ_CREATE_POLICY) + amu_fie_setup(policy->related_cpus); + + /* + * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU + * counters don't have any dependency on cpufreq driver once we have + * initialized AMU support and enabled invariance. The AMU counters will + * keep on working just fine in the absence of the cpufreq driver, and + * for the CPUs for which there are no counters available, the last set + * value of arch_freq_scale will remain valid as that is the frequency + * those CPUs are running at. + */ + return 0; } -#endif +static struct notifier_block init_amu_fie_notifier = { + .notifier_call = init_amu_fie_callback, +}; + +static int __init init_amu_fie(void) +{ + int ret; + + if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) + return -ENOMEM; + + ret = cpufreq_register_notifier(&init_amu_fie_notifier, + CPUFREQ_POLICY_NOTIFIER); + if (ret) + free_cpumask_var(amu_fie_cpus); + + return ret; +} +core_initcall(init_amu_fie); + +#ifdef CONFIG_ACPI_CPPC_LIB +#include <acpi/cppc_acpi.h> +static void cpu_read_corecnt(void *val) +{ + /* + * A value of 0 can be returned if the current CPU does not support AMUs + * or if the counter is disabled for this CPU. A return value of 0 at + * counter read is properly handled as an error case by the users of the + * counter. + */ + *(u64 *)val = read_corecnt(); +} + +static void cpu_read_constcnt(void *val) +{ + /* + * Return 0 if the current CPU is affected by erratum 2457168. A value + * of 0 is also returned if the current CPU does not support AMUs or if + * the counter is disabled. A return value of 0 at counter read is + * properly handled as an error case by the users of the counter. + */ + *(u64 *)val = this_cpu_has_cap(ARM64_WORKAROUND_2457168) ? + 0UL : read_constcnt(); +} + +static inline +int counters_read_on_cpu(int cpu, smp_call_func_t func, u64 *val) +{ + /* + * Abort call on counterless CPU or when interrupts are + * disabled - can lead to deadlock in smp sync call. + */ + if (!cpu_has_amu_feat(cpu)) + return -EOPNOTSUPP; + + if (WARN_ON_ONCE(irqs_disabled())) + return -EPERM; + + smp_call_function_single(cpu, func, val, 1); + + return 0; +} + +/* + * Refer to drivers/acpi/cppc_acpi.c for the description of the functions + * below. + */ +bool cpc_ffh_supported(void) +{ + int cpu = get_cpu_with_amu_feat(); + + /* + * FFH is considered supported if there is at least one present CPU that + * supports AMUs. Using FFH to read core and reference counters for CPUs + * that do not support AMUs, have counters disabled or that are affected + * by errata, will result in a return value of 0. + * + * This is done to allow any enabled and valid counters to be read + * through FFH, knowing that potentially returning 0 as counter value is + * properly handled by the users of these counters. + */ + if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) + return false; + + return true; +} + +int cpc_read_ffh(int cpu, struct cpc_reg *reg, u64 *val) +{ + int ret = -EOPNOTSUPP; + + switch ((u64)reg->address) { + case 0x0: + ret = counters_read_on_cpu(cpu, cpu_read_corecnt, val); + break; + case 0x1: + ret = counters_read_on_cpu(cpu, cpu_read_constcnt, val); + break; + } + + if (!ret) { + *val &= GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + *val >>= reg->bit_offset; + } + + return ret; +} + +int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_ACPI_CPPC_LIB */ diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 73caf35c2262..215e6d7f2df8 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -9,7 +9,6 @@ #include <linux/bug.h> #include <linux/context_tracking.h> #include <linux/signal.h> -#include <linux/personality.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/spinlock.h> @@ -19,6 +18,7 @@ #include <linux/module.h> #include <linux/kexec.h> #include <linux/delay.h> +#include <linux/efi.h> #include <linux/init.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> @@ -27,134 +27,167 @@ #include <linux/syscalls.h> #include <linux/mm_types.h> #include <linux/kasan.h> +#include <linux/ubsan.h> +#include <linux/cfi.h> #include <asm/atomic.h> #include <asm/bug.h> #include <asm/cpufeature.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> +#include <asm/efi.h> #include <asm/esr.h> +#include <asm/exception.h> +#include <asm/extable.h> #include <asm/insn.h> #include <asm/kprobes.h> +#include <asm/patching.h> #include <asm/traps.h> #include <asm/smp.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> -#include <asm/exception.h> #include <asm/system_misc.h> #include <asm/sysreg.h> -static const char *handler[]= { - "Synchronous Abort", - "IRQ", - "FIQ", - "Error" -}; +static bool __kprobes __check_eq(unsigned long pstate) +{ + return (pstate & PSR_Z_BIT) != 0; +} -int show_unhandled_signals = 0; +static bool __kprobes __check_ne(unsigned long pstate) +{ + return (pstate & PSR_Z_BIT) == 0; +} -static void dump_backtrace_entry(unsigned long where) +static bool __kprobes __check_cs(unsigned long pstate) { - printk(" %pS\n", (void *)where); + return (pstate & PSR_C_BIT) != 0; } -static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) +static bool __kprobes __check_cc(unsigned long pstate) { - unsigned long addr = instruction_pointer(regs); - char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; - int i; + return (pstate & PSR_C_BIT) == 0; +} - if (user_mode(regs)) - return; +static bool __kprobes __check_mi(unsigned long pstate) +{ + return (pstate & PSR_N_BIT) != 0; +} - for (i = -4; i < 1; i++) { - unsigned int val, bad; +static bool __kprobes __check_pl(unsigned long pstate) +{ + return (pstate & PSR_N_BIT) == 0; +} - bad = aarch64_insn_read(&((u32 *)addr)[i], &val); +static bool __kprobes __check_vs(unsigned long pstate) +{ + return (pstate & PSR_V_BIT) != 0; +} - if (!bad) - p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); - else { - p += sprintf(p, "bad PC value"); - break; - } - } +static bool __kprobes __check_vc(unsigned long pstate) +{ + return (pstate & PSR_V_BIT) == 0; +} - printk("%sCode: %s\n", lvl, str); +static bool __kprobes __check_hi(unsigned long pstate) +{ + pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ + return (pstate & PSR_C_BIT) != 0; } -void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +static bool __kprobes __check_ls(unsigned long pstate) { - struct stackframe frame; - int skip = 0; + pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ + return (pstate & PSR_C_BIT) == 0; +} - pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); +static bool __kprobes __check_ge(unsigned long pstate) +{ + pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ + return (pstate & PSR_N_BIT) == 0; +} - if (regs) { - if (user_mode(regs)) - return; - skip = 1; - } +static bool __kprobes __check_lt(unsigned long pstate) +{ + pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ + return (pstate & PSR_N_BIT) != 0; +} - if (!tsk) - tsk = current; +static bool __kprobes __check_gt(unsigned long pstate) +{ + /*PSR_N_BIT ^= PSR_V_BIT */ + unsigned long temp = pstate ^ (pstate << 3); - if (!try_get_task_stack(tsk)) - return; + temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ + return (temp & PSR_N_BIT) == 0; +} - if (tsk == current) { - start_backtrace(&frame, - (unsigned long)__builtin_frame_address(0), - (unsigned long)dump_backtrace); - } else { - /* - * task blocked in __switch_to - */ - start_backtrace(&frame, - thread_saved_fp(tsk), - thread_saved_pc(tsk)); - } +static bool __kprobes __check_le(unsigned long pstate) +{ + /*PSR_N_BIT ^= PSR_V_BIT */ + unsigned long temp = pstate ^ (pstate << 3); - printk("Call trace:\n"); - do { - /* skip until specified stack frame */ - if (!skip) { - dump_backtrace_entry(frame.pc); - } else if (frame.fp == regs->regs[29]) { - skip = 0; - /* - * Mostly, this is the case where this function is - * called in panic/abort. As exception handler's - * stack frame does not contain the corresponding pc - * at which an exception has taken place, use regs->pc - * instead. - */ - dump_backtrace_entry(regs->pc); - } - } while (!unwind_frame(tsk, &frame)); + temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ + return (temp & PSR_N_BIT) != 0; +} - put_task_stack(tsk); +static bool __kprobes __check_al(unsigned long pstate) +{ + return true; } -void show_stack(struct task_struct *tsk, unsigned long *sp) +/* + * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that + * it behaves identically to 0b1110 ("al"). + */ +pstate_check_t * const aarch32_opcode_cond_checks[16] = { + __check_eq, __check_ne, __check_cs, __check_cc, + __check_mi, __check_pl, __check_vs, __check_vc, + __check_hi, __check_ls, __check_ge, __check_lt, + __check_gt, __check_le, __check_al, __check_al +}; + +int show_unhandled_signals = 0; + +static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) { - dump_backtrace(NULL, tsk); - barrier(); + unsigned long addr = instruction_pointer(regs); + char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; + int i; + + if (user_mode(regs)) + return; + + for (i = -4; i < 1; i++) { + unsigned int val, bad; + + bad = aarch64_insn_read(&((u32 *)addr)[i], &val); + + if (!bad) + p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); + else + p += sprintf(p, i == 0 ? "(????????) " : "???????? "); + } + + printk("%sCode: %s\n", lvl, str); } #ifdef CONFIG_PREEMPT #define S_PREEMPT " PREEMPT" +#elif defined(CONFIG_PREEMPT_RT) +#define S_PREEMPT " PREEMPT_RT" #else #define S_PREEMPT "" #endif + #define S_SMP " SMP" -static int __die(const char *str, int err, struct pt_regs *regs) +static int __die(const char *str, long err, struct pt_regs *regs) { static int die_counter; int ret; - pr_emerg("Internal error: %s: %x [#%d]" S_PREEMPT S_SMP "\n", + pr_emerg("Internal error: %s: %016lx [#%d]" S_PREEMPT S_SMP "\n", str, err, ++die_counter); /* trap and error numbers are mostly meaningless on ARM */ @@ -175,7 +208,7 @@ static DEFINE_RAW_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. */ -void die(const char *str, struct pt_regs *regs, int err) +void die(const char *str, struct pt_regs *regs, long err) { int ret; unsigned long flags; @@ -196,14 +229,14 @@ void die(const char *str, struct pt_regs *regs, int err) oops_exit(); if (in_interrupt()) - panic("Fatal exception in interrupt"); + panic("%s: Fatal exception in interrupt", str); if (panic_on_oops) - panic("Fatal exception"); + panic("%s: Fatal exception", str); raw_spin_unlock_irqrestore(&die_lock, flags); if (ret != NOTIFY_STOP) - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } static void arm64_show_signal(int signo, const char *str) @@ -211,7 +244,7 @@ static void arm64_show_signal(int signo, const char *str) static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct task_struct *tsk = current; - unsigned int esr = tsk->thread.fault_code; + unsigned long esr = tsk->thread.fault_code; struct pt_regs *regs = task_pt_regs(tsk); /* Leave if the signal won't be shown */ @@ -222,7 +255,7 @@ static void arm64_show_signal(int signo, const char *str) pr_info("%s[%d]: unhandled exception: ", tsk->comm, task_pid_nr(tsk)); if (esr) - pr_cont("%s, ESR 0x%08x, ", esr_get_class_string(esr), esr); + pr_cont("%s, ESR 0x%016lx, ", esr_get_class_string(esr), esr); pr_cont("%s", str); print_vma_addr(KERN_CONT " in ", regs->pc); @@ -230,102 +263,133 @@ static void arm64_show_signal(int signo, const char *str) __show_regs(regs); } -void arm64_force_sig_fault(int signo, int code, void __user *addr, +void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str) { arm64_show_signal(signo, str); if (signo == SIGKILL) force_sig(SIGKILL); else - force_sig_fault(signo, code, addr); + force_sig_fault(signo, code, (void __user *)far); } -void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, +void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str) { arm64_show_signal(SIGBUS, str); - force_sig_mceerr(code, addr, lsb); + force_sig_mceerr(code, (void __user *)far, lsb); } -void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr, +void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str) { arm64_show_signal(SIGTRAP, str); - force_sig_ptrace_errno_trap(errno, addr); + force_sig_ptrace_errno_trap(errno, (void __user *)far); } void arm64_notify_die(const char *str, struct pt_regs *regs, - int signo, int sicode, void __user *addr, - int err) + int signo, int sicode, unsigned long far, + unsigned long err) { if (user_mode(regs)) { WARN_ON(regs != current_pt_regs()); current->thread.fault_address = 0; current->thread.fault_code = err; - arm64_force_sig_fault(signo, sicode, addr, str); + arm64_force_sig_fault(signo, sicode, far, str); } else { die(str, regs, err); } } -void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size) +#ifdef CONFIG_COMPAT +#define PSTATE_IT_1_0_SHIFT 25 +#define PSTATE_IT_1_0_MASK (0x3 << PSTATE_IT_1_0_SHIFT) +#define PSTATE_IT_7_2_SHIFT 10 +#define PSTATE_IT_7_2_MASK (0x3f << PSTATE_IT_7_2_SHIFT) + +static u32 compat_get_it_state(struct pt_regs *regs) { - regs->pc += size; + u32 it, pstate = regs->pstate; - /* - * If we were single stepping, we want to get the step exception after - * we return from the trap. - */ - if (user_mode(regs)) - user_fastforward_single_step(current); + it = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT; + it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2; + + return it; } -static LIST_HEAD(undef_hook); -static DEFINE_RAW_SPINLOCK(undef_lock); +static void compat_set_it_state(struct pt_regs *regs, u32 it) +{ + u32 pstate_it; + + pstate_it = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK; + pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK; + + regs->pstate &= ~PSR_AA32_IT_MASK; + regs->pstate |= pstate_it; +} -void register_undef_hook(struct undef_hook *hook) +static void advance_itstate(struct pt_regs *regs) { - unsigned long flags; + u32 it; + + /* ARM mode */ + if (!(regs->pstate & PSR_AA32_T_BIT) || + !(regs->pstate & PSR_AA32_IT_MASK)) + return; + + it = compat_get_it_state(regs); - raw_spin_lock_irqsave(&undef_lock, flags); - list_add(&hook->node, &undef_hook); - raw_spin_unlock_irqrestore(&undef_lock, flags); + /* + * If this is the last instruction of the block, wipe the IT + * state. Otherwise advance it. + */ + if (!(it & 7)) + it = 0; + else + it = (it & 0xe0) | ((it << 1) & 0x1f); + + compat_set_it_state(regs, it); } +#else +static void advance_itstate(struct pt_regs *regs) +{ +} +#endif -void unregister_undef_hook(struct undef_hook *hook) +void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size) { - unsigned long flags; + regs->pc += size; + + /* + * If we were single stepping, we want to get the step exception after + * we return from the trap. + */ + if (user_mode(regs)) + user_fastforward_single_step(current); - raw_spin_lock_irqsave(&undef_lock, flags); - list_del(&hook->node); - raw_spin_unlock_irqrestore(&undef_lock, flags); + if (compat_user_mode(regs)) + advance_itstate(regs); + else + regs->pstate &= ~PSR_BTYPE_MASK; } -static int call_undef_hook(struct pt_regs *regs) +static int user_insn_read(struct pt_regs *regs, u32 *insnp) { - struct undef_hook *hook; - unsigned long flags; u32 instr; - int (*fn)(struct pt_regs *regs, u32 instr) = NULL; - void __user *pc = (void __user *)instruction_pointer(regs); + unsigned long pc = instruction_pointer(regs); - if (!user_mode(regs)) { - __le32 instr_le; - if (probe_kernel_address((__force __le32 *)pc, instr_le)) - goto exit; - instr = le32_to_cpu(instr_le); - } else if (compat_thumb_mode(regs)) { + if (compat_thumb_mode(regs)) { /* 16-bit Thumb instruction */ __le16 instr_le; if (get_user(instr_le, (__le16 __user *)pc)) - goto exit; + return -EFAULT; instr = le16_to_cpu(instr_le); if (aarch32_insn_is_wide(instr)) { u32 instr2; if (get_user(instr_le, (__le16 __user *)(pc + 2))) - goto exit; + return -EFAULT; instr2 = le16_to_cpu(instr_le); instr = (instr << 16) | instr2; } @@ -333,22 +397,15 @@ static int call_undef_hook(struct pt_regs *regs) /* 32-bit ARM instruction */ __le32 instr_le; if (get_user(instr_le, (__le32 __user *)pc)) - goto exit; + return -EFAULT; instr = le32_to_cpu(instr_le); } - raw_spin_lock_irqsave(&undef_lock, flags); - list_for_each_entry(hook, &undef_hook, node) - if ((instr & hook->instr_mask) == hook->instr_val && - (regs->pstate & hook->pstate_mask) == hook->pstate_val) - fn = hook->fn; - - raw_spin_unlock_irqrestore(&undef_lock, flags); -exit: - return fn ? fn(regs, instr) : 1; + *insnp = instr; + return 0; } -void force_signal_inject(int signal, int code, unsigned long address) +void force_signal_inject(int signal, int code, unsigned long address, unsigned long err) { const char *desc; struct pt_regs *regs = current_pt_regs(); @@ -374,7 +431,7 @@ void force_signal_inject(int signal, int code, unsigned long address) signal = SIGKILL; } - arm64_notify_die(desc, regs, signal, code, (void __user *)address, 0); + arm64_notify_die(desc, regs, signal, code, address, err); } /* @@ -384,32 +441,92 @@ void arm64_notify_segfault(unsigned long addr) { int code; - down_read(¤t->mm->mmap_sem); - if (find_vma(current->mm, addr) == NULL) + mmap_read_lock(current->mm); + if (find_vma(current->mm, untagged_addr(addr)) == NULL) code = SEGV_MAPERR; else code = SEGV_ACCERR; - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); - force_signal_inject(SIGSEGV, code, addr); + force_signal_inject(SIGSEGV, code, addr, 0); } -void do_undefinstr(struct pt_regs *regs) +void do_el0_undef(struct pt_regs *regs, unsigned long esr) { + u32 insn; + /* check for AArch32 breakpoint instructions */ if (!aarch32_break_handler(regs)) return; - if (call_undef_hook(regs) == 0) + if (user_insn_read(regs, &insn)) + goto out_err; + + if (try_emulate_mrs(regs, insn)) + return; + + if (try_emulate_armv8_deprecated(regs, insn)) return; - BUG_ON(!user_mode(regs)); - force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); +out_err: + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); +} + +void do_el1_undef(struct pt_regs *regs, unsigned long esr) +{ + u32 insn; + + if (aarch64_insn_read((void *)regs->pc, &insn)) + goto out_err; + + if (try_emulate_el1_ssbs(regs, insn)) + return; + +out_err: + die("Oops - Undefined instruction", regs, esr); +} + +void do_el0_bti(struct pt_regs *regs) +{ + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); +} + +void do_el1_bti(struct pt_regs *regs, unsigned long esr) +{ + if (efi_runtime_fixup_exception(regs, "BTI violation")) { + regs->pstate &= ~PSR_BTYPE_MASK; + return; + } + die("Oops - BTI", regs, esr); +} + +void do_el0_fpac(struct pt_regs *regs, unsigned long esr) +{ + force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr); +} + +void do_el1_fpac(struct pt_regs *regs, unsigned long esr) +{ + /* + * Unexpected FPAC exception in the kernel: kill the task before it + * does any more harm. + */ + die("Oops - FPAC", regs, esr); +} + +void do_el0_mops(struct pt_regs *regs, unsigned long esr) +{ + arm64_mops_reset_regs(®s->user_regs, esr); + + /* + * If single stepping then finish the step before executing the + * prologue instruction. + */ + user_fastforward_single_step(current); } -NOKPROBE_SYMBOL(do_undefinstr); #define __user_cache_maint(insn, address, res) \ - if (address >= user_addr_max()) { \ + if (address >= TASK_SIZE_MAX) { \ res = -EFAULT; \ } else { \ uaccess_ttbr0_enable(); \ @@ -417,25 +534,21 @@ NOKPROBE_SYMBOL(do_undefinstr); "1: " insn ", %1\n" \ " mov %w0, #0\n" \ "2:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "3: mov %w0, %w2\n" \ - " b 2b\n" \ - " .popsection\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) \ : "=r" (res) \ - : "r" (address), "i" (-EFAULT)); \ + : "r" (address)); \ uaccess_ttbr0_disable(); \ } -static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) +static void user_cache_maint_handler(unsigned long esr, struct pt_regs *regs) { - unsigned long address; + unsigned long tagged_address, address; int rt = ESR_ELx_SYS64_ISS_RT(esr); int crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; int ret = 0; - address = untagged_addr(pt_regs_read_reg(regs, rt)); + tagged_address = pt_regs_read_reg(regs, rt); + address = untagged_addr(tagged_address); switch (crm) { case ESR_ELx_SYS64_ISS_CRM_DC_CVAU: /* DC CVAU, gets promoted */ @@ -457,28 +570,28 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) __user_cache_maint("ic ivau", address, ret); break; default: - force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } if (ret) - arm64_notify_segfault(address); + arm64_notify_segfault(tagged_address); else arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } -static void ctr_read_handler(unsigned int esr, struct pt_regs *regs) +static void ctr_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = ESR_ELx_SYS64_ISS_RT(esr); unsigned long val = arm64_ftr_reg_user_value(&arm64_ftr_reg_ctrel0); - if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_1542419)) { /* Hide DIC so that we can trap the unnecessary maintenance...*/ - val &= ~BIT(CTR_DIC_SHIFT); + val &= ~BIT(CTR_EL0_DIC_SHIFT); /* ... and fake IminLine to reduce the number of traps. */ - val &= ~CTR_IMINLINE_MASK; - val |= (PAGE_SHIFT - 2) & CTR_IMINLINE_MASK; + val &= ~CTR_EL0_IminLine_MASK; + val |= (PAGE_SHIFT - 2) & CTR_EL0_IminLine_MASK; } pt_regs_write_reg(regs, rt, val); @@ -486,7 +599,7 @@ static void ctr_read_handler(unsigned int esr, struct pt_regs *regs) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } -static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs) +static void cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = ESR_ELx_SYS64_ISS_RT(esr); @@ -494,7 +607,7 @@ static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } -static void cntfrq_read_handler(unsigned int esr, struct pt_regs *regs) +static void cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = ESR_ELx_SYS64_ISS_RT(esr); @@ -502,7 +615,7 @@ static void cntfrq_read_handler(unsigned int esr, struct pt_regs *regs) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } -static void mrs_handler(unsigned int esr, struct pt_regs *regs) +static void mrs_handler(unsigned long esr, struct pt_regs *regs) { u32 sysreg, rt; @@ -510,18 +623,18 @@ static void mrs_handler(unsigned int esr, struct pt_regs *regs) sysreg = esr_sys64_to_sysreg(esr); if (do_emulate_mrs(regs, sysreg, rt) != 0) - force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } -static void wfi_handler(unsigned int esr, struct pt_regs *regs) +static void wfi_handler(unsigned long esr, struct pt_regs *regs) { arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } struct sys64_hook { - unsigned int esr_mask; - unsigned int esr_val; - void (*handler)(unsigned int esr, struct pt_regs *regs); + unsigned long esr_mask; + unsigned long esr_val; + void (*handler)(unsigned long esr, struct pt_regs *regs); }; static const struct sys64_hook sys64_hooks[] = { @@ -543,6 +656,12 @@ static const struct sys64_hook sys64_hooks[] = { .handler = cntvct_read_handler, }, { + /* Trap read access to CNTVCTSS_EL0 */ + .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, + .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTVCTSS, + .handler = cntvct_read_handler, + }, + { /* Trap read access to CNTFRQ_EL0 */ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTFRQ, @@ -563,35 +682,8 @@ static const struct sys64_hook sys64_hooks[] = { {}, }; - #ifdef CONFIG_COMPAT -#define PSTATE_IT_1_0_SHIFT 25 -#define PSTATE_IT_1_0_MASK (0x3 << PSTATE_IT_1_0_SHIFT) -#define PSTATE_IT_7_2_SHIFT 10 -#define PSTATE_IT_7_2_MASK (0x3f << PSTATE_IT_7_2_SHIFT) - -static u32 compat_get_it_state(struct pt_regs *regs) -{ - u32 it, pstate = regs->pstate; - - it = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT; - it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2; - - return it; -} - -static void compat_set_it_state(struct pt_regs *regs, u32 it) -{ - u32 pstate_it; - - pstate_it = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK; - pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK; - - regs->pstate &= ~PSR_AA32_IT_MASK; - regs->pstate |= pstate_it; -} - -static bool cp15_cond_valid(unsigned int esr, struct pt_regs *regs) +static bool cp15_cond_valid(unsigned long esr, struct pt_regs *regs) { int cond; @@ -611,42 +703,12 @@ static bool cp15_cond_valid(unsigned int esr, struct pt_regs *regs) return aarch32_opcode_cond_checks[cond](regs->pstate); } -static void advance_itstate(struct pt_regs *regs) -{ - u32 it; - - /* ARM mode */ - if (!(regs->pstate & PSR_AA32_T_BIT) || - !(regs->pstate & PSR_AA32_IT_MASK)) - return; - - it = compat_get_it_state(regs); - - /* - * If this is the last instruction of the block, wipe the IT - * state. Otherwise advance it. - */ - if (!(it & 7)) - it = 0; - else - it = (it & 0xe0) | ((it << 1) & 0x1f); - - compat_set_it_state(regs, it); -} - -static void arm64_compat_skip_faulting_instruction(struct pt_regs *regs, - unsigned int sz) -{ - advance_itstate(regs); - arm64_skip_faulting_instruction(regs, sz); -} - -static void compat_cntfrq_read_handler(unsigned int esr, struct pt_regs *regs) +static void compat_cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { int reg = (esr & ESR_ELx_CP15_32_ISS_RT_MASK) >> ESR_ELx_CP15_32_ISS_RT_SHIFT; pt_regs_write_reg(regs, reg, arch_timer_get_rate()); - arm64_compat_skip_faulting_instruction(regs, 4); + arm64_skip_faulting_instruction(regs, 4); } static const struct sys64_hook cp15_32_hooks[] = { @@ -658,7 +720,7 @@ static const struct sys64_hook cp15_32_hooks[] = { {}, }; -static void compat_cntvct_read_handler(unsigned int esr, struct pt_regs *regs) +static void compat_cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = (esr & ESR_ELx_CP15_64_ISS_RT_MASK) >> ESR_ELx_CP15_64_ISS_RT_SHIFT; int rt2 = (esr & ESR_ELx_CP15_64_ISS_RT2_MASK) >> ESR_ELx_CP15_64_ISS_RT2_SHIFT; @@ -666,7 +728,7 @@ static void compat_cntvct_read_handler(unsigned int esr, struct pt_regs *regs) pt_regs_write_reg(regs, rt, lower_32_bits(val)); pt_regs_write_reg(regs, rt2, upper_32_bits(val)); - arm64_compat_skip_faulting_instruction(regs, 4); + arm64_skip_faulting_instruction(regs, 4); } static const struct sys64_hook cp15_64_hooks[] = { @@ -675,10 +737,15 @@ static const struct sys64_hook cp15_64_hooks[] = { .esr_val = ESR_ELx_CP15_64_ISS_SYS_CNTVCT, .handler = compat_cntvct_read_handler, }, + { + .esr_mask = ESR_ELx_CP15_64_ISS_SYS_MASK, + .esr_val = ESR_ELx_CP15_64_ISS_SYS_CNTVCTSS, + .handler = compat_cntvct_read_handler, + }, {}, }; -void do_cp15instr(unsigned int esr, struct pt_regs *regs) +void do_el0_cp15(unsigned long esr, struct pt_regs *regs) { const struct sys64_hook *hook, *hook_base; @@ -687,7 +754,7 @@ void do_cp15instr(unsigned int esr, struct pt_regs *regs) * There is no T16 variant of a CP access, so we * always advance PC by 4 bytes. */ - arm64_compat_skip_faulting_instruction(regs, 4); + arm64_skip_faulting_instruction(regs, 4); return; } @@ -699,7 +766,7 @@ void do_cp15instr(unsigned int esr, struct pt_regs *regs) hook_base = cp15_64_hooks; break; default: - do_undefinstr(regs); + do_el0_undef(regs, esr); return; } @@ -714,12 +781,11 @@ void do_cp15instr(unsigned int esr, struct pt_regs *regs) * EL0. Fall back to our usual undefined instruction handler * so that we handle these consistently. */ - do_undefinstr(regs); + do_el0_undef(regs, esr); } -NOKPROBE_SYMBOL(do_cp15instr); #endif -void do_sysinstr(unsigned int esr, struct pt_regs *regs) +void do_el0_sys(unsigned long esr, struct pt_regs *regs) { const struct sys64_hook *hook; @@ -734,9 +800,8 @@ void do_sysinstr(unsigned int esr, struct pt_regs *regs) * back to our usual undefined instruction handler so that we handle * these consistently. */ - do_undefinstr(regs); + do_el0_undef(regs, esr); } -NOKPROBE_SYMBOL(do_sysinstr); static const char *esr_class_str[] = { [0 ... ESR_ELx_EC_MAX] = "UNRECOGNIZED EC", @@ -750,6 +815,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_CP10_ID] = "CP10 MRC/VMRS", [ESR_ELx_EC_PAC] = "PAC", [ESR_ELx_EC_CP14_64] = "CP14 MCRR/MRRC", + [ESR_ELx_EC_BTI] = "BTI", [ESR_ELx_EC_ILL] = "PSTATE.IL", [ESR_ELx_EC_SVC32] = "SVC (AArch32)", [ESR_ELx_EC_HVC32] = "HVC (AArch32)", @@ -760,6 +826,8 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_SYS64] = "MSR/MRS (AArch64)", [ESR_ELx_EC_SVE] = "SVE", [ESR_ELx_EC_ERET] = "ERET/ERETAA/ERETAB", + [ESR_ELx_EC_FPAC] = "FPAC", + [ESR_ELx_EC_SME] = "SME", [ESR_ELx_EC_IMP_DEF] = "EL3 IMP DEF", [ESR_ELx_EC_IABT_LOW] = "IABT (lower EL)", [ESR_ELx_EC_IABT_CUR] = "IABT (current EL)", @@ -767,6 +835,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_DABT_LOW] = "DABT (lower EL)", [ESR_ELx_EC_DABT_CUR] = "DABT (current EL)", [ESR_ELx_EC_SP_ALIGN] = "SP Alignment", + [ESR_ELx_EC_MOPS] = "MOPS", [ESR_ELx_EC_FP_EXC32] = "FP (AArch32)", [ESR_ELx_EC_FP_EXC64] = "FP (AArch64)", [ESR_ELx_EC_SERROR] = "SError", @@ -781,34 +850,18 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_BRK64] = "BRK (AArch64)", }; -const char *esr_get_class_string(u32 esr) +const char *esr_get_class_string(unsigned long esr) { return esr_class_str[ESR_ELx_EC(esr)]; } /* - * bad_mode handles the impossible case in the exception vector. This is always - * fatal. - */ -asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr) -{ - console_verbose(); - - pr_crit("Bad mode in %s handler detected on CPU%d, code 0x%08x -- %s\n", - handler[reason], smp_processor_id(), esr, - esr_get_class_string(esr)); - - local_daif_mask(); - panic("bad mode"); -} - -/* * bad_el0_sync handles unexpected, but potentially recoverable synchronous - * exceptions taken from EL0. Unlike bad_mode, this returns. + * exceptions taken from EL0. */ -void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) +void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr) { - void __user *pc = (void __user *)instruction_pointer(regs); + unsigned long pc = instruction_pointer(regs); current->thread.fault_address = 0; current->thread.fault_code = esr; @@ -822,24 +875,22 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) __aligned(16); -asmlinkage void handle_bad_stack(struct pt_regs *regs) +void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigned long far) { unsigned long tsk_stk = (unsigned long)current->stack; unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack); - unsigned int esr = read_sysreg(esr_el1); - unsigned long far = read_sysreg(far_el1); console_verbose(); pr_emerg("Insufficient stack space to handle exception!"); - pr_emerg("ESR: 0x%08x -- %s\n", esr, esr_get_class_string(esr)); + pr_emerg("ESR: 0x%016lx -- %s\n", esr, esr_get_class_string(esr)); pr_emerg("FAR: 0x%016lx\n", far); pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", tsk_stk, tsk_stk + THREAD_SIZE); pr_emerg("IRQ stack: [0x%016lx..0x%016lx]\n", - irq_stk, irq_stk + THREAD_SIZE); + irq_stk, irq_stk + IRQ_STACK_SIZE); pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); @@ -854,11 +905,11 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs) } #endif -void __noreturn arm64_serror_panic(struct pt_regs *regs, u32 esr) +void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr) { console_verbose(); - pr_crit("SError Interrupt on CPU%d, code 0x%08x -- %s\n", + pr_crit("SError Interrupt on CPU%d, code 0x%016lx -- %s\n", smp_processor_id(), esr, esr_get_class_string(esr)); if (regs) __show_regs(regs); @@ -866,12 +917,11 @@ void __noreturn arm64_serror_panic(struct pt_regs *regs, u32 esr) nmi_panic(regs, "Asynchronous SError Interrupt"); cpu_park_loop(); - unreachable(); } -bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) +bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr) { - u32 aet = arm64_ras_serror_get_severity(esr); + unsigned long aet = arm64_ras_serror_get_severity(esr); switch (aet) { case ESR_ELx_AET_CE: /* corrected error */ @@ -901,50 +951,15 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) } } -asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr) +void do_serror(struct pt_regs *regs, unsigned long esr) { - const bool was_in_nmi = in_nmi(); - - if (!was_in_nmi) - nmi_enter(); - /* non-RAS errors are not containable */ if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) arm64_serror_panic(regs, esr); - - if (!was_in_nmi) - nmi_exit(); -} - -asmlinkage void enter_from_user_mode(void) -{ - CT_WARN_ON(ct_state() != CONTEXT_USER); - user_exit_irqoff(); -} -NOKPROBE_SYMBOL(enter_from_user_mode); - -void __pte_error(const char *file, int line, unsigned long val) -{ - pr_err("%s:%d: bad pte %016lx.\n", file, line, val); -} - -void __pmd_error(const char *file, int line, unsigned long val) -{ - pr_err("%s:%d: bad pmd %016lx.\n", file, line, val); -} - -void __pud_error(const char *file, int line, unsigned long val) -{ - pr_err("%s:%d: bad pud %016lx.\n", file, line, val); -} - -void __pgd_error(const char *file, int line, unsigned long val) -{ - pr_err("%s:%d: bad pgd %016lx.\n", file, line, val); } /* GENERIC_BUG traps */ - +#ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long addr) { /* @@ -956,12 +971,13 @@ int is_valid_bugaddr(unsigned long addr) */ return 1; } +#endif -static int bug_handler(struct pt_regs *regs, unsigned int esr) +static int bug_handler(struct pt_regs *regs, unsigned long esr) { switch (report_bug(regs->pc, regs)) { case BUG_TRAP_TYPE_BUG: - die("Oops - BUG", regs, 0); + die("Oops - BUG", regs, esr); break; case BUG_TRAP_TYPE_WARN: @@ -982,6 +998,53 @@ static struct break_hook bug_break_hook = { .imm = BUG_BRK_IMM, }; +#ifdef CONFIG_CFI_CLANG +static int cfi_handler(struct pt_regs *regs, unsigned long esr) +{ + unsigned long target; + u32 type; + + target = pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TARGET, esr)); + type = (u32)pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TYPE, esr)); + + switch (report_cfi_failure(regs, regs->pc, &target, type)) { + case BUG_TRAP_TYPE_BUG: + die("Oops - CFI", regs, esr); + break; + + case BUG_TRAP_TYPE_WARN: + break; + + default: + return DBG_HOOK_ERROR; + } + + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + return DBG_HOOK_HANDLED; +} + +static struct break_hook cfi_break_hook = { + .fn = cfi_handler, + .imm = CFI_BRK_IMM_BASE, + .mask = CFI_BRK_IMM_MASK, +}; +#endif /* CONFIG_CFI_CLANG */ + +static int reserved_fault_handler(struct pt_regs *regs, unsigned long esr) +{ + pr_err("%s generated an invalid instruction at %pS!\n", + "Kernel text patching", + (void *)instruction_pointer(regs)); + + /* We cannot handle this */ + return DBG_HOOK_ERROR; +} + +static struct break_hook fault_break_hook = { + .fn = reserved_fault_handler, + .imm = FAULT_BRK_IMM, +}; + #ifdef CONFIG_KASAN_SW_TAGS #define KASAN_ESR_RECOVER 0x20 @@ -989,12 +1052,12 @@ static struct break_hook bug_break_hook = { #define KASAN_ESR_SIZE_MASK 0x0f #define KASAN_ESR_SIZE(esr) (1 << ((esr) & KASAN_ESR_SIZE_MASK)) -static int kasan_handler(struct pt_regs *regs, unsigned int esr) +static int kasan_handler(struct pt_regs *regs, unsigned long esr) { bool recover = esr & KASAN_ESR_RECOVER; bool write = esr & KASAN_ESR_WRITE; size_t size = KASAN_ESR_SIZE(esr); - u64 addr = regs->regs[0]; + void *addr = (void *)regs->regs[0]; u64 pc = regs->pc; kasan_report(addr, size, write, pc); @@ -1014,7 +1077,7 @@ static int kasan_handler(struct pt_regs *regs, unsigned int esr) * This is something that might be fixed at some point in the future. */ if (!recover) - die("Oops - KASAN", regs, 0); + die("Oops - KASAN", regs, esr); /* If thread survives, skip over the brk instruction and continue: */ arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); @@ -1028,27 +1091,56 @@ static struct break_hook kasan_break_hook = { }; #endif +#ifdef CONFIG_UBSAN_TRAP +static int ubsan_handler(struct pt_regs *regs, unsigned long esr) +{ + die(report_ubsan_failure(regs, esr & UBSAN_BRK_MASK), regs, esr); + return DBG_HOOK_HANDLED; +} + +static struct break_hook ubsan_break_hook = { + .fn = ubsan_handler, + .imm = UBSAN_BRK_IMM, + .mask = UBSAN_BRK_MASK, +}; +#endif + +#define esr_comment(esr) ((esr) & ESR_ELx_BRK64_ISS_COMMENT_MASK) + /* * Initial handler for AArch64 BRK exceptions * This handler only used until debug_traps_init(). */ -int __init early_brk64(unsigned long addr, unsigned int esr, +int __init early_brk64(unsigned long addr, unsigned long esr, struct pt_regs *regs) { +#ifdef CONFIG_CFI_CLANG + if ((esr_comment(esr) & ~CFI_BRK_IMM_MASK) == CFI_BRK_IMM_BASE) + return cfi_handler(regs, esr) != DBG_HOOK_HANDLED; +#endif #ifdef CONFIG_KASAN_SW_TAGS - unsigned int comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; - - if ((comment & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) + if ((esr_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) return kasan_handler(regs, esr) != DBG_HOOK_HANDLED; #endif +#ifdef CONFIG_UBSAN_TRAP + if ((esr_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM) + return ubsan_handler(regs, esr) != DBG_HOOK_HANDLED; +#endif return bug_handler(regs, esr) != DBG_HOOK_HANDLED; } -/* This registration must happen early, before debug_traps_init(). */ void __init trap_init(void) { register_kernel_break_hook(&bug_break_hook); +#ifdef CONFIG_CFI_CLANG + register_kernel_break_hook(&cfi_break_hook); +#endif + register_kernel_break_hook(&fault_break_hook); #ifdef CONFIG_KASAN_SW_TAGS register_kernel_break_hook(&kasan_break_hook); #endif +#ifdef CONFIG_UBSAN_TRAP + register_kernel_break_hook(&ubsan_break_hook); +#endif + debug_traps_init(); } diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso-wrap.S index d1414fee5274..c4b1990bf2be 100644 --- a/arch/arm64/kernel/vdso/vdso.S +++ b/arch/arm64/kernel/vdso-wrap.S @@ -8,6 +8,7 @@ #include <linux/init.h> #include <linux/linkage.h> #include <linux/const.h> +#include <asm/assembler.h> #include <asm/page.h> .globl vdso_start, vdso_end @@ -19,3 +20,5 @@ vdso_start: vdso_end: .previous + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 354b11e27c07..5562daf38a22 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -18,6 +18,7 @@ #include <linux/sched.h> #include <linux/signal.h> #include <linux/slab.h> +#include <linux/time_namespace.h> #include <linux/timekeeper_internal.h> #include <linux/vmalloc.h> #include <vdso/datapage.h> @@ -28,25 +29,18 @@ #include <asm/signal32.h> #include <asm/vdso.h> -extern char vdso_start[], vdso_end[]; -#ifdef CONFIG_COMPAT_VDSO -extern char vdso32_start[], vdso32_end[]; -#endif /* CONFIG_COMPAT_VDSO */ +enum vdso_abi { + VDSO_ABI_AA64, + VDSO_ABI_AA32, +}; -/* vdso_lookup arch_index */ -enum arch_vdso_type { - ARM64_VDSO = 0, -#ifdef CONFIG_COMPAT_VDSO - ARM64_VDSO32 = 1, -#endif /* CONFIG_COMPAT_VDSO */ +enum vvar_pages { + VVAR_DATA_PAGE_OFFSET, + VVAR_TIMENS_PAGE_OFFSET, + VVAR_NR_PAGES, }; -#ifdef CONFIG_COMPAT_VDSO -#define VDSO_TYPES (ARM64_VDSO32 + 1) -#else -#define VDSO_TYPES (ARM64_VDSO + 1) -#endif /* CONFIG_COMPAT_VDSO */ -struct __vdso_abi { +struct vdso_abi_info { const char *name; const char *vdso_code_start; const char *vdso_code_end; @@ -57,14 +51,14 @@ struct __vdso_abi { struct vm_special_mapping *cm; }; -static struct __vdso_abi vdso_lookup[VDSO_TYPES] __ro_after_init = { - { +static struct vdso_abi_info vdso_info[] __ro_after_init = { + [VDSO_ABI_AA64] = { .name = "vdso", .vdso_code_start = vdso_start, .vdso_code_end = vdso_end, }, #ifdef CONFIG_COMPAT_VDSO - { + [VDSO_ABI_AA32] = { .name = "vdso32", .vdso_code_start = vdso32_start, .vdso_code_end = vdso32_end, @@ -81,72 +75,130 @@ static union { } vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; -static int __vdso_remap(enum arch_vdso_type arch_index, - const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) { - unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - unsigned long vdso_size = vdso_lookup[arch_index].vdso_code_end - - vdso_lookup[arch_index].vdso_code_start; - - if (vdso_size != new_size) - return -EINVAL; - current->mm->context.vdso = (void *)new_vma->vm_start; return 0; } -static int __vdso_init(enum arch_vdso_type arch_index) +static int __init __vdso_init(enum vdso_abi abi) { int i; struct page **vdso_pagelist; unsigned long pfn; - if (memcmp(vdso_lookup[arch_index].vdso_code_start, "\177ELF", 4)) { + if (memcmp(vdso_info[abi].vdso_code_start, "\177ELF", 4)) { pr_err("vDSO is not a valid ELF object!\n"); return -EINVAL; } - vdso_lookup[arch_index].vdso_pages = ( - vdso_lookup[arch_index].vdso_code_end - - vdso_lookup[arch_index].vdso_code_start) >> + vdso_info[abi].vdso_pages = ( + vdso_info[abi].vdso_code_end - + vdso_info[abi].vdso_code_start) >> PAGE_SHIFT; - /* Allocate the vDSO pagelist, plus a page for the data. */ - vdso_pagelist = kcalloc(vdso_lookup[arch_index].vdso_pages + 1, + vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages, sizeof(struct page *), GFP_KERNEL); if (vdso_pagelist == NULL) return -ENOMEM; - /* Grab the vDSO data page. */ - vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data)); + /* Grab the vDSO code pages. */ + pfn = sym_to_pfn(vdso_info[abi].vdso_code_start); + for (i = 0; i < vdso_info[abi].vdso_pages; i++) + vdso_pagelist[i] = pfn_to_page(pfn + i); - /* Grab the vDSO code pages. */ - pfn = sym_to_pfn(vdso_lookup[arch_index].vdso_code_start); + vdso_info[abi].cm->pages = vdso_pagelist; + + return 0; +} + +#ifdef CONFIG_TIME_NS +struct vdso_data *arch_get_vdso_data(void *vvar_page) +{ + return (struct vdso_data *)(vvar_page); +} + +/* + * The vvar mapping contains data for a specific time namespace, so when a task + * changes namespace we must unmap its vvar data for the old namespace. + * Subsequent faults will map in data for the new namespace. + * + * For more details see timens_setup_vdso_data(). + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (i = 0; i < vdso_lookup[arch_index].vdso_pages; i++) - vdso_pagelist[i + 1] = pfn_to_page(pfn + i); + mmap_read_lock(mm); - vdso_lookup[arch_index].dm->pages = &vdso_pagelist[0]; - vdso_lookup[arch_index].cm->pages = &vdso_pagelist[1]; + for_each_vma(vmi, vma) { + if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm)) + zap_vma_pages(vma); +#ifdef CONFIG_COMPAT_VDSO + if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA32].dm)) + zap_vma_pages(vma); +#endif + } + mmap_read_unlock(mm); return 0; } +#endif + +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *timens_page = find_timens_vvar_page(vma); + unsigned long pfn; + + switch (vmf->pgoff) { + case VVAR_DATA_PAGE_OFFSET: + if (timens_page) + pfn = page_to_pfn(timens_page); + else + pfn = sym_to_pfn(vdso_data); + break; +#ifdef CONFIG_TIME_NS + case VVAR_TIMENS_PAGE_OFFSET: + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (!timens_page) + return VM_FAULT_SIGBUS; + pfn = sym_to_pfn(vdso_data); + break; +#endif /* CONFIG_TIME_NS */ + default: + return VM_FAULT_SIGBUS; + } -static int __setup_additional_pages(enum arch_vdso_type arch_index, + return vmf_insert_pfn(vma, vmf->address, pfn); +} + +static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, int uses_interp) { unsigned long vdso_base, vdso_text_len, vdso_mapping_len; + unsigned long gp_flags = 0; void *ret; - vdso_text_len = vdso_lookup[arch_index].vdso_pages << PAGE_SHIFT; + BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + + vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ - vdso_mapping_len = vdso_text_len + PAGE_SIZE; + vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); if (IS_ERR_VALUE(vdso_base)) { @@ -154,18 +206,21 @@ static int __setup_additional_pages(enum arch_vdso_type arch_index, goto up_fail; } - ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE, - VM_READ|VM_MAYREAD, - vdso_lookup[arch_index].dm); + ret = _install_special_mapping(mm, vdso_base, VVAR_NR_PAGES * PAGE_SIZE, + VM_READ|VM_MAYREAD|VM_PFNMAP, + vdso_info[abi].dm); if (IS_ERR(ret)) goto up_fail; - vdso_base += PAGE_SIZE; + if (system_supports_bti_kernel()) + gp_flags = VM_ARM64_BTI; + + vdso_base += VVAR_NR_PAGES * PAGE_SIZE; mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, - VM_READ|VM_EXEC| + VM_READ|VM_EXEC|gp_flags| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdso_lookup[arch_index].cm); + vdso_info[abi].cm); if (IS_ERR(ret)) goto up_fail; @@ -180,52 +235,42 @@ up_fail: /* * Create and map the vectors page for AArch32 tasks. */ -#ifdef CONFIG_COMPAT_VDSO -static int aarch32_vdso_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) +enum aarch32_map { + AA32_MAP_VECTORS, /* kuser helpers */ + AA32_MAP_SIGPAGE, + AA32_MAP_VVAR, + AA32_MAP_VDSO, +}; + +static struct page *aarch32_vectors_page __ro_after_init; +static struct page *aarch32_sig_page __ro_after_init; + +static int aarch32_sigpage_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) { - return __vdso_remap(ARM64_VDSO32, sm, new_vma); + current->mm->context.sigpage = (void *)new_vma->vm_start; + + return 0; } -#endif /* CONFIG_COMPAT_VDSO */ -/* - * aarch32_vdso_pages: - * 0 - kuser helpers - * 1 - sigreturn code - * or (CONFIG_COMPAT_VDSO): - * 0 - kuser helpers - * 1 - vdso data - * 2 - vdso code - */ -#define C_VECTORS 0 -#ifdef CONFIG_COMPAT_VDSO -#define C_VVAR 1 -#define C_VDSO 2 -#define C_PAGES (C_VDSO + 1) -#else -#define C_SIGPAGE 1 -#define C_PAGES (C_SIGPAGE + 1) -#endif /* CONFIG_COMPAT_VDSO */ -static struct page *aarch32_vdso_pages[C_PAGES] __ro_after_init; -static struct vm_special_mapping aarch32_vdso_spec[C_PAGES] = { - { +static struct vm_special_mapping aarch32_vdso_maps[] = { + [AA32_MAP_VECTORS] = { .name = "[vectors]", /* ABI */ - .pages = &aarch32_vdso_pages[C_VECTORS], + .pages = &aarch32_vectors_page, }, -#ifdef CONFIG_COMPAT_VDSO - { + [AA32_MAP_SIGPAGE] = { + .name = "[sigpage]", /* ABI */ + .pages = &aarch32_sig_page, + .mremap = aarch32_sigpage_mremap, + }, + [AA32_MAP_VVAR] = { .name = "[vvar]", + .fault = vvar_fault, }, - { + [AA32_MAP_VDSO] = { .name = "[vdso]", - .mremap = aarch32_vdso_mremap, - }, -#else - { - .name = "[sigpage]", /* ABI */ - .pages = &aarch32_vdso_pages[C_SIGPAGE], + .mremap = vdso_mremap, }, -#endif /* CONFIG_COMPAT_VDSO */ }; static int aarch32_alloc_kuser_vdso_page(void) @@ -237,69 +282,59 @@ static int aarch32_alloc_kuser_vdso_page(void) if (!IS_ENABLED(CONFIG_KUSER_HELPERS)) return 0; - vdso_page = get_zeroed_page(GFP_ATOMIC); + vdso_page = get_zeroed_page(GFP_KERNEL); if (!vdso_page) return -ENOMEM; memcpy((void *)(vdso_page + 0x1000 - kuser_sz), __kuser_helper_start, kuser_sz); - aarch32_vdso_pages[C_VECTORS] = virt_to_page(vdso_page); - flush_dcache_page(aarch32_vdso_pages[C_VECTORS]); + aarch32_vectors_page = virt_to_page((void *)vdso_page); return 0; } -#ifdef CONFIG_COMPAT_VDSO -static int __aarch32_alloc_vdso_pages(void) -{ - int ret; - - vdso_lookup[ARM64_VDSO32].dm = &aarch32_vdso_spec[C_VVAR]; - vdso_lookup[ARM64_VDSO32].cm = &aarch32_vdso_spec[C_VDSO]; - - ret = __vdso_init(ARM64_VDSO32); - if (ret) - return ret; - - ret = aarch32_alloc_kuser_vdso_page(); - if (ret) { - unsigned long c_vvar = - (unsigned long)page_to_virt(aarch32_vdso_pages[C_VVAR]); - unsigned long c_vdso = - (unsigned long)page_to_virt(aarch32_vdso_pages[C_VDSO]); - - free_page(c_vvar); - free_page(c_vdso); - } - - return ret; -} -#else -static int __aarch32_alloc_vdso_pages(void) +#define COMPAT_SIGPAGE_POISON_WORD 0xe7fddef1 +static int aarch32_alloc_sigpage(void) { extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[]; int sigret_sz = __aarch32_sigret_code_end - __aarch32_sigret_code_start; - unsigned long sigpage; - int ret; + __le32 poison = cpu_to_le32(COMPAT_SIGPAGE_POISON_WORD); + void *sigpage; - sigpage = get_zeroed_page(GFP_ATOMIC); + sigpage = (void *)__get_free_page(GFP_KERNEL); if (!sigpage) return -ENOMEM; - memcpy((void *)sigpage, __aarch32_sigret_code_start, sigret_sz); - aarch32_vdso_pages[C_SIGPAGE] = virt_to_page(sigpage); - flush_dcache_page(aarch32_vdso_pages[C_SIGPAGE]); + memset32(sigpage, (__force u32)poison, PAGE_SIZE / sizeof(poison)); + memcpy(sigpage, __aarch32_sigret_code_start, sigret_sz); + aarch32_sig_page = virt_to_page(sigpage); + return 0; +} - ret = aarch32_alloc_kuser_vdso_page(); - if (ret) - free_page(sigpage); +static int __init __aarch32_alloc_vdso_pages(void) +{ - return ret; + if (!IS_ENABLED(CONFIG_COMPAT_VDSO)) + return 0; + + vdso_info[VDSO_ABI_AA32].dm = &aarch32_vdso_maps[AA32_MAP_VVAR]; + vdso_info[VDSO_ABI_AA32].cm = &aarch32_vdso_maps[AA32_MAP_VDSO]; + + return __vdso_init(VDSO_ABI_AA32); } -#endif /* CONFIG_COMPAT_VDSO */ static int __init aarch32_alloc_vdso_pages(void) { - return __aarch32_alloc_vdso_pages(); + int ret; + + ret = __aarch32_alloc_vdso_pages(); + if (ret) + return ret; + + ret = aarch32_alloc_sigpage(); + if (ret) + return ret; + + return aarch32_alloc_kuser_vdso_page(); } arch_initcall(aarch32_alloc_vdso_pages); @@ -317,12 +352,11 @@ static int aarch32_kuser_helpers_setup(struct mm_struct *mm) ret = _install_special_mapping(mm, AARCH32_VECTORS_BASE, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC, - &aarch32_vdso_spec[C_VECTORS]); + &aarch32_vdso_maps[AA32_MAP_VECTORS]); return PTR_ERR_OR_ZERO(ret); } -#ifndef CONFIG_COMPAT_VDSO static int aarch32_sigreturn_setup(struct mm_struct *mm) { unsigned long addr; @@ -341,63 +375,53 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm) ret = _install_special_mapping(mm, addr, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, - &aarch32_vdso_spec[C_SIGPAGE]); + &aarch32_vdso_maps[AA32_MAP_SIGPAGE]); if (IS_ERR(ret)) goto out; - mm->context.vdso = (void *)addr; + mm->context.sigpage = (void *)addr; out: return PTR_ERR_OR_ZERO(ret); } -#endif /* !CONFIG_COMPAT_VDSO */ int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; int ret; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; ret = aarch32_kuser_helpers_setup(mm); if (ret) goto out; -#ifdef CONFIG_COMPAT_VDSO - ret = __setup_additional_pages(ARM64_VDSO32, - mm, - bprm, - uses_interp); -#else - ret = aarch32_sigreturn_setup(mm); -#endif /* CONFIG_COMPAT_VDSO */ + if (IS_ENABLED(CONFIG_COMPAT_VDSO)) { + ret = __setup_additional_pages(VDSO_ABI_AA32, mm, bprm, + uses_interp); + if (ret) + goto out; + } + ret = aarch32_sigreturn_setup(mm); out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } #endif /* CONFIG_COMPAT */ -static int vdso_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) -{ - return __vdso_remap(ARM64_VDSO, sm, new_vma); -} +enum aarch64_map { + AA64_MAP_VVAR, + AA64_MAP_VDSO, +}; -/* - * aarch64_vdso_pages: - * 0 - vvar - * 1 - vdso - */ -#define A_VVAR 0 -#define A_VDSO 1 -#define A_PAGES (A_VDSO + 1) -static struct vm_special_mapping vdso_spec[A_PAGES] __ro_after_init = { - { +static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { + [AA64_MAP_VVAR] = { .name = "[vvar]", + .fault = vvar_fault, }, - { + [AA64_MAP_VDSO] = { .name = "[vdso]", .mremap = vdso_mremap, }, @@ -405,28 +429,23 @@ static struct vm_special_mapping vdso_spec[A_PAGES] __ro_after_init = { static int __init vdso_init(void) { - vdso_lookup[ARM64_VDSO].dm = &vdso_spec[A_VVAR]; - vdso_lookup[ARM64_VDSO].cm = &vdso_spec[A_VDSO]; + vdso_info[VDSO_ABI_AA64].dm = &aarch64_vdso_maps[AA64_MAP_VVAR]; + vdso_info[VDSO_ABI_AA64].cm = &aarch64_vdso_maps[AA64_MAP_VDSO]; - return __vdso_init(ARM64_VDSO); + return __vdso_init(VDSO_ABI_AA64); } arch_initcall(vdso_init); -int arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; int ret; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; - ret = __setup_additional_pages(ARM64_VDSO, - mm, - bprm, - uses_interp); - - up_write(&mm->mmap_sem); + ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, uses_interp); + mmap_write_unlock(mm); return ret; } diff --git a/arch/arm64/kernel/vdso/.gitignore b/arch/arm64/kernel/vdso/.gitignore index f8b69d84238e..652e31d82582 100644 --- a/arch/arm64/kernel/vdso/.gitignore +++ b/arch/arm64/kernel/vdso/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only vdso.lds diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index dd2514bb1511..8818287f1095 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -6,9 +6,7 @@ # Heavily based on the vDSO Makefiles for other archs. # -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_AARCH64_JUMP_SLOT|R_AARCH64_GLOB_DAT|R_AARCH64_ABS64 +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso := vgettimeofday.o note.o sigreturn.o @@ -17,44 +15,49 @@ obj-vdso := vgettimeofday.o note.o sigreturn.o targets := $(obj-vdso) vdso.so vdso.so.dbg obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) -ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv \ - --build-id -n -T +btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti -ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 -ccflags-y += -DDISABLE_BRANCH_PROFILING +# -Bsymbolic has been added for consistency with arm, the compat vDSO and +# potential future proofing if we end up with internal calls to the exported +# routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so +# preparation in build-time C")). +ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv \ + -Bsymbolic --build-id=sha1 -n $(btildflags-y) + +ifdef CONFIG_LD_ORPHAN_WARN + ldflags-y += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) +endif -VDSO_LDFLAGS := -Bsymbolic +ldflags-y += -T -CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os -KBUILD_CFLAGS += $(DISABLE_LTO) +ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 +ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO + +# -Wmissing-prototypes and -Wmissing-declarations are removed from +# the CFLAGS of vgettimeofday.c to make possible to build the +# kernel with CONFIG_WERROR enabled. +CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \ + $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \ + $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \ + -Wmissing-prototypes -Wmissing-declarations KASAN_SANITIZE := n +KCSAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y KCOV_INSTRUMENT := n -CFLAGS_vgettimeofday.o = -O2 -mcmodel=tiny +CFLAGS_vgettimeofday.o = -O2 -mcmodel=tiny -fasynchronous-unwind-tables ifneq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y) endif -# Clang versions less than 8 do not support -mcmodel=tiny -ifeq ($(CONFIG_CC_IS_CLANG), y) - ifeq ($(shell test $(CONFIG_CLANG_VERSION) -lt 80000; echo $$?),0) - CFLAGS_REMOVE_vgettimeofday.o += -mcmodel=tiny - endif -endif - # Disable gcov profiling for VDSO code GCOV_PROFILE := n -obj-y += vdso.o -extra-y += vdso.lds +targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) -# Force dependency (incbin is bad) -$(obj)/vdso.o : $(obj)/vdso.so - # Link rule for the .so file, .lds has to be first $(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold_and_vdso_check) @@ -75,13 +78,3 @@ include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE # Actual build commands quiet_cmd_vdsold_and_vdso_check = LD $@ cmd_vdsold_and_vdso_check = $(cmd_ld); $(cmd_vdso_check) - -# Install commands for the unstripped file -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ - -vdso.so: $(obj)/vdso.so.dbg - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: vdso.so diff --git a/arch/arm64/kernel/vdso/gen_vdso_offsets.sh b/arch/arm64/kernel/vdso/gen_vdso_offsets.sh index 0664acaf61ff..0387209d65b1 100755 --- a/arch/arm64/kernel/vdso/gen_vdso_offsets.sh +++ b/arch/arm64/kernel/vdso/gen_vdso_offsets.sh @@ -8,9 +8,9 @@ # Doing this inside the Makefile will break the $(filter-out) function, # causing Kbuild to rebuild the vdso-offsets header file every time. # -# Author: Will Deacon <will.deacon@arm.com +# Author: Will Deacon <will.deacon@arm.com> # LC_ALL=C sed -n -e 's/^00*/0/' -e \ -'s/^\([0-9a-fA-F]*\) . VDSO_\([a-zA-Z0-9_]*\)$/\#define vdso_offset_\2\t0x\1/p' +'s/^\([0-9a-fA-F]*\) . VDSO_\([a-zA-Z0-9_]*\)$/\#define vdso_offset_\2 0x\1/p' diff --git a/arch/arm64/kernel/vdso/note.S b/arch/arm64/kernel/vdso/note.S index 0ce6ec75a525..3d4e82290c80 100644 --- a/arch/arm64/kernel/vdso/note.S +++ b/arch/arm64/kernel/vdso/note.S @@ -12,9 +12,12 @@ #include <linux/version.h> #include <linux/elfnote.h> #include <linux/build-salt.h> +#include <asm/assembler.h> ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE ELFNOTE_END BUILD_SALT + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso/sigreturn.S b/arch/arm64/kernel/vdso/sigreturn.S index 0723aa398d6e..0e18729abc3b 100644 --- a/arch/arm64/kernel/vdso/sigreturn.S +++ b/arch/arm64/kernel/vdso/sigreturn.S @@ -1,7 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* * Sigreturn trampoline for returning from a signal when the SA_RESTORER - * flag is not set. + * flag is not set. It serves primarily as a hall of shame for crappy + * unwinders and features an exciting but mysterious NOP instruction. + * + * It's also fragile as hell, so please think twice before changing anything + * in here. * * Copyright (C) 2012 ARM Limited * @@ -9,18 +13,68 @@ */ #include <linux/linkage.h> +#include <asm/assembler.h> #include <asm/unistd.h> .text - nop -ENTRY(__kernel_rt_sigreturn) - .cfi_startproc - .cfi_signal_frame - .cfi_def_cfa x29, 0 - .cfi_offset x29, 0 * 8 - .cfi_offset x30, 1 * 8 +/* + * NOTE!!! You may notice that all of the .cfi directives in this file have + * been commented out. This is because they have been shown to trigger segfaults + * in libgcc when unwinding out of a SIGCANCEL handler to invoke pthread + * cleanup handlers during the thread cancellation dance. By omitting the + * directives, we trigger an arm64-specific fallback path in the unwinder which + * recognises the signal frame and restores many of the registers directly from + * the sigcontext. Re-enabling the cfi directives here therefore needs to be + * much more comprehensive to reduce the risk of further regressions. + */ + +/* Ensure that the mysterious NOP can be associated with a function. */ +// .cfi_startproc + +/* + * .cfi_signal_frame causes the corresponding Frame Description Entry (FDE) in + * the .eh_frame section to be annotated as a signal frame. This allows DWARF + * unwinders (e.g. libstdc++) to implement _Unwind_GetIPInfo() and identify + * the next frame using the unmodified return address instead of subtracting 1, + * which may yield the wrong FDE. + */ +// .cfi_signal_frame + +/* + * Tell the unwinder where to locate the frame record linking back to the + * interrupted context. We don't provide unwind info for registers other than + * the frame pointer and the link register here; in practice, this is likely to + * be insufficient for unwinding in C/C++ based runtimes, especially without a + * means to restore the stack pointer. Thankfully, unwinders and debuggers + * already have baked-in strategies for attempting to unwind out of signals. + */ +// .cfi_def_cfa x29, 0 +// .cfi_offset x29, 0 * 8 +// .cfi_offset x30, 1 * 8 + +/* + * This mysterious NOP is required for some unwinders (e.g. libc++) that + * unconditionally subtract one from the result of _Unwind_GetIP() in order to + * identify the calling function. + * Hack borrowed from arch/powerpc/kernel/vdso64/sigtramp.S. + */ + nop // Mysterious NOP + +/* + * GDB, libgcc and libunwind rely on being able to identify the sigreturn + * instruction sequence to unwind from signal handlers. We cannot, therefore, + * use SYM_FUNC_START() here, as it will emit a BTI C instruction and break the + * unwinder. Thankfully, this function is only ever called from a RET and so + * omitting the landing pad is perfectly fine. + */ +SYM_CODE_START(__kernel_rt_sigreturn) +// PLEASE DO NOT MODIFY mov x8, #__NR_rt_sigreturn +// PLEASE DO NOT MODIFY svc #0 - .cfi_endproc -ENDPROC(__kernel_rt_sigreturn) +// PLEASE DO NOT MODIFY +// .cfi_endproc +SYM_CODE_END(__kernel_rt_sigreturn) + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index 7ad2d3a0cd48..45354f2ddf70 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -11,13 +11,17 @@ #include <linux/const.h> #include <asm/page.h> #include <asm/vdso.h> +#include <asm-generic/vmlinux.lds.h> OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") OUTPUT_ARCH(aarch64) SECTIONS { - PROVIDE(_vdso_data = . - PAGE_SIZE); + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); +#endif . = VDSO_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text @@ -28,6 +32,13 @@ SECTIONS .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } + /* + * Discard .note.gnu.property sections which are unused and have + * different alignment requirement from vDSO note sections. + */ + /DISCARD/ : { + *(.note.GNU-stack .note.gnu.property) + } .note : { *(.note.*) } :text :note . = ALIGN(16); @@ -37,20 +48,35 @@ SECTIONS PROVIDE (_etext = .); PROVIDE (etext = .); - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text + . = ALIGN(4); + .altinstructions : { + *(.altinstructions) + } .dynamic : { *(.dynamic) } :text :dynamic - .rodata : { *(.rodata*) } :text + .rela.dyn : ALIGN(8) { *(.rela .rela*) } + + .rodata : { + *(.rodata*) + *(.got) + *(.got.plt) + *(.plt) + *(.plt.*) + *(.iplt) + *(.igot .igot.plt) + } :text _end = .; PROVIDE(end = .); + DWARF_DEBUG + ELF_DETAILS + /DISCARD/ : { - *(.note.GNU-stack) *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) + *(.eh_frame .eh_frame_hdr) } } @@ -63,7 +89,6 @@ PHDRS text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr PT_GNU_EH_FRAME; } /* diff --git a/arch/arm64/kernel/vdso/vgettimeofday.c b/arch/arm64/kernel/vdso/vgettimeofday.c index 747635501a14..9941c5b04f15 100644 --- a/arch/arm64/kernel/vdso/vgettimeofday.c +++ b/arch/arm64/kernel/vdso/vgettimeofday.c @@ -5,8 +5,10 @@ * Copyright (C) 2018 ARM Limited * */ -#include <linux/time.h> -#include <linux/types.h> + +int __kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); +int __kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); +int __kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res); int __kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) diff --git a/arch/arm64/kernel/vdso32/vdso.S b/arch/arm64/kernel/vdso32-wrap.S index e72ac7bc4c04..e72ac7bc4c04 100644 --- a/arch/arm64/kernel/vdso32/vdso.S +++ b/arch/arm64/kernel/vdso32-wrap.S diff --git a/arch/arm64/kernel/vdso32/.gitignore b/arch/arm64/kernel/vdso32/.gitignore index 4fea950fa5ed..3542fa24e26b 100644 --- a/arch/arm64/kernel/vdso32/.gitignore +++ b/arch/arm64/kernel/vdso32/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only vdso.lds vdso.so.raw diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 76b327f88fbb..2266fcdff78a 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -3,26 +3,26 @@ # Makefile for vdso32 # -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_ARM_JUMP_SLOT|R_ARM_GLOB_DAT|R_ARM_ABS32 include $(srctree)/lib/vdso/Makefile # Same as cc-*option, but using CC_COMPAT instead of CC ifeq ($(CONFIG_CC_IS_CLANG), y) CC_COMPAT ?= $(CC) +CC_COMPAT += --target=arm-linux-gnueabi else CC_COMPAT ?= $(CROSS_COMPILE_COMPAT)gcc endif +ifeq ($(CONFIG_LD_IS_LLD), y) +LD_COMPAT ?= $(LD) +else +LD_COMPAT ?= $(CROSS_COMPILE_COMPAT)ld +endif + cc32-option = $(call try-run,\ $(CC_COMPAT) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) cc32-disable-warning = $(call try-run,\ $(CC_COMPAT) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) -cc32-ldoption = $(call try-run,\ - $(CC_COMPAT) $(1) -nostdlib -x c /dev/null -o "$$TMP",$(1),$(2)) -cc32-as-instr = $(call try-run,\ - printf "%b\n" "$(1)" | $(CC_COMPAT) $(VDSO_AFLAGS) -c -x assembler -o "$$TMP" -,$(2),$(3)) # We cannot use the global flags to compile the vDSO files, the main reason # being that the 32-bit compiler may be older than the main (64-bit) compiler @@ -32,16 +32,13 @@ cc32-as-instr = $(call try-run,\ # As a result we set our own flags here. # KBUILD_CPPFLAGS and NOSTDINC_FLAGS from top-level Makefile -VDSO_CPPFLAGS := -D__KERNEL__ -nostdinc -isystem $(shell $(CC_COMPAT) -print-file-name=include) +VDSO_CPPFLAGS := -DBUILD_VDSO -D__KERNEL__ -nostdinc +VDSO_CPPFLAGS += -isystem $(shell $(CC_COMPAT) -print-file-name=include 2>/dev/null) VDSO_CPPFLAGS += $(LINUXINCLUDE) # Common C and assembly flags # From top-level Makefile VDSO_CAFLAGS := $(VDSO_CPPFLAGS) -ifneq ($(shell $(CC_COMPAT) --version 2>&1 | head -n 1 | grep clang),) -VDSO_CAFLAGS += --target=$(notdir $(CROSS_COMPILE_COMPAT:%-=%)) -endif - VDSO_CAFLAGS += $(call cc32-option,-fno-PIE) ifdef CONFIG_DEBUG_INFO VDSO_CAFLAGS += -g @@ -59,13 +56,7 @@ endif # From arm vDSO Makefile VDSO_CAFLAGS += -fPIC -fno-builtin -fno-stack-protector VDSO_CAFLAGS += -DDISABLE_BRANCH_PROFILING - - -# Try to compile for ARMv8. If the compiler is too old and doesn't support it, -# fall back to v7. There is no easy way to check for what architecture the code -# is being compiled, so define a macro specifying that (see arch/arm/Makefile). -VDSO_CAFLAGS += $(call cc32-option,-march=armv8-a -D__LINUX_ARM_ARCH__=8,\ - -march=armv7-a -D__LINUX_ARM_ARCH__=7) +VDSO_CAFLAGS += -march=armv8-a VDSO_CFLAGS := $(VDSO_CAFLAGS) VDSO_CFLAGS += -DENABLE_COMPAT_VDSO=1 @@ -74,14 +65,13 @@ VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -fno-common \ -Werror-implicit-function-declaration \ -Wno-format-security \ - -std=gnu89 + -std=gnu11 VDSO_CFLAGS += -O2 # Some useful compiler-dependent flags from top-level Makefile -VDSO_CFLAGS += $(call cc32-option,-Wdeclaration-after-statement,) VDSO_CFLAGS += $(call cc32-option,-Wno-pointer-sign) -VDSO_CFLAGS += $(call cc32-option,-fno-strict-overflow) +VDSO_CFLAGS += -fno-strict-overflow VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes) -VDSO_CFLAGS += $(call cc32-option,-Werror=date-time) +VDSO_CFLAGS += -Werror=date-time VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types) # The 32-bit compiler does not provide 128-bit integers, which are used in @@ -94,34 +84,32 @@ VDSO_CFLAGS += -D__uint128_t='void*' VDSO_CFLAGS += $(call cc32-disable-warning,shift-count-overflow) VDSO_CFLAGS += -Wno-int-to-pointer-cast +# Compile as THUMB2 or ARM. Unwinding via frame-pointers in THUMB2 is +# unreliable. +ifeq ($(CONFIG_THUMB2_COMPAT_VDSO), y) +VDSO_CFLAGS += -mthumb -fomit-frame-pointer +else +VDSO_CFLAGS += -marm +endif + VDSO_AFLAGS := $(VDSO_CAFLAGS) VDSO_AFLAGS += -D__ASSEMBLY__ -# Check for binutils support for dmb ishld -dmbinstr := $(call cc32-as-instr,dmb ishld,-DCONFIG_AS_DMB_ISHLD=1) - -VDSO_CFLAGS += $(dmbinstr) -VDSO_AFLAGS += $(dmbinstr) - -VDSO_LDFLAGS := $(VDSO_CPPFLAGS) # From arm vDSO Makefile -VDSO_LDFLAGS += -Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1 -VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 -VDSO_LDFLAGS += -nostdlib -shared -mfloat-abi=soft -VDSO_LDFLAGS += -Wl,--hash-style=sysv -VDSO_LDFLAGS += -Wl,--build-id -VDSO_LDFLAGS += $(call cc32-ldoption,-fuse-ld=bfd) +VDSO_LDFLAGS += -Bsymbolic --no-undefined -soname=linux-vdso.so.1 +VDSO_LDFLAGS += -z max-page-size=4096 -z common-page-size=4096 +VDSO_LDFLAGS += -shared --hash-style=sysv --build-id=sha1 +VDSO_LDFLAGS += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) # Borrow vdsomunge.c from the arm vDSO # We have to use a relative path because scripts/Makefile.host prefixes -# $(hostprogs-y) with $(obj) +# $(hostprogs) with $(obj) munge := ../../../arm/vdso/vdsomunge -hostprogs-y := $(munge) +hostprogs := $(munge) c-obj-vdso := note.o c-obj-vdso-gettimeofday := vgettimeofday.o -asm-obj-vdso := sigreturn.o ifneq ($(c-gettimeofday-y),) VDSO_CFLAGS_gettimeofday_o += -include $(c-gettimeofday-y) @@ -130,28 +118,24 @@ endif VDSO_CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os # Build rules -targets := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) vdso.so vdso.so.dbg vdso.so.raw +targets := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) vdso.so vdso32.so.dbg vdso.so.raw c-obj-vdso := $(addprefix $(obj)/, $(c-obj-vdso)) c-obj-vdso-gettimeofday := $(addprefix $(obj)/, $(c-obj-vdso-gettimeofday)) asm-obj-vdso := $(addprefix $(obj)/, $(asm-obj-vdso)) obj-vdso := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) -obj-y += vdso.o -extra-y += vdso.lds +targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) -# Force dependency (vdso.s includes vdso.so through incbin) -$(obj)/vdso.o: $(obj)/vdso.so - -include/generated/vdso32-offsets.h: $(obj)/vdso.so.dbg FORCE +include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE $(call if_changed,vdsosym) # Strip rule for vdso.so $(obj)/vdso.so: OBJCOPYFLAGS := -S -$(obj)/vdso.so: $(obj)/vdso.so.dbg FORCE +$(obj)/vdso.so: $(obj)/vdso32.so.dbg FORCE $(call if_changed,objcopy) -$(obj)/vdso.so.dbg: $(obj)/vdso.so.raw $(obj)/$(munge) FORCE +$(obj)/vdso32.so.dbg: $(obj)/vdso.so.raw $(obj)/$(munge) FORCE $(call if_changed,vdsomunge) # Link rule for the .so file, .lds has to be first @@ -171,8 +155,8 @@ quiet_cmd_vdsold_and_vdso_check = LD32 $@ cmd_vdsold_and_vdso_check = $(cmd_vdsold); $(cmd_vdso_check) quiet_cmd_vdsold = LD32 $@ - cmd_vdsold = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_LDFLAGS) \ - -Wl,-T $(filter %.lds,$^) $(filter %.o,$^) -o $@ + cmd_vdsold = $(LD_COMPAT) $(VDSO_LDFLAGS) \ + -T $(filter %.lds,$^) $(filter %.o,$^) -o $@ quiet_cmd_vdsocc = CC32 $@ cmd_vdsocc = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_CFLAGS) -c -o $@ $< quiet_cmd_vdsocc_gettimeofday = CC32 $@ @@ -188,13 +172,3 @@ gen-vdsosym := $(srctree)/$(src)/../vdso/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ # The AArch64 nm should be able to read an AArch32 binary cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ - -# Install commands for the unstripped file -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/vdso32.so - -vdso.so: $(obj)/vdso.so.dbg - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: vdso.so diff --git a/arch/arm64/kernel/vdso32/sigreturn.S b/arch/arm64/kernel/vdso32/sigreturn.S deleted file mode 100644 index 1a81277c2d09..000000000000 --- a/arch/arm64/kernel/vdso32/sigreturn.S +++ /dev/null @@ -1,62 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * This file provides both A32 and T32 versions, in accordance with the - * arm sigreturn code. - * - * Copyright (C) 2018 ARM Limited - */ - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> -#include <asm/unistd.h> - -#define ARM_ENTRY(name) \ - ENTRY(name) - -#define ARM_ENDPROC(name) \ - .type name, %function; \ - END(name) - - .text - - .arm - .fnstart - .save {r0-r15} - .pad #COMPAT_SIGFRAME_REGS_OFFSET - nop -ARM_ENTRY(__kernel_sigreturn_arm) - mov r7, #__NR_compat_sigreturn - svc #0 - .fnend -ARM_ENDPROC(__kernel_sigreturn_arm) - - .fnstart - .save {r0-r15} - .pad #COMPAT_RT_SIGFRAME_REGS_OFFSET - nop -ARM_ENTRY(__kernel_rt_sigreturn_arm) - mov r7, #__NR_compat_rt_sigreturn - svc #0 - .fnend -ARM_ENDPROC(__kernel_rt_sigreturn_arm) - - .thumb - .fnstart - .save {r0-r15} - .pad #COMPAT_SIGFRAME_REGS_OFFSET - nop -ARM_ENTRY(__kernel_sigreturn_thumb) - mov r7, #__NR_compat_sigreturn - svc #0 - .fnend -ARM_ENDPROC(__kernel_sigreturn_thumb) - - .fnstart - .save {r0-r15} - .pad #COMPAT_RT_SIGFRAME_REGS_OFFSET - nop -ARM_ENTRY(__kernel_rt_sigreturn_thumb) - mov r7, #__NR_compat_rt_sigreturn - svc #0 - .fnend -ARM_ENDPROC(__kernel_rt_sigreturn_thumb) diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S index a3944927eaeb..8d95d7d35057 100644 --- a/arch/arm64/kernel/vdso32/vdso.lds.S +++ b/arch/arm64/kernel/vdso32/vdso.lds.S @@ -11,13 +11,17 @@ #include <linux/const.h> #include <asm/page.h> #include <asm/vdso.h> +#include <asm-generic/vmlinux.lds.h> OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") OUTPUT_ARCH(arm) SECTIONS { - PROVIDE_HIDDEN(_vdso_data = . - PAGE_SIZE); + PROVIDE_HIDDEN(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE_HIDDEN(_timens_data = _vdso_data + PAGE_SIZE); +#endif . = VDSO_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text @@ -32,12 +36,30 @@ SECTIONS .dynamic : { *(.dynamic) } :text :dynamic - .rodata : { *(.rodata*) } :text + .rodata : { + *(.rodata*) + *(.got) + *(.got.plt) + *(.plt) + *(.rel.iplt) + *(.iplt) + *(.igot.plt) + } :text - .text : { *(.text*) } :text =0xe7f001f2 + .text : { + *(.text*) + *(.glue_7) + *(.glue_7t) + *(.vfp11_veneer) + *(.v4_bx) + } :text =0xe7f001f2 - .got : { *(.got) } - .rel.plt : { *(.rel.plt) } + .rel.dyn : { *(.rel*) } + + .ARM.exidx : { *(.ARM.exidx*) } + DWARF_DEBUG + ELF_DETAILS + .ARM.attributes 0 : { *(.ARM.attributes) } /DISCARD/ : { *(.note.GNU-stack) @@ -64,19 +86,7 @@ VERSION __vdso_clock_gettime; __vdso_gettimeofday; __vdso_clock_getres; - __kernel_sigreturn_arm; - __kernel_sigreturn_thumb; - __kernel_rt_sigreturn_arm; - __kernel_rt_sigreturn_thumb; __vdso_clock_gettime64; local: *; }; } - -/* - * Make the sigreturn code visible to the kernel. - */ -VDSO_compat_sigreturn_arm = __kernel_sigreturn_arm; -VDSO_compat_sigreturn_thumb = __kernel_sigreturn_thumb; -VDSO_compat_rt_sigreturn_arm = __kernel_rt_sigreturn_arm; -VDSO_compat_rt_sigreturn_thumb = __kernel_rt_sigreturn_thumb; diff --git a/arch/arm64/kernel/vdso32/vgettimeofday.c b/arch/arm64/kernel/vdso32/vgettimeofday.c index 54fc1c2ce93f..29b4d8f61e39 100644 --- a/arch/arm64/kernel/vdso32/vgettimeofday.c +++ b/arch/arm64/kernel/vdso32/vgettimeofday.c @@ -5,26 +5,18 @@ * Copyright (C) 2018 ARM Limited * */ -#include <linux/time.h> -#include <linux/types.h> +#define BUILD_VDSO32_64 +#include <vdso/gettime.h> int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts) { - /* The checks below are required for ABI consistency with arm */ - if ((u32)ts >= TASK_SIZE_32) - return -EFAULT; - return __cvdso_clock_gettime32(clock, ts); } int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts) { - /* The checks below are required for ABI consistency with arm */ - if ((u32)ts >= TASK_SIZE_32) - return -EFAULT; - return __cvdso_clock_gettime(clock, ts); } @@ -37,10 +29,6 @@ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, int __vdso_clock_getres(clockid_t clock_id, struct old_timespec32 *res) { - /* The checks below are required for ABI consistency with arm */ - if ((u32)res >= TASK_SIZE_32) - return -EFAULT; - return __cvdso_clock_getres_time32(clock_id, res); } diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 497f9675071d..3cd7e76cc562 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -5,42 +5,84 @@ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz> */ -#define RO_EXCEPTION_TABLE_ALIGN 8 +#include <asm/hyp_image.h> +#ifdef CONFIG_KVM +#define HYPERVISOR_EXTABLE \ + . = ALIGN(SZ_8); \ + __start___kvm_ex_table = .; \ + *(__kvm_ex_table) \ + __stop___kvm_ex_table = .; + +#define HYPERVISOR_DATA_SECTIONS \ + HYP_SECTION_NAME(.rodata) : { \ + . = ALIGN(PAGE_SIZE); \ + __hyp_rodata_start = .; \ + *(HYP_SECTION_NAME(.data..ro_after_init)) \ + *(HYP_SECTION_NAME(.rodata)) \ + . = ALIGN(PAGE_SIZE); \ + __hyp_rodata_end = .; \ + } + +#define HYPERVISOR_PERCPU_SECTION \ + . = ALIGN(PAGE_SIZE); \ + HYP_SECTION_NAME(.data..percpu) : { \ + *(HYP_SECTION_NAME(.data..percpu)) \ + } + +#define HYPERVISOR_RELOC_SECTION \ + .hyp.reloc : ALIGN(4) { \ + __hyp_reloc_begin = .; \ + *(.hyp.reloc) \ + __hyp_reloc_end = .; \ + } + +#define BSS_FIRST_SECTIONS \ + __hyp_bss_start = .; \ + *(HYP_SECTION_NAME(.bss)) \ + . = ALIGN(PAGE_SIZE); \ + __hyp_bss_end = .; + +/* + * We require that __hyp_bss_start and __bss_start are aligned, and enforce it + * with an assertion. But the BSS_SECTION macro places an empty .sbss section + * between them, which can in some cases cause the linker to misalign them. To + * work around the issue, force a page alignment for __bss_start. + */ +#define SBSS_ALIGN PAGE_SIZE +#else /* CONFIG_KVM */ +#define HYPERVISOR_EXTABLE +#define HYPERVISOR_DATA_SECTIONS +#define HYPERVISOR_PERCPU_SECTION +#define HYPERVISOR_RELOC_SECTION +#define SBSS_ALIGN 0 +#endif + +#define RO_EXCEPTION_TABLE_ALIGN 4 +#define RUNTIME_DISCARD_EXIT #include <asm-generic/vmlinux.lds.h> #include <asm/cache.h> #include <asm/kernel-pgtable.h> -#include <asm/thread_info.h> +#include <asm/kexec.h> #include <asm/memory.h> #include <asm/page.h> -#include <asm/pgtable.h> #include "image.h" -/* .exit.text needed in case of alternative patching */ -#define ARM_EXIT_KEEP(x) x -#define ARM_EXIT_DISCARD(x) - OUTPUT_ARCH(aarch64) ENTRY(_text) jiffies = jiffies_64; #define HYPERVISOR_TEXT \ - /* \ - * Align to 4 KB so that \ - * a) the HYP vector table is at its minimum \ - * alignment of 2048 bytes \ - * b) the HYP init code will not cross a page \ - * boundary if its size does not exceed \ - * 4 KB (see related ASSERT() below) \ - */ \ - . = ALIGN(SZ_4K); \ + . = ALIGN(PAGE_SIZE); \ __hyp_idmap_text_start = .; \ *(.hyp.idmap.text) \ __hyp_idmap_text_end = .; \ __hyp_text_start = .; \ *(.hyp.text) \ + HYPERVISOR_EXTABLE \ + . = ALIGN(PAGE_SIZE); \ __hyp_text_end = .; #define IDMAP_TEXT \ @@ -51,7 +93,7 @@ jiffies = jiffies_64; #ifdef CONFIG_HIBERNATION #define HIBERNATE_TEXT \ - . = ALIGN(SZ_4K); \ + ALIGN_FUNCTION(); \ __hibernate_exit_text_start = .; \ *(.hibernate_exit.text) \ __hibernate_exit_text_end = .; @@ -59,21 +101,43 @@ jiffies = jiffies_64; #define HIBERNATE_TEXT #endif +#ifdef CONFIG_KEXEC_CORE +#define KEXEC_TEXT \ + ALIGN_FUNCTION(); \ + __relocate_new_kernel_start = .; \ + *(.kexec_relocate.text) \ + __relocate_new_kernel_end = .; +#else +#define KEXEC_TEXT +#endif + #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 #define TRAMP_TEXT \ . = ALIGN(PAGE_SIZE); \ __entry_tramp_text_start = .; \ *(.entry.tramp.text) \ . = ALIGN(PAGE_SIZE); \ - __entry_tramp_text_end = .; + __entry_tramp_text_end = .; \ + *(.entry.tramp.rodata) #else #define TRAMP_TEXT #endif +#ifdef CONFIG_UNWIND_TABLES +#define UNWIND_DATA_SECTIONS \ + .eh_frame : { \ + __eh_frame_start = .; \ + *(.eh_frame) \ + __eh_frame_end = .; \ + } +#else +#define UNWIND_DATA_SECTIONS +#endif + /* * The size of the PE/COFF section that covers the kernel image, which - * runs from stext to _edata, must be a round multiple of the PE/COFF - * FileAlignment, which we set to its minimum value of 0x200. 'stext' + * runs from _stext to _edata, must be a round multiple of the PE/COFF + * FileAlignment, which we set to its minimum value of 0x200. '_stext' * itself is 4 KB aligned, so padding out _edata to a 0x200 aligned * boundary should be sufficient. */ @@ -94,41 +158,29 @@ SECTIONS * matching the same input section name. There is no documented * order of matching. */ + DISCARDS /DISCARD/ : { - ARM_EXIT_DISCARD(EXIT_TEXT) - ARM_EXIT_DISCARD(EXIT_DATA) - EXIT_CALL - *(.discard) - *(.discard.*) *(.interp .dynamic) *(.dynsym .dynstr .hash .gnu.hash) - *(.eh_frame) } - . = KIMAGE_VADDR + TEXT_OFFSET; + . = KIMAGE_VADDR; .head.text : { _text = .; HEAD_TEXT } - .text : { /* Real text segment */ + .text : ALIGN(SEGMENT_ALIGN) { /* Real text segment */ _stext = .; /* Text and read-only data */ IRQENTRY_TEXT SOFTIRQENTRY_TEXT ENTRY_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT HYPERVISOR_TEXT - IDMAP_TEXT - HIBERNATE_TEXT - TRAMP_TEXT - *(.fixup) *(.gnu.warning) - . = ALIGN(16); - *(.got) /* Global offset table */ } . = ALIGN(SEGMENT_ALIGN); @@ -137,21 +189,39 @@ SECTIONS /* everything from this point to __init_begin will be marked RO NX */ RO_DATA(PAGE_SIZE) + HYPERVISOR_DATA_SECTIONS + + .got : { *(.got) } + /* + * Make sure that the .got.plt is either completely empty or it + * contains only the lazy dispatch entries. + */ + .got.plt : { *(.got.plt) } + ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18, + "Unexpected GOT/PLT entries detected!") + + /* code sections that are never executed via the kernel mapping */ + .rodata.text : { + TRAMP_TEXT + HIBERNATE_TEXT + KEXEC_TEXT + IDMAP_TEXT + . = ALIGN(PAGE_SIZE); + } + idmap_pg_dir = .; - . += IDMAP_DIR_SIZE; + . += PAGE_SIZE; #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 tramp_pg_dir = .; . += PAGE_SIZE; #endif -#ifdef CONFIG_ARM64_SW_TTBR0_PAN - reserved_ttbr0 = .; - . += RESERVED_TTBR0_SIZE; -#endif + reserved_pg_dir = .; + . += PAGE_SIZE; + swapper_pg_dir = .; . += PAGE_SIZE; - swapper_pg_end = .; . = ALIGN(SEGMENT_ALIGN); __init_begin = .; @@ -161,7 +231,7 @@ SECTIONS __exittext_begin = .; .exit.text : { - ARM_EXIT_KEEP(EXIT_TEXT) + EXIT_TEXT } __exittext_end = .; @@ -171,44 +241,46 @@ SECTIONS *(.altinstructions) __alt_instructions_end = .; } - .altinstr_replacement : { - *(.altinstr_replacement) - } - . = ALIGN(PAGE_SIZE); + UNWIND_DATA_SECTIONS + + . = ALIGN(SEGMENT_ALIGN); __inittext_end = .; __initdata_begin = .; + init_idmap_pg_dir = .; + . += INIT_IDMAP_DIR_SIZE; + init_idmap_pg_end = .; + .init.data : { INIT_DATA INIT_SETUP(16) INIT_CALLS CON_INITCALL INIT_RAM_FS - *(.init.rodata.* .init.bss) /* from the EFI stub */ + *(.init.altinstructions .init.bss) /* from the EFI stub */ } .exit.data : { - ARM_EXIT_KEEP(EXIT_DATA) + EXIT_DATA } PERCPU_SECTION(L1_CACHE_BYTES) + HYPERVISOR_PERCPU_SECTION + + HYPERVISOR_RELOC_SECTION .rela.dyn : ALIGN(8) { + __rela_start = .; *(.rela .rela*) + __rela_end = .; } - __rela_offset = ABSOLUTE(ADDR(.rela.dyn) - KIMAGE_VADDR); - __rela_size = SIZEOF(.rela.dyn); - -#ifdef CONFIG_RELR .relr.dyn : ALIGN(8) { + __relr_start = .; *(.relr.dyn) + __relr_end = .; } - __relr_offset = ABSOLUTE(ADDR(.relr.dyn) - KIMAGE_VADDR); - __relr_size = SIZEOF(.relr.dyn); -#endif - . = ALIGN(SEGMENT_ALIGN); __initdata_end = .; __init_end = .; @@ -239,40 +311,78 @@ SECTIONS __pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin); _edata = .; - BSS_SECTION(0, 0, 0) + BSS_SECTION(SBSS_ALIGN, 0, 0) . = ALIGN(PAGE_SIZE); init_pg_dir = .; . += INIT_DIR_SIZE; init_pg_end = .; + . = ALIGN(SEGMENT_ALIGN); __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; STABS_DEBUG + DWARF_DEBUG + ELF_DETAILS HEAD_SYMBOLS + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .plt : { + *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") + + .data.rel.ro : { *(.data.rel.ro) } + ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!") } #include "image-vars.h" /* - * The HYP init code and ID map text can't be longer than a page each, - * and should not cross a page boundary. + * The HYP init code and ID map text can't be longer than a page each. The + * former is page-aligned, but the latter may not be with 16K or 64K pages, so + * it should also not cross a page boundary. */ -ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, - "HYP init code too big or misaligned") +ASSERT(__hyp_idmap_text_end - __hyp_idmap_text_start <= PAGE_SIZE, + "HYP init code too big") ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, "ID map text too big or misaligned") #ifdef CONFIG_HIBERNATION -ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) - <= SZ_4K, "Hibernate exit text too big or misaligned") +ASSERT(__hibernate_exit_text_end - __hibernate_exit_text_start <= SZ_4K, + "Hibernate exit text is bigger than 4 KiB") +ASSERT(__hibernate_exit_text_start == swsusp_arch_suspend_exit, + "Hibernate exit text does not start with swsusp_arch_suspend_exit") #endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, +ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) <= 3*PAGE_SIZE, "Entry trampoline text too big") #endif +#ifdef CONFIG_KVM +ASSERT(__hyp_bss_start == __bss_start, "HYP and Host BSS are misaligned") +#endif /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ -ASSERT(_text == (KIMAGE_VADDR + TEXT_OFFSET), "HEAD is misaligned") +ASSERT(_text == KIMAGE_VADDR, "HEAD is misaligned") + +ASSERT(swapper_pg_dir - reserved_pg_dir == RESERVED_SWAPPER_OFFSET, + "RESERVED_SWAPPER_OFFSET is wrong!") + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +ASSERT(swapper_pg_dir - tramp_pg_dir == TRAMP_SWAPPER_OFFSET, + "TRAMP_SWAPPER_OFFSET is wrong!") +#endif + +#ifdef CONFIG_KEXEC_CORE +/* kexec relocation code should fit into one KEXEC_CONTROL_PAGE_SIZE */ +ASSERT(__relocate_new_kernel_end - __relocate_new_kernel_start <= SZ_4K, + "kexec relocation code is bigger than 4 KiB") +ASSERT(KEXEC_CONTROL_PAGE_SIZE >= SZ_4K, "KEXEC_CONTROL_PAGE_SIZE is broken") +ASSERT(__relocate_new_kernel_start == arm64_relocate_new_kernel, + "kexec control page does not start with arm64_relocate_new_kernel") +#endif diff --git a/arch/arm64/kernel/watchdog_hld.c b/arch/arm64/kernel/watchdog_hld.c new file mode 100644 index 000000000000..dcd25322127c --- /dev/null +++ b/arch/arm64/kernel/watchdog_hld.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/nmi.h> +#include <linux/cpufreq.h> +#include <linux/perf/arm_pmu.h> + +/* + * Safe maximum CPU frequency in case a particular platform doesn't implement + * cpufreq driver. Although, architecture doesn't put any restrictions on + * maximum frequency but 5 GHz seems to be safe maximum given the available + * Arm CPUs in the market which are clocked much less than 5 GHz. On the other + * hand, we can't make it much higher as it would lead to a large hard-lockup + * detection timeout on parts which are running slower (eg. 1GHz on + * Developerbox) and doesn't possess a cpufreq driver. + */ +#define SAFE_MAX_CPU_FREQ 5000000000UL // 5 GHz +u64 hw_nmi_get_sample_period(int watchdog_thresh) +{ + unsigned int cpu = smp_processor_id(); + unsigned long max_cpu_freq; + + max_cpu_freq = cpufreq_get_hw_max_freq(cpu) * 1000UL; + if (!max_cpu_freq) + max_cpu_freq = SAFE_MAX_CPU_FREQ; + + return (u64)max_cpu_freq * watchdog_thresh; +} + +bool __init arch_perf_nmi_is_available(void) +{ + /* + * hardlockup_detector_perf_init() will success even if Pseudo-NMI turns off, + * however, the pmu interrupts will act like a normal interrupt instead of + * NMI and the hardlockup detector would be broken. + */ + return arm_pmu_irq_is_nmi(); +} |