diff options
Diffstat (limited to 'arch/arm64/kernel')
95 files changed, 7209 insertions, 4770 deletions
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index d95b3d6b471a..76f32e424065 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -33,8 +33,8 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - syscall.o proton-pack.o idreg-override.o idle.o \ - patching.o + syscall.o proton-pack.o idle.o patching.o pi/ \ + rsi.o jump_label.o obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o @@ -47,8 +47,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_CPU_PM) += sleep.o suspend.o -obj-$(CONFIG_CPU_IDLE) += cpuidle.o -obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_EFI) += efi.o efi-rt-wrapper.o obj-$(CONFIG_PCI) += pci.o @@ -57,7 +55,7 @@ obj-$(CONFIG_ACPI) += acpi.o obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o obj-$(CONFIG_PARAVIRT) += paravirt.o -obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o pi/ +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o obj-$(CONFIG_ELF_CORE) += elfcore.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \ @@ -66,14 +64,12 @@ obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o -obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o -obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.o -CFLAGS_patch-scs.o += -mbranch-protection=none # Force dependency (vdso*-wrap.S includes vdso.so through incbin) $(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so @@ -81,10 +77,10 @@ $(obj)/vdso32-wrap.o: $(obj)/vdso32/vdso.so obj-y += probes/ obj-y += head.o -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds ifeq ($(CONFIG_DEBUG_EFI),y) -AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\"" +AFLAGS_head.o += -DVMLINUX_PATH="\"$(abspath vmlinux)\"" endif # for cleaning diff --git a/arch/arm64/kernel/Makefile.syscalls b/arch/arm64/kernel/Makefile.syscalls new file mode 100644 index 000000000000..0542a718871a --- /dev/null +++ b/arch/arm64/kernel/Makefile.syscalls @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 + +syscall_abis_32 += +syscall_abis_64 += renameat rlimit memfd_secret + +syscalltbl = arch/arm64/tools/syscall_%.tbl diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index dba8fcec7f33..af90128cfed5 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -26,9 +26,11 @@ #include <linux/libfdt.h> #include <linux/smp.h> #include <linux/serial_core.h> +#include <linux/suspend.h> #include <linux/pgtable.h> #include <acpi/ghes.h> +#include <acpi/processor.h> #include <asm/cputype.h> #include <asm/cpu_ops.h> #include <asm/daifflags.h> @@ -44,6 +46,7 @@ EXPORT_SYMBOL(acpi_pci_disabled); static bool param_acpi_off __initdata; static bool param_acpi_on __initdata; static bool param_acpi_force __initdata; +static bool param_acpi_nospcr __initdata; static int __init parse_acpi(char *arg) { @@ -57,6 +60,8 @@ static int __init parse_acpi(char *arg) param_acpi_on = true; else if (strcmp(arg, "force") == 0) /* force ACPI to be enabled */ param_acpi_force = true; + else if (strcmp(arg, "nospcr") == 0) /* disable SPCR as default console */ + param_acpi_nospcr = true; else return -EINVAL; /* Core will print when we return error */ @@ -128,7 +133,7 @@ static int __init acpi_fadt_sanity_check(void) /* * FADT is required on arm64; retrieve it to check its presence - * and carry out revision and ACPI HW reduced compliancy tests + * and carry out revision and ACPI HW reduced compliance tests */ status = acpi_get_table(ACPI_SIG_FADT, 0, &table); if (ACPI_FAILURE(status)) { @@ -227,7 +232,27 @@ done: if (earlycon_acpi_spcr_enable) early_init_dt_scan_chosen_stdout(); } else { - acpi_parse_spcr(earlycon_acpi_spcr_enable, true); +#ifdef CONFIG_HIBERNATION + struct acpi_table_header *facs = NULL; + acpi_get_table(ACPI_SIG_FACS, 1, &facs); + if (facs) { + swsusp_hardware_signature = + ((struct acpi_table_facs *)facs)->hardware_signature; + acpi_put_table(facs); + } +#endif + + /* + * For varying privacy and security reasons, sometimes need + * to completely silence the serial console output, and only + * enable it when needed. + * But there are many existing systems that depend on this + * behaviour, use acpi=nospcr to disable console in ACPI SPCR + * table as default serial console. + */ + acpi_parse_spcr(earlycon_acpi_spcr_enable, + !param_acpi_nospcr); + if (IS_ENABLED(CONFIG_ACPI_BGRT)) acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); } @@ -352,7 +377,7 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) prot = __acpi_get_writethrough_mem_attribute(); } } - return ioremap_prot(phys, size, pgprot_val(prot)); + return ioremap_prot(phys, size, prot); } /* @@ -376,7 +401,7 @@ int apei_claim_sea(struct pt_regs *regs) return_to_irqs_enabled = !irqs_disabled_flags(arch_local_save_flags()); if (regs) - return_to_irqs_enabled = interrupts_enabled(regs); + return_to_irqs_enabled = !regs_irqs_disabled(regs); /* * SEA can interrupt SError, mask it and describe this as an NMI so @@ -398,7 +423,7 @@ int apei_claim_sea(struct pt_regs *regs) irq_work_run(); __irq_exit(); } else { - pr_warn_ratelimited("APEI work queued but not completed"); + pr_warn_ratelimited("APEI work queued but not completed\n"); err = -EINPROGRESS; } } @@ -413,107 +438,23 @@ void arch_reserve_mem_area(acpi_physical_address addr, size_t size) memblock_mark_nomap(addr, size); } -#ifdef CONFIG_ACPI_FFH -/* - * Implements ARM64 specific callbacks to support ACPI FFH Operation Region as - * specified in https://developer.arm.com/docs/den0048/latest - */ -struct acpi_ffh_data { - struct acpi_ffh_info info; - void (*invoke_ffh_fn)(unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, - unsigned long a4, unsigned long a5, - unsigned long a6, unsigned long a7, - struct arm_smccc_res *args, - struct arm_smccc_quirk *res); - void (*invoke_ffh64_fn)(const struct arm_smccc_1_2_regs *args, - struct arm_smccc_1_2_regs *res); -}; - -int acpi_ffh_address_space_arch_setup(void *handler_ctxt, void **region_ctxt) +#ifdef CONFIG_ACPI_HOTPLUG_CPU +int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 apci_id, + int *pcpu) { - enum arm_smccc_conduit conduit; - struct acpi_ffh_data *ffh_ctxt; - - if (arm_smccc_get_version() < ARM_SMCCC_VERSION_1_2) - return -EOPNOTSUPP; - - conduit = arm_smccc_1_1_get_conduit(); - if (conduit == SMCCC_CONDUIT_NONE) { - pr_err("%s: invalid SMCCC conduit\n", __func__); - return -EOPNOTSUPP; + /* If an error code is passed in this stub can't fix it */ + if (*pcpu < 0) { + pr_warn_once("Unable to map CPU to valid ID\n"); + return *pcpu; } - ffh_ctxt = kzalloc(sizeof(*ffh_ctxt), GFP_KERNEL); - if (!ffh_ctxt) - return -ENOMEM; - - if (conduit == SMCCC_CONDUIT_SMC) { - ffh_ctxt->invoke_ffh_fn = __arm_smccc_smc; - ffh_ctxt->invoke_ffh64_fn = arm_smccc_1_2_smc; - } else { - ffh_ctxt->invoke_ffh_fn = __arm_smccc_hvc; - ffh_ctxt->invoke_ffh64_fn = arm_smccc_1_2_hvc; - } - - memcpy(ffh_ctxt, handler_ctxt, sizeof(ffh_ctxt->info)); - - *region_ctxt = ffh_ctxt; - return AE_OK; -} - -static bool acpi_ffh_smccc_owner_allowed(u32 fid) -{ - int owner = ARM_SMCCC_OWNER_NUM(fid); - - if (owner == ARM_SMCCC_OWNER_STANDARD || - owner == ARM_SMCCC_OWNER_SIP || owner == ARM_SMCCC_OWNER_OEM) - return true; - - return false; + return 0; } +EXPORT_SYMBOL(acpi_map_cpu); -int acpi_ffh_address_space_arch_handler(acpi_integer *value, void *region_context) +int acpi_unmap_cpu(int cpu) { - int ret = 0; - struct acpi_ffh_data *ffh_ctxt = region_context; - - if (ffh_ctxt->info.offset == 0) { - /* SMC/HVC 32bit call */ - struct arm_smccc_res res; - u32 a[8] = { 0 }, *ptr = (u32 *)value; - - if (!ARM_SMCCC_IS_FAST_CALL(*ptr) || ARM_SMCCC_IS_64(*ptr) || - !acpi_ffh_smccc_owner_allowed(*ptr) || - ffh_ctxt->info.length > 32) { - ret = AE_ERROR; - } else { - int idx, len = ffh_ctxt->info.length >> 2; - - for (idx = 0; idx < len; idx++) - a[idx] = *(ptr + idx); - - ffh_ctxt->invoke_ffh_fn(a[0], a[1], a[2], a[3], a[4], - a[5], a[6], a[7], &res, NULL); - memcpy(value, &res, sizeof(res)); - } - - } else if (ffh_ctxt->info.offset == 1) { - /* SMC/HVC 64bit call */ - struct arm_smccc_1_2_regs *r = (struct arm_smccc_1_2_regs *)value; - - if (!ARM_SMCCC_IS_FAST_CALL(r->a0) || !ARM_SMCCC_IS_64(r->a0) || - !acpi_ffh_smccc_owner_allowed(r->a0) || - ffh_ctxt->info.length > sizeof(*r)) { - ret = AE_ERROR; - } else { - ffh_ctxt->invoke_ffh64_fn(r, r); - memcpy(value, r, ffh_ctxt->info.length); - } - } else { - ret = AE_ERROR; - } - - return ret; + return 0; } -#endif /* CONFIG_ACPI_FFH */ +EXPORT_SYMBOL(acpi_unmap_cpu); +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c index e51535a5f939..2465f291c7e1 100644 --- a/arch/arm64/kernel/acpi_numa.c +++ b/arch/arm64/kernel/acpi_numa.c @@ -27,24 +27,13 @@ #include <asm/numa.h> -static int acpi_early_node_map[NR_CPUS] __initdata = { NUMA_NO_NODE }; +static int acpi_early_node_map[NR_CPUS] __initdata = { [0 ... NR_CPUS - 1] = NUMA_NO_NODE }; int __init acpi_numa_get_nid(unsigned int cpu) { return acpi_early_node_map[cpu]; } -static inline int get_cpu_for_acpi_id(u32 uid) -{ - int cpu; - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) - if (uid == get_acpi_id_for_cpu(cpu)) - return cpu; - - return -EINVAL; -} - static int __init acpi_parse_gicc_pxm(union acpi_subtable_headers *header, const unsigned long end) { diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c index b1990e38aed0..e1be29e608b7 100644 --- a/arch/arm64/kernel/acpi_parking_protocol.c +++ b/arch/arm64/kernel/acpi_parking_protocol.c @@ -103,7 +103,7 @@ static int acpi_parking_protocol_cpu_boot(unsigned int cpu) &mailbox->entry_point); writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id); - arch_send_wakeup_ipi_mask(cpumask_of(cpu)); + arch_send_wakeup_ipi(cpu); return 0; } diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 8ff6610af496..f5ec7e7c1d3f 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -139,9 +139,9 @@ static noinstr void clean_dcache_range_nopatch(u64 start, u64 end) } while (cur += d_size, cur < end); } -static void __apply_alternatives(const struct alt_region *region, - bool is_module, - unsigned long *cpucap_mask) +static int __apply_alternatives(const struct alt_region *region, + bool is_module, + unsigned long *cpucap_mask) { struct alt_instr *alt; __le32 *origptr, *updptr; @@ -166,10 +166,13 @@ static void __apply_alternatives(const struct alt_region *region, updptr = is_module ? origptr : lm_alias(origptr); nr_inst = alt->orig_len / AARCH64_INSN_SIZE; - if (ALT_HAS_CB(alt)) + if (ALT_HAS_CB(alt)) { alt_cb = ALT_REPL_PTR(alt); - else + if (is_module && !core_kernel_text((unsigned long)alt_cb)) + return -ENOEXEC; + } else { alt_cb = patch_alternative; + } alt_cb(alt, origptr, updptr, nr_inst); @@ -193,6 +196,8 @@ static void __apply_alternatives(const struct alt_region *region, bitmap_and(applied_alternatives, applied_alternatives, system_cpucaps, ARM64_NCAPS); } + + return 0; } static void __init apply_alternatives_vdso(void) @@ -277,7 +282,7 @@ void __init apply_boot_alternatives(void) } #ifdef CONFIG_MODULES -void apply_alternatives_module(void *start, size_t length) +int apply_alternatives_module(void *start, size_t length) { struct alt_region region = { .begin = start, @@ -287,7 +292,7 @@ void apply_alternatives_module(void *start, size_t length) bitmap_fill(all_capabilities, ARM64_NCAPS); - __apply_alternatives(®ion, true, &all_capabilities[0]); + return __apply_alternatives(®ion, true, &all_capabilities[0]); } #endif diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index e459cfd33711..e737c6295ec7 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -52,10 +52,8 @@ struct insn_emulation { int min; int max; - /* - * sysctl for this emulation + a sentinal entry. - */ - struct ctl_table sysctl[2]; + /* sysctl for this emulation */ + struct ctl_table sysctl; }; #define ARM_OPCODE_CONDTEST_FAIL 0 @@ -464,6 +462,9 @@ static int run_all_insn_set_hw_mode(unsigned int cpu) for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) { struct insn_emulation *insn = insn_emulations[i]; bool enable = READ_ONCE(insn->current_mode) == INSN_HW; + if (insn->status == INSN_UNAVAILABLE) + continue; + if (insn->set_hw_mode && insn->set_hw_mode(enable)) { pr_warn("CPU[%u] cannot support the emulation of %s", cpu, insn->name); @@ -506,7 +507,7 @@ static int update_insn_emulation_mode(struct insn_emulation *insn, return ret; } -static int emulation_proc_handler(struct ctl_table *table, int write, +static int emulation_proc_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -558,7 +559,7 @@ static void __init register_insn_emulation(struct insn_emulation *insn) update_insn_emulation_mode(insn, INSN_UNDEF); if (insn->status != INSN_UNAVAILABLE) { - sysctl = &insn->sysctl[0]; + sysctl = &insn->sysctl; sysctl->mode = 0644; sysctl->maxlen = sizeof(int); diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 5ff1942b04fc..b6367ff3a49c 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -6,21 +6,19 @@ * 2001-2002 Keith Owens * Copyright (C) 2012 ARM Ltd. */ +#define COMPILE_OFFSETS #include <linux/arm_sdei.h> #include <linux/sched.h> #include <linux/ftrace.h> #include <linux/kexec.h> #include <linux/mm.h> -#include <linux/dma-mapping.h> #include <linux/kvm_host.h> -#include <linux/preempt.h> #include <linux/suspend.h> #include <asm/cpufeature.h> #include <asm/fixmap.h> #include <asm/thread_info.h> #include <asm/memory.h> -#include <asm/signal32.h> #include <asm/smp_plat.h> #include <asm/suspend.h> #include <linux/kbuild.h> @@ -28,8 +26,6 @@ int main(void) { - DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); - BLANK(); DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); @@ -75,51 +71,31 @@ int main(void) DEFINE(S_FP, offsetof(struct pt_regs, regs[29])); DEFINE(S_LR, offsetof(struct pt_regs, regs[30])); DEFINE(S_SP, offsetof(struct pt_regs, sp)); - DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate)); DEFINE(S_PC, offsetof(struct pt_regs, pc)); + DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1)); - DEFINE(S_PMR_SAVE, offsetof(struct pt_regs, pmr_save)); + DEFINE(S_PMR, offsetof(struct pt_regs, pmr)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); + DEFINE(S_STACKFRAME_TYPE, offsetof(struct pt_regs, stackframe.type)); DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); BLANK(); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS - DEFINE(FREGS_X0, offsetof(struct ftrace_regs, regs[0])); - DEFINE(FREGS_X2, offsetof(struct ftrace_regs, regs[2])); - DEFINE(FREGS_X4, offsetof(struct ftrace_regs, regs[4])); - DEFINE(FREGS_X6, offsetof(struct ftrace_regs, regs[6])); - DEFINE(FREGS_X8, offsetof(struct ftrace_regs, regs[8])); - DEFINE(FREGS_FP, offsetof(struct ftrace_regs, fp)); - DEFINE(FREGS_LR, offsetof(struct ftrace_regs, lr)); - DEFINE(FREGS_SP, offsetof(struct ftrace_regs, sp)); - DEFINE(FREGS_PC, offsetof(struct ftrace_regs, pc)); + DEFINE(FREGS_X0, offsetof(struct __arch_ftrace_regs, regs[0])); + DEFINE(FREGS_X2, offsetof(struct __arch_ftrace_regs, regs[2])); + DEFINE(FREGS_X4, offsetof(struct __arch_ftrace_regs, regs[4])); + DEFINE(FREGS_X6, offsetof(struct __arch_ftrace_regs, regs[6])); + DEFINE(FREGS_X8, offsetof(struct __arch_ftrace_regs, regs[8])); + DEFINE(FREGS_FP, offsetof(struct __arch_ftrace_regs, fp)); + DEFINE(FREGS_LR, offsetof(struct __arch_ftrace_regs, lr)); + DEFINE(FREGS_SP, offsetof(struct __arch_ftrace_regs, sp)); + DEFINE(FREGS_PC, offsetof(struct __arch_ftrace_regs, pc)); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS - DEFINE(FREGS_DIRECT_TRAMP, offsetof(struct ftrace_regs, direct_tramp)); + DEFINE(FREGS_DIRECT_TRAMP, offsetof(struct __arch_ftrace_regs, direct_tramp)); #endif - DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); + DEFINE(FREGS_SIZE, sizeof(struct __arch_ftrace_regs)); BLANK(); #endif -#ifdef CONFIG_COMPAT - DEFINE(COMPAT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0)); - DEFINE(COMPAT_RT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_rt_sigframe, sig.uc.uc_mcontext.arm_r0)); - BLANK(); -#endif - DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); - BLANK(); - DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm)); - DEFINE(VMA_VM_FLAGS, offsetof(struct vm_area_struct, vm_flags)); - BLANK(); - DEFINE(VM_EXEC, VM_EXEC); - BLANK(); - DEFINE(PAGE_SZ, PAGE_SIZE); - BLANK(); - DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); - DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); - BLANK(); - DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); - DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT); - DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending)); - BLANK(); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); DEFINE(FTR_OVR_VAL_OFFSET, offsetof(struct arm64_ftr_override, val)); @@ -130,6 +106,7 @@ int main(void) DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1)); DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2)); DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_cpu_context, regs)); + DEFINE(CPU_ELR_EL2, offsetof(struct kvm_cpu_context, sys_regs[ELR_EL2])); DEFINE(CPU_RGSR_EL1, offsetof(struct kvm_cpu_context, sys_regs[RGSR_EL1])); DEFINE(CPU_GCR_EL1, offsetof(struct kvm_cpu_context, sys_regs[GCR_EL1])); DEFINE(CPU_APIAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIAKEYLO_EL1])); @@ -147,6 +124,7 @@ int main(void) DEFINE(NVHE_INIT_HCR_EL2, offsetof(struct kvm_nvhe_init_params, hcr_el2)); DEFINE(NVHE_INIT_VTTBR, offsetof(struct kvm_nvhe_init_params, vttbr)); DEFINE(NVHE_INIT_VTCR, offsetof(struct kvm_nvhe_init_params, vtcr)); + DEFINE(NVHE_INIT_TMP, offsetof(struct kvm_nvhe_init_params, tmp)); #endif #ifdef CONFIG_CPU_PM DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); @@ -202,20 +180,10 @@ int main(void) DEFINE(FTRACE_OPS_FUNC, offsetof(struct ftrace_ops, func)); #endif BLANK(); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - DEFINE(FGRET_REGS_X0, offsetof(struct fgraph_ret_regs, regs[0])); - DEFINE(FGRET_REGS_X1, offsetof(struct fgraph_ret_regs, regs[1])); - DEFINE(FGRET_REGS_X2, offsetof(struct fgraph_ret_regs, regs[2])); - DEFINE(FGRET_REGS_X3, offsetof(struct fgraph_ret_regs, regs[3])); - DEFINE(FGRET_REGS_X4, offsetof(struct fgraph_ret_regs, regs[4])); - DEFINE(FGRET_REGS_X5, offsetof(struct fgraph_ret_regs, regs[5])); - DEFINE(FGRET_REGS_X6, offsetof(struct fgraph_ret_regs, regs[6])); - DEFINE(FGRET_REGS_X7, offsetof(struct fgraph_ret_regs, regs[7])); - DEFINE(FGRET_REGS_FP, offsetof(struct fgraph_ret_regs, fp)); - DEFINE(FGRET_REGS_SIZE, sizeof(struct fgraph_ret_regs)); -#endif #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS DEFINE(FTRACE_OPS_DIRECT_CALL, offsetof(struct ftrace_ops, direct_call)); #endif + DEFINE(PIE_E0_ASM, PIE_E0); + DEFINE(PIE_E1_ASM, PIE_E1); return 0; } diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index d9c9218fa1fd..309942b06c5b 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -101,16 +101,18 @@ int populate_cache_leaves(unsigned int cpu) unsigned int level, idx; enum cache_type type; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; + struct cacheinfo *infos = this_cpu_ci->info_list; for (idx = 0, level = 1; level <= this_cpu_ci->num_levels && - idx < this_cpu_ci->num_leaves; idx++, level++) { + idx < this_cpu_ci->num_leaves; level++) { type = get_cache_type(level); if (type == CACHE_TYPE_SEPARATE) { - ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); - ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); + if (idx + 1 >= this_cpu_ci->num_leaves) + break; + ci_leaf_init(&infos[idx++], CACHE_TYPE_DATA, level); + ci_leaf_init(&infos[idx++], CACHE_TYPE_INST, level); } else { - ci_leaf_init(this_leaf++, type, level); + ci_leaf_init(&infos[idx++], type, level); } } return 0; diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c index deff21bfa680..b68e1d328d4c 100644 --- a/arch/arm64/kernel/compat_alignment.c +++ b/arch/arm64/kernel/compat_alignment.c @@ -368,6 +368,8 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) return 1; } + if (!handler) + return 1; type = handler(addr, instr, regs); if (type == TYPE_ERROR || type == TYPE_FAULT) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index be66e94a21bd..8cb3b575a031 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -14,31 +14,85 @@ #include <asm/kvm_asm.h> #include <asm/smp_plat.h> +static u64 target_impl_cpu_num; +static struct target_impl_cpu *target_impl_cpus; + +bool cpu_errata_set_target_impl(u64 num, void *impl_cpus) +{ + if (target_impl_cpu_num || !num || !impl_cpus) + return false; + + target_impl_cpu_num = num; + target_impl_cpus = impl_cpus; + return true; +} + +static inline bool is_midr_in_range(struct midr_range const *range) +{ + int i; + + if (!target_impl_cpu_num) + return midr_is_cpu_model_range(read_cpuid_id(), range->model, + range->rv_min, range->rv_max); + + for (i = 0; i < target_impl_cpu_num; i++) { + if (midr_is_cpu_model_range(target_impl_cpus[i].midr, + range->model, + range->rv_min, range->rv_max)) + return true; + } + return false; +} + +bool is_midr_in_range_list(struct midr_range const *ranges) +{ + while (ranges->model) + if (is_midr_in_range(ranges++)) + return true; + return false; +} +EXPORT_SYMBOL_GPL(is_midr_in_range_list); + static bool __maybe_unused -is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) +__is_affected_midr_range(const struct arm64_cpu_capabilities *entry, + u32 midr, u32 revidr) { const struct arm64_midr_revidr *fix; - u32 midr = read_cpuid_id(), revidr; - - WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - if (!is_midr_in_range(midr, &entry->midr_range)) + if (!is_midr_in_range(&entry->midr_range)) return false; midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK; - revidr = read_cpuid(REVIDR_EL1); for (fix = entry->fixed_revs; fix && fix->revidr_mask; fix++) if (midr == fix->midr_rv && (revidr & fix->revidr_mask)) return false; - return true; } static bool __maybe_unused +is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) +{ + int i; + + if (!target_impl_cpu_num) { + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + return __is_affected_midr_range(entry, read_cpuid_id(), + read_cpuid(REVIDR_EL1)); + } + + for (i = 0; i < target_impl_cpu_num; i++) { + if (__is_affected_midr_range(entry, target_impl_cpus[i].midr, + target_impl_cpus[i].midr)) + return true; + } + return false; +} + +static bool __maybe_unused is_affected_midr_range_list(const struct arm64_cpu_capabilities *entry, int scope) { WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list); + return is_midr_in_range_list(entry->midr_range_list); } static bool __maybe_unused @@ -121,22 +175,6 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused) sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCI, 0); } -static DEFINE_RAW_SPINLOCK(reg_user_mask_modification); -static void __maybe_unused -cpu_clear_bf16_from_user_emulation(const struct arm64_cpu_capabilities *__unused) -{ - struct arm64_ftr_reg *regp; - - regp = get_arm64_ftr_reg(SYS_ID_AA64ISAR1_EL1); - if (!regp) - return; - - raw_spin_lock(®_user_mask_modification); - if (regp->user_mask & ID_AA64ISAR1_EL1_BF16_MASK) - regp->user_mask &= ~ID_AA64ISAR1_EL1_BF16_MASK; - raw_spin_unlock(®_user_mask_modification); -} - #define CAP_MIDR_RANGE(model, v_min, r_min, v_max, r_max) \ .matches = is_affected_midr_range, \ .midr_range = MIDR_RANGE(model, v_min, r_min, v_max, r_max) @@ -202,12 +240,48 @@ static bool __maybe_unused has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry, int scope) { - u32 midr = read_cpuid_id(); bool has_dic = read_cpuid_cachetype() & BIT(CTR_EL0_DIC_SHIFT); const struct midr_range range = MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1); WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range(midr, &range) && has_dic; + return is_midr_in_range(&range) && has_dic; +} + +static const struct midr_range impdef_pmuv3_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {}, +}; + +static bool has_impdef_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + unsigned int pmuver; + + if (!is_kernel_in_hyp_mode()) + return false; + + pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_EL1_PMUVer_SHIFT); + if (pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return false; + + return is_midr_in_range_list(impdef_pmuv3_cpus); +} + +static void cpu_enable_impdef_pmuv3_traps(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set_s(SYS_HACR_EL2, 0, BIT(56)); } #ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI @@ -261,7 +335,7 @@ static const struct midr_range cavium_erratum_23154_cpus[] = { #endif #ifdef CONFIG_CAVIUM_ERRATUM_27456 -const struct midr_range cavium_erratum_27456_cpus[] = { +static const struct midr_range cavium_erratum_27456_cpus[] = { /* Cavium ThunderX, T88 pass 1.x - 2.1 */ MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), /* Cavium ThunderX, T81 pass 1.0 */ @@ -390,6 +464,7 @@ static const struct midr_range erratum_1463225[] = { static const struct midr_range trbe_overwrite_fill_mode_cpus[] = { #ifdef CONFIG_ARM64_ERRATUM_2139208 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_MICROSOFT_AZURE_COBALT_100), #endif #ifdef CONFIG_ARM64_ERRATUM_2119858 MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), @@ -403,6 +478,7 @@ static const struct midr_range trbe_overwrite_fill_mode_cpus[] = { static const struct midr_range tsb_flush_fail_cpus[] = { #ifdef CONFIG_ARM64_ERRATUM_2067961 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_MICROSOFT_AZURE_COBALT_100), #endif #ifdef CONFIG_ARM64_ERRATUM_2054223 MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), @@ -415,6 +491,7 @@ static const struct midr_range tsb_flush_fail_cpus[] = { static struct midr_range trbe_write_out_of_range_cpus[] = { #ifdef CONFIG_ARM64_ERRATUM_2253138 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_MICROSOFT_AZURE_COBALT_100), #endif #ifdef CONFIG_ARM64_ERRATUM_2224489 MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), @@ -432,6 +509,63 @@ static struct midr_range broken_aarch32_aes[] = { }; #endif /* CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE */ +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD +static const struct midr_range erratum_spec_unpriv_load_list[] = { +#ifdef CONFIG_ARM64_ERRATUM_3117295 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A510), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2966298 + /* Cortex-A520 r0p0 to r0p1 */ + MIDR_REV_RANGE(MIDR_CORTEX_A520, 0, 0, 1), +#endif + {}, +}; +#endif + +#ifdef CONFIG_ARM64_ERRATUM_3194386 +static const struct midr_range erratum_spec_ssbs_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A725), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X4), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X925), + MIDR_ALL_VERSIONS(MIDR_MICROSOFT_AZURE_COBALT_100), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3AE), + {} +}; +#endif + +#ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38 +static const struct midr_range erratum_ac03_cpu_38_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {}, +}; +#endif + +#ifdef CONFIG_AMPERE_ERRATUM_AC04_CPU_23 +static const struct midr_range erratum_ac04_cpu_23_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { @@ -727,16 +861,52 @@ const struct arm64_cpu_capabilities arm64_errata[] = { /* Cortex-A510 r0p0 - r1p1 */ ERRATA_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1), MIDR_FIXED(MIDR_CPU_VAR_REV(1,1), BIT(25)), - .cpu_enable = cpu_clear_bf16_from_user_emulation, + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_3194386 + { + .desc = "SSBS not fully self-synchronizing", + .capability = ARM64_WORKAROUND_SPECULATIVE_SSBS, + ERRATA_MIDR_RANGE_LIST(erratum_spec_ssbs_list), + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD + { + .desc = "ARM errata 2966298, 3117295", + .capability = ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD, + /* Cortex-A520 r0p0 - r0p1 */ + ERRATA_MIDR_RANGE_LIST(erratum_spec_unpriv_load_list), }, #endif #ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38 { .desc = "AmpereOne erratum AC03_CPU_38", .capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38, - ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1), + ERRATA_MIDR_RANGE_LIST(erratum_ac03_cpu_38_list), + }, +#endif +#ifdef CONFIG_AMPERE_ERRATUM_AC04_CPU_23 + { + .desc = "AmpereOne erratum AC04_CPU_23", + .capability = ARM64_WORKAROUND_AMPERE_AC04_CPU_23, + ERRATA_MIDR_RANGE_LIST(erratum_ac04_cpu_23_list), }, #endif { + .desc = "Broken CNTVOFF_EL2", + .capability = ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF, + ERRATA_MIDR_RANGE_LIST(((const struct midr_range[]) { + MIDR_ALL_VERSIONS(MIDR_QCOM_ORYON_X1), + {} + })), + }, + { + .desc = "Apple IMPDEF PMUv3 Traps", + .capability = ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_impdef_pmuv3, + .cpu_enable = cpu_enable_impdef_pmuv3_traps, + }, + { } }; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a5f533f63b60..c840a93b9ef9 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -75,6 +75,7 @@ #include <linux/cpu.h> #include <linux/kasan.h> #include <linux/percpu.h> +#include <linux/sched/isolation.h> #include <asm/cpu.h> #include <asm/cpufeature.h> @@ -83,8 +84,10 @@ #include <asm/hwcap.h> #include <asm/insn.h> #include <asm/kvm_host.h> +#include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/mte.h> +#include <asm/hypervisor.h> #include <asm/processor.h> #include <asm/smp.h> #include <asm/sysreg.h> @@ -92,6 +95,7 @@ #include <asm/vectors.h> #include <asm/virt.h> +#include <asm/spectre.h> /* Kernel representation of AT_HWCAP and AT_HWCAP2 */ static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly; @@ -103,6 +107,7 @@ static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly; COMPAT_HWCAP_LPAE) unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT; unsigned int compat_elf_hwcap2 __read_mostly; +unsigned int compat_elf_hwcap3 __read_mostly; #endif DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS); @@ -111,7 +116,14 @@ static struct arm64_cpu_capabilities const __ro_after_init *cpucap_ptrs[ARM64_NC DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS); -bool arm64_use_ng_mappings = false; +/* + * arm64_use_ng_mappings must be placed in the .data section, otherwise it + * ends up in the .bss section where it is initialized in early_map_kernel() + * after the MMU (with the idmap) was enabled. create_init_idmap() - which + * runs before early_map_kernel() and reads the variable via PTE_MAYBE_NG - + * may end up generating an incorrect idmap page table attributes. + */ +bool arm64_use_ng_mappings __read_mostly = false; EXPORT_SYMBOL(arm64_use_ng_mappings); DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors; @@ -140,12 +152,42 @@ void dump_cpu_features(void) pr_emerg("0x%*pb\n", ARM64_NCAPS, &system_cpucaps); } +#define __ARM64_MAX_POSITIVE(reg, field) \ + ((reg##_##field##_SIGNED ? \ + BIT(reg##_##field##_WIDTH - 1) : \ + BIT(reg##_##field##_WIDTH)) - 1) + +#define __ARM64_MIN_NEGATIVE(reg, field) BIT(reg##_##field##_WIDTH - 1) + +#define __ARM64_CPUID_FIELDS(reg, field, min_value, max_value) \ + .sys_reg = SYS_##reg, \ + .field_pos = reg##_##field##_SHIFT, \ + .field_width = reg##_##field##_WIDTH, \ + .sign = reg##_##field##_SIGNED, \ + .min_field_value = min_value, \ + .max_field_value = max_value, + +/* + * ARM64_CPUID_FIELDS() encodes a field with a range from min_value to + * an implicit maximum that depends on the sign-ess of the field. + * + * An unsigned field will be capped at all ones, while a signed field + * will be limited to the positive half only. + */ #define ARM64_CPUID_FIELDS(reg, field, min_value) \ - .sys_reg = SYS_##reg, \ - .field_pos = reg##_##field##_SHIFT, \ - .field_width = reg##_##field##_WIDTH, \ - .sign = reg##_##field##_SIGNED, \ - .min_field_value = reg##_##field##_##min_value, + __ARM64_CPUID_FIELDS(reg, field, \ + SYS_FIELD_VALUE(reg, field, min_value), \ + __ARM64_MAX_POSITIVE(reg, field)) + +/* + * ARM64_CPUID_FIELDS_NEG() encodes a field with a range from an + * implicit minimal value to max_value. This should be used when + * matching a non-implemented property. + */ +#define ARM64_CPUID_FIELDS_NEG(reg, field, max_value) \ + __ARM64_CPUID_FIELDS(reg, field, \ + __ARM64_MIN_NEGATIVE(reg, field), \ + SYS_FIELD_VALUE(reg, field, max_value)) #define __ARM64_FTR_BITS(SIGNED, VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ { \ @@ -198,6 +240,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_XS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0), @@ -220,9 +263,11 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_LUT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CSSC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRFM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CLRBHB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_MOPS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), FTR_STRICT, FTR_EXACT, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, 0), @@ -233,6 +278,13 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64isar3[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSFE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV2_SHIFT, 4, 0), @@ -248,12 +300,16 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_FP_SHIFT, 4, ID_AA64PFR0_EL1_FP_NI), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL1_SHIFT, 4, ID_AA64PFR0_EL1_ELx_64BIT_ONLY), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL0_SHIFT, 4, ID_AA64PFR0_EL1_ELx_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL1_SHIFT, 4, ID_AA64PFR0_EL1_EL1_IMP), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL0_SHIFT, 4, ID_AA64PFR0_EL1_EL0_IMP), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_DF2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_frac_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), @@ -266,22 +322,35 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64pfr2[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_FPMR_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_MTEFAR_SHIFT, 4, ID_AA64PFR2_EL1_MTEFAR_NI), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_MTESTOREONLY_SHIFT, 4, ID_AA64PFR2_EL1_MTESTOREONLY_NI), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F64MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F32MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F16MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SHA3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_B16B16_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_EltPerm_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_AES_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, 0), @@ -292,6 +361,8 @@ static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_LUTv2_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SMEver_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I16I64_SHIFT, 4, 0), @@ -304,6 +375,10 @@ static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F16_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F8F16_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F8F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I8I32_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F32_SHIFT, 1, 0), @@ -313,6 +388,34 @@ static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_BI32I32_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F32F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8FMA_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP4_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP2_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SBitPerm_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_AES_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SFEXPA_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_STMOP_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SMOP4_SHIFT, 1, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64fpfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8CVT_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8FMA_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP4_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP2_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM8_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM4_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E4M3_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E5M2_SHIFT, 1, 0), ARM64_FTR_END, }; @@ -363,6 +466,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ECBHB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TIDCP1_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HCX_SHIFT, 4, 0), @@ -399,11 +503,20 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_POE), + FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1POE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_SCTLRX_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0), ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = { + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_NV_frac_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_ctr[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RES1 */ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DIC_SHIFT, 1, 1), @@ -610,15 +723,11 @@ static const struct arm64_ftr_bits ftr_id_dfr1[] = { ARM64_FTR_END, }; -static const struct arm64_ftr_bits ftr_zcr[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, - ZCR_ELx_LEN_SHIFT, ZCR_ELx_LEN_WIDTH, 0), /* LEN */ - ARM64_FTR_END, -}; - -static const struct arm64_ftr_bits ftr_smcr[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, - SMCR_ELx_LEN_SHIFT, SMCR_ELx_LEN_WIDTH, 0), /* LEN */ +static const struct arm64_ftr_bits ftr_mpamidr[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PMG_MAX_SHIFT, MPAMIDR_EL1_PMG_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_VPMR_MAX_SHIFT, MPAMIDR_EL1_VPMR_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_HAS_HCR_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PARTID_MAX_SHIFT, MPAMIDR_EL1_PARTID_MAX_WIDTH, 0), ARM64_FTR_END, }; @@ -664,15 +773,17 @@ static const struct arm64_ftr_bits ftr_raz[] = { #define ARM64_FTR_REG(id, table) \ __ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override) -struct arm64_ftr_override __ro_after_init id_aa64mmfr1_override; -struct arm64_ftr_override __ro_after_init id_aa64pfr0_override; -struct arm64_ftr_override __ro_after_init id_aa64pfr1_override; -struct arm64_ftr_override __ro_after_init id_aa64zfr0_override; -struct arm64_ftr_override __ro_after_init id_aa64smfr0_override; -struct arm64_ftr_override __ro_after_init id_aa64isar1_override; -struct arm64_ftr_override __ro_after_init id_aa64isar2_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr0_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr1_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr2_override; +struct arm64_ftr_override __read_mostly id_aa64pfr0_override; +struct arm64_ftr_override __read_mostly id_aa64pfr1_override; +struct arm64_ftr_override __read_mostly id_aa64zfr0_override; +struct arm64_ftr_override __read_mostly id_aa64smfr0_override; +struct arm64_ftr_override __read_mostly id_aa64isar1_override; +struct arm64_ftr_override __read_mostly id_aa64isar2_override; -struct arm64_ftr_override arm64_sw_feature_override; +struct arm64_ftr_override __read_mostly arm64_sw_feature_override; static const struct __ftr_reg_entry { u32 sys_id; @@ -711,10 +822,12 @@ static const struct __ftr_reg_entry { &id_aa64pfr0_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1, &id_aa64pfr1_override), + ARM64_FTR_REG(SYS_ID_AA64PFR2_EL1, ftr_id_aa64pfr2), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0, &id_aa64zfr0_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0, &id_aa64smfr0_override), + ARM64_FTR_REG(SYS_ID_AA64FPFR0_EL1, ftr_id_aa64fpfr0), /* Op1 = 0, CRn = 0, CRm = 5 */ ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0), @@ -726,17 +839,20 @@ static const struct __ftr_reg_entry { &id_aa64isar1_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2, &id_aa64isar2_override), + ARM64_FTR_REG(SYS_ID_AA64ISAR3_EL1, ftr_id_aa64isar3), /* Op1 = 0, CRn = 0, CRm = 7 */ - ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0, + &id_aa64mmfr0_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1, &id_aa64mmfr1_override), - ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2, + &id_aa64mmfr2_override), ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3), + ARM64_FTR_REG(SYS_ID_AA64MMFR4_EL1, ftr_id_aa64mmfr4), - /* Op1 = 0, CRn = 1, CRm = 2 */ - ARM64_FTR_REG(SYS_ZCR_EL1, ftr_zcr), - ARM64_FTR_REG(SYS_SMCR_EL1, ftr_smcr), + /* Op1 = 0, CRn = 10, CRm = 4 */ + ARM64_FTR_REG(SYS_MPAMIDR_EL1, ftr_mpamidr), /* Op1 = 1, CRn = 0, CRm = 0 */ ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), @@ -887,7 +1003,7 @@ static void __init sort_ftr_regs(void) /* * Initialise the CPU feature register from Boot CPU values. - * Also initiliases the strict_mask for the register. + * Also initialises the strict_mask for the register. * Any bits that are not covered by an arm64_ftr_bits entry are considered * RES0 for the system-wide value, and must strictly match. */ @@ -923,16 +1039,16 @@ static void init_cpu_ftr_reg(u32 sys_reg, u64 new) /* Override was valid */ ftr_new = tmp; str = "forced"; - } else if (ftr_ovr == tmp) { + } else { /* Override was the safe value */ str = "already set"; } - if (str) - pr_warn("%s[%d:%d]: %s to %llx\n", - reg->name, - ftrp->shift + ftrp->width - 1, - ftrp->shift, str, tmp); + pr_warn("%s[%d:%d]: %s to %llx\n", + reg->name, + ftrp->shift + ftrp->width - 1, + ftrp->shift, str, + tmp & (BIT(ftrp->width) - 1)); } else if ((ftr_mask & reg->override->val) == ftr_mask) { reg->override->val &= ~ftr_mask; pr_warn("%s[%d:%d]: impossible override, ignored\n", @@ -1012,6 +1128,37 @@ static void init_32bit_cpu_features(struct cpuinfo_32bit *info) init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2); } +#ifdef CONFIG_ARM64_PSEUDO_NMI +static bool enable_pseudo_nmi; + +static int __init early_enable_pseudo_nmi(char *p) +{ + return kstrtobool(p, &enable_pseudo_nmi); +} +early_param("irqchip.gicv3_pseudo_nmi", early_enable_pseudo_nmi); + +static __init void detect_system_supports_pseudo_nmi(void) +{ + struct device_node *np; + + if (!enable_pseudo_nmi) + return; + + /* + * Detect broken MediaTek firmware that doesn't properly save and + * restore GIC priorities. + */ + np = of_find_compatible_node(NULL, NULL, "arm,gic-v3"); + if (np && of_property_read_bool(np, "mediatek,broken-save-restore-fw")) { + pr_info("Pseudo-NMI disabled due to MediaTek Chromebook GICR save problem\n"); + enable_pseudo_nmi = false; + } + of_node_put(np); +} +#else /* CONFIG_ARM64_PSEUDO_NMI */ +static inline void detect_system_supports_pseudo_nmi(void) { } +#endif + void __init init_cpu_features(struct cpuinfo_arm64 *info) { /* Before we start using the tables, make sure it is sorted */ @@ -1025,52 +1172,47 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64ISAR0_EL1, info->reg_id_aa64isar0); init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1); init_cpu_ftr_reg(SYS_ID_AA64ISAR2_EL1, info->reg_id_aa64isar2); + init_cpu_ftr_reg(SYS_ID_AA64ISAR3_EL1, info->reg_id_aa64isar3); init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0); init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); init_cpu_ftr_reg(SYS_ID_AA64MMFR3_EL1, info->reg_id_aa64mmfr3); + init_cpu_ftr_reg(SYS_ID_AA64MMFR4_EL1, info->reg_id_aa64mmfr4); init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); + init_cpu_ftr_reg(SYS_ID_AA64PFR2_EL1, info->reg_id_aa64pfr2); init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0); init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0); + init_cpu_ftr_reg(SYS_ID_AA64FPFR0_EL1, info->reg_id_aa64fpfr0); if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) init_32bit_cpu_features(&info->aarch32); if (IS_ENABLED(CONFIG_ARM64_SVE) && id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { - info->reg_zcr = read_zcr_features(); - init_cpu_ftr_reg(SYS_ZCR_EL1, info->reg_zcr); + unsigned long cpacr = cpacr_save_enable_kernel_sve(); + vec_init_vq_map(ARM64_VEC_SVE); + + cpacr_restore(cpacr); } if (IS_ENABLED(CONFIG_ARM64_SME) && id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { - info->reg_smcr = read_smcr_features(); - /* - * We mask out SMPS since even if the hardware - * supports priorities the kernel does not at present - * and we block access to them. - */ - info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; - init_cpu_ftr_reg(SYS_SMCR_EL1, info->reg_smcr); + unsigned long cpacr = cpacr_save_enable_kernel_sme(); + vec_init_vq_map(ARM64_VEC_SME); + + cpacr_restore(cpacr); + } + + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr); } if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); - - /* - * Initialize the indirect array of CPU capabilities pointers before we - * handle the boot CPU below. - */ - init_cpucap_indirect_list(); - - /* - * Detect and enable early CPU capabilities based on the boot CPU, - * after we have initialised the CPU feature infrastructure. - */ - setup_boot_cpu_capabilities(); } static void update_cpu_ftr_reg(struct arm64_ftr_reg *reg, u64 new) @@ -1262,6 +1404,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64isar1, boot->reg_id_aa64isar1); taint |= check_update_ftr_reg(SYS_ID_AA64ISAR2_EL1, cpu, info->reg_id_aa64isar2, boot->reg_id_aa64isar2); + taint |= check_update_ftr_reg(SYS_ID_AA64ISAR3_EL1, cpu, + info->reg_id_aa64isar3, boot->reg_id_aa64isar3); /* * Differing PARange support is fine as long as all peripherals and @@ -1276,11 +1420,15 @@ void update_cpu_features(int cpu, info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR3_EL1, cpu, info->reg_id_aa64mmfr3, boot->reg_id_aa64mmfr3); + taint |= check_update_ftr_reg(SYS_ID_AA64MMFR4_EL1, cpu, + info->reg_id_aa64mmfr4, boot->reg_id_aa64mmfr4); taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu, info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0); taint |= check_update_ftr_reg(SYS_ID_AA64PFR1_EL1, cpu, info->reg_id_aa64pfr1, boot->reg_id_aa64pfr1); + taint |= check_update_ftr_reg(SYS_ID_AA64PFR2_EL1, cpu, + info->reg_id_aa64pfr2, boot->reg_id_aa64pfr2); taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu, info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0); @@ -1288,32 +1436,36 @@ void update_cpu_features(int cpu, taint |= check_update_ftr_reg(SYS_ID_AA64SMFR0_EL1, cpu, info->reg_id_aa64smfr0, boot->reg_id_aa64smfr0); + taint |= check_update_ftr_reg(SYS_ID_AA64FPFR0_EL1, cpu, + info->reg_id_aa64fpfr0, boot->reg_id_aa64fpfr0); + + /* Probe vector lengths */ if (IS_ENABLED(CONFIG_ARM64_SVE) && id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { - info->reg_zcr = read_zcr_features(); - taint |= check_update_ftr_reg(SYS_ZCR_EL1, cpu, - info->reg_zcr, boot->reg_zcr); + if (!system_capabilities_finalized()) { + unsigned long cpacr = cpacr_save_enable_kernel_sve(); - /* Probe vector lengths */ - if (!system_capabilities_finalized()) vec_update_vq_map(ARM64_VEC_SVE); + + cpacr_restore(cpacr); + } } if (IS_ENABLED(CONFIG_ARM64_SME) && id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { - info->reg_smcr = read_smcr_features(); - /* - * We mask out SMPS since even if the hardware - * supports priorities the kernel does not at present - * and we block access to them. - */ - info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; - taint |= check_update_ftr_reg(SYS_SMCR_EL1, cpu, - info->reg_smcr, boot->reg_smcr); + unsigned long cpacr = cpacr_save_enable_kernel_sme(); /* Probe vector lengths */ if (!system_capabilities_finalized()) vec_update_vq_map(ARM64_VEC_SME); + + cpacr_restore(cpacr); + } + + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu, + info->reg_mpamidr, boot->reg_mpamidr); } /* @@ -1398,17 +1550,21 @@ u64 __read_sysreg_by_encoding(u32 sys_id) read_sysreg_case(SYS_ID_AA64PFR0_EL1); read_sysreg_case(SYS_ID_AA64PFR1_EL1); + read_sysreg_case(SYS_ID_AA64PFR2_EL1); read_sysreg_case(SYS_ID_AA64ZFR0_EL1); read_sysreg_case(SYS_ID_AA64SMFR0_EL1); + read_sysreg_case(SYS_ID_AA64FPFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR0_EL1); read_sysreg_case(SYS_ID_AA64MMFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR2_EL1); read_sysreg_case(SYS_ID_AA64MMFR3_EL1); + read_sysreg_case(SYS_ID_AA64MMFR4_EL1); read_sysreg_case(SYS_ID_AA64ISAR0_EL1); read_sysreg_case(SYS_ID_AA64ISAR1_EL1); read_sysreg_case(SYS_ID_AA64ISAR2_EL1); + read_sysreg_case(SYS_ID_AA64ISAR3_EL1); read_sysreg_case(SYS_CNTFRQ_EL0); read_sysreg_case(SYS_CTR_EL0); @@ -1439,11 +1595,28 @@ has_always(const struct arm64_cpu_capabilities *entry, int scope) static bool feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry) { - int val = cpuid_feature_extract_field_width(reg, entry->field_pos, - entry->field_width, - entry->sign); + int val, min, max; + u64 tmp; - return val >= entry->min_field_value; + val = cpuid_feature_extract_field_width(reg, entry->field_pos, + entry->field_width, + entry->sign); + + tmp = entry->min_field_value; + tmp <<= entry->field_pos; + + min = cpuid_feature_extract_field_width(tmp, entry->field_pos, + entry->field_width, + entry->sign); + + tmp = entry->max_field_value; + tmp <<= entry->field_pos; + + max = cpuid_feature_extract_field_width(tmp, entry->field_pos, + entry->field_width, + entry->sign); + + return val >= min && val <= max; } static u64 @@ -1494,6 +1667,11 @@ const struct cpumask *system_32bit_el0_cpumask(void) return cpu_possible_mask; } +const struct cpumask *task_cpu_fallback_mask(struct task_struct *p) +{ + return __task_cpu_possible_mask(p, housekeeping_cpumask(HK_TYPE_TICK)); +} + static int __init parse_32bit_el0_param(char *str) { allow_mismatched_32bit_el0 = true; @@ -1553,24 +1731,6 @@ static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry, return has_sre; } -static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry, int __unused) -{ - u32 midr = read_cpuid_id(); - - /* Cavium ThunderX pass 1.x and 2.x */ - return midr_is_cpu_model_range(midr, MIDR_THUNDERX, - MIDR_CPU_VAR_REV(0, 0), - MIDR_CPU_VAR_REV(1, MIDR_REVISION_MASK)); -} - -static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unused) -{ - u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); - - return cpuid_feature_extract_signed_field(pfr0, - ID_AA64PFR0_EL1_FP_SHIFT) < 0; -} - static bool has_cache_idc(const struct arm64_cpu_capabilities *entry, int scope) { @@ -1620,52 +1780,12 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) if (is_kdump_kernel()) return false; - if (cpus_have_const_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP)) + if (cpus_have_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP)) return false; return has_cpuid_feature(entry, scope); } -/* - * This check is triggered during the early boot before the cpufeature - * is initialised. Checking the status on the local CPU allows the boot - * CPU to detect the need for non-global mappings and thus avoiding a - * pagetable re-write after all the CPUs are booted. This check will be - * anyway run on individual CPUs, allowing us to get the consistent - * state once the SMP CPUs are up and thus make the switch to non-global - * mappings if required. - */ -bool kaslr_requires_kpti(void) -{ - if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - return false; - - /* - * E0PD does a similar job to KPTI so can be used instead - * where available. - */ - if (IS_ENABLED(CONFIG_ARM64_E0PD)) { - u64 mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1); - if (cpuid_feature_extract_unsigned_field(mmfr2, - ID_AA64MMFR2_EL1_E0PD_SHIFT)) - return false; - } - - /* - * Systems affected by Cavium erratum 24756 are incompatible - * with KPTI. - */ - if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) { - extern const struct midr_range cavium_erratum_27456_cpus[]; - - if (is_midr_in_range_list(read_cpuid_id(), - cavium_erratum_27456_cpus)) - return false; - } - - return kaslr_enabled(); -} - static bool __meltdown_safe = true; static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ @@ -1694,7 +1814,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, char const *str = "kpti command line option"; bool meltdown_safe; - meltdown_safe = is_midr_in_range_list(read_cpuid_id(), kpti_safe_list); + meltdown_safe = is_midr_in_range_list(kpti_safe_list); /* Defer to CPU feature registers */ if (has_cpuid_feature(entry, scope)) @@ -1718,7 +1838,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, } /* Useful for KASLR robustness */ - if (kaslr_requires_kpti()) { + if (kaslr_enabled() && kaslr_requires_kpti()) { if (!__kpti_forced) { str = "KASLR"; __kpti_forced = 1; @@ -1745,93 +1865,92 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, return !meltdown_safe; } -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -#define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) - -extern -void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, - phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags); +static bool has_nv1(const struct arm64_cpu_capabilities *entry, int scope) +{ + /* + * Although the Apple M2 family appears to support NV1, the + * PTW barfs on the nVHE EL2 S1 page table format. Pretend + * that it doesn't support NV1 at all. + */ + static const struct midr_range nv1_ni_list[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {} + }; -static phys_addr_t kpti_ng_temp_alloc; + return (__system_matches_cap(ARM64_HAS_NESTED_VIRT) && + !(has_cpuid_feature(entry, scope) || + is_midr_in_range_list(nv1_ni_list))); +} -static phys_addr_t kpti_ng_pgd_alloc(int shift) +#if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2) +static bool has_lpa2_at_stage1(u64 mmfr0) { - kpti_ng_temp_alloc -= PAGE_SIZE; - return kpti_ng_temp_alloc; + unsigned int tgran; + + tgran = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_TGRAN_SHIFT); + return tgran == ID_AA64MMFR0_EL1_TGRAN_LPA2; } -static void -kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) +static bool has_lpa2_at_stage2(u64 mmfr0) { - typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long); - extern kpti_remap_fn idmap_kpti_install_ng_mappings; - kpti_remap_fn *remap_fn; + unsigned int tgran; - int cpu = smp_processor_id(); - int levels = CONFIG_PGTABLE_LEVELS; - int order = order_base_2(levels); - u64 kpti_ng_temp_pgd_pa = 0; - pgd_t *kpti_ng_temp_pgd; - u64 alloc = 0; + tgran = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_TGRAN_2_SHIFT); + return tgran == ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2; +} - if (__this_cpu_read(this_cpu_vector) == vectors) { - const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI); +static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 mmfr0; - __this_cpu_write(this_cpu_vector, v); - } + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + return has_lpa2_at_stage1(mmfr0) && has_lpa2_at_stage2(mmfr0); +} +#else +static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) +{ + return false; +} +#endif + +#ifdef CONFIG_HW_PERF_EVENTS +static bool has_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + unsigned int pmuver; /* - * We don't need to rewrite the page-tables if either we've done - * it already or we have KASLR enabled and therefore have not - * created any global mappings at all. + * PMUVer follows the standard ID scheme for an unsigned field with the + * exception of 0xF (IMP_DEF) which is treated specially and implies + * FEAT_PMUv3 is not implemented. + * + * See DDI0487L.a D24.1.3.2 for more details. */ - if (arm64_use_ng_mappings) - return; + pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_EL1_PMUVer_SHIFT); + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return false; - remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); - - if (!cpu) { - alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); - kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE); - kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd); - - // - // Create a minimal page table hierarchy that permits us to map - // the swapper page tables temporarily as we traverse them. - // - // The physical pages are laid out as follows: - // - // +--------+-/-------+-/------ +-\\--------+ - // : PTE[] : | PMD[] : | PUD[] : || PGD[] : - // +--------+-\-------+-\------ +-//--------+ - // ^ - // The first page is mapped into this hierarchy at a PMD_SHIFT - // aligned virtual address, so that we can manipulate the PTE - // level entries while the mapping is active. The first entry - // covers the PTE[] page itself, the remaining entries are free - // to be used as a ad-hoc fixmap. - // - create_kpti_ng_temp_pgd(kpti_ng_temp_pgd, __pa(alloc), - KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL, - kpti_ng_pgd_alloc, 0); - } + return pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP; +} +#endif - cpu_install_idmap(); - remap_fn(cpu, num_online_cpus(), kpti_ng_temp_pgd_pa, KPTI_NG_TEMP_VA); - cpu_uninstall_idmap(); +static void cpu_enable_kpti(struct arm64_cpu_capabilities const *cap) +{ + if (__this_cpu_read(this_cpu_vector) == vectors) { + const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI); - if (!cpu) { - free_pages(alloc, order); - arm64_use_ng_mappings = true; + __this_cpu_write(this_cpu_vector, v); } + } -#else -static void -kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) -{ -} -#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ static int __init parse_kpti(char *str) { @@ -1847,9 +1966,11 @@ static int __init parse_kpti(char *str) early_param("kpti", parse_kpti); #ifdef CONFIG_ARM64_HW_AFDBM +static struct cpumask dbm_cpus __read_mostly; + static inline void __cpu_enable_hw_dbm(void) { - u64 tcr = read_sysreg(tcr_el1) | TCR_HD; + u64 tcr = read_sysreg(tcr_el1) | TCR_EL1_HD; write_sysreg(tcr, tcr_el1); isb(); @@ -1871,7 +1992,7 @@ static bool cpu_has_broken_dbm(void) {}, }; - return is_midr_in_range_list(read_cpuid_id(), cpus); + return is_midr_in_range_list(cpus); } static bool cpu_can_use_dbm(const struct arm64_cpu_capabilities *cap) @@ -1882,35 +2003,22 @@ static bool cpu_can_use_dbm(const struct arm64_cpu_capabilities *cap) static void cpu_enable_hw_dbm(struct arm64_cpu_capabilities const *cap) { - if (cpu_can_use_dbm(cap)) + if (cpu_can_use_dbm(cap)) { __cpu_enable_hw_dbm(); + cpumask_set_cpu(smp_processor_id(), &dbm_cpus); + } } static bool has_hw_dbm(const struct arm64_cpu_capabilities *cap, int __unused) { - static bool detected = false; /* * DBM is a non-conflicting feature. i.e, the kernel can safely * run a mix of CPUs with and without the feature. So, we * unconditionally enable the capability to allow any late CPU * to use the feature. We only enable the control bits on the - * CPU, if it actually supports. - * - * We have to make sure we print the "feature" detection only - * when at least one CPU actually uses it. So check if this CPU - * can actually use it and print the message exactly once. - * - * This is safe as all CPUs (including secondary CPUs - due to the - * LOCAL_CPU scope - and the hotplugged CPUs - via verification) - * goes through the "matches" check exactly once. Also if a CPU - * matches the criteria, it is guaranteed that the CPU will turn - * the DBM on, as the capability is unconditionally enabled. + * CPU, if it is supported. */ - if (!detected && cpu_can_use_dbm(cap)) { - detected = true; - pr_info("detected: Hardware dirty bit management\n"); - } return true; } @@ -1943,8 +2051,6 @@ int get_cpu_with_amu_feat(void) static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap) { if (has_cpuid_feature(cap, SCOPE_LOCAL_CPU)) { - pr_info("detected CPU%d: Activity Monitors Unit (AMU)\n", - smp_processor_id()); cpumask_set_cpu(smp_processor_id(), &amu_cpus); /* 0 reference values signal broken/disabled counters */ @@ -2003,7 +2109,7 @@ static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap, if (kvm_get_mode() != KVM_MODE_NV) return false; - if (!has_cpuid_feature(cap, scope)) { + if (!cpucap_multi_entry_cap_matches(cap, scope)) { pr_warn("unavailable: %s\n", cap->desc); return false; } @@ -2014,14 +2120,48 @@ static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap, static bool hvhe_possible(const struct arm64_cpu_capabilities *entry, int __unused) { - u64 val; + return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE); +} + +bool cpu_supports_bbml2_noabort(void) +{ + /* + * We want to allow usage of BBML2 in as wide a range of kernel contexts + * as possible. This list is therefore an allow-list of known-good + * implementations that both support BBML2 and additionally, fulfill the + * extra constraint of never generating TLB conflict aborts when using + * the relaxed BBML2 semantics (such aborts make use of BBML2 in certain + * kernel contexts difficult to prove safe against recursive aborts). + * + * Note that implementations can only be considered "known-good" if their + * implementors attest to the fact that the implementation never raises + * TLB conflict aborts for BBML2 mapping granularity changes. + */ + static const struct midr_range supports_bbml2_noabort_list[] = { + MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf), + MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf), + MIDR_REV_RANGE(MIDR_NEOVERSE_V3AE, 0, 2, 0xf), + MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS), + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {} + }; - val = read_sysreg(id_aa64mmfr1_el1); - if (!cpuid_feature_extract_unsigned_field(val, ID_AA64MMFR1_EL1_VH_SHIFT)) + /* Does our cpu guarantee to never raise TLB conflict aborts? */ + if (!is_midr_in_range_list(supports_bbml2_noabort_list)) return false; - val = arm64_sw_feature_override.val & arm64_sw_feature_override.mask; - return cpuid_feature_extract_unsigned_field(val, ARM64_SW_FEATURE_OVERRIDE_HVHE); + /* + * We currently ignore the ID_AA64MMFR2_EL1 register, and only care + * about whether the MIDR check passes. + */ + + return true; +} + +static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope) +{ + return cpu_supports_bbml2_noabort(); } #ifdef CONFIG_ARM64_PAN @@ -2044,6 +2184,24 @@ static void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused) /* Firmware may have left a deferred SError in this register. */ write_sysreg_s(0, SYS_DISR_EL1); } +static bool has_rasv1p1(const struct arm64_cpu_capabilities *__unused, int scope) +{ + const struct arm64_cpu_capabilities rasv1p1_caps[] = { + { + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, V1P1) + }, + { + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, IMP) + }, + { + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, RAS_frac, RASv1p1) + }, + }; + + return (has_cpuid_feature(&rasv1p1_caps[0], scope) || + (has_cpuid_feature(&rasv1p1_caps[1], scope) && + has_cpuid_feature(&rasv1p1_caps[2], scope))); +} #endif /* CONFIG_ARM64_RAS_EXTN */ #ifdef CONFIG_ARM64_PTR_AUTH @@ -2098,28 +2256,20 @@ static bool has_generic_auth(const struct arm64_cpu_capabilities *entry, static void cpu_enable_e0pd(struct arm64_cpu_capabilities const *cap) { if (this_cpu_has_cap(ARM64_HAS_E0PD)) - sysreg_clear_set(tcr_el1, 0, TCR_E0PD1); + sysreg_clear_set(tcr_el1, 0, TCR_EL1_E0PD1); } #endif /* CONFIG_ARM64_E0PD */ #ifdef CONFIG_ARM64_PSEUDO_NMI -static bool enable_pseudo_nmi; - -static int __init early_enable_pseudo_nmi(char *p) -{ - return kstrtobool(p, &enable_pseudo_nmi); -} -early_param("irqchip.gicv3_pseudo_nmi", early_enable_pseudo_nmi); - static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry, int scope) { /* - * ARM64_HAS_GIC_CPUIF_SYSREGS has a lower index, and is a boot CPU + * ARM64_HAS_GICV3_CPUIF has a lower index, and is a boot CPU * feature, so will be detected earlier. */ - BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_MASKING <= ARM64_HAS_GIC_CPUIF_SYSREGS); - if (!cpus_have_cap(ARM64_HAS_GIC_CPUIF_SYSREGS)) + BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_MASKING <= ARM64_HAS_GICV3_CPUIF); + if (!cpus_have_cap(ARM64_HAS_GICV3_CPUIF)) return false; return enable_pseudo_nmi; @@ -2154,6 +2304,49 @@ static bool has_gic_prio_relaxed_sync(const struct arm64_cpu_capabilities *entry } #endif +static bool can_trap_icv_dir_el1(const struct arm64_cpu_capabilities *entry, + int scope) +{ + static const struct midr_range has_vgic_v3[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {}, + }; + struct arm_smccc_res res = {}; + + BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV3_CPUIF); + BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV5_LEGACY); + if (!this_cpu_has_cap(ARM64_HAS_GICV3_CPUIF) && + !is_midr_in_range_list(has_vgic_v3)) + return false; + + if (!is_hyp_mode_available()) + return false; + + if (this_cpu_has_cap(ARM64_HAS_GICV5_LEGACY)) + return true; + + if (is_kernel_in_hyp_mode()) + res.a1 = read_sysreg_s(SYS_ICH_VTR_EL2); + else + arm_smccc_1_1_hvc(HVC_GET_ICH_VTR_EL2, &res); + + if (res.a0 == HVC_STUB_ERR) + return false; + + return res.a1 & ICH_VTR_EL2_TDS; +} + #ifdef CONFIG_ARM64_BTI static void bti_enable(const struct arm64_cpu_capabilities *__unused) { @@ -2172,29 +2365,52 @@ static void bti_enable(const struct arm64_cpu_capabilities *__unused) #ifdef CONFIG_ARM64_MTE static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) { + static bool cleared_zero_page = false; + sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_ATA | SCTLR_EL1_ATA0); mte_cpu_setup(); /* * Clear the tags in the zero page. This needs to be done via the - * linear map which has the Tagged attribute. + * linear map which has the Tagged attribute. Since this page is + * always mapped as pte_special(), set_pte_at() will not attempt to + * clear the tags or set PG_mte_tagged. */ - if (try_page_mte_tagging(ZERO_PAGE(0))) { + if (!cleared_zero_page) { + cleared_zero_page = true; mte_clear_page_tags(lm_alias(empty_zero_page)); - set_page_mte_tagged(ZERO_PAGE(0)); } kasan_init_hw_tags_cpu(); } #endif /* CONFIG_ARM64_MTE */ +static void user_feature_fixup(void) +{ + if (cpus_have_cap(ARM64_WORKAROUND_2658417)) { + struct arm64_ftr_reg *regp; + + regp = get_arm64_ftr_reg(SYS_ID_AA64ISAR1_EL1); + if (regp) + regp->user_mask &= ~ID_AA64ISAR1_EL1_BF16_MASK; + } + + if (cpus_have_cap(ARM64_WORKAROUND_SPECULATIVE_SSBS)) { + struct arm64_ftr_reg *regp; + + regp = get_arm64_ftr_reg(SYS_ID_AA64PFR1_EL1); + if (regp) + regp->user_mask &= ~ID_AA64PFR1_EL1_SSBS_MASK; + } +} + static void elf_hwcap_fixup(void) { -#ifdef CONFIG_ARM64_ERRATUM_1742098 - if (cpus_have_const_cap(ARM64_WORKAROUND_1742098)) +#ifdef CONFIG_COMPAT + if (cpus_have_cap(ARM64_WORKAROUND_1742098)) compat_elf_hwcap2 &= ~COMPAT_HWCAP2_AES; -#endif /* ARM64_ERRATUM_1742098 */ +#endif /* CONFIG_COMPAT */ } #ifdef CONFIG_KVM @@ -2219,6 +2435,22 @@ static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused) sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_MSCEn); } +#ifdef CONFIG_ARM64_POE +static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1_E0POE); + sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_E0POE); +} +#endif + +#ifdef CONFIG_ARM64_GCS +static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused) +{ + /* GCSPR_EL0 is always readable */ + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); +} +#endif + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -2238,6 +2470,45 @@ cpucap_panic_on_conflict(const struct arm64_cpu_capabilities *cap) return !!(cap->type & ARM64_CPUCAP_PANIC_ON_CONFLICT); } +static bool +test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) +{ + if (!has_cpuid_feature(entry, scope)) + return false; + + /* Check firmware actually enabled MPAM on this cpu. */ + return (read_sysreg_s(SYS_MPAM1_EL1) & MPAM1_EL1_MPAMEN); +} + +static void +cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) +{ + /* + * Access by the kernel (at EL1) should use the reserved PARTID + * which is configured unrestricted. This avoids priority-inversion + * where latency sensitive tasks have to wait for a task that has + * been throttled to release the lock. + */ + write_sysreg_s(0, SYS_MPAM1_EL1); +} + +static bool +test_has_mpam_hcr(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + + return idr & MPAMIDR_EL1_HAS_HCR; +} + +static bool +test_has_gicv5_legacy(const struct arm64_cpu_capabilities *entry, int scope) +{ + if (!this_cpu_has_cap(ARM64_HAS_GICV5_CPUIF)) + return false; + + return !!(read_sysreg_s(SYS_ICC_IDR0_EL1) & ICC_IDR0_EL1_GCIE_LEGACY); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .capability = ARM64_ALWAYS_BOOT, @@ -2250,8 +2521,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_always, }, { - .desc = "GIC system register CPU interface", - .capability = ARM64_HAS_GIC_CPUIF_SYSREGS, + .desc = "GICv3 CPU interface", + .capability = ARM64_HAS_GICV3_CPUIF, .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = has_useable_gicv3_cpuif, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, GIC, IMP) @@ -2299,12 +2570,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { }, #endif /* CONFIG_ARM64_LSE_ATOMICS */ { - .desc = "Software prefetching using PRFM", - .capability = ARM64_HAS_NO_HW_PREFETCH, - .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, - .matches = has_no_hw_prefetch, - }, - { .desc = "Virtualization Host Extensions", .capability = ARM64_HAS_VIRT_HOST_EXTN, .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, @@ -2316,7 +2581,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_NESTED_VIRT, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_nested_virt_support, - ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, IMP) + .match_list = (const struct arm64_cpu_capabilities []){ + { + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2) + }, + { + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY) + }, + { /* Sentinel */ } + }, }, { .capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE, @@ -2350,7 +2625,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .desc = "Kernel page table isolation (KPTI)", .capability = ARM64_UNMAP_KERNEL_AT_EL0, .type = ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE, - .cpu_enable = kpti_install_ng_mappings, + .cpu_enable = cpu_enable_kpti, .matches = unmap_kernel_at_el0, /* * The ID feature fields below are used to indicate that @@ -2360,11 +2635,11 @@ static const struct arm64_cpu_capabilities arm64_features[] = { ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, CSV3, IMP) }, { - /* FP/SIMD is not implemented */ - .capability = ARM64_HAS_NO_FPSIMD, - .type = ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE, - .min_field_value = 0, - .matches = has_no_fpsimd, + .capability = ARM64_HAS_FPSIMD, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_fpsimd, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, FP, IMP) }, #ifdef CONFIG_ARM64_PMEM { @@ -2387,7 +2662,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .desc = "Scalable Vector Extension", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_SVE, - .cpu_enable = sve_kernel_enable, + .cpu_enable = cpu_enable_sve, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, SVE, IMP) }, @@ -2401,19 +2676,21 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_clear_disr, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, IMP) }, + { + .desc = "RASv1p1 Extension Support", + .capability = ARM64_HAS_RASV1P1_EXTN, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_rasv1p1, + }, #endif /* CONFIG_ARM64_RAS_EXTN */ #ifdef CONFIG_ARM64_AMU_EXTN { - /* - * The feature is enabled by default if CONFIG_ARM64_AMU_EXTN=y. - * Therefore, don't provide .desc as we don't want the detection - * message to be shown until at least one CPU is detected to - * support the feature. - */ + .desc = "Activity Monitors Unit (AMU)", .capability = ARM64_HAS_AMU_EXTN, .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, .matches = has_amu, .cpu_enable = cpu_amu_enable, + .cpus = &amu_cpus, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, AMU, IMP) }, #endif /* CONFIG_ARM64_AMU_EXTN */ @@ -2453,21 +2730,30 @@ static const struct arm64_cpu_capabilities arm64_features[] = { }, #ifdef CONFIG_ARM64_HW_AFDBM { - /* - * Since we turn this on always, we don't want the user to - * think that the feature is available when it may not be. - * So hide the description. - * - * .desc = "Hardware pagetable Dirty Bit Management", - * - */ + .desc = "Hardware dirty bit management", .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, .capability = ARM64_HW_DBM, .matches = has_hw_dbm, .cpu_enable = cpu_enable_hw_dbm, + .cpus = &dbm_cpus, ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, DBM) }, #endif +#ifdef CONFIG_ARM64_HAFT + { + .desc = "Hardware managed Access Flag for Table Descriptors", + /* + * Contrary to the page/block access flag, the table access flag + * cannot be emulated in software (no access fault will occur). + * Therefore this should be used only if it's supported system + * wide. + */ + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAFT, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, HAFT) + }, +#endif { .desc = "CRC32 instructions", .capability = ARM64_HAS_CRC32, @@ -2572,6 +2858,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_gic_prio_relaxed_sync, }, #endif + { + /* + * Depends on having GICv3 + */ + .desc = "ICV_DIR_EL1 trapping", + .capability = ARM64_HAS_ICH_HCR_EL2_TDIR, + .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE, + .matches = can_trap_icv_dir_el1, + }, #ifdef CONFIG_ARM64_E0PD { .desc = "E0PD", @@ -2619,6 +2914,20 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, MTE, MTE3) }, + { + .desc = "FAR on MTE Tag Check Fault", + .capability = ARM64_MTE_FAR, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, MTEFAR, IMP) + }, + { + .desc = "Store Only MTE Tag Check", + .capability = ARM64_MTE_STORE_ONLY, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, MTESTOREONLY, IMP) + }, #endif /* CONFIG_ARM64_MTE */ { .desc = "RCpc load-acquire (LDAPR)", @@ -2627,13 +2936,27 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, LRCPC, IMP) }, + { + .desc = "Fine Grained Traps", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_FGT, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, IMP) + }, + { + .desc = "Fine Grained Traps 2", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_FGT2, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, FGT2) + }, #ifdef CONFIG_ARM64_SME { .desc = "Scalable Matrix Extension", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_SME, .matches = has_cpuid_feature, - .cpu_enable = sme_kernel_enable, + .cpu_enable = cpu_enable_sme, ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SME, IMP) }, /* FA64 should be sorted after the base SME capability */ @@ -2642,7 +2965,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_SME_FA64, .matches = has_cpuid_feature, - .cpu_enable = fa64_kernel_enable, + .cpu_enable = cpu_enable_fa64, ARM64_CPUID_FIELDS(ID_AA64SMFR0_EL1, FA64, IMP) }, { @@ -2650,7 +2973,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_SME2, .matches = has_cpuid_feature, - .cpu_enable = sme2_kernel_enable, + .cpu_enable = cpu_enable_sme2, ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SME, SME2) }, #endif /* CONFIG_ARM64_SME */ @@ -2711,6 +3034,120 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP) }, + { + .desc = "BBM Level 2 without TLB conflict abort", + .capability = ARM64_HAS_BBML2_NOABORT, + .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE, + .matches = has_bbml2_noabort, + }, + { + .desc = "52-bit Virtual Addressing for KVM (LPA2)", + .capability = ARM64_HAS_LPA2, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_lpa2, + }, + { + .desc = "FPMR", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_FPMR, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_fpmr, + ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, FPMR, IMP) + }, +#ifdef CONFIG_ARM64_VA_BITS_52 + { + .capability = ARM64_HAS_VA52, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, +#ifdef CONFIG_ARM64_64K_PAGES + .desc = "52-bit Virtual Addressing (LVA)", + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, VARange, 52) +#else + .desc = "52-bit Virtual Addressing (LPA2)", +#ifdef CONFIG_ARM64_4K_PAGES + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN4, 52_BIT) +#else + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN16, 52_BIT) +#endif +#endif + }, +#endif + { + .desc = "Memory Partitioning And Monitoring", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM, + .matches = test_has_mpam, + .cpu_enable = cpu_enable_mpam, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, MPAM, 1) + }, + { + .desc = "Memory Partitioning And Monitoring Virtualisation", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM_HCR, + .matches = test_has_mpam_hcr, + }, + { + .desc = "NV1", + .capability = ARM64_HAS_HCR_NV1, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_nv1, + ARM64_CPUID_FIELDS_NEG(ID_AA64MMFR4_EL1, E2H0, NI_NV1) + }, +#ifdef CONFIG_ARM64_POE + { + .desc = "Stage-1 Permission Overlay Extension (S1POE)", + .capability = ARM64_HAS_S1POE, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_poe, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) + }, +#endif +#ifdef CONFIG_ARM64_GCS + { + .desc = "Guarded Control Stack (GCS)", + .capability = ARM64_HAS_GCS, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .cpu_enable = cpu_enable_gcs, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP) + }, +#endif +#ifdef CONFIG_HW_PERF_EVENTS + { + .desc = "PMUv3", + .capability = ARM64_HAS_PMUV3, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_pmuv3, + }, +#endif + { + .desc = "SCTLR2", + .capability = ARM64_HAS_SCTLR2, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, SCTLRX, IMP) + }, + { + .desc = "GICv5 CPU interface", + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, + .capability = ARM64_HAS_GICV5_CPUIF, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, GCIE, IMP) + }, + { + .desc = "GICv5 Legacy vCPU interface", + .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE, + .capability = ARM64_HAS_GICV5_LEGACY, + .matches = test_has_gicv5_legacy, + }, + { + .desc = "XNX", + .capability = ARM64_HAS_XNX, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, XNX, IMP) + }, {}, }; @@ -2743,6 +3180,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = match, \ } +#define HWCAP_CAP_MATCH_ID(match, reg, field, min_value, cap_type, cap) \ + { \ + __HWCAP_CAP(#cap, cap_type, cap) \ + HWCAP_CPUID_MATCH(reg, field, min_value) \ + .matches = match, \ + } + #ifdef CONFIG_ARM64_PTR_AUTH static const struct arm64_cpu_capabilities ptr_auth_hwcap_addr_matches[] = { { @@ -2771,6 +3215,20 @@ static const struct arm64_cpu_capabilities ptr_auth_hwcap_gen_matches[] = { }; #endif +#ifdef CONFIG_ARM64_SVE +static bool has_sve_feature(const struct arm64_cpu_capabilities *cap, int scope) +{ + return system_supports_sve() && has_user_cpuid_feature(cap, scope); +} +#endif + +#ifdef CONFIG_ARM64_SME +static bool has_sme_feature(const struct arm64_cpu_capabilities *cap, int scope) +{ + return system_supports_sme() && has_user_cpuid_feature(cap, scope); +} +#endif + static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, AES, PMULL, CAP_HWCAP, KERNEL_HWCAP_PMULL), HWCAP_CAP(ID_AA64ISAR0_EL1, AES, AES, CAP_HWCAP, KERNEL_HWCAP_AES), @@ -2779,6 +3237,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, SHA2, SHA512, CAP_HWCAP, KERNEL_HWCAP_SHA512), HWCAP_CAP(ID_AA64ISAR0_EL1, CRC32, IMP, CAP_HWCAP, KERNEL_HWCAP_CRC32), HWCAP_CAP(ID_AA64ISAR0_EL1, ATOMIC, IMP, CAP_HWCAP, KERNEL_HWCAP_ATOMICS), + HWCAP_CAP(ID_AA64ISAR0_EL1, ATOMIC, FEAT_LSE128, CAP_HWCAP, KERNEL_HWCAP_LSE128), HWCAP_CAP(ID_AA64ISAR0_EL1, RDM, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM), HWCAP_CAP(ID_AA64ISAR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SHA3), HWCAP_CAP(ID_AA64ISAR0_EL1, SM3, IMP, CAP_HWCAP, KERNEL_HWCAP_SM3), @@ -2788,38 +3247,53 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM, CAP_HWCAP, KERNEL_HWCAP_FLAGM), HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), HWCAP_CAP(ID_AA64ISAR0_EL1, RNDR, IMP, CAP_HWCAP, KERNEL_HWCAP_RNG), + HWCAP_CAP(ID_AA64ISAR3_EL1, FPRCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_FPRCVT), HWCAP_CAP(ID_AA64PFR0_EL1, FP, IMP, CAP_HWCAP, KERNEL_HWCAP_FP), HWCAP_CAP(ID_AA64PFR0_EL1, FP, FP16, CAP_HWCAP, KERNEL_HWCAP_FPHP), HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMD), HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, FP16, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP), HWCAP_CAP(ID_AA64PFR0_EL1, DIT, IMP, CAP_HWCAP, KERNEL_HWCAP_DIT), + HWCAP_CAP(ID_AA64PFR2_EL1, FPMR, IMP, CAP_HWCAP, KERNEL_HWCAP_FPMR), HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, IMP, CAP_HWCAP, KERNEL_HWCAP_DCPOP), HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, DPB2, CAP_HWCAP, KERNEL_HWCAP_DCPODP), HWCAP_CAP(ID_AA64ISAR1_EL1, JSCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_JSCVT), HWCAP_CAP(ID_AA64ISAR1_EL1, FCMA, IMP, CAP_HWCAP, KERNEL_HWCAP_FCMA), HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, IMP, CAP_HWCAP, KERNEL_HWCAP_LRCPC), HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, LRCPC2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), + HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, LRCPC3, CAP_HWCAP, KERNEL_HWCAP_LRCPC3), HWCAP_CAP(ID_AA64ISAR1_EL1, FRINTTS, IMP, CAP_HWCAP, KERNEL_HWCAP_FRINT), HWCAP_CAP(ID_AA64ISAR1_EL1, SB, IMP, CAP_HWCAP, KERNEL_HWCAP_SB), HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_BF16), HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_EBF16), HWCAP_CAP(ID_AA64ISAR1_EL1, DGH, IMP, CAP_HWCAP, KERNEL_HWCAP_DGH), HWCAP_CAP(ID_AA64ISAR1_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_I8MM), + HWCAP_CAP(ID_AA64ISAR2_EL1, LUT, IMP, CAP_HWCAP, KERNEL_HWCAP_LUT), + HWCAP_CAP(ID_AA64ISAR3_EL1, FAMINMAX, IMP, CAP_HWCAP, KERNEL_HWCAP_FAMINMAX), + HWCAP_CAP(ID_AA64ISAR3_EL1, LSFE, IMP, CAP_HWCAP, KERNEL_HWCAP_LSFE), HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE), - HWCAP_CAP(ID_AA64ZFR0_EL1, SVEver, SVE2p1, CAP_HWCAP, KERNEL_HWCAP_SVE2P1), - HWCAP_CAP(ID_AA64ZFR0_EL1, SVEver, SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), - HWCAP_CAP(ID_AA64ZFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES), - HWCAP_CAP(ID_AA64ZFR0_EL1, AES, PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), - HWCAP_CAP(ID_AA64ZFR0_EL1, BitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), - HWCAP_CAP(ID_AA64ZFR0_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16), - HWCAP_CAP(ID_AA64ZFR0_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16), - HWCAP_CAP(ID_AA64ZFR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), - HWCAP_CAP(ID_AA64ZFR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4), - HWCAP_CAP(ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), - HWCAP_CAP(ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), - HWCAP_CAP(ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p2, CAP_HWCAP, KERNEL_HWCAP_SVE2P2), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p1, CAP_HWCAP, KERNEL_HWCAP_SVE2P1), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, AES2, CAP_HWCAP, KERNEL_HWCAP_SVE_AES2), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_B16B16), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, BFSCALE, CAP_HWCAP, KERNEL_HWCAP_SVE_BFSCALE), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F16MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_F16MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, EltPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_ELTPERM), +#endif +#ifdef CONFIG_ARM64_GCS + HWCAP_CAP(ID_AA64PFR1_EL1, GCS, IMP, CAP_HWCAP, KERNEL_HWCAP_GCS), #endif HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS), #ifdef CONFIG_ARM64_BTI @@ -2832,10 +3306,13 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { #ifdef CONFIG_ARM64_MTE HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE2, CAP_HWCAP, KERNEL_HWCAP_MTE), HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE3, CAP_HWCAP, KERNEL_HWCAP_MTE3), + HWCAP_CAP(ID_AA64PFR2_EL1, MTEFAR, IMP, CAP_HWCAP, KERNEL_HWCAP_MTE_FAR), + HWCAP_CAP(ID_AA64PFR2_EL1, MTESTOREONLY, IMP, CAP_HWCAP , KERNEL_HWCAP_MTE_STORE_ONLY), #endif /* CONFIG_ARM64_MTE */ HWCAP_CAP(ID_AA64MMFR0_EL1, ECV, IMP, CAP_HWCAP, KERNEL_HWCAP_ECV), HWCAP_CAP(ID_AA64MMFR1_EL1, AFP, IMP, CAP_HWCAP, KERNEL_HWCAP_AFP), HWCAP_CAP(ID_AA64ISAR2_EL1, CSSC, IMP, CAP_HWCAP, KERNEL_HWCAP_CSSC), + HWCAP_CAP(ID_AA64ISAR2_EL1, CSSC, CMPBR, CAP_HWCAP, KERNEL_HWCAP_CMPBR), HWCAP_CAP(ID_AA64ISAR2_EL1, RPRFM, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRFM), HWCAP_CAP(ID_AA64ISAR2_EL1, RPRES, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRES), HWCAP_CAP(ID_AA64ISAR2_EL1, WFxT, IMP, CAP_HWCAP, KERNEL_HWCAP_WFXT), @@ -2843,20 +3320,43 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR2_EL1, BC, IMP, CAP_HWCAP, KERNEL_HWCAP_HBC), #ifdef CONFIG_ARM64_SME HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME), - HWCAP_CAP(ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), - HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1), - HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2), - HWCAP_CAP(ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), - HWCAP_CAP(ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64), - HWCAP_CAP(ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32), - HWCAP_CAP(ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16), - HWCAP_CAP(ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16), - HWCAP_CAP(ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), - HWCAP_CAP(ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), - HWCAP_CAP(ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), - HWCAP_CAP(ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32), - HWCAP_CAP(ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4), #endif /* CONFIG_ARM64_SME */ + HWCAP_CAP(ID_AA64FPFR0_EL1, F8CVT, IMP, CAP_HWCAP, KERNEL_HWCAP_F8CVT), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP4), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM8), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM4), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2), +#ifdef CONFIG_ARM64_POE + HWCAP_CAP(ID_AA64MMFR3_EL1, S1POE, IMP, CAP_HWCAP, KERNEL_HWCAP_POE), +#endif {}, }; @@ -2967,18 +3467,49 @@ static void update_cpu_capabilities(u16 scope_mask) scope_mask &= ARM64_CPUCAP_SCOPE_MASK; for (i = 0; i < ARM64_NCAPS; i++) { + bool match_all = false; + bool caps_set = false; + bool boot_cpu = false; + caps = cpucap_ptrs[i]; - if (!caps || !(caps->type & scope_mask) || - cpus_have_cap(caps->capability) || - !caps->matches(caps, cpucap_default_scope(caps))) + if (!caps || !(caps->type & scope_mask)) continue; - if (caps->desc) + match_all = cpucap_match_all_early_cpus(caps); + caps_set = cpus_have_cap(caps->capability); + boot_cpu = scope_mask & SCOPE_BOOT_CPU; + + /* + * Unless it's a match-all CPUs feature, avoid probing if + * already detected. + */ + if (!match_all && caps_set) + continue; + + /* + * A match-all CPUs capability is only set when probing the + * boot CPU. It may be cleared subsequently if not detected on + * secondary ones. + */ + if (match_all && !caps_set && !boot_cpu) + continue; + + if (!caps->matches(caps, cpucap_default_scope(caps))) { + if (match_all) + __clear_bit(caps->capability, system_cpucaps); + continue; + } + + /* + * Match-all CPUs capabilities are logged later when the + * system capabilities are finalised. + */ + if (!match_all && caps->desc && !caps->cpus) pr_info("detected: %s\n", caps->desc); __set_bit(caps->capability, system_cpucaps); - if ((scope_mask & SCOPE_BOOT_CPU) && (caps->type & SCOPE_BOOT_CPU)) + if (boot_cpu && (caps->type & SCOPE_BOOT_CPU)) set_bit(caps->capability, boot_cpucaps); } } @@ -3021,13 +3552,9 @@ static void __init enable_cpu_capabilities(u16 scope_mask) boot_scope = !!(scope_mask & SCOPE_BOOT_CPU); for (i = 0; i < ARM64_NCAPS; i++) { - unsigned int num; - caps = cpucap_ptrs[i]; - if (!caps || !(caps->type & scope_mask)) - continue; - num = caps->capability; - if (!cpus_have_cap(num)) + if (!caps || !(caps->type & scope_mask) || + !cpus_have_cap(caps->capability)) continue; if (boot_scope && caps->cpu_enable) @@ -3145,36 +3672,28 @@ static void verify_local_elf_hwcaps(void) static void verify_sve_features(void) { - u64 safe_zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1); - u64 zcr = read_zcr_features(); + unsigned long cpacr = cpacr_save_enable_kernel_sve(); - unsigned int safe_len = safe_zcr & ZCR_ELx_LEN_MASK; - unsigned int len = zcr & ZCR_ELx_LEN_MASK; - - if (len < safe_len || vec_verify_vq_map(ARM64_VEC_SVE)) { + if (vec_verify_vq_map(ARM64_VEC_SVE)) { pr_crit("CPU%d: SVE: vector length support mismatch\n", smp_processor_id()); cpu_die_early(); } - /* Add checks on other ZCR bits here if necessary */ + cpacr_restore(cpacr); } static void verify_sme_features(void) { - u64 safe_smcr = read_sanitised_ftr_reg(SYS_SMCR_EL1); - u64 smcr = read_smcr_features(); - - unsigned int safe_len = safe_smcr & SMCR_ELx_LEN_MASK; - unsigned int len = smcr & SMCR_ELx_LEN_MASK; + unsigned long cpacr = cpacr_save_enable_kernel_sme(); - if (len < safe_len || vec_verify_vq_map(ARM64_VEC_SME)) { + if (vec_verify_vq_map(ARM64_VEC_SME)) { pr_crit("CPU%d: SME: vector length support mismatch\n", smp_processor_id()); cpu_die_early(); } - /* Add checks on other SMCR bits here if necessary */ + cpacr_restore(cpacr); } static void verify_hyp_capabilities(void) @@ -3187,7 +3706,7 @@ static void verify_hyp_capabilities(void) return; safe_mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); - mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); /* Verify VMID bits */ @@ -3208,6 +3727,36 @@ static void verify_hyp_capabilities(void) } } +static void verify_mpam_capabilities(void) +{ + u64 cpu_idr = read_cpuid(ID_AA64PFR0_EL1); + u64 sys_idr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u16 cpu_partid_max, cpu_pmg_max, sys_partid_max, sys_pmg_max; + + if (FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, cpu_idr) != + FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, sys_idr)) { + pr_crit("CPU%d: MPAM version mismatch\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_idr = read_cpuid(MPAMIDR_EL1); + sys_idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + if (FIELD_GET(MPAMIDR_EL1_HAS_HCR, cpu_idr) != + FIELD_GET(MPAMIDR_EL1_HAS_HCR, sys_idr)) { + pr_crit("CPU%d: Missing MPAM HCR\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, cpu_idr); + cpu_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, cpu_idr); + sys_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, sys_idr); + sys_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, sys_idr); + if (cpu_partid_max < sys_partid_max || cpu_pmg_max < sys_pmg_max) { + pr_crit("CPU%d: MPAM PARTID/PMG max values are mismatched\n", smp_processor_id()); + cpu_die_early(); + } +} + /* * Run through the enabled system capabilities and enable() it on this CPU. * The capabilities were decided based on the available CPUs at the boot time. @@ -3234,6 +3783,9 @@ static void verify_local_cpu_capabilities(void) if (is_hyp_mode_available()) verify_hyp_capabilities(); + + if (system_supports_mpam()) + verify_mpam_capabilities(); } void check_local_cpu_capabilities(void) @@ -3256,14 +3808,6 @@ void check_local_cpu_capabilities(void) verify_local_cpu_capabilities(); } -static void __init setup_boot_cpu_capabilities(void) -{ - /* Detect capabilities with either SCOPE_BOOT_CPU or SCOPE_LOCAL_CPU */ - update_cpu_capabilities(SCOPE_BOOT_CPU | SCOPE_LOCAL_CPU); - /* Enable the SCOPE_BOOT_CPU capabilities alone right away */ - enable_cpu_capabilities(SCOPE_BOOT_CPU); -} - bool this_cpu_has_cap(unsigned int n) { if (!WARN_ON(preemptible()) && n < ARM64_NCAPS) { @@ -3281,7 +3825,6 @@ EXPORT_SYMBOL_GPL(this_cpu_has_cap); * This helper function is used in a narrow window when, * - The system wide safe registers are set with all the SMP CPUs and, * - The SYSTEM_FEATURE system_cpucaps may not have been set. - * In all other cases cpus_have_{const_}cap() should be used. */ static bool __maybe_unused __system_matches_cap(unsigned int n) { @@ -3320,46 +3863,117 @@ unsigned long cpu_get_elf_hwcap2(void) return elf_hwcap[1]; } +unsigned long cpu_get_elf_hwcap3(void) +{ + return elf_hwcap[2]; +} + +static void __init setup_boot_cpu_capabilities(void) +{ + kvm_arm_target_impl_cpu_init(); + /* + * The boot CPU's feature register values have been recorded. Detect + * boot cpucaps and local cpucaps for the boot CPU, then enable and + * patch alternatives for the available boot cpucaps. + */ + update_cpu_capabilities(SCOPE_BOOT_CPU | SCOPE_LOCAL_CPU); + enable_cpu_capabilities(SCOPE_BOOT_CPU); + apply_boot_alternatives(); +} + +void __init setup_boot_cpu_features(void) +{ + /* + * Initialize the indirect array of CPU capabilities pointers before we + * handle the boot CPU. + */ + init_cpucap_indirect_list(); + + /* + * Detect broken pseudo-NMI. Must be called _before_ the call to + * setup_boot_cpu_capabilities() since it interacts with + * can_use_gic_priorities(). + */ + detect_system_supports_pseudo_nmi(); + + setup_boot_cpu_capabilities(); +} + static void __init setup_system_capabilities(void) { /* - * We have finalised the system-wide safe feature - * registers, finalise the capabilities that depend - * on it. Also enable all the available capabilities, - * that are not enabled already. + * The system-wide safe feature register values have been finalized. + * Detect, enable, and patch alternatives for the available system + * cpucaps. */ update_cpu_capabilities(SCOPE_SYSTEM); enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU); -} + apply_alternatives_all(); -void __init setup_cpu_features(void) -{ - u32 cwg; + for (int i = 0; i < ARM64_NCAPS; i++) { + const struct arm64_cpu_capabilities *caps = cpucap_ptrs[i]; - setup_system_capabilities(); - setup_elf_hwcaps(arm64_elf_hwcaps); + if (!caps || !caps->desc) + continue; - if (system_supports_32bit_el0()) { - setup_elf_hwcaps(compat_elf_hwcaps); - elf_hwcap_fixup(); + /* + * Log any cpucaps with a cpumask as these aren't logged by + * update_cpu_capabilities(). + */ + if (caps->cpus && cpumask_any(caps->cpus) < nr_cpu_ids) + pr_info("detected: %s on CPU%*pbl\n", + caps->desc, cpumask_pr_args(caps->cpus)); + + /* Log match-all CPUs capabilities */ + if (cpucap_match_all_early_cpus(caps) && + cpus_have_cap(caps->capability)) + pr_info("detected: %s\n", caps->desc); } + /* + * TTBR0 PAN doesn't have its own cpucap, so log it manually. + */ if (system_uses_ttbr0_pan()) pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); + /* + * Report Spectre mitigations status. + */ + spectre_print_disabled_mitigations(); +} + +void __init setup_system_features(void) +{ + setup_system_capabilities(); + + linear_map_maybe_split_to_ptes(); + kpti_install_ng_mappings(); + sve_setup(); sme_setup(); - minsigstksz_setup(); /* * Check for sane CTR_EL0.CWG value. */ - cwg = cache_type_cwg(); - if (!cwg) + if (!cache_type_cwg()) pr_warn("No Cache Writeback Granule information, assuming %d\n", ARCH_DMA_MINALIGN); } +void __init setup_user_features(void) +{ + user_feature_fixup(); + + setup_elf_hwcaps(arm64_elf_hwcaps); + + if (system_supports_32bit_el0()) { + setup_elf_hwcaps(compat_elf_hwcaps); + elf_hwcap_fixup(); + } + + minsigstksz_setup(); +} + static int enable_mismatched_32bit_el0(unsigned int cpu) { /* @@ -3370,7 +3984,14 @@ static int enable_mismatched_32bit_el0(unsigned int cpu) static int lucky_winner = -1; struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu); - bool cpu_32bit = id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0); + bool cpu_32bit = false; + + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { + if (!housekeeping_cpu(cpu, HK_TYPE_TICK)) + pr_info("Treating adaptive-ticks CPU %u as 64-bit only\n", cpu); + else + cpu_32bit = true; + } if (cpu_32bit) { cpumask_set_cpu(cpu, cpu_32bit_el0_mask); @@ -3414,7 +4035,7 @@ subsys_initcall_sync(init_32bit_el0_mask); static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap) { - cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir); + cpu_enable_swapper_cnp(); } /* diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c deleted file mode 100644 index f372295207fb..000000000000 --- a/arch/arm64/kernel/cpuidle.c +++ /dev/null @@ -1,74 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * ARM64 CPU idle arch support - * - * Copyright (C) 2014 ARM Ltd. - * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> - */ - -#include <linux/acpi.h> -#include <linux/cpuidle.h> -#include <linux/cpu_pm.h> -#include <linux/psci.h> - -#ifdef CONFIG_ACPI_PROCESSOR_IDLE - -#include <acpi/processor.h> - -#define ARM64_LPI_IS_RETENTION_STATE(arch_flags) (!(arch_flags)) - -static int psci_acpi_cpu_init_idle(unsigned int cpu) -{ - int i, count; - struct acpi_lpi_state *lpi; - struct acpi_processor *pr = per_cpu(processors, cpu); - - if (unlikely(!pr || !pr->flags.has_lpi)) - return -EINVAL; - - /* - * If the PSCI cpu_suspend function hook has not been initialized - * idle states must not be enabled, so bail out - */ - if (!psci_ops.cpu_suspend) - return -EOPNOTSUPP; - - count = pr->power.count - 1; - if (count <= 0) - return -ENODEV; - - for (i = 0; i < count; i++) { - u32 state; - - lpi = &pr->power.lpi_states[i + 1]; - /* - * Only bits[31:0] represent a PSCI power_state while - * bits[63:32] must be 0x0 as per ARM ACPI FFH Specification - */ - state = lpi->address; - if (!psci_power_state_is_valid(state)) { - pr_warn("Invalid PSCI power state %#x\n", state); - return -EINVAL; - } - } - - return 0; -} - -int acpi_processor_ffh_lpi_probe(unsigned int cpu) -{ - return psci_acpi_cpu_init_idle(cpu); -} - -__cpuidle int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi) -{ - u32 state = lpi->address; - - if (ARM64_LPI_IS_RETENTION_STATE(lpi->arch_flags)) - return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM_RCU(psci_cpu_suspend_enter, - lpi->index, state); - else - return CPU_PM_CPU_IDLE_ENTER_PARAM_RCU(psci_cpu_suspend_enter, - lpi->index, state); -} -#endif diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 98fda8500535..c44e6d94f5de 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -36,8 +36,6 @@ static struct cpuinfo_arm64 boot_cpu_data; static inline const char *icache_policy_str(int l1ip) { switch (l1ip) { - case CTR_EL0_L1Ip_VPIPT: - return "VPIPT"; case CTR_EL0_L1Ip_VIPT: return "VIPT"; case CTR_EL0_L1Ip_PIPT: @@ -82,6 +80,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SB] = "sb", [KERNEL_HWCAP_PACA] = "paca", [KERNEL_HWCAP_PACG] = "pacg", + [KERNEL_HWCAP_GCS] = "gcs", [KERNEL_HWCAP_DCPODP] = "dcpodp", [KERNEL_HWCAP_SVE2] = "sve2", [KERNEL_HWCAP_SVEAES] = "sveaes", @@ -127,6 +126,43 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SME_F16F16] = "smef16f16", [KERNEL_HWCAP_MOPS] = "mops", [KERNEL_HWCAP_HBC] = "hbc", + [KERNEL_HWCAP_SVE_B16B16] = "sveb16b16", + [KERNEL_HWCAP_LRCPC3] = "lrcpc3", + [KERNEL_HWCAP_LSE128] = "lse128", + [KERNEL_HWCAP_FPMR] = "fpmr", + [KERNEL_HWCAP_LUT] = "lut", + [KERNEL_HWCAP_FAMINMAX] = "faminmax", + [KERNEL_HWCAP_F8CVT] = "f8cvt", + [KERNEL_HWCAP_F8FMA] = "f8fma", + [KERNEL_HWCAP_F8DP4] = "f8dp4", + [KERNEL_HWCAP_F8DP2] = "f8dp2", + [KERNEL_HWCAP_F8E4M3] = "f8e4m3", + [KERNEL_HWCAP_F8E5M2] = "f8e5m2", + [KERNEL_HWCAP_SME_LUTV2] = "smelutv2", + [KERNEL_HWCAP_SME_F8F16] = "smef8f16", + [KERNEL_HWCAP_SME_F8F32] = "smef8f32", + [KERNEL_HWCAP_SME_SF8FMA] = "smesf8fma", + [KERNEL_HWCAP_SME_SF8DP4] = "smesf8dp4", + [KERNEL_HWCAP_SME_SF8DP2] = "smesf8dp2", + [KERNEL_HWCAP_POE] = "poe", + [KERNEL_HWCAP_CMPBR] = "cmpbr", + [KERNEL_HWCAP_FPRCVT] = "fprcvt", + [KERNEL_HWCAP_F8MM8] = "f8mm8", + [KERNEL_HWCAP_F8MM4] = "f8mm4", + [KERNEL_HWCAP_SVE_F16MM] = "svef16mm", + [KERNEL_HWCAP_SVE_ELTPERM] = "sveeltperm", + [KERNEL_HWCAP_SVE_AES2] = "sveaes2", + [KERNEL_HWCAP_SVE_BFSCALE] = "svebfscale", + [KERNEL_HWCAP_SVE2P2] = "sve2p2", + [KERNEL_HWCAP_SME2P2] = "sme2p2", + [KERNEL_HWCAP_SME_SBITPERM] = "smesbitperm", + [KERNEL_HWCAP_SME_AES] = "smeaes", + [KERNEL_HWCAP_SME_SFEXPA] = "smesfexpa", + [KERNEL_HWCAP_SME_STMOP] = "smestmop", + [KERNEL_HWCAP_SME_SMOP4] = "smesmop4", + [KERNEL_HWCAP_MTE_FAR] = "mtefar", + [KERNEL_HWCAP_MTE_STORE_ONLY] = "mtestoreonly", + [KERNEL_HWCAP_LSFE] = "lsfe", }; #ifdef CONFIG_COMPAT @@ -176,80 +212,79 @@ static const char *const compat_hwcap2_str[] = { static int c_show(struct seq_file *m, void *v) { - int i, j; + int j; + int cpu = m->index; bool compat = personality(current->personality) == PER_LINUX32; + struct cpuinfo_arm64 *cpuinfo = v; + u32 midr = cpuinfo->reg_midr; - for_each_online_cpu(i) { - struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i); - u32 midr = cpuinfo->reg_midr; - - /* - * glibc reads /proc/cpuinfo to determine the number of - * online processors, looking for lines beginning with - * "processor". Give glibc what it expects. - */ - seq_printf(m, "processor\t: %d\n", i); - if (compat) - seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n", - MIDR_REVISION(midr), COMPAT_ELF_PLATFORM); + /* + * glibc reads /proc/cpuinfo to determine the number of + * online processors, looking for lines beginning with + * "processor". Give glibc what it expects. + */ + seq_printf(m, "processor\t: %d\n", cpu); + if (compat) + seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n", + MIDR_REVISION(midr), COMPAT_ELF_PLATFORM); - seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", - loops_per_jiffy / (500000UL/HZ), - loops_per_jiffy / (5000UL/HZ) % 100); + seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", + loops_per_jiffy / (500000UL/HZ), + loops_per_jiffy / (5000UL/HZ) % 100); - /* - * Dump out the common processor features in a single line. - * Userspace should read the hwcaps with getauxval(AT_HWCAP) - * rather than attempting to parse this, but there's a body of - * software which does already (at least for 32-bit). - */ - seq_puts(m, "Features\t:"); - if (compat) { + /* + * Dump out the common processor features in a single line. + * Userspace should read the hwcaps with getauxval(AT_HWCAP) + * rather than attempting to parse this, but there's a body of + * software which does already (at least for 32-bit). + */ + seq_puts(m, "Features\t:"); + if (compat) { #ifdef CONFIG_COMPAT - for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { - if (compat_elf_hwcap & (1 << j)) { - /* - * Warn once if any feature should not - * have been present on arm64 platform. - */ - if (WARN_ON_ONCE(!compat_hwcap_str[j])) - continue; - - seq_printf(m, " %s", compat_hwcap_str[j]); - } + for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { + if (compat_elf_hwcap & (1 << j)) { + /* + * Warn once if any feature should not + * have been present on arm64 platform. + */ + if (WARN_ON_ONCE(!compat_hwcap_str[j])) + continue; + + seq_printf(m, " %s", compat_hwcap_str[j]); } + } - for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) - if (compat_elf_hwcap2 & (1 << j)) - seq_printf(m, " %s", compat_hwcap2_str[j]); + for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) + if (compat_elf_hwcap2 & (1 << j)) + seq_printf(m, " %s", compat_hwcap2_str[j]); #endif /* CONFIG_COMPAT */ - } else { - for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) - if (cpu_have_feature(j)) - seq_printf(m, " %s", hwcap_str[j]); - } - seq_puts(m, "\n"); - - seq_printf(m, "CPU implementer\t: 0x%02x\n", - MIDR_IMPLEMENTOR(midr)); - seq_printf(m, "CPU architecture: 8\n"); - seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr)); - seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr)); - seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr)); + } else { + for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) + if (cpu_have_feature(j)) + seq_printf(m, " %s", hwcap_str[j]); } + seq_puts(m, "\n"); + + seq_printf(m, "CPU implementer\t: 0x%02x\n", + MIDR_IMPLEMENTOR(midr)); + seq_puts(m, "CPU architecture: 8\n"); + seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr)); + seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr)); + seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr)); return 0; } static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < 1 ? (void *)1 : NULL; + *pos = cpumask_next(*pos - 1, cpu_online_mask); + return *pos < nr_cpu_ids ? &per_cpu(cpu_data, *pos) : NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { ++*pos; - return NULL; + return c_start(m, pos); } static void c_stop(struct seq_file *m, void *v) @@ -264,7 +299,7 @@ const struct seq_operations cpuinfo_op = { }; -static struct kobj_type cpuregs_kobj_type = { +static const struct kobj_type cpuregs_kobj_type = { .sysfs_ops = &kobj_sysfs_ops, }; @@ -295,11 +330,13 @@ static struct kobj_type cpuregs_kobj_type = { CPUREGS_ATTR_RO(midr_el1, midr); CPUREGS_ATTR_RO(revidr_el1, revidr); +CPUREGS_ATTR_RO(aidr_el1, aidr); CPUREGS_ATTR_RO(smidr_el1, smidr); static struct attribute *cpuregs_id_attrs[] = { &cpuregs_attr_midr_el1.attr, &cpuregs_attr_revidr_el1.attr, + &cpuregs_attr_aidr_el1.attr, NULL }; @@ -385,9 +422,6 @@ static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info) switch (l1ip) { case CTR_EL0_L1Ip_PIPT: break; - case CTR_EL0_L1Ip_VPIPT: - set_bit(ICACHEF_VPIPT, &__icache_flags); - break; case CTR_EL0_L1Ip_VIPT: default: /* Assume aliasing */ @@ -439,20 +473,25 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_dczid = read_cpuid(DCZID_EL0); info->reg_midr = read_cpuid_id(); info->reg_revidr = read_cpuid(REVIDR_EL1); + info->reg_aidr = read_cpuid(AIDR_EL1); info->reg_id_aa64dfr0 = read_cpuid(ID_AA64DFR0_EL1); info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1); info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1); info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1); info->reg_id_aa64isar2 = read_cpuid(ID_AA64ISAR2_EL1); + info->reg_id_aa64isar3 = read_cpuid(ID_AA64ISAR3_EL1); info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1); info->reg_id_aa64mmfr3 = read_cpuid(ID_AA64MMFR3_EL1); + info->reg_id_aa64mmfr4 = read_cpuid(ID_AA64MMFR4_EL1); info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); + info->reg_id_aa64pfr2 = read_cpuid(ID_AA64PFR2_EL1); info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1); info->reg_id_aa64smfr0 = read_cpuid(ID_AA64SMFR0_EL1); + info->reg_id_aa64fpfr0 = read_cpuid(ID_AA64FPFR0_EL1); if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) info->reg_gmid = read_cpuid(GMID_EL1); @@ -460,6 +499,22 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) __cpuinfo_store_cpu_32bit(&info->aarch32); + /* + * info->reg_mpamidr deferred to {init,update}_cpu_features because we + * don't want to read it (and trigger a trap on buggy firmware) if + * using an aa64pfr0_el1 override to unconditionally disable MPAM. + */ + + if (IS_ENABLED(CONFIG_ARM64_SME) && + id_aa64pfr1_sme(info->reg_id_aa64pfr1)) { + /* + * We mask out SMPS since even if the hardware + * supports priorities the kernel does not at present + * and we block access to them. + */ + info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; + } + cpuinfo_detect_icache_policy(info); } diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 64f2ecbdfe5c..29307642f4c9 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -21,8 +21,12 @@ #include <asm/cputype.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> +#include <asm/exception.h> +#include <asm/kgdb.h> +#include <asm/kprobes.h> #include <asm/system_misc.h> #include <asm/traps.h> +#include <asm/uprobes.h> /* Determine debug architecture. */ u8 debug_monitors_arch(void) @@ -34,7 +38,7 @@ u8 debug_monitors_arch(void) /* * MDSCR access routines. */ -static void mdscr_write(u32 mdscr) +static void mdscr_write(u64 mdscr) { unsigned long flags; flags = local_daif_save(); @@ -43,7 +47,7 @@ static void mdscr_write(u32 mdscr) } NOKPROBE_SYMBOL(mdscr_write); -static u32 mdscr_read(void) +static u64 mdscr_read(void) { return read_sysreg(mdscr_el1); } @@ -79,16 +83,16 @@ static DEFINE_PER_CPU(int, kde_ref_count); void enable_debug_monitors(enum dbg_active_el el) { - u32 mdscr, enable = 0; + u64 mdscr, enable = 0; WARN_ON(preemptible()); if (this_cpu_inc_return(mde_ref_count) == 1) - enable = DBG_MDSCR_MDE; + enable = MDSCR_EL1_MDE; if (el == DBG_ACTIVE_EL1 && this_cpu_inc_return(kde_ref_count) == 1) - enable |= DBG_MDSCR_KDE; + enable |= MDSCR_EL1_KDE; if (enable && debug_enabled) { mdscr = mdscr_read(); @@ -100,16 +104,16 @@ NOKPROBE_SYMBOL(enable_debug_monitors); void disable_debug_monitors(enum dbg_active_el el) { - u32 mdscr, disable = 0; + u64 mdscr, disable = 0; WARN_ON(preemptible()); if (this_cpu_dec_return(mde_ref_count) == 0) - disable = ~DBG_MDSCR_MDE; + disable = ~MDSCR_EL1_MDE; if (el == DBG_ACTIVE_EL1 && this_cpu_dec_return(kde_ref_count) == 0) - disable &= ~DBG_MDSCR_KDE; + disable &= ~MDSCR_EL1_KDE; if (disable) { mdscr = mdscr_read(); @@ -156,190 +160,124 @@ NOKPROBE_SYMBOL(clear_user_regs_spsr_ss); #define set_regs_spsr_ss(r) set_user_regs_spsr_ss(&(r)->user_regs) #define clear_regs_spsr_ss(r) clear_user_regs_spsr_ss(&(r)->user_regs) -static DEFINE_SPINLOCK(debug_hook_lock); -static LIST_HEAD(user_step_hook); -static LIST_HEAD(kernel_step_hook); - -static void register_debug_hook(struct list_head *node, struct list_head *list) +static void send_user_sigtrap(int si_code) { - spin_lock(&debug_hook_lock); - list_add_rcu(node, list); - spin_unlock(&debug_hook_lock); - -} + struct pt_regs *regs = current_pt_regs(); -static void unregister_debug_hook(struct list_head *node) -{ - spin_lock(&debug_hook_lock); - list_del_rcu(node); - spin_unlock(&debug_hook_lock); - synchronize_rcu(); -} + if (WARN_ON(!user_mode(regs))) + return; -void register_user_step_hook(struct step_hook *hook) -{ - register_debug_hook(&hook->node, &user_step_hook); -} + if (!regs_irqs_disabled(regs)) + local_irq_enable(); -void unregister_user_step_hook(struct step_hook *hook) -{ - unregister_debug_hook(&hook->node); + arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs), + "User debug trap"); } -void register_kernel_step_hook(struct step_hook *hook) +/* + * We have already unmasked interrupts and enabled preemption + * when calling do_el0_softstep() from entry-common.c. + */ +void do_el0_softstep(unsigned long esr, struct pt_regs *regs) { - register_debug_hook(&hook->node, &kernel_step_hook); -} + if (uprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) + return; -void unregister_kernel_step_hook(struct step_hook *hook) -{ - unregister_debug_hook(&hook->node); + send_user_sigtrap(TRAP_TRACE); + /* + * ptrace will disable single step unless explicitly + * asked to re-enable it. For other clients, it makes + * sense to leave it enabled (i.e. rewind the controls + * to the active-not-pending state). + */ + user_rewind_single_step(current); } -/* - * Call registered single step handlers - * There is no Syndrome info to check for determining the handler. - * So we call all the registered handlers, until the right handler is - * found which returns zero. - */ -static int call_step_hook(struct pt_regs *regs, unsigned long esr) +void do_el1_softstep(unsigned long esr, struct pt_regs *regs) { - struct step_hook *hook; - struct list_head *list; - int retval = DBG_HOOK_ERROR; - - list = user_mode(regs) ? &user_step_hook : &kernel_step_hook; + if (kgdb_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) + return; + pr_warn("Unexpected kernel single-step exception at EL1\n"); /* - * Since single-step exception disables interrupt, this function is - * entirely not preemptible, and we can use rcu list safely here. + * Re-enable stepping since we know that we will be + * returning to regs. */ - list_for_each_entry_rcu(hook, list, node) { - retval = hook->fn(regs, esr); - if (retval == DBG_HOOK_HANDLED) - break; - } - - return retval; + set_regs_spsr_ss(regs); } -NOKPROBE_SYMBOL(call_step_hook); +NOKPROBE_SYMBOL(do_el1_softstep); -static void send_user_sigtrap(int si_code) +static int call_el1_break_hook(struct pt_regs *regs, unsigned long esr) { - struct pt_regs *regs = current_pt_regs(); + if (esr_brk_comment(esr) == BUG_BRK_IMM) + return bug_brk_handler(regs, esr); - if (WARN_ON(!user_mode(regs))) - return; + if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) + return cfi_brk_handler(regs, esr); - if (interrupts_enabled(regs)) - local_irq_enable(); + if (esr_brk_comment(esr) == FAULT_BRK_IMM) + return reserved_fault_brk_handler(regs, esr); - arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs), - "User debug trap"); -} + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) && + (esr_brk_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) + return kasan_brk_handler(regs, esr); -static int single_step_handler(unsigned long unused, unsigned long esr, - struct pt_regs *regs) -{ - bool handler_found = false; + if (IS_ENABLED(CONFIG_UBSAN_TRAP) && esr_is_ubsan_brk(esr)) + return ubsan_brk_handler(regs, esr); - /* - * If we are stepping a pending breakpoint, call the hw_breakpoint - * handler first. - */ - if (!reinstall_suspended_bps(regs)) - return 0; - - if (!handler_found && call_step_hook(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; - - if (!handler_found && user_mode(regs)) { - send_user_sigtrap(TRAP_TRACE); - - /* - * ptrace will disable single step unless explicitly - * asked to re-enable it. For other clients, it makes - * sense to leave it enabled (i.e. rewind the controls - * to the active-not-pending state). - */ - user_rewind_single_step(current); - } else if (!handler_found) { - pr_warn("Unexpected kernel single-step exception at EL1\n"); - /* - * Re-enable stepping since we know that we will be - * returning to regs. - */ - set_regs_spsr_ss(regs); + if (IS_ENABLED(CONFIG_KGDB)) { + if (esr_brk_comment(esr) == KGDB_DYN_DBG_BRK_IMM) + return kgdb_brk_handler(regs, esr); + if (esr_brk_comment(esr) == KGDB_COMPILED_DBG_BRK_IMM) + return kgdb_compiled_brk_handler(regs, esr); } - return 0; -} -NOKPROBE_SYMBOL(single_step_handler); - -static LIST_HEAD(user_break_hook); -static LIST_HEAD(kernel_break_hook); + if (IS_ENABLED(CONFIG_KPROBES)) { + if (esr_brk_comment(esr) == KPROBES_BRK_IMM) + return kprobe_brk_handler(regs, esr); + if (esr_brk_comment(esr) == KPROBES_BRK_SS_IMM) + return kprobe_ss_brk_handler(regs, esr); + } -void register_user_break_hook(struct break_hook *hook) -{ - register_debug_hook(&hook->node, &user_break_hook); -} + if (IS_ENABLED(CONFIG_KRETPROBES) && + esr_brk_comment(esr) == KRETPROBES_BRK_IMM) + return kretprobe_brk_handler(regs, esr); -void unregister_user_break_hook(struct break_hook *hook) -{ - unregister_debug_hook(&hook->node); + return DBG_HOOK_ERROR; } +NOKPROBE_SYMBOL(call_el1_break_hook); -void register_kernel_break_hook(struct break_hook *hook) +/* + * We have already unmasked interrupts and enabled preemption + * when calling do_el0_brk64() from entry-common.c. + */ +void do_el0_brk64(unsigned long esr, struct pt_regs *regs) { - register_debug_hook(&hook->node, &kernel_break_hook); -} + if (IS_ENABLED(CONFIG_UPROBES) && + esr_brk_comment(esr) == UPROBES_BRK_IMM && + uprobe_brk_handler(regs, esr) == DBG_HOOK_HANDLED) + return; -void unregister_kernel_break_hook(struct break_hook *hook) -{ - unregister_debug_hook(&hook->node); + send_user_sigtrap(TRAP_BRKPT); } -static int call_break_hook(struct pt_regs *regs, unsigned long esr) +void do_el1_brk64(unsigned long esr, struct pt_regs *regs) { - struct break_hook *hook; - struct list_head *list; - int (*fn)(struct pt_regs *regs, unsigned long esr) = NULL; - - list = user_mode(regs) ? &user_break_hook : &kernel_break_hook; - - /* - * Since brk exception disables interrupt, this function is - * entirely not preemptible, and we can use rcu list safely here. - */ - list_for_each_entry_rcu(hook, list, node) { - unsigned long comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; - - if ((comment & ~hook->mask) == hook->imm) - fn = hook->fn; - } + if (call_el1_break_hook(regs, esr) == DBG_HOOK_HANDLED) + return; - return fn ? fn(regs, esr) : DBG_HOOK_ERROR; + die("Oops - BRK", regs, esr); } -NOKPROBE_SYMBOL(call_break_hook); +NOKPROBE_SYMBOL(do_el1_brk64); -static int brk_handler(unsigned long unused, unsigned long esr, - struct pt_regs *regs) +#ifdef CONFIG_COMPAT +void do_bkpt32(unsigned long esr, struct pt_regs *regs) { - if (call_break_hook(regs, esr) == DBG_HOOK_HANDLED) - return 0; - - if (user_mode(regs)) { - send_user_sigtrap(TRAP_BRKPT); - } else { - pr_warn("Unexpected kernel BRK exception at EL1\n"); - return -EFAULT; - } - - return 0; + arm64_notify_die("aarch32 BKPT", regs, SIGTRAP, TRAP_BRKPT, regs->pc, esr); } -NOKPROBE_SYMBOL(brk_handler); +#endif /* CONFIG_COMPAT */ -int aarch32_break_handler(struct pt_regs *regs) +bool try_handle_aarch32_break(struct pt_regs *regs) { u32 arm_instr; u16 thumb_instr; @@ -347,7 +285,7 @@ int aarch32_break_handler(struct pt_regs *regs) void __user *pc = (void __user *)instruction_pointer(regs); if (!compat_user_mode(regs)) - return -EFAULT; + return false; if (compat_thumb_mode(regs)) { /* get 16-bit Thumb instruction */ @@ -371,20 +309,12 @@ int aarch32_break_handler(struct pt_regs *regs) } if (!bp) - return -EFAULT; + return false; send_user_sigtrap(TRAP_BRKPT); - return 0; -} -NOKPROBE_SYMBOL(aarch32_break_handler); - -void __init debug_traps_init(void) -{ - hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP, - TRAP_TRACE, "single-step handler"); - hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP, - TRAP_BRKPT, "BRK handler"); + return true; } +NOKPROBE_SYMBOL(try_handle_aarch32_break); /* Re-enable single step for syscall restarting. */ void user_rewind_single_step(struct task_struct *task) @@ -418,7 +348,7 @@ void kernel_enable_single_step(struct pt_regs *regs) { WARN_ON(!irqs_disabled()); set_regs_spsr_ss(regs); - mdscr_write(mdscr_read() | DBG_MDSCR_SS); + mdscr_write(mdscr_read() | MDSCR_EL1_SS); enable_debug_monitors(DBG_ACTIVE_EL1); } NOKPROBE_SYMBOL(kernel_enable_single_step); @@ -426,7 +356,7 @@ NOKPROBE_SYMBOL(kernel_enable_single_step); void kernel_disable_single_step(void) { WARN_ON(!irqs_disabled()); - mdscr_write(mdscr_read() & ~DBG_MDSCR_SS); + mdscr_write(mdscr_read() & ~MDSCR_EL1_SS); disable_debug_monitors(DBG_ACTIVE_EL1); } NOKPROBE_SYMBOL(kernel_disable_single_step); @@ -434,7 +364,7 @@ NOKPROBE_SYMBOL(kernel_disable_single_step); int kernel_active_single_step(void) { WARN_ON(!irqs_disabled()); - return mdscr_read() & DBG_MDSCR_SS; + return mdscr_read() & MDSCR_EL1_SS; } NOKPROBE_SYMBOL(kernel_active_single_step); @@ -443,6 +373,11 @@ void kernel_rewind_single_step(struct pt_regs *regs) set_regs_spsr_ss(regs); } +void kernel_fastforward_single_step(struct pt_regs *regs) +{ + clear_regs_spsr_ss(regs); +} + /* ptrace API */ void user_enable_single_step(struct task_struct *task) { diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S index 11d7f7de202d..329e8df9215f 100644 --- a/arch/arm64/kernel/efi-header.S +++ b/arch/arm64/kernel/efi-header.S @@ -28,7 +28,7 @@ .macro __EFI_PE_HEADER #ifdef CONFIG_EFI .set .Lpe_header_offset, . - .L_head - .long PE_MAGIC + .long IMAGE_NT_SIGNATURE .short IMAGE_FILE_MACHINE_ARM64 // Machine .short .Lsection_count // NumberOfSections .long 0 // TimeDateStamp @@ -40,7 +40,7 @@ IMAGE_FILE_LINE_NUMS_STRIPPED // Characteristics .Loptional_header: - .short PE_OPT_MAGIC_PE32PLUS // PE32+ format + .short IMAGE_NT_OPTIONAL_HDR64_MAGIC // PE32+ format .byte 0x02 // MajorLinkerVersion .byte 0x14 // MinorLinkerVersion .long __initdata_begin - .Lefi_header_end // SizeOfCode @@ -66,7 +66,7 @@ .long .Lefi_header_end - .L_head // SizeOfHeaders .long 0 // CheckSum .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem - .short IMAGE_DLL_CHARACTERISTICS_NX_COMPAT // DllCharacteristics + .short IMAGE_DLLCHARACTERISTICS_NX_COMPAT // DllCharacteristics .quad 0 // SizeOfStackReserve .quad 0 // SizeOfStackCommit .quad 0 // SizeOfHeapReserve diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 2b478ca356b0..a81cb4aa4738 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -9,10 +9,14 @@ #include <linux/efi.h> #include <linux/init.h> +#include <linux/kmemleak.h> +#include <linux/kthread.h> #include <linux/screen_info.h> +#include <linux/vmalloc.h> #include <asm/efi.h> #include <asm/stacktrace.h> +#include <asm/vmap_stack.h> static bool region_is_misaligned(const efi_memory_desc_t *md) { @@ -27,13 +31,21 @@ static bool region_is_misaligned(const efi_memory_desc_t *md) * executable, everything else can be mapped with the XN bits * set. Also take the new (optional) RO/XP bits into account. */ -static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) +static __init ptdesc_t create_mapping_protection(efi_memory_desc_t *md) { u64 attr = md->attribute; u32 type = md->type; - if (type == EFI_MEMORY_MAPPED_IO) - return PROT_DEVICE_nGnRE; + if (type == EFI_MEMORY_MAPPED_IO) { + pgprot_t prot = __pgprot(PROT_DEVICE_nGnRE); + + if (arm64_is_protected_mmio(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT)) + prot = pgprot_encrypted(prot); + else + prot = pgprot_decrypted(prot); + return pgprot_val(prot); + } if (region_is_misaligned(md)) { static bool __initdata code_is_misaligned; @@ -71,13 +83,9 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) return pgprot_val(PAGE_KERNEL_EXEC); } -/* we will fill this structure from the stub, so don't put it in .bss */ -struct screen_info screen_info __section(".data"); -EXPORT_SYMBOL(screen_info); - int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { - pteval_t prot_val = create_mapping_protection(md); + ptdesc_t prot_val = create_mapping_protection(md); bool page_mappings_only = (md->type == EFI_RUNTIME_SERVICES_CODE || md->type == EFI_RUNTIME_SERVICES_DATA); @@ -107,16 +115,15 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { struct set_perm_data *spd = data; const efi_memory_desc_t *md = spd->md; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = __ptep_get(ptep); if (md->attribute & EFI_MEMORY_RO) pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); if (md->attribute & EFI_MEMORY_XP) pte = set_pte_bit(pte, __pgprot(PTE_PXN)); - else if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && - system_supports_bti() && spd->has_bti) + else if (system_supports_bti_kernel() && spd->has_bti) pte = set_pte_bit(pte, __pgprot(PTE_GP)); - set_pte(ptep, pte); + __set_pte(ptep, pte); return 0; } @@ -159,20 +166,53 @@ asmlinkage efi_status_t efi_handle_corrupted_x18(efi_status_t s, const char *f) return s; } -static DEFINE_RAW_SPINLOCK(efi_rt_lock); - void arch_efi_call_virt_setup(void) { - efi_virtmap_load(); + efi_runtime_assert_lock_held(); + + if (preemptible() && (current->flags & PF_KTHREAD)) { + /* + * Disable migration to ensure that a preempted EFI runtime + * service call will be resumed on the same CPU. This avoids + * potential issues with EFI runtime calls that are preempted + * while polling for an asynchronous completion of a secure + * firmware call, which may not permit the CPU to change. + */ + migrate_disable(); + kthread_use_mm(&efi_mm); + } else { + efi_virtmap_load(); + } + + /* + * Enable access to the valid TTBR0_EL1 and invoke the errata + * workaround directly since there is no return from exception when + * invoking the EFI run-time services. + */ + uaccess_ttbr0_enable(); + post_ttbr_update_workaround(); + __efi_fpsimd_begin(); - raw_spin_lock(&efi_rt_lock); } void arch_efi_call_virt_teardown(void) { - raw_spin_unlock(&efi_rt_lock); __efi_fpsimd_end(); - efi_virtmap_unload(); + + /* + * Defer the switch to the current thread's TTBR0_EL1 until + * uaccess_enable(). Do so before efi_virtmap_unload() updates the + * saved TTBR0 value, so the userland page tables are not activated + * inadvertently over the back of an exception. + */ + uaccess_ttbr0_disable(); + + if (preemptible() && (current->flags & PF_KTHREAD)) { + kthread_unuse_mm(&efi_mm); + migrate_enable(); + } else { + efi_virtmap_unload(); + } } asmlinkage u64 *efi_rt_stack_top __ro_after_init; @@ -209,14 +249,14 @@ static int __init arm64_efi_rt_init(void) if (!efi_enabled(EFI_RUNTIME_SERVICES)) return 0; - p = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, GFP_KERNEL, - NUMA_NO_NODE, &&l); -l: if (!p) { + p = arch_alloc_vmap_stack(THREAD_SIZE, NUMA_NO_NODE); + if (!p) { pr_warn("Failed to allocate EFI runtime stack\n"); clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return -ENOMEM; } + kmemleak_not_leak(p); efi_rt_stack_top = p + THREAD_SIZE; return 0; } diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c index 2e94d20c4ac7..b735f4c2fe5e 100644 --- a/arch/arm64/kernel/elfcore.c +++ b/arch/arm64/kernel/elfcore.c @@ -27,9 +27,10 @@ static int mte_dump_tag_range(struct coredump_params *cprm, int ret = 1; unsigned long addr; void *tags = NULL; + int locked = 0; for (addr = start; addr < start + len; addr += PAGE_SIZE) { - struct page *page = get_dump_page(addr); + struct page *page = get_dump_page(addr, &locked); /* * get_dump_page() returns NULL when encountering an empty diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 0fc94207e69a..3625797e9ee8 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -6,10 +6,13 @@ */ #include <linux/context_tracking.h> +#include <linux/irq-entry-common.h> #include <linux/kasan.h> #include <linux/linkage.h> +#include <linux/livepatch.h> #include <linux/lockdep.h> #include <linux/ptrace.h> +#include <linux/resume_user_mode.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/thread_info.h> @@ -31,67 +34,28 @@ * Handle IRQ/context state management when entering from kernel mode. * Before this function is called it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. - * - * This is intended to match the logic in irqentry_enter(), handling the kernel - * mode transitions only. */ -static __always_inline void __enter_from_kernel_mode(struct pt_regs *regs) +static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs) { - regs->exit_rcu = false; - - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { - lockdep_hardirqs_off(CALLER_ADDR0); - ct_irq_enter(); - trace_hardirqs_off_finish(); - - regs->exit_rcu = true; - return; - } - - lockdep_hardirqs_off(CALLER_ADDR0); - rcu_irq_enter_check_tick(); - trace_hardirqs_off_finish(); -} + irqentry_state_t state; -static void noinstr enter_from_kernel_mode(struct pt_regs *regs) -{ - __enter_from_kernel_mode(regs); + state = irqentry_enter(regs); mte_check_tfsr_entry(); mte_disable_tco_entry(current); + + return state; } /* * Handle IRQ/context state management when exiting to kernel mode. * After this function returns it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. - * - * This is intended to match the logic in irqentry_exit(), handling the kernel - * mode transitions only, and with preemption handled elsewhere. */ -static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) -{ - lockdep_assert_irqs_disabled(); - - if (interrupts_enabled(regs)) { - if (regs->exit_rcu) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - ct_irq_exit(); - lockdep_hardirqs_on(CALLER_ADDR0); - return; - } - - trace_hardirqs_on(); - } else { - if (regs->exit_rcu) - ct_irq_exit(); - } -} - -static void noinstr exit_to_kernel_mode(struct pt_regs *regs) +static void noinstr exit_to_kernel_mode(struct pt_regs *regs, + irqentry_state_t state) { mte_check_tfsr_exit(); - __exit_to_kernel_mode(regs); + irqentry_exit(regs, state); } /* @@ -99,96 +63,30 @@ static void noinstr exit_to_kernel_mode(struct pt_regs *regs) * Before this function is called it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static __always_inline void __enter_from_user_mode(void) +static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs) { - lockdep_hardirqs_off(CALLER_ADDR0); - CT_WARN_ON(ct_state() != CONTEXT_USER); - user_exit_irqoff(); - trace_hardirqs_off_finish(); + enter_from_user_mode(regs); mte_disable_tco_entry(current); } -static __always_inline void enter_from_user_mode(struct pt_regs *regs) -{ - __enter_from_user_mode(); -} - /* * Handle IRQ/context state management when exiting to user mode. * After this function returns it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static __always_inline void __exit_to_user_mode(void) -{ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - user_enter_irqoff(); - lockdep_hardirqs_on(CALLER_ADDR0); -} -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) +static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) { - unsigned long flags; - + local_irq_disable(); + exit_to_user_mode_prepare_legacy(regs); local_daif_mask(); - - flags = read_thread_flags(); - if (unlikely(flags & _TIF_WORK_MASK)) - do_notify_resume(regs, flags); - - lockdep_sys_exit(); -} - -static __always_inline void exit_to_user_mode(struct pt_regs *regs) -{ - exit_to_user_mode_prepare(regs); mte_check_tfsr_exit(); - __exit_to_user_mode(); + exit_to_user_mode(); } asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) { - exit_to_user_mode(regs); -} - -/* - * Handle IRQ/context state management when entering an NMI from user/kernel - * mode. Before this function is called it is not safe to call regular kernel - * code, instrumentable code, or any code which may trigger an exception. - */ -static void noinstr arm64_enter_nmi(struct pt_regs *regs) -{ - regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); - - __nmi_enter(); - lockdep_hardirqs_off(CALLER_ADDR0); - lockdep_hardirq_enter(); - ct_nmi_enter(); - - trace_hardirqs_off_finish(); - ftrace_nmi_enter(); -} - -/* - * Handle IRQ/context state management when exiting an NMI from user/kernel - * mode. After this function returns it is not safe to call regular kernel - * code, instrumentable code, or any code which may trigger an exception. - */ -static void noinstr arm64_exit_nmi(struct pt_regs *regs) -{ - bool restore = regs->lockdep_hardirqs; - - ftrace_nmi_exit(); - if (restore) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - } - - ct_nmi_exit(); - lockdep_hardirq_exit(); - if (restore) - lockdep_hardirqs_on(CALLER_ADDR0); - __nmi_exit(); + arm64_exit_to_user_mode(regs); } /* @@ -196,14 +94,18 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs) * kernel mode. Before this function is called it is not safe to call regular * kernel code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) +static noinstr irqentry_state_t arm64_enter_el1_dbg(struct pt_regs *regs) { - regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); + irqentry_state_t state; + + state.lockdep = lockdep_hardirqs_enabled(); lockdep_hardirqs_off(CALLER_ADDR0); ct_nmi_enter(); trace_hardirqs_off_finish(); + + return state; } /* @@ -211,62 +113,19 @@ static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) * kernel mode. After this function returns it is not safe to call regular * kernel code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs) +static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs, + irqentry_state_t state) { - bool restore = regs->lockdep_hardirqs; - - if (restore) { + if (state.lockdep) { trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); } ct_nmi_exit(); - if (restore) + if (state.lockdep) lockdep_hardirqs_on(CALLER_ADDR0); } -#ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); -#define need_irq_preemption() \ - (static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) -#else -#define need_irq_preemption() (IS_ENABLED(CONFIG_PREEMPTION)) -#endif - -static void __sched arm64_preempt_schedule_irq(void) -{ - if (!need_irq_preemption()) - return; - - /* - * Note: thread_info::preempt_count includes both thread_info::count - * and thread_info::need_resched, and is not equivalent to - * preempt_count(). - */ - if (READ_ONCE(current_thread_info()->preempt_count) != 0) - return; - - /* - * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC - * priority masking is used the GIC irqchip driver will clear DAIF.IF - * using gic_arch_enable_irqs() for normal IRQs. If anything is set in - * DAIF we must have handled an NMI, so skip preemption. - */ - if (system_uses_irq_prio_masking() && read_sysreg(daif)) - return; - - /* - * Preempting a task from an IRQ means we leave copies of PSTATE - * on the stack. cpufeature's enable calls may modify PSTATE, but - * resuming one of these preempted tasks would undo those changes. - * - * Only allow a task to be preempted once cpufeatures have been - * enabled. - */ - if (system_capabilities_finalized()) - preempt_schedule_irq(); -} - static void do_interrupt_handler(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { @@ -286,7 +145,7 @@ extern void (*handle_arch_fiq)(struct pt_regs *); static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector, unsigned long esr) { - arm64_enter_nmi(regs); + irqentry_nmi_enter(regs); console_verbose(); @@ -310,7 +169,7 @@ static DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); static void cortex_a76_erratum_1463225_svc_handler(void) { - u32 reg, val; + u64 reg, val; if (!unlikely(test_thread_flag(TIF_SINGLESTEP))) return; @@ -320,7 +179,7 @@ static void cortex_a76_erratum_1463225_svc_handler(void) __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 1); reg = read_sysreg(mdscr_el1); - val = reg | DBG_MDSCR_SS | DBG_MDSCR_KDE; + val = reg | MDSCR_EL1_SS | MDSCR_EL1_KDE; write_sysreg(val, mdscr_el1); asm volatile("msr daifclr, #8"); isb(); @@ -359,20 +218,16 @@ static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) * As per the ABI exit SME streaming mode and clear the SVE state not * shared with FPSIMD on syscall entry. */ -static inline void fp_user_discard(void) +static inline void fpsimd_syscall_enter(void) { - /* - * If SME is active then exit streaming mode. If ZA is active - * then flush the SVE registers but leave userspace access to - * both SVE and SME enabled, otherwise disable SME for the - * task and fall through to disabling SVE too. This means - * that after a syscall we never have any streaming mode - * register state to track, if this changes the KVM code will - * need updating. - */ + /* Ensure PSTATE.SM is clear, but leave PSTATE.ZA as-is. */ if (system_supports_sme()) sme_smstop_sm(); + /* + * The CPU is not in streaming mode. If non-streaming SVE is not + * supported, there is no SVE state that needs to be discarded. + */ if (!system_supports_sve()) return; @@ -382,7 +237,56 @@ static inline void fp_user_discard(void) sve_vq_minus_one = sve_vq_from_vl(task_get_sve_vl(current)) - 1; sve_flush_live(true, sve_vq_minus_one); } + + /* + * Any live non-FPSIMD SVE state has been zeroed. Allow + * fpsimd_save_user_state() to lazily discard SVE state until either + * the live state is unbound or fpsimd_syscall_exit() is called. + */ + __this_cpu_write(fpsimd_last_state.to_save, FP_STATE_FPSIMD); +} + +static __always_inline void fpsimd_syscall_exit(void) +{ + if (!system_supports_sve()) + return; + + /* + * The current task's user FPSIMD/SVE/SME state is now bound to this + * CPU. The fpsimd_last_state.to_save value is either: + * + * - FP_STATE_FPSIMD, if the state has not been reloaded on this CPU + * since fpsimd_syscall_enter(). + * + * - FP_STATE_CURRENT, if the state has been reloaded on this CPU at + * any point. + * + * Reset this to FP_STATE_CURRENT to stop lazy discarding. + */ + __this_cpu_write(fpsimd_last_state.to_save, FP_STATE_CURRENT); +} + +/* + * In debug exception context, we explicitly disable preemption despite + * having interrupts disabled. + * This serves two purposes: it makes it much less likely that we would + * accidentally schedule in exception context and it will force a warning + * if we somehow manage to schedule by accident. + */ +static void debug_exception_enter(struct pt_regs *regs) +{ + preempt_disable(); + + /* This code is a bit fragile. Test it. */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work"); } +NOKPROBE_SYMBOL(debug_exception_enter); + +static void debug_exception_exit(struct pt_regs *regs) +{ + preempt_enable_no_resched(); +} +NOKPROBE_SYMBOL(debug_exception_exit); UNHANDLED(el1t, 64, sync) UNHANDLED(el1t, 64, irq) @@ -392,60 +296,135 @@ UNHANDLED(el1t, 64, error) static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + irqentry_state_t state; - enter_from_kernel_mode(regs); + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_mem_abort(far, esr, regs); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + irqentry_state_t state; - enter_from_kernel_mode(regs); + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_sp_pc_abort(far, esr, regs); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_undef(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_bti(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); +} + +static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_gcs(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs, state); +} + +static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_mops(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs, state); +} + +static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); + debug_exception_enter(regs); + do_breakpoint(esr, regs); + debug_exception_exit(regs); + arm64_exit_el1_dbg(regs, state); } -static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) +static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr) { + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); + if (!cortex_a76_erratum_1463225_debug_handler(regs)) { + debug_exception_enter(regs); + /* + * After handling a breakpoint, we suspend the breakpoint + * and use single-step to move to the next instruction. + * If we are stepping a suspended breakpoint there's nothing more to do: + * the single-step is complete. + */ + if (!try_step_suspended_breakpoints(regs)) + do_el1_softstep(esr, regs); + debug_exception_exit(regs); + } + arm64_exit_el1_dbg(regs, state); +} + +static void noinstr el1_watchpt(struct pt_regs *regs, unsigned long esr) +{ + /* Watchpoints are the only debug exception to write FAR_EL1 */ unsigned long far = read_sysreg(far_el1); + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); + debug_exception_enter(regs); + do_watchpoint(far, esr, regs); + debug_exception_exit(regs); + arm64_exit_el1_dbg(regs, state); +} + +static void noinstr el1_brk64(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; - arm64_enter_el1_dbg(regs); - if (!cortex_a76_erratum_1463225_debug_handler(regs)) - do_debug_exception(far, esr, regs); - arm64_exit_el1_dbg(regs); + state = arm64_enter_el1_dbg(regs); + debug_exception_enter(regs); + do_el1_brk64(esr, regs); + debug_exception_exit(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_fpac(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) @@ -471,11 +450,23 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_BTI: el1_bti(regs, esr); break; + case ESR_ELx_EC_GCS: + el1_gcs(regs, esr); + break; + case ESR_ELx_EC_MOPS: + el1_mops(regs, esr); + break; case ESR_ELx_EC_BREAKPT_CUR: + el1_breakpt(regs, esr); + break; case ESR_ELx_EC_SOFTSTP_CUR: + el1_softstp(regs, esr); + break; case ESR_ELx_EC_WATCHPT_CUR: + el1_watchpt(regs, esr); + break; case ESR_ELx_EC_BRK64: - el1_dbg(regs, esr); + el1_brk64(regs, esr); break; case ESR_ELx_EC_FPAC: el1_fpac(regs, esr); @@ -488,30 +479,32 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) static __always_inline void __el1_pnmi(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - arm64_enter_nmi(regs); + irqentry_state_t state; + + state = irqentry_nmi_enter(regs); do_interrupt_handler(regs, handler); - arm64_exit_nmi(regs); + irqentry_nmi_exit(regs, state); } static __always_inline void __el1_irq(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); irq_enter_rcu(); do_interrupt_handler(regs, handler); irq_exit_rcu(); - arm64_preempt_schedule_irq(); - - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_interrupt(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { write_sysreg(DAIF_PROCCTX_NOIRQ, daif); - if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) + if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && regs_irqs_disabled(regs)) __el1_pnmi(regs, handler); else __el1_irq(regs, handler); @@ -530,21 +523,22 @@ asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs) asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); + irqentry_state_t state; local_daif_restore(DAIF_ERRCTX); - arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); do_serror(regs, esr); - arm64_exit_nmi(regs); + irqentry_nmi_exit(regs, state); } static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) @@ -559,50 +553,50 @@ static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(far)) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_fpsimd_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sve_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sve_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sme_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sme_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_fpsimd_exc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sys(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_sys(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) @@ -612,79 +606,132 @@ static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(instruction_pointer(regs))) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sp(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(regs->sp, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_undef(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_undef(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_bti(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_bti(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_mops(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_gcs(regs, esr); + arm64_exit_to_user_mode(regs); } static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); bad_el0_sync(regs, 0, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_breakpt(struct pt_regs *regs, unsigned long esr) +{ + if (!is_ttbr0_addr(regs->pc)) + arm64_apply_bp_hardening(); + + arm64_enter_from_user_mode(regs); + debug_exception_enter(regs); + do_breakpoint(esr, regs); + debug_exception_exit(regs); + local_daif_restore(DAIF_PROCCTX); + arm64_exit_to_user_mode(regs); } -static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr) { - /* Only watchpoints write FAR_EL1, otherwise its UNKNOWN */ + bool step_done; + + if (!is_ttbr0_addr(regs->pc)) + arm64_apply_bp_hardening(); + + arm64_enter_from_user_mode(regs); + /* + * After handling a breakpoint, we suspend the breakpoint + * and use single-step to move to the next instruction. + * If we are stepping a suspended breakpoint there's nothing more to do: + * the single-step is complete. + */ + step_done = try_step_suspended_breakpoints(regs); + local_daif_restore(DAIF_PROCCTX); + if (!step_done) + do_el0_softstep(esr, regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_watchpt(struct pt_regs *regs, unsigned long esr) +{ + /* Watchpoints are the only debug exception to write FAR_EL1 */ unsigned long far = read_sysreg(far_el1); - enter_from_user_mode(regs); - do_debug_exception(far, esr, regs); + arm64_enter_from_user_mode(regs); + debug_exception_enter(regs); + do_watchpoint(far, esr, regs); + debug_exception_exit(regs); + local_daif_restore(DAIF_PROCCTX); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_brk64(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); - exit_to_user_mode(regs); + do_el0_brk64(esr, regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_svc(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); - fp_user_discard(); + fpsimd_syscall_enter(); local_daif_restore(DAIF_PROCCTX); do_el0_svc(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); + fpsimd_syscall_exit(); } static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_fpac(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) @@ -732,11 +779,20 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_MOPS: el0_mops(regs, esr); break; + case ESR_ELx_EC_GCS: + el0_gcs(regs, esr); + break; case ESR_ELx_EC_BREAKPT_LOW: + el0_breakpt(regs, esr); + break; case ESR_ELx_EC_SOFTSTP_LOW: + el0_softstp(regs, esr); + break; case ESR_ELx_EC_WATCHPT_LOW: + el0_watchpt(regs, esr); + break; case ESR_ELx_EC_BRK64: - el0_dbg(regs, esr); + el0_brk64(regs, esr); break; case ESR_ELx_EC_FPAC: el0_fpac(regs, esr); @@ -749,7 +805,7 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) static void noinstr el0_interrupt(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); write_sysreg(DAIF_PROCCTX_NOIRQ, daif); @@ -760,7 +816,7 @@ static void noinstr el0_interrupt(struct pt_regs *regs, do_interrupt_handler(regs, handler); irq_exit_rcu(); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr __el0_irq_handler_common(struct pt_regs *regs) @@ -786,14 +842,15 @@ asmlinkage void noinstr el0t_64_fiq_handler(struct pt_regs *regs) static void noinstr __el0_error_handler_common(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); + irqentry_state_t state; - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_ERRCTX); - arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); do_serror(regs, esr); - arm64_exit_nmi(regs); + irqentry_nmi_exit(regs, state); local_daif_restore(DAIF_PROCCTX); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs) @@ -804,19 +861,27 @@ asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs) #ifdef CONFIG_COMPAT static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_cp15(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_svc_compat(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); do_el0_svc_compat(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_bkpt32(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_bkpt32(esr, regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs) @@ -853,10 +918,16 @@ asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs) el0_cp15(regs, esr); break; case ESR_ELx_EC_BREAKPT_LOW: + el0_breakpt(regs, esr); + break; case ESR_ELx_EC_SOFTSTP_LOW: + el0_softstp(regs, esr); + break; case ESR_ELx_EC_WATCHPT_LOW: + el0_watchpt(regs, esr); + break; case ESR_ELx_EC_BKPT32: - el0_dbg(regs, esr); + el0_bkpt32(regs, esr); break; default: el0_inv(regs, esr); @@ -884,21 +955,20 @@ UNHANDLED(el0t, 32, fiq) UNHANDLED(el0t, 32, error) #endif /* CONFIG_COMPAT */ -#ifdef CONFIG_VMAP_STACK asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); unsigned long far = read_sysreg(far_el1); - arm64_enter_nmi(regs); + irqentry_nmi_enter(regs); panic_bad_stack(regs, esr, far); } -#endif /* CONFIG_VMAP_STACK */ #ifdef CONFIG_ARM_SDE_INTERFACE asmlinkage noinstr unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { + irqentry_state_t state; unsigned long ret; /* @@ -923,9 +993,9 @@ __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) else if (cpu_has_pan()) set_pstate_pan(0); - arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); ret = do_sdei_event(regs, arg); - arm64_exit_nmi(regs); + irqentry_nmi_exit(regs, state); return ret; } diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index f0c16640ef21..025140caafe7 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -94,7 +94,7 @@ SYM_CODE_START(ftrace_caller) stp x29, x30, [sp, #FREGS_SIZE] add x29, sp, #FREGS_SIZE - /* Prepare arguments for the the tracer func */ + /* Prepare arguments for the tracer func */ sub x0, x30, #AARCH64_INSN_SIZE // ip (callsite's BL insn) mov x1, x9 // parent_ip (callsite's LR) mov x3, sp // regs @@ -329,24 +329,28 @@ SYM_FUNC_END(ftrace_stub_graph) * @fp is checked against the value passed by ftrace_graph_caller(). */ SYM_CODE_START(return_to_handler) - /* save return value regs */ - sub sp, sp, #FGRET_REGS_SIZE - stp x0, x1, [sp, #FGRET_REGS_X0] - stp x2, x3, [sp, #FGRET_REGS_X2] - stp x4, x5, [sp, #FGRET_REGS_X4] - stp x6, x7, [sp, #FGRET_REGS_X6] - str x29, [sp, #FGRET_REGS_FP] // parent's fp + /* Make room for ftrace_regs */ + sub sp, sp, #FREGS_SIZE + + /* Save return value regs */ + stp x0, x1, [sp, #FREGS_X0] + stp x2, x3, [sp, #FREGS_X2] + stp x4, x5, [sp, #FREGS_X4] + stp x6, x7, [sp, #FREGS_X6] + + /* Save the callsite's FP */ + str x29, [sp, #FREGS_FP] mov x0, sp - bl ftrace_return_to_handler // addr = ftrace_return_to_hander(regs); + bl ftrace_return_to_handler // addr = ftrace_return_to_hander(fregs); mov x30, x0 // restore the original return address - /* restore return value regs */ - ldp x0, x1, [sp, #FGRET_REGS_X0] - ldp x2, x3, [sp, #FGRET_REGS_X2] - ldp x4, x5, [sp, #FGRET_REGS_X4] - ldp x6, x7, [sp, #FGRET_REGS_X6] - add sp, sp, #FGRET_REGS_SIZE + /* Restore return value regs */ + ldp x0, x1, [sp, #FREGS_X0] + ldp x2, x3, [sp, #FREGS_X2] + ldp x4, x5, [sp, #FREGS_X4] + ldp x6, x7, [sp, #FREGS_X6] + add sp, sp, #FREGS_SIZE ret SYM_CODE_END(return_to_handler) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 6ad61de03d0a..f8018b5c1f9a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -25,6 +25,7 @@ #include <asm/processor.h> #include <asm/ptrace.h> #include <asm/scs.h> +#include <asm/stacktrace/frame.h> #include <asm/thread_info.h> #include <asm/asm-uaccess.h> #include <asm/unistd.h> @@ -54,7 +55,6 @@ .endif sub sp, sp, #PT_REGS_SIZE -#ifdef CONFIG_VMAP_STACK /* * Test whether the SP has overflowed, without corrupting a GPR. * Task and IRQ stacks are aligned so that SP & (1 << THREAD_SHIFT) @@ -96,7 +96,6 @@ /* We were already on the overflow stack. Restore sp/x0 and carry on. */ sub sp, sp, x0 mrs x0, tpidrro_el0 -#endif b el\el\ht\()_\regsize\()_\label .org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm @@ -284,15 +283,16 @@ alternative_else_nop_endif stp lr, x21, [sp, #S_LR] /* - * For exceptions from EL0, create a final frame record. - * For exceptions from EL1, create a synthetic frame record so the - * interrupted code shows up in the backtrace. + * Create a metadata frame record. The unwinder will use this to + * identify and unwind exception boundaries. */ - .if \el == 0 stp xzr, xzr, [sp, #S_STACKFRAME] + .if \el == 0 + mov x0, #FRAME_META_TYPE_FINAL .else - stp x29, x22, [sp, #S_STACKFRAME] + mov x0, #FRAME_META_TYPE_PT_REGS .endif + str x0, [sp, #S_STACKFRAME_TYPE] add x29, sp, #S_STACKFRAME #ifdef CONFIG_ARM64_SW_TTBR0_PAN @@ -315,7 +315,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING alternative_else_nop_endif mrs_s x20, SYS_ICC_PMR_EL1 - str x20, [sp, #S_PMR_SAVE] + str w20, [sp, #S_PMR] mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET msr_s SYS_ICC_PMR_EL1, x20 @@ -342,7 +342,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING b .Lskip_pmr_restore\@ alternative_else_nop_endif - ldr x20, [sp, #S_PMR_SAVE] + ldr w20, [sp, #S_PMR] msr_s SYS_ICC_PMR_EL1, x20 /* Ensure priority change is seen by redistributor */ @@ -428,12 +428,9 @@ alternative_else_nop_endif ldp x28, x29, [sp, #16 * 14] .if \el == 0 -alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 - ldr lr, [sp, #S_LR] - add sp, sp, #PT_REGS_SIZE // restore sp - eret -alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + alternative_insn "b .L_skip_tramp_exit_\@", nop, ARM64_UNMAP_KERNEL_AT_EL0 + msr far_el1, x29 ldr_this_cpu x30, this_cpu_vector, x29 @@ -442,16 +439,26 @@ alternative_else_nop_endif ldr lr, [sp, #S_LR] // restore x30 add sp, sp, #PT_REGS_SIZE // restore sp br x29 + +.L_skip_tramp_exit_\@: #endif - .else + .endif + ldr lr, [sp, #S_LR] add sp, sp, #PT_REGS_SIZE // restore sp + .if \el == 0 + /* This must be after the last explicit memory access */ +alternative_if ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD + tlbi vale1, xzr + dsb nsh +alternative_else_nop_endif + .else /* Ensure any device/NC reads complete */ alternative_insn nop, "dmb sy", ARM64_WORKAROUND_1508412 + .endif eret - .endif sb .endm @@ -531,7 +538,6 @@ SYM_CODE_START(vectors) kernel_ventry 0, t, 32, error // Error 32-bit EL0 SYM_CODE_END(vectors) -#ifdef CONFIG_VMAP_STACK SYM_CODE_START_LOCAL(__bad_stack) /* * We detected an overflow in kernel_ventry, which switched to the @@ -559,7 +565,6 @@ SYM_CODE_START_LOCAL(__bad_stack) bl handle_bad_stack ASM_BUG() SYM_CODE_END(__bad_stack) -#endif /* CONFIG_VMAP_STACK */ .macro entry_handler el:req, ht:req, regsize:req, label:req @@ -605,7 +610,7 @@ SYM_CODE_END(ret_to_kernel) SYM_CODE_START_LOCAL(ret_to_user) ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step enable_step_tsk x19, x2 -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK +#ifdef CONFIG_KSTACK_ERASE bl stackleak_erase_on_task_stack #endif kernel_exit 0 @@ -816,6 +821,7 @@ SYM_CODE_END(__bp_harden_el1_vectors) * */ SYM_FUNC_START(cpu_switch_to) + save_and_disable_daif x11 mov x10, #THREAD_CPU_CONTEXT add x8, x0, x10 mov x9, sp @@ -839,6 +845,7 @@ SYM_FUNC_START(cpu_switch_to) ptrauth_keys_install_kernel x1, x8, x9, x10 scs_save x0 scs_load_current + restore_irq x11 ret SYM_FUNC_END(cpu_switch_to) NOKPROBE(cpu_switch_to) @@ -865,6 +872,7 @@ NOKPROBE(ret_from_fork) * Calls func(regs) using this CPU's irq stack and shadow irq stack. */ SYM_FUNC_START(call_on_irq_stack) + save_and_disable_daif x9 #ifdef CONFIG_SHADOW_CALL_STACK get_current_task x16 scs_save x16 @@ -879,8 +887,10 @@ SYM_FUNC_START(call_on_irq_stack) /* Move to the new stack and call the function there */ add sp, x16, #IRQ_STACK_SIZE + restore_irq x9 blr x1 + save_and_disable_daif x9 /* * Restore the SP from the FP, and restore the FP and LR from the frame * record. @@ -888,6 +898,7 @@ SYM_FUNC_START(call_on_irq_stack) mov sp, x29 ldp x29, x30, [sp], #16 scs_load_current + restore_irq x9 ret SYM_FUNC_END(call_on_irq_stack) NOKPROBE(call_on_irq_stack) @@ -994,7 +1005,6 @@ SYM_CODE_START(__sdei_asm_handler) 1: adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6 2: str x19, [x5] -#ifdef CONFIG_VMAP_STACK /* * entry.S may have been using sp as a scratch register, find whether * this is a normal or critical event and switch to the appropriate @@ -1007,7 +1017,6 @@ SYM_CODE_START(__sdei_asm_handler) 2: mov x6, #SDEI_STACK_SIZE add x5, x5, x6 mov sp, x5 -#endif #ifdef CONFIG_SHADOW_CALL_STACK /* Use a separate shadow call stack for normal and critical events */ diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 91e44ac7150f..c154f72634e0 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -85,13 +85,13 @@ * softirq kicks in. Upon vcpu_put(), KVM will save the vcpu FP state and * flag the register state as invalid. * - * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may - * save the task's FPSIMD context back to task_struct from softirq context. - * To prevent this from racing with the manipulation of the task's FPSIMD state - * from task context and thereby corrupting the state, it is necessary to - * protect any manipulation of a task's fpsimd_state or TIF_FOREIGN_FPSTATE - * flag with {, __}get_cpu_fpsimd_context(). This will still allow softirqs to - * run but prevent them to use FPSIMD. + * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may be + * called from softirq context, which will save the task's FPSIMD context back + * to task_struct. To prevent this from racing with the manipulation of the + * task's FPSIMD state from task context and thereby corrupting the state, it + * is necessary to protect any manipulation of a task's fpsimd_state or + * TIF_FOREIGN_FPSTATE flag with get_cpu_fpsimd_context(), which will suspend + * softirq servicing entirely until put_cpu_fpsimd_context() is called. * * For a certain task, the sequence may look something like this: * - the task gets scheduled in; if both the task's fpsimd_cpu field @@ -119,7 +119,7 @@ * whatever is in the FPSIMD registers is not saved to memory, but discarded. */ -static DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); +DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); __ro_after_init struct vl_info vl_info[ARM64_VEC_MAX] = { #ifdef CONFIG_ARM64_SVE @@ -180,12 +180,12 @@ static inline void set_sve_default_vl(int val) set_default_vl(ARM64_VEC_SVE, val); } -static void __percpu *efi_sve_state; +static u8 *efi_sve_state; #else /* ! CONFIG_ARM64_SVE */ /* Dummy declaration for code that will be optimised out: */ -extern void __percpu *efi_sve_state; +extern u8 *efi_sve_state; #endif /* ! CONFIG_ARM64_SVE */ @@ -209,27 +209,14 @@ static inline void sme_free(struct task_struct *t) { } #endif -DEFINE_PER_CPU(bool, fpsimd_context_busy); -EXPORT_PER_CPU_SYMBOL(fpsimd_context_busy); - static void fpsimd_bind_task_to_cpu(void); -static void __get_cpu_fpsimd_context(void) -{ - bool busy = __this_cpu_xchg(fpsimd_context_busy, true); - - WARN_ON(busy); -} - /* * Claim ownership of the CPU FPSIMD context for use by the calling context. * * The caller may freely manipulate the FPSIMD context metadata until * put_cpu_fpsimd_context() is called. * - * The double-underscore version must only be called if you know the task - * can't be preempted. - * * On RT kernels local_bh_disable() is not sufficient because it only * serializes soft interrupt related sections via a local lock, but stays * preemptible. Disabling preemption is the right choice here as bottom @@ -238,18 +225,21 @@ static void __get_cpu_fpsimd_context(void) */ static void get_cpu_fpsimd_context(void) { - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - local_bh_disable(); - else + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + /* + * The softirq subsystem lacks a true unmask/mask API, and + * re-enabling softirq processing using local_bh_enable() will + * not only unmask softirqs, it will also result in immediate + * delivery of any pending softirqs. + * This is undesirable when running with IRQs disabled, but in + * that case, there is no need to mask softirqs in the first + * place, so only bother doing so when IRQs are enabled. + */ + if (!irqs_disabled()) + local_bh_disable(); + } else { preempt_disable(); - __get_cpu_fpsimd_context(); -} - -static void __put_cpu_fpsimd_context(void) -{ - bool busy = __this_cpu_xchg(fpsimd_context_busy, false); - - WARN_ON(!busy); /* No matching get_cpu_fpsimd_context()? */ + } } /* @@ -261,16 +251,12 @@ static void __put_cpu_fpsimd_context(void) */ static void put_cpu_fpsimd_context(void) { - __put_cpu_fpsimd_context(); - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - local_bh_enable(); - else + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + if (!irqs_disabled()) + local_bh_enable(); + } else { preempt_enable(); -} - -static bool have_cpu_fpsimd_context(void) -{ - return !preemptible() && __this_cpu_read(fpsimd_context_busy); + } } unsigned int task_get_vl(const struct task_struct *task, enum vec_type type) @@ -383,19 +369,18 @@ static void task_fpsimd_load(void) bool restore_ffr; WARN_ON(!system_supports_fpsimd()); - WARN_ON(!have_cpu_fpsimd_context()); + WARN_ON(preemptible()); + WARN_ON(test_thread_flag(TIF_KERNEL_FPSTATE)); if (system_supports_sve() || system_supports_sme()) { switch (current->thread.fp_type) { case FP_STATE_FPSIMD: /* Stop tracking SVE for this task until next use. */ - if (test_and_clear_thread_flag(TIF_SVE)) - sve_user_disable(); + clear_thread_flag(TIF_SVE); break; case FP_STATE_SVE: - if (!thread_sm_enabled(¤t->thread) && - !WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE))) - sve_user_enable(); + if (!thread_sm_enabled(¤t->thread)) + WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE)); if (test_thread_flag(TIF_SVE)) sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1); @@ -406,10 +391,10 @@ static void task_fpsimd_load(void) default: /* * This indicates either a bug in - * fpsimd_save() or memory corruption, we + * fpsimd_save_user_state() or memory corruption, we * should always record an explicit format * when we save. We always at least have the - * memory allocated for FPSMID registers so + * memory allocated for FPSIMD registers so * try that and hope for the best. */ WARN_ON_ONCE(1); @@ -436,6 +421,9 @@ static void task_fpsimd_load(void) restore_ffr = system_supports_fa64(); } + if (system_supports_fpmr()) + write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR); + if (restore_sve_regs) { WARN_ON_ONCE(current->thread.fp_type != FP_STATE_SVE); sve_load_state(sve_pffr(¤t->thread), @@ -457,7 +445,7 @@ static void task_fpsimd_load(void) * than via current, if we are saving KVM state then it will have * ensured that the type of registers to save is set in last->to_save. */ -static void fpsimd_save(void) +static void fpsimd_save_user_state(void) { struct cpu_fp_state const *last = this_cpu_ptr(&fpsimd_last_state); @@ -467,18 +455,24 @@ static void fpsimd_save(void) unsigned int vl; WARN_ON(!system_supports_fpsimd()); - WARN_ON(!have_cpu_fpsimd_context()); + WARN_ON(preemptible()); if (test_thread_flag(TIF_FOREIGN_FPSTATE)) return; + if (system_supports_fpmr()) + *(last->fpmr) = read_sysreg_s(SYS_FPMR); + /* - * If a task is in a syscall the ABI allows us to only - * preserve the state shared with FPSIMD so don't bother - * saving the full SVE state in that case. + * Save SVE state if it is live. + * + * The syscall ABI discards live SVE state at syscall entry. When + * entering a syscall, fpsimd_syscall_enter() sets to_save to + * FP_STATE_FPSIMD to allow the SVE state to be lazily discarded until + * either new SVE state is loaded+bound or fpsimd_syscall_exit() is + * called prior to a return to userspace. */ - if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE) && - !in_syscall(current_pt_regs())) || + if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE)) || last->to_save == FP_STATE_SVE) { save_sve_regs = true; save_ffr = true; @@ -555,7 +549,7 @@ static unsigned int find_supported_vector_length(enum vec_type type, #if defined(CONFIG_ARM64_SVE) && defined(CONFIG_SYSCTL) -static int vec_proc_do_default_vl(struct ctl_table *table, int write, +static int vec_proc_do_default_vl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct vl_info *info = table->extra1; @@ -582,14 +576,13 @@ static int vec_proc_do_default_vl(struct ctl_table *table, int write, return 0; } -static struct ctl_table sve_default_vl_table[] = { +static const struct ctl_table sve_default_vl_table[] = { { .procname = "sve_default_vector_length", .mode = 0644, .proc_handler = vec_proc_do_default_vl, .extra1 = &vl_info[ARM64_VEC_SVE], }, - { } }; static int __init sve_sysctl_init(void) @@ -606,14 +599,13 @@ static int __init sve_sysctl_init(void) { return 0; } #endif /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */ #if defined(CONFIG_ARM64_SME) && defined(CONFIG_SYSCTL) -static struct ctl_table sme_default_vl_table[] = { +static const struct ctl_table sme_default_vl_table[] = { { .procname = "sme_default_vector_length", .mode = 0644, .proc_handler = vec_proc_do_default_vl, .extra1 = &vl_info[ARM64_VEC_SME], }, - { } }; static int __init sme_sysctl_init(void) @@ -673,7 +665,7 @@ static void __fpsimd_to_sve(void *sst, struct user_fpsimd_state const *fst, * task->thread.uw.fpsimd_state must be up to date before calling this * function. */ -static void fpsimd_to_sve(struct task_struct *task) +static inline void fpsimd_to_sve(struct task_struct *task) { unsigned int vq; void *sst = task->thread.sve_state; @@ -697,7 +689,7 @@ static void fpsimd_to_sve(struct task_struct *task) * bytes of allocated kernel memory. * task->thread.sve_state must be up to date before calling this function. */ -static void sve_to_fpsimd(struct task_struct *task) +static inline void sve_to_fpsimd(struct task_struct *task) { unsigned int vq, vl; void const *sst = task->thread.sve_state; @@ -716,38 +708,39 @@ static void sve_to_fpsimd(struct task_struct *task) } } -#ifdef CONFIG_ARM64_SVE -/* - * Call __sve_free() directly only if you know task can't be scheduled - * or preempted. - */ -static void __sve_free(struct task_struct *task) +static inline void __fpsimd_zero_vregs(struct user_fpsimd_state *fpsimd) { - kfree(task->thread.sve_state); - task->thread.sve_state = NULL; + memset(&fpsimd->vregs, 0, sizeof(fpsimd->vregs)); } -static void sve_free(struct task_struct *task) +/* + * Simulate the effects of an SMSTOP SM instruction. + */ +void task_smstop_sm(struct task_struct *task) { - WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); + if (!thread_sm_enabled(&task->thread)) + return; + + __fpsimd_zero_vregs(&task->thread.uw.fpsimd_state); + task->thread.uw.fpsimd_state.fpsr = 0x0800009f; + if (system_supports_fpmr()) + task->thread.uw.fpmr = 0; - __sve_free(task); + task->thread.svcr &= ~SVCR_SM_MASK; + task->thread.fp_type = FP_STATE_FPSIMD; } -/* - * Return how many bytes of memory are required to store the full SVE - * state for task, given task's currently configured vector length. - */ -size_t sve_state_size(struct task_struct const *task) +void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p) { - unsigned int vl = 0; - - if (system_supports_sve()) - vl = task_get_sve_vl(task); - if (system_supports_sme()) - vl = max(vl, task_get_sme_vl(task)); + write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK, + SYS_SCTLR_EL1); +} - return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)); +#ifdef CONFIG_ARM64_SVE +static void sve_free(struct task_struct *task) +{ + kfree(task->thread.sve_state); + task->thread.sve_state = NULL; } /* @@ -774,69 +767,34 @@ void sve_alloc(struct task_struct *task, bool flush) kzalloc(sve_state_size(task), GFP_KERNEL); } - /* - * Force the FPSIMD state shared with SVE to be updated in the SVE state - * even if the SVE state is the current active state. + * Ensure that task->thread.uw.fpsimd_state is up to date with respect to the + * task's currently effective FPSIMD/SVE state. * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. + * The task's FPSIMD/SVE/SME state must not be subject to concurrent + * manipulation. */ -void fpsimd_force_sync_to_sve(struct task_struct *task) -{ - fpsimd_to_sve(task); -} - -/* - * Ensure that task->thread.sve_state is up to date with respect to - * the user task, irrespective of when SVE is in use or not. - * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - */ -void fpsimd_sync_to_sve(struct task_struct *task) -{ - if (!test_tsk_thread_flag(task, TIF_SVE) && - !thread_sm_enabled(&task->thread)) - fpsimd_to_sve(task); -} - -/* - * Ensure that task->thread.uw.fpsimd_state is up to date with respect to - * the user task, irrespective of whether SVE is in use or not. - * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - */ -void sve_sync_to_fpsimd(struct task_struct *task) +void fpsimd_sync_from_effective_state(struct task_struct *task) { if (task->thread.fp_type == FP_STATE_SVE) sve_to_fpsimd(task); } /* - * Ensure that task->thread.sve_state is up to date with respect to - * the task->thread.uw.fpsimd_state. + * Ensure that the task's currently effective FPSIMD/SVE state is up to date + * with respect to task->thread.uw.fpsimd_state, zeroing any effective + * non-FPSIMD (S)SVE state. * - * This should only be called by ptrace to merge new FPSIMD register - * values into a task for which SVE is currently active. - * task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - * task->thread.uw.fpsimd_state must already have been initialised with - * the new FPSIMD register values to be merged in. + * The task's FPSIMD/SVE/SME state must not be subject to concurrent + * manipulation. */ -void sve_sync_from_fpsimd_zeropad(struct task_struct *task) +void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task) { unsigned int vq; void *sst = task->thread.sve_state; struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state; - if (!test_tsk_thread_flag(task, TIF_SVE) && - !thread_sm_enabled(&task->thread)) + if (task->thread.fp_type != FP_STATE_SVE) return; vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); @@ -845,10 +803,73 @@ void sve_sync_from_fpsimd_zeropad(struct task_struct *task) __fpsimd_to_sve(sst, fst, vq); } +static int change_live_vector_length(struct task_struct *task, + enum vec_type type, + unsigned long vl) +{ + unsigned int sve_vl = task_get_sve_vl(task); + unsigned int sme_vl = task_get_sme_vl(task); + void *sve_state = NULL, *sme_state = NULL; + + if (type == ARM64_VEC_SME) + sme_vl = vl; + else + sve_vl = vl; + + /* + * Allocate the new sve_state and sme_state before freeing the old + * copies so that allocation failure can be handled without needing to + * mutate the task's state in any way. + * + * Changes to the SVE vector length must not discard live ZA state or + * clear PSTATE.ZA, as userspace code which is unaware of the AAPCS64 + * ZA lazy saving scheme may attempt to change the SVE vector length + * while unsaved/dormant ZA state exists. + */ + sve_state = kzalloc(__sve_state_size(sve_vl, sme_vl), GFP_KERNEL); + if (!sve_state) + goto out_mem; + + if (type == ARM64_VEC_SME) { + sme_state = kzalloc(__sme_state_size(sme_vl), GFP_KERNEL); + if (!sme_state) + goto out_mem; + } + + if (task == current) + fpsimd_save_and_flush_current_state(); + else + fpsimd_flush_task_state(task); + + /* + * Always preserve PSTATE.SM and the effective FPSIMD state, zeroing + * other SVE state. + */ + fpsimd_sync_from_effective_state(task); + task_set_vl(task, type, vl); + kfree(task->thread.sve_state); + task->thread.sve_state = sve_state; + fpsimd_sync_to_effective_state_zeropad(task); + + if (type == ARM64_VEC_SME) { + task->thread.svcr &= ~SVCR_ZA_MASK; + kfree(task->thread.sme_state); + task->thread.sme_state = sme_state; + } + + return 0; + +out_mem: + kfree(sve_state); + kfree(sme_state); + return -ENOMEM; +} + int vec_set_vector_length(struct task_struct *task, enum vec_type type, unsigned long vl, unsigned long flags) { - bool free_sme = false; + bool onexec = flags & PR_SVE_SET_VL_ONEXEC; + bool inherit = flags & PR_SVE_VL_INHERIT; if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT | PR_SVE_SET_VL_ONEXEC)) @@ -868,73 +889,17 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type, vl = find_supported_vector_length(type, vl); - if (flags & (PR_SVE_VL_INHERIT | - PR_SVE_SET_VL_ONEXEC)) + if (!onexec && vl != task_get_vl(task, type)) { + if (change_live_vector_length(task, type, vl)) + return -ENOMEM; + } + + if (onexec || inherit) task_set_vl_onexec(task, type, vl); else /* Reset VL to system default on next exec: */ task_set_vl_onexec(task, type, 0); - /* Only actually set the VL if not deferred: */ - if (flags & PR_SVE_SET_VL_ONEXEC) - goto out; - - if (vl == task_get_vl(task, type)) - goto out; - - /* - * To ensure the FPSIMD bits of the SVE vector registers are preserved, - * write any live register state back to task_struct, and convert to a - * regular FPSIMD thread. - */ - if (task == current) { - get_cpu_fpsimd_context(); - - fpsimd_save(); - } - - fpsimd_flush_task_state(task); - if (test_and_clear_tsk_thread_flag(task, TIF_SVE) || - thread_sm_enabled(&task->thread)) { - sve_to_fpsimd(task); - task->thread.fp_type = FP_STATE_FPSIMD; - } - - if (system_supports_sme()) { - if (type == ARM64_VEC_SME || - !(task->thread.svcr & (SVCR_SM_MASK | SVCR_ZA_MASK))) { - /* - * We are changing the SME VL or weren't using - * SME anyway, discard the state and force a - * reallocation. - */ - task->thread.svcr &= ~(SVCR_SM_MASK | - SVCR_ZA_MASK); - clear_tsk_thread_flag(task, TIF_SME); - free_sme = true; - } - } - - if (task == current) - put_cpu_fpsimd_context(); - - task_set_vl(task, type, vl); - - /* - * Free the changed states if they are not in use, SME will be - * reallocated to the correct size on next use and we just - * allocate SVE now in case it is needed for use in streaming - * mode. - */ - if (system_supports_sve()) { - sve_free(task); - sve_alloc(task, true); - } - - if (free_sme) - sme_free(task); - -out: update_tsk_thread_flag(task, vec_vl_inherit_flag(type), flags & PR_SVE_VL_INHERIT); @@ -1149,53 +1114,31 @@ static void __init sve_efi_setup(void) if (!sve_vl_valid(max_vl)) goto fail; - efi_sve_state = __alloc_percpu( - SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), SVE_VQ_BYTES); + efi_sve_state = kmalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), + GFP_KERNEL); if (!efi_sve_state) goto fail; return; fail: - panic("Cannot allocate percpu memory for EFI SVE save/restore"); + panic("Cannot allocate memory for EFI SVE save/restore"); } -/* - * Enable SVE for EL1. - * Intended for use by the cpufeatures code during CPU boot. - */ -void sve_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +void cpu_enable_sve(const struct arm64_cpu_capabilities *__always_unused p) { write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_ZEN_EL1EN, CPACR_EL1); isb(); -} -/* - * Read the pseudo-ZCR used by cpufeatures to identify the supported SVE - * vector length. - * - * Use only if SVE is present. - * This function clobbers the SVE vector length. - */ -u64 read_zcr_features(void) -{ - /* - * Set the maximum possible VL, and write zeroes to all other - * bits to see if they stick. - */ - sve_kernel_enable(NULL); - write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL1); - - /* Return LEN value that would be written to get the maximum VL */ - return sve_vq_from_vl(sve_get_vl()) - 1; + write_sysreg_s(0, SYS_ZCR_EL1); } void __init sve_setup(void) { struct vl_info *info = &vl_info[ARM64_VEC_SVE]; - u64 zcr; DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); unsigned long b; + int max_bit; if (!system_supports_sve()) return; @@ -1208,17 +1151,8 @@ void __init sve_setup(void) if (WARN_ON(!test_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map))) set_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map); - zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1); - info->max_vl = sve_vl_from_vq((zcr & ZCR_ELx_LEN_MASK) + 1); - - /* - * Sanity-check that the max VL we determined through CPU features - * corresponds properly to sve_vq_map. If not, do our best: - */ - if (WARN_ON(info->max_vl != find_supported_vector_length(ARM64_VEC_SVE, - info->max_vl))) - info->max_vl = find_supported_vector_length(ARM64_VEC_SVE, - info->max_vl); + max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX); + info->max_vl = sve_vl_from_vq(__bit_to_vq(max_bit)); /* * For the default VL, pick the maximum supported value <= 64. @@ -1261,7 +1195,7 @@ void __init sve_setup(void) */ void fpsimd_release_task(struct task_struct *dead_task) { - __sve_free(dead_task); + sve_free(dead_task); sme_free(dead_task); } @@ -1280,8 +1214,10 @@ void fpsimd_release_task(struct task_struct *dead_task) */ void sme_alloc(struct task_struct *task, bool flush) { - if (task->thread.sme_state && flush) { - memset(task->thread.sme_state, 0, sme_state_size(task)); + if (task->thread.sme_state) { + if (flush) + memset(task->thread.sme_state, 0, + sme_state_size(task)); return; } @@ -1296,7 +1232,7 @@ static void sme_free(struct task_struct *task) task->thread.sme_state = NULL; } -void sme_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +void cpu_enable_sme(const struct arm64_cpu_capabilities *__always_unused p) { /* Set priority for all PEs to architecturally defined minimum */ write_sysreg_s(read_sysreg_s(SYS_SMPRI_EL1) & ~SMPRI_EL1_PRIORITY_MASK, @@ -1306,85 +1242,57 @@ void sme_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_SMEN_EL1EN, CPACR_EL1); isb(); + /* Ensure all bits in SMCR are set to known values */ + write_sysreg_s(0, SYS_SMCR_EL1); + /* Allow EL0 to access TPIDR2 */ write_sysreg(read_sysreg(SCTLR_EL1) | SCTLR_ELx_ENTP2, SCTLR_EL1); isb(); } -/* - * This must be called after sme_kernel_enable(), we rely on the - * feature table being sorted to ensure this. - */ -void sme2_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +void cpu_enable_sme2(const struct arm64_cpu_capabilities *__always_unused p) { + /* This must be enabled after SME */ + BUILD_BUG_ON(ARM64_SME2 <= ARM64_SME); + /* Allow use of ZT0 */ write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_EZT0_MASK, SYS_SMCR_EL1); } -/* - * This must be called after sme_kernel_enable(), we rely on the - * feature table being sorted to ensure this. - */ -void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +void cpu_enable_fa64(const struct arm64_cpu_capabilities *__always_unused p) { + /* This must be enabled after SME */ + BUILD_BUG_ON(ARM64_SME_FA64 <= ARM64_SME); + /* Allow use of FA64 */ write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK, SYS_SMCR_EL1); } -/* - * Read the pseudo-SMCR used by cpufeatures to identify the supported - * vector length. - * - * Use only if SME is present. - * This function clobbers the SME vector length. - */ -u64 read_smcr_features(void) -{ - sme_kernel_enable(NULL); - - /* - * Set the maximum possible VL. - */ - write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_LEN_MASK, - SYS_SMCR_EL1); - - /* Return LEN value that would be written to get the maximum VL */ - return sve_vq_from_vl(sme_get_vl()) - 1; -} - void __init sme_setup(void) { struct vl_info *info = &vl_info[ARM64_VEC_SME]; - u64 smcr; - int min_bit; + int min_bit, max_bit; if (!system_supports_sme()) return; + min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); + /* * SME doesn't require any particular vector length be * supported but it does require at least one. We should have * disabled the feature entirely while bringing up CPUs but - * let's double check here. + * let's double check here. The bitmap is SVE_VQ_MAP sized for + * sharing with SVE. */ - WARN_ON(bitmap_empty(info->vq_map, SVE_VQ_MAX)); + WARN_ON(min_bit >= SVE_VQ_MAX); - min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); info->min_vl = sve_vl_from_vq(__bit_to_vq(min_bit)); - smcr = read_sanitised_ftr_reg(SYS_SMCR_EL1); - info->max_vl = sve_vl_from_vq((smcr & SMCR_ELx_LEN_MASK) + 1); - - /* - * Sanity-check that the max VL we determined through CPU features - * corresponds properly to sme_vq_map. If not, do our best: - */ - if (WARN_ON(info->max_vl != find_supported_vector_length(ARM64_VEC_SME, - info->max_vl))) - info->max_vl = find_supported_vector_length(ARM64_VEC_SME, - info->max_vl); + max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX); + info->max_vl = sve_vl_from_vq(__bit_to_vq(max_bit)); WARN_ON(info->min_vl > info->max_vl); @@ -1404,6 +1312,22 @@ void __init sme_setup(void) get_sme_default_vl()); } +void sme_suspend_exit(void) +{ + u64 smcr = 0; + + if (!system_supports_sme()) + return; + + if (system_supports_fa64()) + smcr |= SMCR_ELx_FA64; + if (system_supports_sme2()) + smcr |= SMCR_ELx_EZT0; + + write_sysreg_s(smcr, SYS_SMCR_EL1); + write_sysreg_s(0, SYS_SMPRI_EL1); +} + #endif /* CONFIG_ARM64_SME */ static void sve_init_regs(void) @@ -1427,6 +1351,7 @@ static void sve_init_regs(void) } else { fpsimd_to_sve(current); current->thread.fp_type = FP_STATE_SVE; + fpsimd_flush_task_state(current); } } @@ -1495,7 +1420,7 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs) * If this not a trap due to SME being disabled then something * is being used in the wrong mode, report as SIGILL. */ - if (ESR_ELx_ISS(esr) != ESR_ELx_SME_ISS_SME_DISABLED) { + if (ESR_ELx_SME_ISS_SMTC(esr) != ESR_ELx_SME_ISS_SMTC_SME_DISABLED) { force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } @@ -1519,6 +1444,8 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs) sme_set_vq(vq_minus_one); fpsimd_bind_task_to_cpu(); + } else { + fpsimd_flush_task_state(current); } put_cpu_fpsimd_context(); @@ -1529,8 +1456,17 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs) */ void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs) { - /* TODO: implement lazy context saving/restoring */ - WARN_ON(1); + /* Even if we chose not to use FPSIMD, the hardware could still trap: */ + if (!system_supports_fpsimd()) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + /* + * When FPSIMD is enabled, we should never take a trap unless something + * has gone very wrong. + */ + BUG(); } /* @@ -1558,6 +1494,57 @@ void do_fpsimd_exc(unsigned long esr, struct pt_regs *regs) current); } +static void fpsimd_load_kernel_state(struct task_struct *task) +{ + struct cpu_fp_state *last = this_cpu_ptr(&fpsimd_last_state); + + /* + * Elide the load if this CPU holds the most recent kernel mode + * FPSIMD context of the current task. + */ + if (last->st == task->thread.kernel_fpsimd_state && + task->thread.kernel_fpsimd_cpu == smp_processor_id()) + return; + + fpsimd_load_state(task->thread.kernel_fpsimd_state); +} + +static void fpsimd_save_kernel_state(struct task_struct *task) +{ + struct cpu_fp_state cpu_fp_state = { + .st = task->thread.kernel_fpsimd_state, + .to_save = FP_STATE_FPSIMD, + }; + + BUG_ON(!cpu_fp_state.st); + + fpsimd_save_state(task->thread.kernel_fpsimd_state); + fpsimd_bind_state_to_cpu(&cpu_fp_state); + + task->thread.kernel_fpsimd_cpu = smp_processor_id(); +} + +/* + * Invalidate any task's FPSIMD state that is present on this cpu. + * The FPSIMD context should be acquired with get_cpu_fpsimd_context() + * before calling this function. + */ +static void fpsimd_flush_cpu_state(void) +{ + WARN_ON(!system_supports_fpsimd()); + __this_cpu_write(fpsimd_last_state.st, NULL); + + /* + * Leaving streaming mode enabled will cause issues for any kernel + * NEON and leaving streaming mode or ZA enabled may increase power + * consumption. + */ + if (system_supports_sme()) + sme_smstop(); + + set_thread_flag(TIF_FOREIGN_FPSTATE); +} + void fpsimd_thread_switch(struct task_struct *next) { bool wrong_task, wrong_cpu; @@ -1565,24 +1552,31 @@ void fpsimd_thread_switch(struct task_struct *next) if (!system_supports_fpsimd()) return; - __get_cpu_fpsimd_context(); + WARN_ON_ONCE(!irqs_disabled()); /* Save unsaved fpsimd state, if any: */ - fpsimd_save(); - - /* - * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's - * state. For kernel threads, FPSIMD registers are never loaded - * and wrong_task and wrong_cpu will always be true. - */ - wrong_task = __this_cpu_read(fpsimd_last_state.st) != - &next->thread.uw.fpsimd_state; - wrong_cpu = next->thread.fpsimd_cpu != smp_processor_id(); + if (test_thread_flag(TIF_KERNEL_FPSTATE)) + fpsimd_save_kernel_state(current); + else + fpsimd_save_user_state(); - update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, - wrong_task || wrong_cpu); + if (test_tsk_thread_flag(next, TIF_KERNEL_FPSTATE)) { + fpsimd_flush_cpu_state(); + fpsimd_load_kernel_state(next); + } else { + /* + * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's + * state. For kernel threads, FPSIMD registers are never + * loaded with user mode FPSIMD state and so wrong_task and + * wrong_cpu will always be true. + */ + wrong_task = __this_cpu_read(fpsimd_last_state.st) != + &next->thread.uw.fpsimd_state; + wrong_cpu = next->thread.fpsimd_cpu != smp_processor_id(); - __put_cpu_fpsimd_context(); + update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, + wrong_task || wrong_cpu); + } } static void fpsimd_flush_thread_vl(enum vec_type type) @@ -1655,6 +1649,9 @@ void fpsimd_flush_thread(void) current->thread.svcr = 0; } + if (system_supports_fpmr()) + current->thread.uw.fpmr = 0; + current->thread.fp_type = FP_STATE_FPSIMD; put_cpu_fpsimd_context(); @@ -1672,44 +1669,7 @@ void fpsimd_preserve_current_state(void) return; get_cpu_fpsimd_context(); - fpsimd_save(); - put_cpu_fpsimd_context(); -} - -/* - * Like fpsimd_preserve_current_state(), but ensure that - * current->thread.uw.fpsimd_state is updated so that it can be copied to - * the signal frame. - */ -void fpsimd_signal_preserve_current_state(void) -{ - fpsimd_preserve_current_state(); - if (test_thread_flag(TIF_SVE)) - sve_to_fpsimd(current); -} - -/* - * Called by KVM when entering the guest. - */ -void fpsimd_kvm_prepare(void) -{ - if (!system_supports_sve()) - return; - - /* - * KVM does not save host SVE state since we can only enter - * the guest from a syscall so the ABI means that only the - * non-saved SVE state needs to be saved. If we have left - * SVE enabled for performance reasons then update the task - * state to be FPSIMD only. - */ - get_cpu_fpsimd_context(); - - if (test_and_clear_thread_flag(TIF_SVE)) { - sve_to_fpsimd(current); - current->thread.fp_type = FP_STATE_FPSIMD; - } - + fpsimd_save_user_state(); put_cpu_fpsimd_context(); } @@ -1729,6 +1689,7 @@ static void fpsimd_bind_task_to_cpu(void) last->sve_vl = task_get_sve_vl(current); last->sme_vl = task_get_sme_vl(current); last->svcr = ¤t->thread.svcr; + last->fpmr = ¤t->thread.uw.fpmr; last->fp_type = ¤t->thread.fp_type; last->to_save = FP_STATE_CURRENT; current->thread.fpsimd_cpu = smp_processor_id(); @@ -1771,13 +1732,23 @@ void fpsimd_bind_state_to_cpu(struct cpu_fp_state *state) void fpsimd_restore_current_state(void) { /* - * For the tasks that were created before we detected the absence of - * FP/SIMD, the TIF_FOREIGN_FPSTATE could be set via fpsimd_thread_switch(), - * e.g, init. This could be then inherited by the children processes. - * If we later detect that the system doesn't support FP/SIMD, - * we must clear the flag for all the tasks to indicate that the - * FPSTATE is clean (as we can't have one) to avoid looping for ever in - * do_notify_resume(). + * TIF_FOREIGN_FPSTATE is set on the init task and copied by + * arch_dup_task_struct() regardless of whether FP/SIMD is detected. + * Thus user threads can have this set even when FP/SIMD hasn't been + * detected. + * + * When FP/SIMD is detected, begin_new_exec() will set + * TIF_FOREIGN_FPSTATE via flush_thread() -> fpsimd_flush_thread(), + * and fpsimd_thread_switch() will set TIF_FOREIGN_FPSTATE when + * switching tasks. We detect FP/SIMD before we exec the first user + * process, ensuring this has TIF_FOREIGN_FPSTATE set and + * do_notify_resume() will call fpsimd_restore_current_state() to + * install the user FP/SIMD context. + * + * When FP/SIMD is not detected, nothing else will clear or set + * TIF_FOREIGN_FPSTATE prior to the first return to userspace, and + * we must clear TIF_FOREIGN_FPSTATE to avoid do_notify_resume() + * looping forever calling fpsimd_restore_current_state(). */ if (!system_supports_fpsimd()) { clear_thread_flag(TIF_FOREIGN_FPSTATE); @@ -1794,30 +1765,14 @@ void fpsimd_restore_current_state(void) put_cpu_fpsimd_context(); } -/* - * Load an updated userland FPSIMD state for 'current' from memory and set the - * flag that indicates that the FPSIMD register contents are the most recent - * FPSIMD state of 'current'. This is used by the signal code to restore the - * register state when returning from a signal handler in FPSIMD only cases, - * any SVE context will be discarded. - */ void fpsimd_update_current_state(struct user_fpsimd_state const *state) { if (WARN_ON(!system_supports_fpsimd())) return; - get_cpu_fpsimd_context(); - current->thread.uw.fpsimd_state = *state; - if (test_thread_flag(TIF_SVE)) + if (current->thread.fp_type == FP_STATE_SVE) fpsimd_to_sve(current); - - task_fpsimd_load(); - fpsimd_bind_task_to_cpu(); - - clear_thread_flag(TIF_FOREIGN_FPSTATE); - - put_cpu_fpsimd_context(); } /* @@ -1834,6 +1789,7 @@ void fpsimd_update_current_state(struct user_fpsimd_state const *state) void fpsimd_flush_task_state(struct task_struct *t) { t->thread.fpsimd_cpu = NR_CPUS; + t->thread.kernel_fpsimd_state = NULL; /* * If we don't support fpsimd, bail out after we have * reset the fpsimd_cpu for this task and clear the @@ -1847,25 +1803,15 @@ void fpsimd_flush_task_state(struct task_struct *t) barrier(); } -/* - * Invalidate any task's FPSIMD state that is present on this cpu. - * The FPSIMD context should be acquired with get_cpu_fpsimd_context() - * before calling this function. - */ -static void fpsimd_flush_cpu_state(void) +void fpsimd_save_and_flush_current_state(void) { - WARN_ON(!system_supports_fpsimd()); - __this_cpu_write(fpsimd_last_state.st, NULL); - - /* - * Leaving streaming mode enabled will cause issues for any kernel - * NEON and leaving streaming mode or ZA enabled may increase power - * consumption. - */ - if (system_supports_sme()) - sme_smstop(); + if (!system_supports_fpsimd()) + return; - set_thread_flag(TIF_FOREIGN_FPSTATE); + get_cpu_fpsimd_context(); + fpsimd_save_user_state(); + fpsimd_flush_task_state(current); + put_cpu_fpsimd_context(); } /* @@ -1874,13 +1820,15 @@ static void fpsimd_flush_cpu_state(void) */ void fpsimd_save_and_flush_cpu_state(void) { + unsigned long flags; + if (!system_supports_fpsimd()) return; WARN_ON(preemptible()); - __get_cpu_fpsimd_context(); - fpsimd_save(); + local_irq_save(flags); + fpsimd_save_user_state(); fpsimd_flush_cpu_state(); - __put_cpu_fpsimd_context(); + local_irq_restore(flags); } #ifdef CONFIG_KERNEL_MODE_NEON @@ -1901,21 +1849,63 @@ void fpsimd_save_and_flush_cpu_state(void) * * The caller may freely use the FPSIMD registers until kernel_neon_end() is * called. + * + * Unless called from non-preemptible task context, @state must point to a + * caller provided buffer that will be used to preserve the task's kernel mode + * FPSIMD context when it is scheduled out, or if it is interrupted by kernel + * mode FPSIMD occurring in softirq context. May be %NULL otherwise. */ -void kernel_neon_begin(void) +void kernel_neon_begin(struct user_fpsimd_state *state) { if (WARN_ON(!system_supports_fpsimd())) return; + WARN_ON((preemptible() || in_serving_softirq()) && !state); + BUG_ON(!may_use_simd()); get_cpu_fpsimd_context(); /* Save unsaved fpsimd state, if any: */ - fpsimd_save(); + if (test_thread_flag(TIF_KERNEL_FPSTATE)) { + BUG_ON(IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq()); + fpsimd_save_state(state); + } else { + fpsimd_save_user_state(); + + /* + * Set the thread flag so that the kernel mode FPSIMD state + * will be context switched along with the rest of the task + * state. + * + * On non-PREEMPT_RT, softirqs may interrupt task level kernel + * mode FPSIMD, but the task will not be preemptible so setting + * TIF_KERNEL_FPSTATE for those would be both wrong (as it + * would mark the task context FPSIMD state as requiring a + * context switch) and unnecessary. + * + * On PREEMPT_RT, softirqs are serviced from a separate thread, + * which is scheduled as usual, and this guarantees that these + * softirqs are not interrupting use of the FPSIMD in kernel + * mode in task context. So in this case, setting the flag here + * is always appropriate. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq()) { + /* + * Record the caller provided buffer as the kernel mode + * FP/SIMD buffer for this task, so that the state can + * be preserved and restored on a context switch. + */ + WARN_ON(current->thread.kernel_fpsimd_state != NULL); + current->thread.kernel_fpsimd_state = state; + set_thread_flag(TIF_KERNEL_FPSTATE); + } + } /* Invalidate any task state remaining in the fpsimd regs: */ fpsimd_flush_cpu_state(); + + put_cpu_fpsimd_context(); } EXPORT_SYMBOL_GPL(kernel_neon_begin); @@ -1927,22 +1917,39 @@ EXPORT_SYMBOL_GPL(kernel_neon_begin); * * The caller must not use the FPSIMD registers after this function is called, * unless kernel_neon_begin() is called again in the meantime. + * + * The value of @state must match the value passed to the preceding call to + * kernel_neon_begin(). */ -void kernel_neon_end(void) +void kernel_neon_end(struct user_fpsimd_state *state) { if (!system_supports_fpsimd()) return; - put_cpu_fpsimd_context(); + if (!test_thread_flag(TIF_KERNEL_FPSTATE)) + return; + + /* + * If we are returning from a nested use of kernel mode FPSIMD, restore + * the task context kernel mode FPSIMD state. This can only happen when + * running in softirq context on non-PREEMPT_RT. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && in_serving_softirq()) { + fpsimd_load_state(state); + } else { + clear_thread_flag(TIF_KERNEL_FPSTATE); + WARN_ON(current->thread.kernel_fpsimd_state != state); + current->thread.kernel_fpsimd_state = NULL; + } } EXPORT_SYMBOL_GPL(kernel_neon_end); #ifdef CONFIG_EFI -static DEFINE_PER_CPU(struct user_fpsimd_state, efi_fpsimd_state); -static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); -static DEFINE_PER_CPU(bool, efi_sve_state_used); -static DEFINE_PER_CPU(bool, efi_sm_state); +static struct user_fpsimd_state efi_fpsimd_state; +static bool efi_fpsimd_state_used; +static bool efi_sve_state_used; +static bool efi_sm_state; /* * EFI runtime services support functions @@ -1966,27 +1973,25 @@ void __efi_fpsimd_begin(void) if (!system_supports_fpsimd()) return; - WARN_ON(preemptible()); - if (may_use_simd()) { - kernel_neon_begin(); + kernel_neon_begin(&efi_fpsimd_state); } else { + WARN_ON(preemptible()); + /* * If !efi_sve_state, SVE can't be in use yet and doesn't need * preserving: */ - if (system_supports_sve() && likely(efi_sve_state)) { - char *sve_state = this_cpu_ptr(efi_sve_state); + if (system_supports_sve() && efi_sve_state != NULL) { bool ffr = true; u64 svcr; - __this_cpu_write(efi_sve_state_used, true); + efi_sve_state_used = true; if (system_supports_sme()) { svcr = read_sysreg_s(SYS_SVCR); - __this_cpu_write(efi_sm_state, - svcr & SVCR_SM_MASK); + efi_sm_state = svcr & SVCR_SM_MASK; /* * Unless we have FA64 FFR does not @@ -1996,19 +2001,18 @@ void __efi_fpsimd_begin(void) ffr = !(svcr & SVCR_SM_MASK); } - sve_save_state(sve_state + sve_ffr_offset(sve_max_vl()), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - ffr); + sve_save_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), + &efi_fpsimd_state.fpsr, ffr); if (system_supports_sme()) sysreg_clear_set_s(SYS_SVCR, SVCR_SM_MASK, 0); } else { - fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); + fpsimd_save_state(&efi_fpsimd_state); } - __this_cpu_write(efi_fpsimd_state_used, true); + efi_fpsimd_state_used = true; } } @@ -2020,12 +2024,10 @@ void __efi_fpsimd_end(void) if (!system_supports_fpsimd()) return; - if (!__this_cpu_xchg(efi_fpsimd_state_used, false)) { - kernel_neon_end(); + if (!efi_fpsimd_state_used) { + kernel_neon_end(&efi_fpsimd_state); } else { - if (system_supports_sve() && - likely(__this_cpu_read(efi_sve_state_used))) { - char const *sve_state = this_cpu_ptr(efi_sve_state); + if (system_supports_sve() && efi_sve_state_used) { bool ffr = true; /* @@ -2034,7 +2036,7 @@ void __efi_fpsimd_end(void) * streaming mode. */ if (system_supports_sme()) { - if (__this_cpu_read(efi_sm_state)) { + if (efi_sm_state) { sysreg_clear_set_s(SYS_SVCR, 0, SVCR_SM_MASK); @@ -2048,14 +2050,15 @@ void __efi_fpsimd_end(void) } } - sve_load_state(sve_state + sve_ffr_offset(sve_max_vl()), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - ffr); + sve_load_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), + &efi_fpsimd_state.fpsr, ffr); - __this_cpu_write(efi_sve_state_used, false); + efi_sve_state_used = false; } else { - fpsimd_load_state(this_cpu_ptr(&efi_fpsimd_state)); + fpsimd_load_state(&efi_fpsimd_state); } + + efi_fpsimd_state_used = false; } } @@ -2110,6 +2113,13 @@ static inline void fpsimd_hotplug_init(void) static inline void fpsimd_hotplug_init(void) { } #endif +void cpu_enable_fpsimd(const struct arm64_cpu_capabilities *__always_unused p) +{ + unsigned long enable = CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN; + write_sysreg(read_sysreg(CPACR_EL1) | enable, CPACR_EL1); + isb(); +} + /* * FP/SIMD support code initialisation. */ diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index a650f5e11fc5..5a1554a44162 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -15,7 +15,7 @@ #include <asm/debug-monitors.h> #include <asm/ftrace.h> #include <asm/insn.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS struct fregs_offset { @@ -23,10 +23,10 @@ struct fregs_offset { int offset; }; -#define FREGS_OFFSET(n, field) \ -{ \ - .name = n, \ - .offset = offsetof(struct ftrace_regs, field), \ +#define FREGS_OFFSET(n, field) \ +{ \ + .name = n, \ + .offset = offsetof(struct __arch_ftrace_regs, field), \ } static const struct fregs_offset fregs_offsets[] = { @@ -143,6 +143,69 @@ unsigned long ftrace_call_adjust(unsigned long addr) return addr; } +/* Convert fentry_ip to the symbol address without kallsyms */ +unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip) +{ + u32 insn; + + /* + * When using patchable-function-entry without pre-function NOPS, ftrace + * entry is the address of the first NOP after the function entry point. + * + * The compiler has either generated: + * + * func+00: func: NOP // To be patched to MOV X9, LR + * func+04: NOP // To be patched to BL <caller> + * + * Or: + * + * func-04: BTI C + * func+00: func: NOP // To be patched to MOV X9, LR + * func+04: NOP // To be patched to BL <caller> + * + * The fentry_ip is the address of `BL <caller>` which is at `func + 4` + * bytes in either case. + */ + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return fentry_ip - AARCH64_INSN_SIZE; + + /* + * When using patchable-function-entry with pre-function NOPs, BTI is + * a bit different. + * + * func+00: func: NOP // To be patched to MOV X9, LR + * func+04: NOP // To be patched to BL <caller> + * + * Or: + * + * func+00: func: BTI C + * func+04: NOP // To be patched to MOV X9, LR + * func+08: NOP // To be patched to BL <caller> + * + * The fentry_ip is the address of `BL <caller>` which is at either + * `func + 4` or `func + 8` depends on whether there is a BTI. + */ + + /* If there is no BTI, the func address should be one instruction before. */ + if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) + return fentry_ip - AARCH64_INSN_SIZE; + + /* We want to be extra safe in case entry ip is on the page edge, + * but otherwise we need to avoid get_kernel_nofault()'s overhead. + */ + if ((fentry_ip & ~PAGE_MASK) < AARCH64_INSN_SIZE * 2) { + if (get_kernel_nofault(insn, (u32 *)(fentry_ip - AARCH64_INSN_SIZE * 2))) + return 0; + } else { + insn = *(u32 *)(fentry_ip - AARCH64_INSN_SIZE * 2); + } + + if (aarch64_insn_is_bti(le32_to_cpu((__le32)insn))) + return fentry_ip - AARCH64_INSN_SIZE * 2; + + return fentry_ip - AARCH64_INSN_SIZE; +} + /* * Replace a single instruction, which may be a branch or NOP. * If @validate == true, a replaced instruction is checked against 'old'. @@ -195,10 +258,17 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return ftrace_modify_code(pc, 0, new, false); } -static struct plt_entry *get_ftrace_plt(struct module *mod) +static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr) { #ifdef CONFIG_MODULES - struct plt_entry *plt = mod->arch.ftrace_trampolines; + struct plt_entry *plt = NULL; + + if (within_module_mem_type(addr, mod, MOD_INIT_TEXT)) + plt = mod->arch.init_ftrace_trampolines; + else if (within_module_mem_type(addr, mod, MOD_TEXT)) + plt = mod->arch.ftrace_trampolines; + else + return NULL; return &plt[FTRACE_PLT_IDX]; #else @@ -257,20 +327,19 @@ static bool ftrace_find_callable_addr(struct dyn_ftrace *rec, * dealing with an out-of-range condition, we can assume it * is due to a module being loaded far away from the kernel. * - * NOTE: __module_text_address() must be called with preemption - * disabled, but we can rely on ftrace_lock to ensure that 'mod' + * NOTE: __module_text_address() must be called within a RCU read + * section, but we can rely on ftrace_lock to ensure that 'mod' * retains its validity throughout the remainder of this code. */ if (!mod) { - preempt_disable(); + guard(rcu)(); mod = __module_text_address(pc); - preempt_enable(); } if (WARN_ON(!mod)) return false; - plt = get_ftrace_plt(mod); + plt = get_ftrace_plt(mod, pc); if (!plt) { pr_err("ftrace: no module PLT for %ps\n", (void *)*addr); return false; @@ -423,7 +492,7 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, return ret; /* - * When using mcount, callsites in modules may have been initalized to + * When using mcount, callsites in modules may have been initialized to * call an arbitrary module PLT (which redirects to the _mcount stub) * rather than the ftrace PLT we'll use at runtime (which redirects to * the ftrace trampoline). We can ignore the old PLT when initializing @@ -481,7 +550,20 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - prepare_ftrace_return(ip, &fregs->lr, fregs->fp); + unsigned long return_hooker = (unsigned long)&return_to_handler; + unsigned long frame_pointer = arch_ftrace_regs(fregs)->fp; + unsigned long *parent = &arch_ftrace_regs(fregs)->lr; + unsigned long old; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + + old = *parent; + + if (!function_graph_enter_regs(old, ip, frame_pointer, + (void *)frame_pointer, fregs)) { + *parent = return_hooker; + } } #else /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 7b236994f0e1..ca04b338cb0d 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -32,6 +32,7 @@ #include <asm/scs.h> #include <asm/smp.h> #include <asm/sysreg.h> +#include <asm/stacktrace/frame.h> #include <asm/thread_info.h> #include <asm/virt.h> @@ -80,28 +81,42 @@ * x19 primary_entry() .. start_kernel() whether we entered with the MMU on * x20 primary_entry() .. __primary_switch() CPU boot mode * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 - * x22 create_idmap() .. start_kernel() ID map VA of the DT blob - * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset - * x24 __primary_switch() linear map KASLR seed - * x25 primary_entry() .. start_kernel() supported VA size - * x28 create_idmap() callee preserved temp register */ SYM_CODE_START(primary_entry) bl record_mmu_state bl preserve_boot_args - bl create_idmap + + adrp x1, early_init_stack + mov sp, x1 + mov x29, xzr + adrp x0, __pi_init_idmap_pg_dir + mov x1, xzr + bl __pi_create_init_idmap + + /* + * If the page tables have been populated with non-cacheable + * accesses (MMU disabled), invalidate those tables again to + * remove any speculatively loaded cache lines. + */ + cbnz x19, 0f + dmb sy + mov x1, x0 // end of used region + adrp x0, __pi_init_idmap_pg_dir + adr_l x2, dcache_inval_poc + blr x2 + b 1f /* * If we entered with the MMU and caches on, clean the ID mapped part * of the primary boot code to the PoC so we can safely execute it with * the MMU off. */ - cbz x19, 0f - adrp x0, __idmap_text_start +0: adrp x0, __idmap_text_start adr_l x1, __idmap_text_end adr_l x2, dcache_clean_poc blr x2 -0: mov x0, x19 + +1: mov x0, x19 bl init_kernel_el // w0=cpu_boot_mode mov x20, x0 @@ -111,14 +126,6 @@ SYM_CODE_START(primary_entry) * On return, the CPU will be ready for the MMU to be turned on and * the TCR will have been set. */ -#if VA_BITS > 48 - mrs_s x0, SYS_ID_AA64MMFR2_EL1 - tst x0, ID_AA64MMFR2_EL1_VARange_MASK - mov x0, #VA_BITS - mov x25, #VA_BITS_MIN - csel x25, x25, x0, eq - mov x0, x25 -#endif bl __cpu_setup // initialise processor b __primary_switch SYM_CODE_END(primary_entry) @@ -177,267 +184,6 @@ SYM_CODE_START_LOCAL(preserve_boot_args) ret SYM_CODE_END(preserve_boot_args) -SYM_FUNC_START_LOCAL(clear_page_tables) - /* - * Clear the init page tables. - */ - adrp x0, init_pg_dir - adrp x1, init_pg_end - sub x2, x1, x0 - mov x1, xzr - b __pi_memset // tail call -SYM_FUNC_END(clear_page_tables) - -/* - * Macro to populate page table entries, these entries can be pointers to the next level - * or last level entries pointing to physical memory. - * - * tbl: page table address - * rtbl: pointer to page table or physical memory - * index: start index to write - * eindex: end index to write - [index, eindex] written to - * flags: flags for pagetable entry to or in - * inc: increment to rtbl between each entry - * tmp1: temporary variable - * - * Preserves: tbl, eindex, flags, inc - * Corrupts: index, tmp1 - * Returns: rtbl - */ - .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1 -.Lpe\@: phys_to_pte \tmp1, \rtbl - orr \tmp1, \tmp1, \flags // tmp1 = table entry - str \tmp1, [\tbl, \index, lsl #3] - add \rtbl, \rtbl, \inc // rtbl = pa next level - add \index, \index, #1 - cmp \index, \eindex - b.ls .Lpe\@ - .endm - -/* - * Compute indices of table entries from virtual address range. If multiple entries - * were needed in the previous page table level then the next page table level is assumed - * to be composed of multiple pages. (This effectively scales the end index). - * - * vstart: virtual address of start of range - * vend: virtual address of end of range - we map [vstart, vend] - * shift: shift used to transform virtual address into index - * order: #imm 2log(number of entries in page table) - * istart: index in table corresponding to vstart - * iend: index in table corresponding to vend - * count: On entry: how many extra entries were required in previous level, scales - * our end index. - * On exit: returns how many extra entries required for next page table level - * - * Preserves: vstart, vend - * Returns: istart, iend, count - */ - .macro compute_indices, vstart, vend, shift, order, istart, iend, count - ubfx \istart, \vstart, \shift, \order - ubfx \iend, \vend, \shift, \order - add \iend, \iend, \count, lsl \order - sub \count, \iend, \istart - .endm - -/* - * Map memory for specified virtual address range. Each level of page table needed supports - * multiple entries. If a level requires n entries the next page table level is assumed to be - * formed from n pages. - * - * tbl: location of page table - * rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE) - * vstart: virtual address of start of range - * vend: virtual address of end of range - we map [vstart, vend - 1] - * flags: flags to use to map last level entries - * phys: physical address corresponding to vstart - physical memory is contiguous - * order: #imm 2log(number of entries in PGD table) - * - * If extra_shift is set, an extra level will be populated if the end address does - * not fit in 'extra_shift' bits. This assumes vend is in the TTBR0 range. - * - * Temporaries: istart, iend, tmp, count, sv - these need to be different registers - * Preserves: vstart, flags - * Corrupts: tbl, rtbl, vend, istart, iend, tmp, count, sv - */ - .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, order, istart, iend, tmp, count, sv, extra_shift - sub \vend, \vend, #1 - add \rtbl, \tbl, #PAGE_SIZE - mov \count, #0 - - .ifnb \extra_shift - tst \vend, #~((1 << (\extra_shift)) - 1) - b.eq .L_\@ - compute_indices \vstart, \vend, #\extra_shift, #(PAGE_SHIFT - 3), \istart, \iend, \count - mov \sv, \rtbl - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv - .endif -.L_\@: - compute_indices \vstart, \vend, #PGDIR_SHIFT, #\order, \istart, \iend, \count - mov \sv, \rtbl - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv - -#if SWAPPER_PGTABLE_LEVELS > 3 - compute_indices \vstart, \vend, #PUD_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count - mov \sv, \rtbl - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv -#endif - -#if SWAPPER_PGTABLE_LEVELS > 2 - compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count - mov \sv, \rtbl - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv -#endif - - compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count - bic \rtbl, \phys, #SWAPPER_BLOCK_SIZE - 1 - populate_entries \tbl, \rtbl, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp - .endm - -/* - * Remap a subregion created with the map_memory macro with modified attributes - * or output address. The entire remapped region must have been covered in the - * invocation of map_memory. - * - * x0: last level table address (returned in first argument to map_memory) - * x1: start VA of the existing mapping - * x2: start VA of the region to update - * x3: end VA of the region to update (exclusive) - * x4: start PA associated with the region to update - * x5: attributes to set on the updated region - * x6: order of the last level mappings - */ -SYM_FUNC_START_LOCAL(remap_region) - sub x3, x3, #1 // make end inclusive - - // Get the index offset for the start of the last level table - lsr x1, x1, x6 - bfi x1, xzr, #0, #PAGE_SHIFT - 3 - - // Derive the start and end indexes into the last level table - // associated with the provided region - lsr x2, x2, x6 - lsr x3, x3, x6 - sub x2, x2, x1 - sub x3, x3, x1 - - mov x1, #1 - lsl x6, x1, x6 // block size at this level - - populate_entries x0, x4, x2, x3, x5, x6, x7 - ret -SYM_FUNC_END(remap_region) - -SYM_FUNC_START_LOCAL(create_idmap) - mov x28, lr - /* - * The ID map carries a 1:1 mapping of the physical address range - * covered by the loaded image, which could be anywhere in DRAM. This - * means that the required size of the VA (== PA) space is decided at - * boot time, and could be more than the configured size of the VA - * space for ordinary kernel and user space mappings. - * - * There are three cases to consider here: - * - 39 <= VA_BITS < 48, and the ID map needs up to 48 VA bits to cover - * the placement of the image. In this case, we configure one extra - * level of translation on the fly for the ID map only. (This case - * also covers 42-bit VA/52-bit PA on 64k pages). - * - * - VA_BITS == 48, and the ID map needs more than 48 VA bits. This can - * only happen when using 64k pages, in which case we need to extend - * the root level table rather than add a level. Note that we can - * treat this case as 'always extended' as long as we take care not - * to program an unsupported T0SZ value into the TCR register. - * - * - Combinations that would require two additional levels of - * translation are not supported, e.g., VA_BITS==36 on 16k pages, or - * VA_BITS==39/4k pages with 5-level paging, where the input address - * requires more than 47 or 48 bits, respectively. - */ -#if (VA_BITS < 48) -#define IDMAP_PGD_ORDER (VA_BITS - PGDIR_SHIFT) -#define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) - - /* - * If VA_BITS < 48, we have to configure an additional table level. - * First, we have to verify our assumption that the current value of - * VA_BITS was chosen such that all translation levels are fully - * utilised, and that lowering T0SZ will always result in an additional - * translation level to be configured. - */ -#if VA_BITS != EXTRA_SHIFT -#error "Mismatch between VA_BITS and page size/number of translation levels" -#endif -#else -#define IDMAP_PGD_ORDER (PHYS_MASK_SHIFT - PGDIR_SHIFT) -#define EXTRA_SHIFT - /* - * If VA_BITS == 48, we don't have to configure an additional - * translation level, but the top-level table has more entries. - */ -#endif - adrp x0, init_idmap_pg_dir - adrp x3, _text - adrp x6, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE - mov_q x7, SWAPPER_RX_MMUFLAGS - - map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14, EXTRA_SHIFT - - /* Remap the kernel page tables r/w in the ID map */ - adrp x1, _text - adrp x2, init_pg_dir - adrp x3, init_pg_end - bic x4, x2, #SWAPPER_BLOCK_SIZE - 1 - mov_q x5, SWAPPER_RW_MMUFLAGS - mov x6, #SWAPPER_BLOCK_SHIFT - bl remap_region - - /* Remap the FDT after the kernel image */ - adrp x1, _text - adrp x22, _end + SWAPPER_BLOCK_SIZE - bic x2, x22, #SWAPPER_BLOCK_SIZE - 1 - bfi x22, x21, #0, #SWAPPER_BLOCK_SHIFT // remapped FDT address - add x3, x2, #MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE - bic x4, x21, #SWAPPER_BLOCK_SIZE - 1 - mov_q x5, SWAPPER_RW_MMUFLAGS - mov x6, #SWAPPER_BLOCK_SHIFT - bl remap_region - - /* - * Since the page tables have been populated with non-cacheable - * accesses (MMU disabled), invalidate those tables again to - * remove any speculatively loaded cache lines. - */ - cbnz x19, 0f // skip cache invalidation if MMU is on - dmb sy - - adrp x0, init_idmap_pg_dir - adrp x1, init_idmap_pg_end - bl dcache_inval_poc -0: ret x28 -SYM_FUNC_END(create_idmap) - -SYM_FUNC_START_LOCAL(create_kernel_mapping) - adrp x0, init_pg_dir - mov_q x5, KIMAGE_VADDR // compile time __va(_text) -#ifdef CONFIG_RELOCATABLE - add x5, x5, x23 // add KASLR displacement -#endif - adrp x6, _end // runtime __pa(_end) - adrp x3, _text // runtime __pa(_text) - sub x6, x6, x3 // _end - _text - add x6, x6, x5 // runtime __va(_end) - mov_q x7, SWAPPER_RW_MMUFLAGS - - map_memory x0, x1, x5, x6, x7, x3, (VA_BITS - PGDIR_SHIFT), x10, x11, x12, x13, x14 - - dsb ishst // sync with page table walker - ret -SYM_FUNC_END(create_kernel_mapping) - /* * Initialize CPU registers with task-specific and cpu-specific context. * @@ -454,6 +200,8 @@ SYM_FUNC_END(create_kernel_mapping) sub sp, sp, #PT_REGS_SIZE stp xzr, xzr, [sp, #S_STACKFRAME] + mov \tmp1, #FRAME_META_TYPE_FINAL + str \tmp1, [sp, #S_STACKFRAME_TYPE] add x29, sp, #S_STACKFRAME scs_load_current @@ -482,41 +230,16 @@ SYM_FUNC_START_LOCAL(__primary_switched) str_l x21, __fdt_pointer, x5 // Save FDT pointer - ldr_l x4, kimage_vaddr // Save the offset between + adrp x4, _text // Save the offset between sub x4, x4, x0 // the kernel virtual and str_l x4, kimage_voffset, x5 // physical mappings mov x0, x20 bl set_cpu_boot_mode_flag - // Clear BSS - adr_l x0, __bss_start - mov x1, xzr - adr_l x2, __bss_stop - sub x2, x2, x0 - bl __pi_memset - dsb ishst // Make zero page visible to PTW - -#if VA_BITS > 48 - adr_l x8, vabits_actual // Set this early so KASAN early init - str x25, [x8] // ... observes the correct value - dc civac, x8 // Make visible to booting secondaries -#endif - -#ifdef CONFIG_RANDOMIZE_BASE - adrp x5, memstart_offset_seed // Save KASLR linear map seed - strh w24, [x5, :lo12:memstart_offset_seed] -#endif #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) bl kasan_early_init #endif - mov x0, x21 // pass FDT address in x0 - bl early_fdt_map // Try mapping the FDT early - mov x0, x20 // pass the full boot status - bl init_feature_override // Parse cpu feature overrides -#ifdef CONFIG_UNWIND_PATCH_PAC_INTO_SCS - bl scs_patch_vmlinux -#endif mov x0, x20 bl finalise_el2 // Prefer VHE if possible ldp x29, x30, [sp], #16 @@ -569,11 +292,14 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) adr_l x1, __hyp_text_end adr_l x2, dcache_clean_poc blr x2 -0: - mov_q x0, HCR_HOST_NVHE_FLAGS - msr hcr_el2, x0 + + mov_q x0, INIT_SCTLR_EL2_MMU_OFF + pre_disable_mmu_workaround + msr sctlr_el2, x0 isb +0: + init_el2_hcr HCR_HOST_NVHE_FLAGS init_el2_state /* Hypervisor stub */ @@ -583,27 +309,21 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) mov_q x1, INIT_SCTLR_EL1_MMU_OFF - /* - * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, - * making it impossible to start in nVHE mode. Is that - * compliant with the architecture? Absolutely not! - */ mrs x0, hcr_el2 and x0, x0, #HCR_E2H - cbz x0, 1f + cbz x0, 2f /* Set a sane SCTLR_EL1, the VHE way */ - pre_disable_mmu_workaround msr_s SYS_SCTLR_EL12, x1 mov x2, #BOOT_CPU_FLAG_E2H - b 2f + b 3f -1: - pre_disable_mmu_workaround +2: msr sctlr_el1, x1 mov x2, xzr -2: - __init_el2_nvhe_prepare_eret +3: + mov x0, #INIT_PSTATE_EL1 + msr spsr_el2, x0 mov w0, #BOOT_CPU_MODE_EL2 orr x0, x0, x2 @@ -643,10 +363,13 @@ SYM_FUNC_START_LOCAL(secondary_startup) * Common entry point for secondary CPUs. */ mov x20, x0 // preserve boot mode + +#ifdef CONFIG_ARM64_VA_BITS_52 +alternative_if ARM64_HAS_VA52 bl __cpu_secondary_check52bitva -#if VA_BITS > 48 - ldr_l x0, vabits_actual +alternative_else_nop_endif #endif + bl __cpu_setup // initialise processor adrp x1, swapper_pg_dir adrp x2, idmap_pg_dir @@ -749,15 +472,18 @@ SYM_FUNC_START(__enable_mmu) ret SYM_FUNC_END(__enable_mmu) +#ifdef CONFIG_ARM64_VA_BITS_52 SYM_FUNC_START(__cpu_secondary_check52bitva) -#if VA_BITS > 48 - ldr_l x0, vabits_actual - cmp x0, #52 - b.ne 2f - +#ifndef CONFIG_ARM64_LPA2 mrs_s x0, SYS_ID_AA64MMFR2_EL1 and x0, x0, ID_AA64MMFR2_EL1_VARange_MASK cbnz x0, 2f +#else + mrs x0, id_aa64mmfr0_el1 + sbfx x0, x0, #ID_AA64MMFR0_EL1_TGRAN_SHIFT, 4 + cmp x0, #ID_AA64MMFR0_EL1_TGRAN_LPA2 + b.ge 2f +#endif update_early_cpu_boot_status \ CPU_STUCK_IN_KERNEL | CPU_STUCK_REASON_52_BIT_VA, x0, x1 @@ -765,9 +491,9 @@ SYM_FUNC_START(__cpu_secondary_check52bitva) wfi b 1b -#endif 2: ret SYM_FUNC_END(__cpu_secondary_check52bitva) +#endif SYM_FUNC_START_LOCAL(__no_granule_support) /* Indicate that this CPU can't boot and is stuck in the kernel */ @@ -779,123 +505,18 @@ SYM_FUNC_START_LOCAL(__no_granule_support) b 1b SYM_FUNC_END(__no_granule_support) -#ifdef CONFIG_RELOCATABLE -SYM_FUNC_START_LOCAL(__relocate_kernel) - /* - * Iterate over each entry in the relocation table, and apply the - * relocations in place. - */ - adr_l x9, __rela_start - adr_l x10, __rela_end - mov_q x11, KIMAGE_VADDR // default virtual offset - add x11, x11, x23 // actual virtual offset - -0: cmp x9, x10 - b.hs 1f - ldp x12, x13, [x9], #24 - ldr x14, [x9, #-8] - cmp w13, #R_AARCH64_RELATIVE - b.ne 0b - add x14, x14, x23 // relocate - str x14, [x12, x23] - b 0b - -1: -#ifdef CONFIG_RELR - /* - * Apply RELR relocations. - * - * RELR is a compressed format for storing relative relocations. The - * encoded sequence of entries looks like: - * [ AAAAAAAA BBBBBBB1 BBBBBBB1 ... AAAAAAAA BBBBBB1 ... ] - * - * i.e. start with an address, followed by any number of bitmaps. The - * address entry encodes 1 relocation. The subsequent bitmap entries - * encode up to 63 relocations each, at subsequent offsets following - * the last address entry. - * - * The bitmap entries must have 1 in the least significant bit. The - * assumption here is that an address cannot have 1 in lsb. Odd - * addresses are not supported. Any odd addresses are stored in the RELA - * section, which is handled above. - * - * Excluding the least significant bit in the bitmap, each non-zero - * bit in the bitmap represents a relocation to be applied to - * a corresponding machine word that follows the base address - * word. The second least significant bit represents the machine - * word immediately following the initial address, and each bit - * that follows represents the next word, in linear order. As such, - * a single bitmap can encode up to 63 relocations in a 64-bit object. - * - * In this implementation we store the address of the next RELR table - * entry in x9, the address being relocated by the current address or - * bitmap entry in x13 and the address being relocated by the current - * bit in x14. - */ - adr_l x9, __relr_start - adr_l x10, __relr_end - -2: cmp x9, x10 - b.hs 7f - ldr x11, [x9], #8 - tbnz x11, #0, 3f // branch to handle bitmaps - add x13, x11, x23 - ldr x12, [x13] // relocate address entry - add x12, x12, x23 - str x12, [x13], #8 // adjust to start of bitmap - b 2b - -3: mov x14, x13 -4: lsr x11, x11, #1 - cbz x11, 6f - tbz x11, #0, 5f // skip bit if not set - ldr x12, [x14] // relocate bit - add x12, x12, x23 - str x12, [x14] - -5: add x14, x14, #8 // move to next bit's address - b 4b - -6: /* - * Move to the next bitmap's address. 8 is the word size, and 63 is the - * number of significant bits in a bitmap entry. - */ - add x13, x13, #(8 * 63) - b 2b - -7: -#endif - ret - -SYM_FUNC_END(__relocate_kernel) -#endif - SYM_FUNC_START_LOCAL(__primary_switch) adrp x1, reserved_pg_dir - adrp x2, init_idmap_pg_dir + adrp x2, __pi_init_idmap_pg_dir bl __enable_mmu -#ifdef CONFIG_RELOCATABLE - adrp x23, KERNEL_START - and x23, x23, MIN_KIMG_ALIGN - 1 -#ifdef CONFIG_RANDOMIZE_BASE - mov x0, x22 - adrp x1, init_pg_end + + adrp x1, early_init_stack mov sp, x1 mov x29, xzr - bl __pi_kaslr_early_init - and x24, x0, #SZ_2M - 1 // capture memstart offset seed - bic x0, x0, #SZ_2M - 1 - orr x23, x23, x0 // record kernel offset -#endif -#endif - bl clear_page_tables - bl create_kernel_mapping + mov x0, x20 // pass the full boot status + mov x1, x21 // pass the FDT + bl __pi_early_map_kernel // Map and relocate the kernel - adrp x1, init_pg_dir - load_ttbr1 x1, x1, x2 -#ifdef CONFIG_RELOCATABLE - bl __relocate_kernel -#endif ldr x8, =__primary_switched adrp x0, KERNEL_START // __pa(KERNEL_START) br x8 diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 02870beb271e..18749e9a6c2d 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -266,9 +266,15 @@ static int swsusp_mte_save_tags(void) max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { struct page *page = pfn_to_online_page(pfn); + struct folio *folio; if (!page) continue; + folio = page_folio(page); + + if (folio_test_hugetlb(folio) && + !folio_test_hugetlb_mte_tagged(folio)) + continue; if (!page_mte_tagged(page)) continue; @@ -407,7 +413,7 @@ int swsusp_arch_resume(void) void *, phys_addr_t, phys_addr_t); struct trans_pgd_info trans_info = { .trans_alloc_page = hibernate_page_alloc, - .trans_alloc_arg = (void *)GFP_ATOMIC, + .trans_alloc_arg = (__force void *)GFP_ATOMIC, }; /* diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 35225632d70a..ab76b36dce82 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -21,6 +21,8 @@ #include <asm/current.h> #include <asm/debug-monitors.h> +#include <asm/esr.h> +#include <asm/exception.h> #include <asm/hw_breakpoint.h> #include <asm/traps.h> #include <asm/cputype.h> @@ -617,8 +619,7 @@ NOKPROBE_SYMBOL(toggle_bp_registers); /* * Debug exception handlers. */ -static int breakpoint_handler(unsigned long unused, unsigned long esr, - struct pt_regs *regs) +void do_breakpoint(unsigned long esr, struct pt_regs *regs) { int i, step = 0, *kernel_step; u32 ctrl_reg; @@ -654,14 +655,14 @@ static int breakpoint_handler(unsigned long unused, unsigned long esr, perf_bp_event(bp, regs); /* Do we need to handle the stepping? */ - if (uses_default_overflow_handler(bp)) + if (is_default_overflow_handler(bp)) step = 1; unlock: rcu_read_unlock(); } if (!step) - return 0; + return; if (user_mode(regs)) { debug_info->bps_disabled = 1; @@ -669,7 +670,7 @@ unlock: /* If we're already stepping a watchpoint, just return. */ if (debug_info->wps_disabled) - return 0; + return; if (test_thread_flag(TIF_SINGLESTEP)) debug_info->suspended_step = 1; @@ -680,7 +681,7 @@ unlock: kernel_step = this_cpu_ptr(&stepping_kernel_bp); if (*kernel_step != ARM_KERNEL_STEP_NONE) - return 0; + return; if (kernel_active_single_step()) { *kernel_step = ARM_KERNEL_STEP_SUSPEND; @@ -689,10 +690,8 @@ unlock: kernel_enable_single_step(regs); } } - - return 0; } -NOKPROBE_SYMBOL(breakpoint_handler); +NOKPROBE_SYMBOL(do_breakpoint); /* * Arm64 hardware does not always report a watchpoint hit address that matches @@ -733,7 +732,7 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val, static int watchpoint_report(struct perf_event *wp, unsigned long addr, struct pt_regs *regs) { - int step = uses_default_overflow_handler(wp); + int step = is_default_overflow_handler(wp); struct arch_hw_breakpoint *info = counter_arch_bp(wp); info->trigger = addr; @@ -751,8 +750,7 @@ static int watchpoint_report(struct perf_event *wp, unsigned long addr, return step; } -static int watchpoint_handler(unsigned long addr, unsigned long esr, - struct pt_regs *regs) +void do_watchpoint(unsigned long addr, unsigned long esr, struct pt_regs *regs) { int i, step = 0, *kernel_step, access, closest_match = 0; u64 min_dist = -1, dist; @@ -779,7 +777,7 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr, * Check that the access type matches. * 0 => load, otherwise => store */ - access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W : + access = (esr & ESR_ELx_WNR) ? HW_BREAKPOINT_W : HW_BREAKPOINT_R; if (!(access & hw_breakpoint_type(wp))) continue; @@ -807,7 +805,7 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr, rcu_read_unlock(); if (!step) - return 0; + return; /* * We always disable EL0 watchpoints because the kernel can @@ -820,7 +818,7 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr, /* If we're already stepping a breakpoint, just return. */ if (debug_info->bps_disabled) - return 0; + return; if (test_thread_flag(TIF_SINGLESTEP)) debug_info->suspended_step = 1; @@ -831,7 +829,7 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr, kernel_step = this_cpu_ptr(&stepping_kernel_bp); if (*kernel_step != ARM_KERNEL_STEP_NONE) - return 0; + return; if (kernel_active_single_step()) { *kernel_step = ARM_KERNEL_STEP_SUSPEND; @@ -840,44 +838,41 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr, kernel_enable_single_step(regs); } } - - return 0; } -NOKPROBE_SYMBOL(watchpoint_handler); +NOKPROBE_SYMBOL(do_watchpoint); /* * Handle single-step exception. */ -int reinstall_suspended_bps(struct pt_regs *regs) +bool try_step_suspended_breakpoints(struct pt_regs *regs) { struct debug_info *debug_info = ¤t->thread.debug; - int handled_exception = 0, *kernel_step; - - kernel_step = this_cpu_ptr(&stepping_kernel_bp); + int *kernel_step = this_cpu_ptr(&stepping_kernel_bp); + bool handled_exception = false; /* - * Called from single-step exception handler. - * Return 0 if execution can resume, 1 if a SIGTRAP should be - * reported. + * Called from single-step exception entry. + * Return true if we stepped a breakpoint and can resume execution, + * false if we need to handle a single-step. */ if (user_mode(regs)) { if (debug_info->bps_disabled) { debug_info->bps_disabled = 0; toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL0, 1); - handled_exception = 1; + handled_exception = true; } if (debug_info->wps_disabled) { debug_info->wps_disabled = 0; toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL0, 1); - handled_exception = 1; + handled_exception = true; } if (handled_exception) { if (debug_info->suspended_step) { debug_info->suspended_step = 0; /* Allow exception handling to fall-through. */ - handled_exception = 0; + handled_exception = false; } else { user_disable_single_step(current); } @@ -891,17 +886,17 @@ int reinstall_suspended_bps(struct pt_regs *regs) if (*kernel_step != ARM_KERNEL_STEP_SUSPEND) { kernel_disable_single_step(); - handled_exception = 1; + handled_exception = true; } else { - handled_exception = 0; + handled_exception = false; } *kernel_step = ARM_KERNEL_STEP_NONE; } - return !handled_exception; + return handled_exception; } -NOKPROBE_SYMBOL(reinstall_suspended_bps); +NOKPROBE_SYMBOL(try_step_suspended_breakpoints); /* * Context-switcher for restoring suspended breakpoints. @@ -986,12 +981,6 @@ static int __init arch_hw_breakpoint_init(void) pr_info("found %d breakpoint and %d watchpoint registers.\n", core_num_brps, core_num_wrps); - /* Register debug fault handlers. */ - hook_debug_fault_code(DBG_ESR_EVT_HWBP, breakpoint_handler, SIGTRAP, - TRAP_HWBKPT, "hw-breakpoint handler"); - hook_debug_fault_code(DBG_ESR_EVT_HWWP, watchpoint_handler, SIGTRAP, - TRAP_HWBKPT, "hw-watchpoint handler"); - /* * Reset the breakpoint resources. We assume that a halting * debugger will leave the world in a nice state for us. diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 65f76064c86b..085bc9972f6b 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -54,6 +54,11 @@ SYM_CODE_START_LOCAL(elx_sync) 1: cmp x0, #HVC_FINALISE_EL2 b.eq __finalise_el2 + cmp x0, #HVC_GET_ICH_VTR_EL2 + b.ne 2f + mrs_s x1, SYS_ICH_VTR_EL2 + b 9f + 2: cmp x0, #HVC_SOFT_RESTART b.ne 3f mov x0, x2 @@ -97,7 +102,7 @@ SYM_CODE_START_LOCAL(__finalise_el2) 2: // Engage the VHE magic! mov_q x0, HCR_HOST_VHE_FLAGS - msr hcr_el2, x0 + msr_hcr_el2 x0 isb // Use the EL1 allocated stack, per-cpu offset @@ -114,8 +119,8 @@ SYM_CODE_START_LOCAL(__finalise_el2) // Use EL2 translations for SPE & TRBE and disable access from EL1 mrs x0, mdcr_el2 - bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) - bic x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) + bic x0, x0, #MDCR_EL2_E2PB_MASK + bic x0, x0, #MDCR_EL2_E2TB_MASK msr mdcr_el2, x0 // Transfer the MM state from EL1 to EL2 diff --git a/arch/arm64/kernel/idle.c b/arch/arm64/kernel/idle.c index c1125753fe9b..05cfb347ec26 100644 --- a/arch/arm64/kernel/idle.c +++ b/arch/arm64/kernel/idle.c @@ -20,7 +20,7 @@ * ensure that interrupts are not masked at the PMR (because the core will * not wake up if we block the wake up signal in the interrupt controller). */ -void noinstr cpu_do_idle(void) +void __cpuidle cpu_do_idle(void) { struct arm_cpuidle_irq_context context; @@ -35,7 +35,7 @@ void noinstr cpu_do_idle(void) /* * This is our default idle handler. */ -void noinstr arch_cpu_idle(void) +void __cpuidle arch_cpu_idle(void) { /* * This should do all the clock switching and wait for interrupt diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c deleted file mode 100644 index aee12c75b738..000000000000 --- a/arch/arm64/kernel/idreg-override.c +++ /dev/null @@ -1,338 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Early cpufeature override framework - * - * Copyright (C) 2020 Google LLC - * Author: Marc Zyngier <maz@kernel.org> - */ - -#include <linux/ctype.h> -#include <linux/kernel.h> -#include <linux/libfdt.h> - -#include <asm/cacheflush.h> -#include <asm/cpufeature.h> -#include <asm/setup.h> - -#define FTR_DESC_NAME_LEN 20 -#define FTR_DESC_FIELD_LEN 10 -#define FTR_ALIAS_NAME_LEN 30 -#define FTR_ALIAS_OPTION_LEN 116 - -static u64 __boot_status __initdata; - -struct ftr_set_desc { - char name[FTR_DESC_NAME_LEN]; - struct arm64_ftr_override *override; - struct { - char name[FTR_DESC_FIELD_LEN]; - u8 shift; - u8 width; - bool (*filter)(u64 val); - } fields[]; -}; - -#define FIELD(n, s, f) { .name = n, .shift = s, .width = 4, .filter = f } - -static bool __init mmfr1_vh_filter(u64 val) -{ - /* - * If we ever reach this point while running VHE, we're - * guaranteed to be on one of these funky, VHE-stuck CPUs. If - * the user was trying to force nVHE on us, proceed with - * attitude adjustment. - */ - return !(__boot_status == (BOOT_CPU_FLAG_E2H | BOOT_CPU_MODE_EL2) && - val == 0); -} - -static const struct ftr_set_desc mmfr1 __initconst = { - .name = "id_aa64mmfr1", - .override = &id_aa64mmfr1_override, - .fields = { - FIELD("vh", ID_AA64MMFR1_EL1_VH_SHIFT, mmfr1_vh_filter), - {} - }, -}; - -static bool __init pfr0_sve_filter(u64 val) -{ - /* - * Disabling SVE also means disabling all the features that - * are associated with it. The easiest way to do it is just to - * override id_aa64zfr0_el1 to be 0. - */ - if (!val) { - id_aa64zfr0_override.val = 0; - id_aa64zfr0_override.mask = GENMASK(63, 0); - } - - return true; -} - -static const struct ftr_set_desc pfr0 __initconst = { - .name = "id_aa64pfr0", - .override = &id_aa64pfr0_override, - .fields = { - FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter), - {} - }, -}; - -static bool __init pfr1_sme_filter(u64 val) -{ - /* - * Similarly to SVE, disabling SME also means disabling all - * the features that are associated with it. Just set - * id_aa64smfr0_el1 to 0 and don't look back. - */ - if (!val) { - id_aa64smfr0_override.val = 0; - id_aa64smfr0_override.mask = GENMASK(63, 0); - } - - return true; -} - -static const struct ftr_set_desc pfr1 __initconst = { - .name = "id_aa64pfr1", - .override = &id_aa64pfr1_override, - .fields = { - FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ), - FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), - FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), - {} - }, -}; - -static const struct ftr_set_desc isar1 __initconst = { - .name = "id_aa64isar1", - .override = &id_aa64isar1_override, - .fields = { - FIELD("gpi", ID_AA64ISAR1_EL1_GPI_SHIFT, NULL), - FIELD("gpa", ID_AA64ISAR1_EL1_GPA_SHIFT, NULL), - FIELD("api", ID_AA64ISAR1_EL1_API_SHIFT, NULL), - FIELD("apa", ID_AA64ISAR1_EL1_APA_SHIFT, NULL), - {} - }, -}; - -static const struct ftr_set_desc isar2 __initconst = { - .name = "id_aa64isar2", - .override = &id_aa64isar2_override, - .fields = { - FIELD("gpa3", ID_AA64ISAR2_EL1_GPA3_SHIFT, NULL), - FIELD("apa3", ID_AA64ISAR2_EL1_APA3_SHIFT, NULL), - FIELD("mops", ID_AA64ISAR2_EL1_MOPS_SHIFT, NULL), - {} - }, -}; - -static const struct ftr_set_desc smfr0 __initconst = { - .name = "id_aa64smfr0", - .override = &id_aa64smfr0_override, - .fields = { - FIELD("smever", ID_AA64SMFR0_EL1_SMEver_SHIFT, NULL), - /* FA64 is a one bit field... :-/ */ - { "fa64", ID_AA64SMFR0_EL1_FA64_SHIFT, 1, }, - {} - }, -}; - -static bool __init hvhe_filter(u64 val) -{ - u64 mmfr1 = read_sysreg(id_aa64mmfr1_el1); - - return (val == 1 && - lower_32_bits(__boot_status) == BOOT_CPU_MODE_EL2 && - cpuid_feature_extract_unsigned_field(mmfr1, - ID_AA64MMFR1_EL1_VH_SHIFT)); -} - -static const struct ftr_set_desc sw_features __initconst = { - .name = "arm64_sw", - .override = &arm64_sw_feature_override, - .fields = { - FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL), - FIELD("hvhe", ARM64_SW_FEATURE_OVERRIDE_HVHE, hvhe_filter), - {} - }, -}; - -static const struct ftr_set_desc * const regs[] __initconst = { - &mmfr1, - &pfr0, - &pfr1, - &isar1, - &isar2, - &smfr0, - &sw_features, -}; - -static const struct { - char alias[FTR_ALIAS_NAME_LEN]; - char feature[FTR_ALIAS_OPTION_LEN]; -} aliases[] __initconst = { - { "kvm-arm.mode=nvhe", "id_aa64mmfr1.vh=0" }, - { "kvm-arm.mode=protected", "id_aa64mmfr1.vh=0" }, - { "arm64.nosve", "id_aa64pfr0.sve=0" }, - { "arm64.nosme", "id_aa64pfr1.sme=0" }, - { "arm64.nobti", "id_aa64pfr1.bt=0" }, - { "arm64.nopauth", - "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 " - "id_aa64isar1.api=0 id_aa64isar1.apa=0 " - "id_aa64isar2.gpa3=0 id_aa64isar2.apa3=0" }, - { "arm64.nomops", "id_aa64isar2.mops=0" }, - { "arm64.nomte", "id_aa64pfr1.mte=0" }, - { "nokaslr", "arm64_sw.nokaslr=1" }, -}; - -static int __init parse_nokaslr(char *unused) -{ - /* nokaslr param handling is done by early cpufeature code */ - return 0; -} -early_param("nokaslr", parse_nokaslr); - -static int __init find_field(const char *cmdline, - const struct ftr_set_desc *reg, int f, u64 *v) -{ - char opt[FTR_DESC_NAME_LEN + FTR_DESC_FIELD_LEN + 2]; - int len; - - len = snprintf(opt, ARRAY_SIZE(opt), "%s.%s=", - reg->name, reg->fields[f].name); - - if (!parameqn(cmdline, opt, len)) - return -1; - - return kstrtou64(cmdline + len, 0, v); -} - -static void __init match_options(const char *cmdline) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(regs); i++) { - int f; - - if (!regs[i]->override) - continue; - - for (f = 0; strlen(regs[i]->fields[f].name); f++) { - u64 shift = regs[i]->fields[f].shift; - u64 width = regs[i]->fields[f].width ?: 4; - u64 mask = GENMASK_ULL(shift + width - 1, shift); - u64 v; - - if (find_field(cmdline, regs[i], f, &v)) - continue; - - /* - * If an override gets filtered out, advertise - * it by setting the value to the all-ones while - * clearing the mask... Yes, this is fragile. - */ - if (regs[i]->fields[f].filter && - !regs[i]->fields[f].filter(v)) { - regs[i]->override->val |= mask; - regs[i]->override->mask &= ~mask; - continue; - } - - regs[i]->override->val &= ~mask; - regs[i]->override->val |= (v << shift) & mask; - regs[i]->override->mask |= mask; - - return; - } - } -} - -static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) -{ - do { - char buf[256]; - size_t len; - int i; - - cmdline = skip_spaces(cmdline); - - for (len = 0; cmdline[len] && !isspace(cmdline[len]); len++); - if (!len) - return; - - len = strscpy(buf, cmdline, ARRAY_SIZE(buf)); - if (len == -E2BIG) - len = ARRAY_SIZE(buf) - 1; - - if (strcmp(buf, "--") == 0) - return; - - cmdline += len; - - match_options(buf); - - for (i = 0; parse_aliases && i < ARRAY_SIZE(aliases); i++) - if (parameq(buf, aliases[i].alias)) - __parse_cmdline(aliases[i].feature, false); - } while (1); -} - -static __init const u8 *get_bootargs_cmdline(void) -{ - const u8 *prop; - void *fdt; - int node; - - fdt = get_early_fdt_ptr(); - if (!fdt) - return NULL; - - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - return NULL; - - prop = fdt_getprop(fdt, node, "bootargs", NULL); - if (!prop) - return NULL; - - return strlen(prop) ? prop : NULL; -} - -static __init void parse_cmdline(void) -{ - const u8 *prop = get_bootargs_cmdline(); - - if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop) - __parse_cmdline(CONFIG_CMDLINE, true); - - if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop) - __parse_cmdline(prop, true); -} - -/* Keep checkers quiet */ -void init_feature_override(u64 boot_status); - -asmlinkage void __init init_feature_override(u64 boot_status) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(regs); i++) { - if (regs[i]->override) { - regs[i]->override->val = 0; - regs[i]->override->mask = 0; - } - } - - __boot_status = boot_status; - - parse_cmdline(); - - for (i = 0; i < ARRAY_SIZE(regs); i++) { - if (regs[i]->override) - dcache_clean_inval_poc((unsigned long)regs[i]->override, - (unsigned long)regs[i]->override + - sizeof(*regs[i]->override)); - } -} diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 35f3c7959513..85bc629270bd 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -10,6 +10,16 @@ #error This file should only be included in vmlinux.lds.S #endif +#if defined(CONFIG_LD_IS_LLD) && CONFIG_LLD_VERSION < 210000 +#define ASSERT(...) +#endif + +#define PI_EXPORT_SYM(sym) \ + __PI_EXPORT_SYM(sym, __pi_ ## sym, Cannot export BSS symbol sym to startup code) +#define __PI_EXPORT_SYM(sym, pisym, msg)\ + PROVIDE(pisym = sym); \ + ASSERT((sym - KIMAGE_VADDR) < (__bss_start - KIMAGE_VADDR), #msg) + PROVIDE(__efistub_primary_entry = primary_entry); /* @@ -27,13 +37,40 @@ PROVIDE(__efistub__text = _text); PROVIDE(__efistub__end = _end); PROVIDE(__efistub___inittext_end = __inittext_end); PROVIDE(__efistub__edata = _edata); +#if defined(CONFIG_EFI_EARLYCON) || defined(CONFIG_SYSFB) PROVIDE(__efistub_screen_info = screen_info); +#endif PROVIDE(__efistub__ctype = _ctype); PROVIDE(__pi___memcpy = __pi_memcpy); PROVIDE(__pi___memmove = __pi_memmove); PROVIDE(__pi___memset = __pi_memset); +PI_EXPORT_SYM(id_aa64isar1_override); +PI_EXPORT_SYM(id_aa64isar2_override); +PI_EXPORT_SYM(id_aa64mmfr0_override); +PI_EXPORT_SYM(id_aa64mmfr1_override); +PI_EXPORT_SYM(id_aa64mmfr2_override); +PI_EXPORT_SYM(id_aa64pfr0_override); +PI_EXPORT_SYM(id_aa64pfr1_override); +PI_EXPORT_SYM(id_aa64smfr0_override); +PI_EXPORT_SYM(id_aa64zfr0_override); +PI_EXPORT_SYM(arm64_sw_feature_override); +PI_EXPORT_SYM(arm64_use_ng_mappings); +PI_EXPORT_SYM(_ctype); + +PI_EXPORT_SYM(swapper_pg_dir); + +PI_EXPORT_SYM(_text); +PI_EXPORT_SYM(_stext); +PI_EXPORT_SYM(_etext); +PI_EXPORT_SYM(__start_rodata); +PI_EXPORT_SYM(__inittext_begin); +PI_EXPORT_SYM(__inittext_end); +PI_EXPORT_SYM(__initdata_begin); +PI_EXPORT_SYM(__initdata_end); +PI_EXPORT_SYM(_data); + #ifdef CONFIG_KVM /* @@ -54,6 +91,7 @@ KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable); KVM_NVHE_ALIAS(spectre_bhb_patch_wa3); KVM_NVHE_ALIAS(spectre_bhb_patch_clearbhb); KVM_NVHE_ALIAS(alt_cb_patch_nops); +KVM_NVHE_ALIAS(kvm_compute_ich_hcr_trap_bits); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); @@ -68,20 +106,16 @@ KVM_NVHE_ALIAS(__hyp_stub_vectors); KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); -#ifdef CONFIG_ARM64_PSEUDO_NMI -/* Static key checked in GIC_PRIO_IRQOFF. */ -KVM_NVHE_ALIAS(gic_nonsecure_priorities); -#endif +/* Static key indicating whether GICv3 has GICv2 compatibility */ +KVM_NVHE_ALIAS(vgic_v3_has_v2_compat); + +/* Static key which is set if CNTVOFF_EL2 is unusable */ +KVM_NVHE_ALIAS(broken_cntvoff_key); /* EL2 exception handling */ KVM_NVHE_ALIAS(__start___kvm_ex_table); KVM_NVHE_ALIAS(__stop___kvm_ex_table); -/* PMU available static key */ -#ifdef CONFIG_HW_PERF_EVENTS -KVM_NVHE_ALIAS(kvm_arm_pmu_available); -#endif - /* Position-independent library routines */ KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page); KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page); @@ -100,6 +134,8 @@ KVM_NVHE_ALIAS(__hyp_text_start); KVM_NVHE_ALIAS(__hyp_text_end); KVM_NVHE_ALIAS(__hyp_bss_start); KVM_NVHE_ALIAS(__hyp_bss_end); +KVM_NVHE_ALIAS(__hyp_data_start); +KVM_NVHE_ALIAS(__hyp_data_end); KVM_NVHE_ALIAS(__hyp_rodata_start); KVM_NVHE_ALIAS(__hyp_rodata_end); @@ -112,4 +148,17 @@ KVM_NVHE_ALIAS(kvm_protected_mode_initialized); _kernel_codesize = ABSOLUTE(__inittext_end - _text); #endif +/* + * LLD will occasionally error out with a '__init_end does not converge' error + * if INIT_IDMAP_DIR_SIZE is defined in terms of _end, as this results in a + * circular dependency. Counter this by dimensioning the initial IDMAP page + * tables based on kimage_limit, which is defined such that its value should + * not change as a result of the initdata segment being pushed over a 64k + * segment boundary due to changes in INIT_IDMAP_DIR_SIZE, provided that its + * value doesn't change by more than 2M between linker passes. + */ +kimage_limit = ALIGN(ABSOLUTE(_end + SZ_64K), SZ_2M); + +#undef ASSERT + #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c index aa7a4ec6a3ae..fe86ada23c7d 100644 --- a/arch/arm64/kernel/io.c +++ b/arch/arm64/kernel/io.c @@ -10,88 +10,43 @@ #include <linux/io.h> /* - * Copy data from IO memory space to "real" memory space. + * This generates a memcpy that works on a from/to address which is aligned to + * bits. Count is in terms of the number of bits sized quantities to copy. It + * optimizes to use the STR groupings when possible so that it is WC friendly. */ -void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) +#define memcpy_toio_aligned(to, from, count, bits) \ + ({ \ + volatile u##bits __iomem *_to = to; \ + const u##bits *_from = from; \ + size_t _count = count; \ + const u##bits *_end_from = _from + ALIGN_DOWN(_count, 8); \ + \ + for (; _from < _end_from; _from += 8, _to += 8) \ + __const_memcpy_toio_aligned##bits(_to, _from, 8); \ + if ((_count % 8) >= 4) { \ + __const_memcpy_toio_aligned##bits(_to, _from, 4); \ + _from += 4; \ + _to += 4; \ + } \ + if ((_count % 4) >= 2) { \ + __const_memcpy_toio_aligned##bits(_to, _from, 2); \ + _from += 2; \ + _to += 2; \ + } \ + if (_count % 2) \ + __const_memcpy_toio_aligned##bits(_to, _from, 1); \ + }) + +void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count) { - while (count && !IS_ALIGNED((unsigned long)from, 8)) { - *(u8 *)to = __raw_readb(from); - from++; - to++; - count--; - } - - while (count >= 8) { - *(u64 *)to = __raw_readq(from); - from += 8; - to += 8; - count -= 8; - } - - while (count) { - *(u8 *)to = __raw_readb(from); - from++; - to++; - count--; - } + memcpy_toio_aligned(to, from, count, 64); + dgh(); } -EXPORT_SYMBOL(__memcpy_fromio); +EXPORT_SYMBOL(__iowrite64_copy_full); -/* - * Copy data from "real" memory space to IO memory space. - */ -void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) +void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count) { - while (count && !IS_ALIGNED((unsigned long)to, 8)) { - __raw_writeb(*(u8 *)from, to); - from++; - to++; - count--; - } - - while (count >= 8) { - __raw_writeq(*(u64 *)from, to); - from += 8; - to += 8; - count -= 8; - } - - while (count) { - __raw_writeb(*(u8 *)from, to); - from++; - to++; - count--; - } -} -EXPORT_SYMBOL(__memcpy_toio); - -/* - * "memset" on IO memory space. - */ -void __memset_io(volatile void __iomem *dst, int c, size_t count) -{ - u64 qc = (u8)c; - - qc |= qc << 8; - qc |= qc << 16; - qc |= qc << 32; - - while (count && !IS_ALIGNED((unsigned long)dst, 8)) { - __raw_writeb(c, dst); - dst++; - count--; - } - - while (count >= 8) { - __raw_writeq(qc, dst); - dst += 8; - count -= 8; - } - - while (count) { - __raw_writeb(c, dst); - dst++; - count--; - } + memcpy_toio_aligned(to, from, count, 32); + dgh(); } -EXPORT_SYMBOL(__memset_io); +EXPORT_SYMBOL(__iowrite32_copy_full); diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 6ad5c6ef5329..15dedb385b9e 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -22,6 +22,7 @@ #include <linux/vmalloc.h> #include <asm/daifflags.h> #include <asm/exception.h> +#include <asm/numa.h> #include <asm/softirq_stack.h> #include <asm/stacktrace.h> #include <asm/vmap_stack.h> @@ -47,34 +48,21 @@ static void init_irq_scs(void) for_each_possible_cpu(cpu) per_cpu(irq_shadow_call_stack_ptr, cpu) = - scs_alloc(cpu_to_node(cpu)); + scs_alloc(early_cpu_to_node(cpu)); } -#ifdef CONFIG_VMAP_STACK -static void init_irq_stacks(void) +static void __init init_irq_stacks(void) { int cpu; unsigned long *p; for_each_possible_cpu(cpu) { - p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, cpu_to_node(cpu)); + p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, early_cpu_to_node(cpu)); per_cpu(irq_stack_ptr, cpu) = p; } } -#else -/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ -DEFINE_PER_CPU_ALIGNED(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); -static void init_irq_stacks(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack, cpu); -} -#endif - -#ifndef CONFIG_PREEMPT_RT +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK static void ____do_softirq(struct pt_regs *regs) { __do_softirq(); diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c index faf88ec9c48e..b345425193d2 100644 --- a/arch/arm64/kernel/jump_label.c +++ b/arch/arm64/kernel/jump_label.c @@ -7,11 +7,12 @@ */ #include <linux/kernel.h> #include <linux/jump_label.h> +#include <linux/smp.h> #include <asm/insn.h> -#include <asm/patching.h> +#include <asm/text-patching.h> -void arch_jump_label_transform(struct jump_entry *entry, - enum jump_label_type type) +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) { void *addr = (void *)jump_entry_code(entry); u32 insn; @@ -25,4 +26,10 @@ void arch_jump_label_transform(struct jump_entry *entry, } aarch64_insn_patch_text_nosync(addr, insn); + return true; +} + +void arch_jump_label_transform_apply(void) +{ + kick_all_cpus_sync(); } diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 94a269cd1f07..c9503ed45a6c 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -10,15 +10,11 @@ #include <asm/cpufeature.h> #include <asm/memory.h> -u16 __initdata memstart_offset_seed; - bool __ro_after_init __kaslr_is_enabled = false; void __init kaslr_init(void) { - if (cpuid_feature_extract_unsigned_field(arm64_sw_feature_override.val & - arm64_sw_feature_override.mask, - ARM64_SW_FEATURE_OVERRIDE_NOKASLR)) { + if (kaslr_disabled_cmdline()) { pr_info("KASLR disabled on command line\n"); return; } @@ -36,3 +32,10 @@ void __init kaslr_init(void) pr_info("KASLR enabled\n"); __kaslr_is_enabled = true; } + +static int __init parse_nokaslr(char *unused) +{ + /* nokaslr param handling is done by early cpufeature code */ + return 0; +} +early_param("nokaslr", parse_nokaslr); diff --git a/arch/arm64/kernel/kexec_image.c b/arch/arm64/kernel/kexec_image.c index 636be6715155..532d72ea42ee 100644 --- a/arch/arm64/kernel/kexec_image.c +++ b/arch/arm64/kernel/kexec_image.c @@ -122,9 +122,9 @@ static void *image_load(struct kimage *image, kernel_segment->memsz -= text_offset; image->start = kernel_segment->mem; - pr_debug("Loaded kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - kernel_segment->mem, kbuf.bufsz, - kernel_segment->memsz); + kexec_dprintk("Loaded kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kernel_segment->mem, kbuf.bufsz, + kernel_segment->memsz); return NULL; } diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index 4e1f983df3d1..968324a79a89 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -17,7 +17,7 @@ #include <asm/debug-monitors.h> #include <asm/insn.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/traps.h> struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { @@ -234,23 +234,23 @@ int kgdb_arch_handle_exception(int exception_vector, int signo, return err; } -static int kgdb_brk_fn(struct pt_regs *regs, unsigned long esr) +int kgdb_brk_handler(struct pt_regs *regs, unsigned long esr) { kgdb_handle_exception(1, SIGTRAP, 0, regs); return DBG_HOOK_HANDLED; } -NOKPROBE_SYMBOL(kgdb_brk_fn) +NOKPROBE_SYMBOL(kgdb_brk_handler) -static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned long esr) +int kgdb_compiled_brk_handler(struct pt_regs *regs, unsigned long esr) { compiled_break = 1; kgdb_handle_exception(1, SIGTRAP, 0, regs); return DBG_HOOK_HANDLED; } -NOKPROBE_SYMBOL(kgdb_compiled_brk_fn); +NOKPROBE_SYMBOL(kgdb_compiled_brk_handler); -static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned long esr) +int kgdb_single_step_handler(struct pt_regs *regs, unsigned long esr) { if (!kgdb_single_step) return DBG_HOOK_ERROR; @@ -258,21 +258,7 @@ static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned long esr) kgdb_handle_exception(0, SIGTRAP, 0, regs); return DBG_HOOK_HANDLED; } -NOKPROBE_SYMBOL(kgdb_step_brk_fn); - -static struct break_hook kgdb_brkpt_hook = { - .fn = kgdb_brk_fn, - .imm = KGDB_DYN_DBG_BRK_IMM, -}; - -static struct break_hook kgdb_compiled_brkpt_hook = { - .fn = kgdb_compiled_brk_fn, - .imm = KGDB_COMPILED_DBG_BRK_IMM, -}; - -static struct step_hook kgdb_step_hook = { - .fn = kgdb_step_brk_fn -}; +NOKPROBE_SYMBOL(kgdb_single_step_handler); static int __kgdb_notify(struct die_args *args, unsigned long cmd) { @@ -311,15 +297,7 @@ static struct notifier_block kgdb_notifier = { */ int kgdb_arch_init(void) { - int ret = register_die_notifier(&kgdb_notifier); - - if (ret != 0) - return ret; - - register_kernel_break_hook(&kgdb_brkpt_hook); - register_kernel_break_hook(&kgdb_compiled_brkpt_hook); - register_kernel_step_hook(&kgdb_step_hook); - return 0; + return register_die_notifier(&kgdb_notifier); } /* @@ -329,9 +307,6 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { - unregister_kernel_break_hook(&kgdb_brkpt_hook); - unregister_kernel_break_hook(&kgdb_compiled_brkpt_hook); - unregister_kernel_step_hook(&kgdb_step_hook); unregister_die_notifier(&kgdb_notifier); } diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 078910db77a4..239c16e3d02f 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -32,26 +32,12 @@ static void _kexec_image_info(const char *func, int line, const struct kimage *kimage) { - unsigned long i; - - pr_debug("%s:%d:\n", func, line); - pr_debug(" kexec kimage info:\n"); - pr_debug(" type: %d\n", kimage->type); - pr_debug(" start: %lx\n", kimage->start); - pr_debug(" head: %lx\n", kimage->head); - pr_debug(" nr_segments: %lu\n", kimage->nr_segments); - pr_debug(" dtb_mem: %pa\n", &kimage->arch.dtb_mem); - pr_debug(" kern_reloc: %pa\n", &kimage->arch.kern_reloc); - pr_debug(" el2_vectors: %pa\n", &kimage->arch.el2_vectors); - - for (i = 0; i < kimage->nr_segments; i++) { - pr_debug(" segment[%lu]: %016lx - %016lx, 0x%lx bytes, %lu pages\n", - i, - kimage->segment[i].mem, - kimage->segment[i].mem + kimage->segment[i].memsz, - kimage->segment[i].memsz, - kimage->segment[i].memsz / PAGE_SIZE); - } + kexec_dprintk("%s:%d:\n", func, line); + kexec_dprintk(" kexec kimage info:\n"); + kexec_dprintk(" type: %d\n", kimage->type); + kexec_dprintk(" head: %lx\n", kimage->head); + kexec_dprintk(" kern_reloc: %pa\n", &kimage->arch.kern_reloc); + kexec_dprintk(" el2_vectors: %pa\n", &kimage->arch.el2_vectors); } void machine_kexec_cleanup(struct kimage *kimage) @@ -221,37 +207,6 @@ void machine_kexec(struct kimage *kimage) BUG(); /* Should never get here. */ } -static void machine_kexec_mask_interrupts(void) -{ - unsigned int i; - struct irq_desc *desc; - - for_each_irq_desc(i, desc) { - struct irq_chip *chip; - int ret; - - chip = irq_desc_get_chip(desc); - if (!chip) - continue; - - /* - * First try to remove the active state. If this - * fails, try to EOI the interrupt. - */ - ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false); - - if (ret && irqd_irq_inprogress(&desc->irq_data) && - chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); - - if (chip->irq_mask) - chip->irq_mask(&desc->irq_data); - - if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) - chip->irq_disable(&desc->irq_data); - } -} - /** * machine_crash_shutdown - shutdown non-crashing cpus and save registers */ @@ -269,7 +224,7 @@ void machine_crash_shutdown(struct pt_regs *regs) pr_info("Starting crashdump kernel...\n"); } -#ifdef CONFIG_HIBERNATION +#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION) /* * To preserve the crash dump kernel image, the relevant memory segments * should be mapped again around the hibernation. @@ -296,7 +251,7 @@ void crash_post_resume(void) * marked as Reserved as memory was allocated via memblock_reserve(). * * In hibernation, the pages which are Reserved and yet "nosave" are excluded - * from the hibernation iamge. crash_is_nosave() does thich check for crash + * from the hibernation image. crash_is_nosave() does thich check for crash * dump kernel and will reduce the total size of hibernation image. */ diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index a11a6e14ba89..410060ebd86d 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -39,6 +39,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return kexec_image_post_load_cleanup_default(image); } +#ifdef CONFIG_CRASH_DUMP static int prepare_elf_headers(void **addr, unsigned long *sz) { struct crash_mem *cmem; @@ -80,6 +81,7 @@ out: kfree(cmem); return ret; } +#endif /* * Tries to add the initrd and DTB to the image. If it is not possible to find @@ -92,9 +94,9 @@ int load_other_segments(struct kimage *image, char *initrd, unsigned long initrd_len, char *cmdline) { - struct kexec_buf kbuf; - void *headers, *dtb = NULL; - unsigned long headers_sz, initrd_load_addr = 0, dtb_len, + struct kexec_buf kbuf = {}; + void *dtb = NULL; + unsigned long initrd_load_addr = 0, dtb_len, orig_segments = image->nr_segments; int ret = 0; @@ -102,7 +104,10 @@ int load_other_segments(struct kimage *image, /* not allocate anything below the kernel */ kbuf.buf_min = kernel_load_addr + kernel_size; +#ifdef CONFIG_CRASH_DUMP /* load elf core header */ + void *headers; + unsigned long headers_sz; if (image->type == KEXEC_TYPE_CRASH) { ret = prepare_elf_headers(&headers, &headers_sz); if (ret) { @@ -127,9 +132,10 @@ int load_other_segments(struct kimage *image, image->elf_load_addr = kbuf.mem; image->elf_headers_sz = headers_sz; - pr_debug("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->elf_load_addr, kbuf.bufsz, kbuf.memsz); + kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); } +#endif /* load initrd */ if (initrd) { @@ -148,8 +154,8 @@ int load_other_segments(struct kimage *image, goto out_err; initrd_load_addr = kbuf.mem; - pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - initrd_load_addr, kbuf.bufsz, kbuf.memsz); + kexec_dprintk("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + initrd_load_addr, kbuf.bufsz, kbuf.memsz); } /* load dtb */ @@ -179,8 +185,8 @@ int load_other_segments(struct kimage *image, image->arch.dtb = dtb; image->arch.dtb_mem = kbuf.mem; - pr_debug("Loaded dtb at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - kbuf.mem, kbuf.bufsz, kbuf.memsz); + kexec_dprintk("Loaded dtb at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kbuf.mem, kbuf.bufsz, kbuf.memsz); return 0; diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c index bd69a4e7cd60..7afd370da9f4 100644 --- a/arch/arm64/kernel/module-plts.c +++ b/arch/arm64/kernel/module-plts.c @@ -167,9 +167,6 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, switch (ELF64_R_TYPE(rela[i].r_info)) { case R_AARCH64_JUMP26: case R_AARCH64_CALL26: - if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - break; - /* * We only have to consider branch targets that resolve * to symbols that are defined in a different section. @@ -203,8 +200,7 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, break; case R_AARCH64_ADR_PREL_PG_HI21_NC: case R_AARCH64_ADR_PREL_PG_HI21: - if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) || - !cpus_have_const_cap(ARM64_WORKAROUND_843419)) + if (!cpus_have_final_cap(ARM64_WORKAROUND_843419)) break; /* @@ -239,13 +235,13 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, } } - if (IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) && - cpus_have_const_cap(ARM64_WORKAROUND_843419)) + if (cpus_have_final_cap(ARM64_WORKAROUND_843419)) { /* * Add some slack so we can skip PLT slots that may trigger * the erratum due to the placement of the ADRP instruction. */ ret += DIV_ROUND_UP(ret, (SZ_4K / sizeof(struct plt_entry))); + } return ret; } @@ -269,9 +265,6 @@ static int partition_branch_plt_relas(Elf64_Sym *syms, Elf64_Rela *rela, { int i = 0, j = numrels - 1; - if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - return 0; - while (i < j) { if (branch_rela_needs_plt(syms, &rela[i], dstidx)) i++; @@ -290,7 +283,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, unsigned long core_plts = 0; unsigned long init_plts = 0; Elf64_Sym *syms = NULL; - Elf_Shdr *pltsec, *tramp = NULL; + Elf_Shdr *pltsec, *tramp = NULL, *init_tramp = NULL; int i; /* @@ -305,6 +298,9 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, else if (!strcmp(secstrings + sechdrs[i].sh_name, ".text.ftrace_trampoline")) tramp = sechdrs + i; + else if (!strcmp(secstrings + sechdrs[i].sh_name, + ".init.text.ftrace_trampoline")) + init_tramp = sechdrs + i; else if (sechdrs[i].sh_type == SHT_SYMTAB) syms = (Elf64_Sym *)sechdrs[i].sh_addr; } @@ -370,5 +366,12 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, tramp->sh_size = NR_FTRACE_PLTS * sizeof(struct plt_entry); } + if (init_tramp) { + init_tramp->sh_type = SHT_NOBITS; + init_tramp->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + init_tramp->sh_addralign = __alignof__(struct plt_entry); + init_tramp->sh_size = NR_FTRACE_PLTS * sizeof(struct plt_entry); + } + return 0; } diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index dd851297596e..24adb581af0e 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -12,143 +12,18 @@ #include <linux/bitops.h> #include <linux/elf.h> #include <linux/ftrace.h> -#include <linux/gfp.h> #include <linux/kasan.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/moduleloader.h> #include <linux/random.h> #include <linux/scs.h> -#include <linux/vmalloc.h> #include <asm/alternative.h> #include <asm/insn.h> #include <asm/scs.h> #include <asm/sections.h> - -static u64 module_direct_base __ro_after_init = 0; -static u64 module_plt_base __ro_after_init = 0; - -/* - * Choose a random page-aligned base address for a window of 'size' bytes which - * entirely contains the interval [start, end - 1]. - */ -static u64 __init random_bounding_box(u64 size, u64 start, u64 end) -{ - u64 max_pgoff, pgoff; - - if ((end - start) >= size) - return 0; - - max_pgoff = (size - (end - start)) / PAGE_SIZE; - pgoff = get_random_u32_inclusive(0, max_pgoff); - - return start - pgoff * PAGE_SIZE; -} - -/* - * Modules may directly reference data and text anywhere within the kernel - * image and other modules. References using PREL32 relocations have a +/-2G - * range, and so we need to ensure that the entire kernel image and all modules - * fall within a 2G window such that these are always within range. - * - * Modules may directly branch to functions and code within the kernel text, - * and to functions and code within other modules. These branches will use - * CALL26/JUMP26 relocations with a +/-128M range. Without PLTs, we must ensure - * that the entire kernel text and all module text falls within a 128M window - * such that these are always within range. With PLTs, we can expand this to a - * 2G window. - * - * We chose the 128M region to surround the entire kernel image (rather than - * just the text) as using the same bounds for the 128M and 2G regions ensures - * by construction that we never select a 128M region that is not a subset of - * the 2G region. For very large and unusual kernel configurations this means - * we may fall back to PLTs where they could have been avoided, but this keeps - * the logic significantly simpler. - */ -static int __init module_init_limits(void) -{ - u64 kernel_end = (u64)_end; - u64 kernel_start = (u64)_text; - u64 kernel_size = kernel_end - kernel_start; - - /* - * The default modules region is placed immediately below the kernel - * image, and is large enough to use the full 2G relocation range. - */ - BUILD_BUG_ON(KIMAGE_VADDR != MODULES_END); - BUILD_BUG_ON(MODULES_VSIZE < SZ_2G); - - if (!kaslr_enabled()) { - if (kernel_size < SZ_128M) - module_direct_base = kernel_end - SZ_128M; - if (kernel_size < SZ_2G) - module_plt_base = kernel_end - SZ_2G; - } else { - u64 min = kernel_start; - u64 max = kernel_end; - - if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) { - pr_info("2G module region forced by RANDOMIZE_MODULE_REGION_FULL\n"); - } else { - module_direct_base = random_bounding_box(SZ_128M, min, max); - if (module_direct_base) { - min = module_direct_base; - max = module_direct_base + SZ_128M; - } - } - - module_plt_base = random_bounding_box(SZ_2G, min, max); - } - - pr_info("%llu pages in range for non-PLT usage", - module_direct_base ? (SZ_128M - kernel_size) / PAGE_SIZE : 0); - pr_info("%llu pages in range for PLT usage", - module_plt_base ? (SZ_2G - kernel_size) / PAGE_SIZE : 0); - - return 0; -} -subsys_initcall(module_init_limits); - -void *module_alloc(unsigned long size) -{ - void *p = NULL; - - /* - * Where possible, prefer to allocate within direct branch range of the - * kernel such that no PLTs are necessary. - */ - if (module_direct_base) { - p = __vmalloc_node_range(size, MODULE_ALIGN, - module_direct_base, - module_direct_base + SZ_128M, - GFP_KERNEL | __GFP_NOWARN, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - } - - if (!p && module_plt_base) { - p = __vmalloc_node_range(size, MODULE_ALIGN, - module_plt_base, - module_plt_base + SZ_2G, - GFP_KERNEL | __GFP_NOWARN, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - } - - if (!p) { - pr_warn_ratelimited("%s: unable to allocate memory\n", - __func__); - } - - if (p && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) { - vfree(p); - return NULL; - } - - /* Memory is intended to be executable, reset the pointer tag. */ - return kasan_reset_tag(p); -} +#include <asm/text-patching.h> enum aarch64_reloc_op { RELOC_OP_NONE, @@ -174,7 +49,17 @@ static u64 do_reloc(enum aarch64_reloc_op reloc_op, __le32 *place, u64 val) return 0; } -static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) +#define WRITE_PLACE(place, val, mod) do { \ + __typeof__(val) __val = (val); \ + \ + if (mod->state == MODULE_STATE_UNFORMED) \ + *(place) = __val; \ + else \ + aarch64_insn_copy(place, &(__val), sizeof(*place)); \ +} while (0) + +static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len, + struct module *me) { s64 sval = do_reloc(op, place, val); @@ -192,7 +77,7 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) switch (len) { case 16: - *(s16 *)place = sval; + WRITE_PLACE((s16 *)place, sval, me); switch (op) { case RELOC_OP_ABS: if (sval < 0 || sval > U16_MAX) @@ -208,7 +93,7 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) } break; case 32: - *(s32 *)place = sval; + WRITE_PLACE((s32 *)place, sval, me); switch (op) { case RELOC_OP_ABS: if (sval < 0 || sval > U32_MAX) @@ -224,7 +109,7 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) } break; case 64: - *(s64 *)place = sval; + WRITE_PLACE((s64 *)place, sval, me); break; default: pr_err("Invalid length (%d) for data relocation\n", len); @@ -239,7 +124,8 @@ enum aarch64_insn_movw_imm_type { }; static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, - int lsb, enum aarch64_insn_movw_imm_type imm_type) + int lsb, enum aarch64_insn_movw_imm_type imm_type, + struct module *me) { u64 imm; s64 sval; @@ -271,7 +157,7 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, /* Update the instruction with the new encoding. */ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); - *place = cpu_to_le32(insn); + WRITE_PLACE(place, cpu_to_le32(insn), me); if (imm > U16_MAX) return -ERANGE; @@ -280,7 +166,8 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, } static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, - int lsb, int len, enum aarch64_insn_imm_type imm_type) + int lsb, int len, enum aarch64_insn_imm_type imm_type, + struct module *me) { u64 imm, imm_mask; s64 sval; @@ -296,7 +183,7 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, /* Update the instruction's immediate field. */ insn = aarch64_insn_encode_immediate(imm_type, insn, imm); - *place = cpu_to_le32(insn); + WRITE_PLACE(place, cpu_to_le32(insn), me); /* * Extract the upper value bits (including the sign bit) and @@ -315,17 +202,17 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, } static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs, - __le32 *place, u64 val) + __le32 *place, u64 val, struct module *me) { u32 insn; if (!is_forbidden_offset_for_adrp(place)) return reloc_insn_imm(RELOC_OP_PAGE, place, val, 12, 21, - AARCH64_INSN_IMM_ADR); + AARCH64_INSN_IMM_ADR, me); /* patch ADRP to ADR if it is in range */ if (!reloc_insn_imm(RELOC_OP_PREL, place, val & ~0xfff, 0, 21, - AARCH64_INSN_IMM_ADR)) { + AARCH64_INSN_IMM_ADR, me)) { insn = le32_to_cpu(*place); insn &= ~BIT(31); } else { @@ -337,7 +224,7 @@ static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs, AARCH64_INSN_BRANCH_NOLINK); } - *place = cpu_to_le32(insn); + WRITE_PLACE(place, cpu_to_le32(insn), me); return 0; } @@ -381,23 +268,23 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, /* Data relocations. */ case R_AARCH64_ABS64: overflow_check = false; - ovf = reloc_data(RELOC_OP_ABS, loc, val, 64); + ovf = reloc_data(RELOC_OP_ABS, loc, val, 64, me); break; case R_AARCH64_ABS32: - ovf = reloc_data(RELOC_OP_ABS, loc, val, 32); + ovf = reloc_data(RELOC_OP_ABS, loc, val, 32, me); break; case R_AARCH64_ABS16: - ovf = reloc_data(RELOC_OP_ABS, loc, val, 16); + ovf = reloc_data(RELOC_OP_ABS, loc, val, 16, me); break; case R_AARCH64_PREL64: overflow_check = false; - ovf = reloc_data(RELOC_OP_PREL, loc, val, 64); + ovf = reloc_data(RELOC_OP_PREL, loc, val, 64, me); break; case R_AARCH64_PREL32: - ovf = reloc_data(RELOC_OP_PREL, loc, val, 32); + ovf = reloc_data(RELOC_OP_PREL, loc, val, 32, me); break; case R_AARCH64_PREL16: - ovf = reloc_data(RELOC_OP_PREL, loc, val, 16); + ovf = reloc_data(RELOC_OP_PREL, loc, val, 16, me); break; /* MOVW instruction relocations. */ @@ -406,88 +293,88 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, fallthrough; case R_AARCH64_MOVW_UABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_UABS_G1_NC: overflow_check = false; fallthrough; case R_AARCH64_MOVW_UABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_UABS_G2_NC: overflow_check = false; fallthrough; case R_AARCH64_MOVW_UABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_UABS_G3: /* We're using the top bits so we can't overflow. */ overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_SABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_SABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_SABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_PREL_G0_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_PREL_G0: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_PREL_G1_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_PREL_G1: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_PREL_G2_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_PREL_G2: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_PREL_G3: /* We're using the top bits so we can't overflow. */ overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 48, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; /* Immediate instruction relocations. */ case R_AARCH64_LD_PREL_LO19: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19, - AARCH64_INSN_IMM_19); + AARCH64_INSN_IMM_19, me); break; case R_AARCH64_ADR_PREL_LO21: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21, - AARCH64_INSN_IMM_ADR); + AARCH64_INSN_IMM_ADR, me); break; case R_AARCH64_ADR_PREL_PG_HI21_NC: overflow_check = false; fallthrough; case R_AARCH64_ADR_PREL_PG_HI21: - ovf = reloc_insn_adrp(me, sechdrs, loc, val); + ovf = reloc_insn_adrp(me, sechdrs, loc, val, me); if (ovf && ovf != -ERANGE) return ovf; break; @@ -495,46 +382,46 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_LDST8_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 0, 12, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_LDST16_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 1, 11, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_LDST32_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 2, 10, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_LDST64_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 3, 9, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_LDST128_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 4, 8, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_TSTBR14: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 14, - AARCH64_INSN_IMM_14); + AARCH64_INSN_IMM_14, me); break; case R_AARCH64_CONDBR19: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19, - AARCH64_INSN_IMM_19); + AARCH64_INSN_IMM_19, me); break; case R_AARCH64_JUMP26: case R_AARCH64_CALL26: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26, - AARCH64_INSN_IMM_26); + AARCH64_INSN_IMM_26, me); if (ovf == -ERANGE) { val = module_emit_plt_entry(me, sechdrs, loc, &rel[i], sym); if (!val) return -ENOEXEC; ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, - 26, AARCH64_INSN_IMM_26); + 26, AARCH64_INSN_IMM_26, me); } break; @@ -579,6 +466,17 @@ static int module_init_ftrace_plt(const Elf_Ehdr *hdr, __init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR); mod->arch.ftrace_trampolines = plts; + + s = find_section(hdr, sechdrs, ".init.text.ftrace_trampoline"); + if (!s) + return -ENOEXEC; + + plts = (void *)s->sh_addr; + + __init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR); + + mod->arch.init_ftrace_trampolines = plts; + #endif return 0; } @@ -588,14 +486,33 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s; + int ret; + s = find_section(hdr, sechdrs, ".altinstructions"); - if (s) - apply_alternatives_module((void *)s->sh_addr, s->sh_size); + if (s) { + ret = apply_alternatives_module((void *)s->sh_addr, s->sh_size); + if (ret < 0) { + pr_err("module %s: error occurred when applying alternatives\n", me->name); + return ret; + } + } if (scs_is_dynamic()) { s = find_section(hdr, sechdrs, ".init.eh_frame"); - if (s) - scs_patch((void *)s->sh_addr, s->sh_size); + if (s) { + /* + * Because we can reject modules that are malformed + * so SCS patching fails, skip dry run and try to patch + * it in place. If patching fails, the module would not + * be loaded anyway. + */ + ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size, true); + if (ret) { + pr_err("module %s: error occurred during dynamic SCS patching (%d)\n", + me->name, ret); + return -ENOEXEC; + } + } } return module_init_ftrace_plt(hdr, sechdrs, me); diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 4edecaac8f91..32148bf09c1d 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -35,10 +35,27 @@ DEFINE_STATIC_KEY_FALSE(mte_async_or_asymm_mode); EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode); #endif -void mte_sync_tags(pte_t pte) +void mte_sync_tags(pte_t pte, unsigned int nr_pages) { struct page *page = pte_page(pte); - long i, nr_pages = compound_nr(page); + struct folio *folio = page_folio(page); + unsigned long i; + + if (folio_test_hugetlb(folio)) { + unsigned long nr = folio_nr_pages(folio); + + /* Hugetlb MTE flags are set for head page only */ + if (folio_try_hugetlb_mte_tagging(folio)) { + for (i = 0; i < nr; i++, page++) + mte_clear_page_tags(page_address(page)); + folio_set_hugetlb_mte_tagged(folio); + } + + /* ensure the tags are visible before the PTE is set */ + smp_wmb(); + + return; + } /* if PG_mte_tagged is set, tags have already been initialised */ for (i = 0; i < nr_pages; i++, page++) { @@ -67,7 +84,7 @@ int memcmp_pages(struct page *page1, struct page *page2) /* * If the page content is identical but at least one of the pages is * tagged, return non-zero to avoid KSM merging. If only one of the - * pages is tagged, set_pte_at() may zero or change the tags of the + * pages is tagged, __set_ptes() may zero or change the tags of the * other page via mte_sync_tags(). */ if (page_mte_tagged(page1) || page_mte_tagged(page2)) @@ -140,6 +157,24 @@ void mte_enable_kernel_asymm(void) mte_enable_kernel_sync(); } } + +int mte_enable_kernel_store_only(void) +{ + /* + * If the CPU does not support MTE store only, + * the kernel checks all operations. + */ + if (!cpus_have_cap(ARM64_MTE_STORE_ONLY)) + return -EINVAL; + + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCSO_MASK, + SYS_FIELD_PREP(SCTLR_EL1, TCSO, 1)); + isb(); + + pr_info_once("MTE: enabled store only mode at EL1\n"); + + return 0; +} #endif #ifdef CONFIG_KASAN_HW_TAGS @@ -183,7 +218,7 @@ static void mte_update_sctlr_user(struct task_struct *task) * program requested values go with what was requested. */ resolved_mte_tcf = (mte_ctrl & pref) ? pref : mte_ctrl; - sctlr &= ~SCTLR_EL1_TCF0_MASK; + sctlr &= ~(SCTLR_EL1_TCF0_MASK | SCTLR_EL1_TCSO0_MASK); /* * Pick an actual setting. The order in which we check for * set bits and map into register values determines our @@ -195,6 +230,10 @@ static void mte_update_sctlr_user(struct task_struct *task) sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, ASYNC); else if (resolved_mte_tcf & MTE_CTRL_TCF_SYNC) sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, SYNC); + + if (mte_ctrl & MTE_CTRL_STORE_ONLY) + sctlr |= SYS_FIELD_PREP(SCTLR_EL1, TCSO0, 1); + task->thread.sctlr_user = sctlr; } @@ -354,6 +393,9 @@ long set_mte_ctrl(struct task_struct *task, unsigned long arg) (arg & PR_MTE_TCF_SYNC)) mte_ctrl |= MTE_CTRL_TCF_ASYMM; + if (arg & PR_MTE_STORE_ONLY) + mte_ctrl |= MTE_CTRL_STORE_ONLY; + task->thread.mte_ctrl = mte_ctrl; if (task == current) { preempt_disable(); @@ -381,6 +423,8 @@ long get_mte_ctrl(struct task_struct *task) ret |= PR_MTE_TCF_ASYNC; if (mte_ctrl & MTE_CTRL_TCF_SYNC) ret |= PR_MTE_TCF_SYNC; + if (mte_ctrl & MTE_CTRL_STORE_ONLY) + ret |= PR_MTE_STORE_ONLY; return ret; } @@ -410,9 +454,10 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, void *maddr; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); + struct folio *folio; - if (IS_ERR_OR_NULL(page)) { - err = page == NULL ? -EIO : PTR_ERR(page); + if (IS_ERR(page)) { + err = PTR_ERR(page); break; } @@ -428,7 +473,13 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, put_page(page); break; } - WARN_ON_ONCE(!page_mte_tagged(page)); + + folio = page_folio(page); + if (folio_test_hugetlb(folio)) + WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio) && + !is_huge_zero_folio(folio)); + else + WARN_ON_ONCE(!page_mte_tagged(page) && !is_zero_page(page)); /* limit access to the end of the page */ offset = offset_in_page(addr); @@ -582,12 +633,9 @@ subsys_initcall(register_mte_tcf_preferred_sysctl); size_t mte_probe_user_range(const char __user *uaddr, size_t size) { const char __user *end = uaddr + size; - int err = 0; char val; - __raw_get_user(val, uaddr, err); - if (err) - return size; + __raw_get_user(val, uaddr, efault); uaddr = PTR_ALIGN(uaddr, MTE_GRANULE_SIZE); while (uaddr < end) { @@ -595,12 +643,13 @@ size_t mte_probe_user_range(const char __user *uaddr, size_t size) * A read is sufficient for mte, the caller should have probed * for the pte write permission if required. */ - __raw_get_user(val, uaddr, err); - if (err) - return end - uaddr; + __raw_get_user(val, uaddr, efault); uaddr += MTE_GRANULE_SIZE; } (void)val; return 0; + +efault: + return end - uaddr; } diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c index b4835f6d594b..1041bc67a3ee 100644 --- a/arch/arm64/kernel/patching.c +++ b/arch/arm64/kernel/patching.c @@ -10,7 +10,7 @@ #include <asm/fixmap.h> #include <asm/insn.h> #include <asm/kprobes.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/sections.h> static DEFINE_RAW_SPINLOCK(patch_lock); @@ -30,20 +30,17 @@ static bool is_image_text(unsigned long addr) static void __kprobes *patch_map(void *addr, int fixmap) { - unsigned long uintaddr = (uintptr_t) addr; - bool image = is_image_text(uintaddr); - struct page *page; + phys_addr_t phys; - if (image) - page = phys_to_page(__pa_symbol(addr)); - else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) - page = vmalloc_to_page(addr); - else - return addr; + if (is_image_text((unsigned long)addr)) { + phys = __pa_symbol(addr); + } else { + struct page *page = vmalloc_to_page(addr); + BUG_ON(!page); + phys = page_to_phys(page) + offset_in_page(addr); + } - BUG_ON(!page); - return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + - (uintaddr & ~PAGE_MASK)); + return (void *)set_fixmap_offset(fixmap, phys); } static void __kprobes patch_unmap(int fixmap) @@ -105,6 +102,81 @@ noinstr int aarch64_insn_write_literal_u64(void *addr, u64 val) return ret; } +typedef void text_poke_f(void *dst, void *src, size_t patched, size_t len); + +static void *__text_poke(text_poke_f func, void *addr, void *src, size_t len) +{ + unsigned long flags; + size_t patched = 0; + size_t size; + void *waddr; + void *ptr; + + raw_spin_lock_irqsave(&patch_lock, flags); + + while (patched < len) { + ptr = addr + patched; + size = min_t(size_t, PAGE_SIZE - offset_in_page(ptr), + len - patched); + + waddr = patch_map(ptr, FIX_TEXT_POKE0); + func(waddr, src, patched, size); + patch_unmap(FIX_TEXT_POKE0); + + patched += size; + } + raw_spin_unlock_irqrestore(&patch_lock, flags); + + flush_icache_range((uintptr_t)addr, (uintptr_t)addr + len); + + return addr; +} + +static void text_poke_memcpy(void *dst, void *src, size_t patched, size_t len) +{ + copy_to_kernel_nofault(dst, src + patched, len); +} + +static void text_poke_memset(void *dst, void *src, size_t patched, size_t len) +{ + u32 c = *(u32 *)src; + + memset32(dst, c, len / 4); +} + +/** + * aarch64_insn_copy - Copy instructions into (an unused part of) RX memory + * @dst: address to modify + * @src: source of the copy + * @len: length to copy + * + * Useful for JITs to dump new code blocks into unused regions of RX memory. + */ +noinstr void *aarch64_insn_copy(void *dst, void *src, size_t len) +{ + /* A64 instructions must be word aligned */ + if ((uintptr_t)dst & 0x3) + return NULL; + + return __text_poke(text_poke_memcpy, dst, src, len); +} + +/** + * aarch64_insn_set - memset for RX memory regions. + * @dst: address to modify + * @insn: value to set + * @len: length of memory region. + * + * Useful for JITs to fill regions of RX memory with illegal instructions. + */ +noinstr void *aarch64_insn_set(void *dst, u32 insn, size_t len) +{ + if ((uintptr_t)dst & 0x3) + return NULL; + + return __text_poke(text_poke_memset, dst, &insn, len); +} + int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) { u32 *tp = addr; diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index f872c57e9909..fd9a7bed83ce 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -6,28 +6,7 @@ * Copyright (C) 2014 ARM Ltd. */ -#include <linux/acpi.h> -#include <linux/init.h> -#include <linux/io.h> -#include <linux/kernel.h> -#include <linux/mm.h> #include <linux/pci.h> -#include <linux/pci-acpi.h> -#include <linux/pci-ecam.h> -#include <linux/slab.h> - -#ifdef CONFIG_ACPI -/* - * Try to assign the IRQ number when probing a new device - */ -int pcibios_alloc_irq(struct pci_dev *dev) -{ - if (!acpi_disabled) - acpi_pci_irq_enable(dev); - - return 0; -} -#endif /* * raw_pci_read/write - Platform-specific PCI config space access. @@ -61,173 +40,3 @@ int pcibus_to_node(struct pci_bus *bus) EXPORT_SYMBOL(pcibus_to_node); #endif - -#ifdef CONFIG_ACPI - -struct acpi_pci_generic_root_info { - struct acpi_pci_root_info common; - struct pci_config_window *cfg; /* config space mapping */ -}; - -int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) -{ - struct pci_config_window *cfg = bus->sysdata; - struct acpi_device *adev = to_acpi_device(cfg->parent); - struct acpi_pci_root *root = acpi_driver_data(adev); - - return root->segment; -} - -int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) -{ - struct pci_config_window *cfg; - struct acpi_device *adev; - struct device *bus_dev; - - if (acpi_disabled) - return 0; - - cfg = bridge->bus->sysdata; - - /* - * On Hyper-V there is no corresponding ACPI device for a root bridge, - * therefore ->parent is set as NULL by the driver. And set 'adev' as - * NULL in this case because there is no proper ACPI device. - */ - if (!cfg->parent) - adev = NULL; - else - adev = to_acpi_device(cfg->parent); - - bus_dev = &bridge->bus->dev; - - ACPI_COMPANION_SET(&bridge->dev, adev); - set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev))); - - return 0; -} - -static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) -{ - struct resource_entry *entry, *tmp; - int status; - - status = acpi_pci_probe_root_resources(ci); - resource_list_for_each_entry_safe(entry, tmp, &ci->resources) { - if (!(entry->res->flags & IORESOURCE_WINDOW)) - resource_list_destroy_entry(entry); - } - return status; -} - -/* - * Lookup the bus range for the domain in MCFG, and set up config space - * mapping. - */ -static struct pci_config_window * -pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root) -{ - struct device *dev = &root->device->dev; - struct resource *bus_res = &root->secondary; - u16 seg = root->segment; - const struct pci_ecam_ops *ecam_ops; - struct resource cfgres; - struct acpi_device *adev; - struct pci_config_window *cfg; - int ret; - - ret = pci_mcfg_lookup(root, &cfgres, &ecam_ops); - if (ret) { - dev_err(dev, "%04x:%pR ECAM region not found\n", seg, bus_res); - return NULL; - } - - adev = acpi_resource_consumer(&cfgres); - if (adev) - dev_info(dev, "ECAM area %pR reserved by %s\n", &cfgres, - dev_name(&adev->dev)); - else - dev_warn(dev, FW_BUG "ECAM area %pR not reserved in ACPI namespace\n", - &cfgres); - - cfg = pci_ecam_create(dev, &cfgres, bus_res, ecam_ops); - if (IS_ERR(cfg)) { - dev_err(dev, "%04x:%pR error %ld mapping ECAM\n", seg, bus_res, - PTR_ERR(cfg)); - return NULL; - } - - return cfg; -} - -/* release_info: free resources allocated by init_info */ -static void pci_acpi_generic_release_info(struct acpi_pci_root_info *ci) -{ - struct acpi_pci_generic_root_info *ri; - - ri = container_of(ci, struct acpi_pci_generic_root_info, common); - pci_ecam_free(ri->cfg); - kfree(ci->ops); - kfree(ri); -} - -/* Interface called from ACPI code to setup PCI host controller */ -struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) -{ - struct acpi_pci_generic_root_info *ri; - struct pci_bus *bus, *child; - struct acpi_pci_root_ops *root_ops; - struct pci_host_bridge *host; - - ri = kzalloc(sizeof(*ri), GFP_KERNEL); - if (!ri) - return NULL; - - root_ops = kzalloc(sizeof(*root_ops), GFP_KERNEL); - if (!root_ops) { - kfree(ri); - return NULL; - } - - ri->cfg = pci_acpi_setup_ecam_mapping(root); - if (!ri->cfg) { - kfree(ri); - kfree(root_ops); - return NULL; - } - - root_ops->release_info = pci_acpi_generic_release_info; - root_ops->prepare_resources = pci_acpi_root_prepare_resources; - root_ops->pci_ops = (struct pci_ops *)&ri->cfg->ops->pci_ops; - bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg); - if (!bus) - return NULL; - - /* If we must preserve the resource configuration, claim now */ - host = pci_find_host_bridge(bus); - if (host->preserve_config) - pci_bus_claim_resources(bus); - - /* - * Assign whatever was left unassigned. If we didn't claim above, - * this will reassign everything. - */ - pci_assign_unassigned_root_bus_resources(bus); - - list_for_each_entry(child, &bus->children, node) - pcie_bus_configure_settings(child); - - return bus; -} - -void pcibios_add_bus(struct pci_bus *bus) -{ - acpi_pci_add_bus(bus); -} - -void pcibios_remove_bus(struct pci_bus *bus) -{ - acpi_pci_remove_bus(bus); -} - -#endif diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 6d157f32187b..9b7f26b128b5 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -10,94 +10,12 @@ #include <asm/pointer_auth.h> -struct frame_tail { - struct frame_tail __user *fp; - unsigned long lr; -} __attribute__((packed)); - -/* - * Get the return address for a single stackframe and return a pointer to the - * next frame tail. - */ -static struct frame_tail __user * -user_backtrace(struct frame_tail __user *tail, - struct perf_callchain_entry_ctx *entry) -{ - struct frame_tail buftail; - unsigned long err; - unsigned long lr; - - /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(tail, sizeof(buftail))) - return NULL; - - pagefault_disable(); - err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); - pagefault_enable(); - - if (err) - return NULL; - - lr = ptrauth_strip_user_insn_pac(buftail.lr); - - perf_callchain_store(entry, lr); - - /* - * Frame pointers should strictly progress back up the stack - * (towards higher addresses). - */ - if (tail >= buftail.fp) - return NULL; - - return buftail.fp; -} - -#ifdef CONFIG_COMPAT -/* - * The registers we're interested in are at the end of the variable - * length saved register structure. The fp points at the end of this - * structure so the address of this struct is: - * (struct compat_frame_tail *)(xxx->fp)-1 - * - * This code has been adapted from the ARM OProfile support. - */ -struct compat_frame_tail { - compat_uptr_t fp; /* a (struct compat_frame_tail *) in compat mode */ - u32 sp; - u32 lr; -} __attribute__((packed)); - -static struct compat_frame_tail __user * -compat_user_backtrace(struct compat_frame_tail __user *tail, - struct perf_callchain_entry_ctx *entry) +static bool callchain_trace(void *data, unsigned long pc) { - struct compat_frame_tail buftail; - unsigned long err; - - /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(tail, sizeof(buftail))) - return NULL; - - pagefault_disable(); - err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); - pagefault_enable(); - - if (err) - return NULL; - - perf_callchain_store(entry, buftail.lr); - - /* - * Frame pointers should strictly progress back up the stack - * (towards higher addresses). - */ - if (tail + 1 >= (struct compat_frame_tail __user *) - compat_ptr(buftail.fp)) - return NULL; + struct perf_callchain_entry_ctx *entry = data; - return (struct compat_frame_tail __user *)compat_ptr(buftail.fp) - 1; + return perf_callchain_store(entry, pc) == 0; } -#endif /* CONFIG_COMPAT */ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) @@ -107,35 +25,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, return; } - perf_callchain_store(entry, regs->pc); - - if (!compat_user_mode(regs)) { - /* AARCH64 mode */ - struct frame_tail __user *tail; - - tail = (struct frame_tail __user *)regs->regs[29]; - - while (entry->nr < entry->max_stack && - tail && !((unsigned long)tail & 0x7)) - tail = user_backtrace(tail, entry); - } else { -#ifdef CONFIG_COMPAT - /* AARCH32 compat mode */ - struct compat_frame_tail __user *tail; - - tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; - - while ((entry->nr < entry->max_stack) && - tail && !((unsigned long)tail & 0x3)) - tail = compat_user_backtrace(tail, entry); -#endif - } -} - -static bool callchain_trace(void *data, unsigned long pc) -{ - struct perf_callchain_entry_ctx *entry = data; - return perf_callchain_store(entry, pc) == 0; + arch_stack_walk_user(callchain_trace, entry, regs); } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, @@ -148,31 +38,3 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, arch_stack_walk(callchain_trace, entry, current, regs); } - -unsigned long perf_instruction_pointer(struct pt_regs *regs) -{ - if (perf_guest_state()) - return perf_guest_get_ip(); - - return instruction_pointer(regs); -} - -unsigned long perf_misc_flags(struct pt_regs *regs) -{ - unsigned int guest_state = perf_guest_state(); - int misc = 0; - - if (guest_state) { - if (guest_state & PERF_GUEST_USER) - misc |= PERF_RECORD_MISC_GUEST_USER; - else - misc |= PERF_RECORD_MISC_GUEST_KERNEL; - } else { - if (user_mode(regs)) - misc |= PERF_RECORD_MISC_USER; - else - misc |= PERF_RECORD_MISC_KERNEL; - } - - return misc; -} diff --git a/arch/arm64/kernel/pi/.gitignore b/arch/arm64/kernel/pi/.gitignore new file mode 100644 index 000000000000..efb29b663e85 --- /dev/null +++ b/arch/arm64/kernel/pi/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +relacheck diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile index 4c0ea3cd4ea4..be92d73c25b2 100644 --- a/arch/arm64/kernel/pi/Makefile +++ b/arch/arm64/kernel/pi/Makefile @@ -2,7 +2,8 @@ # Copyright 2022 Google LLC KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ - -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \ + -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_KSTACK_ERASE) \ + $(DISABLE_LATENT_ENTROPY_PLUGIN) \ $(call cc-option,-mbranch-protection=none) \ -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \ -include $(srctree)/include/linux/hidden.h \ @@ -10,25 +11,34 @@ KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ -fno-asynchronous-unwind-tables -fno-unwind-tables \ $(call cc-option,-fno-addrsig) +# this code may run with the MMU off so disable unaligned accesses +CFLAGS_map_range.o += -mstrict-align + # remove SCS flags from all objects in this directory KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_SCS), $(KBUILD_CFLAGS)) # disable LTO KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS)) -GCOV_PROFILE := n -KASAN_SANITIZE := n -KCSAN_SANITIZE := n -UBSAN_SANITIZE := n -KCOV_INSTRUMENT := n +hostprogs := relacheck + +quiet_cmd_piobjcopy = $(quiet_cmd_objcopy) + cmd_piobjcopy = $(cmd_objcopy) && $(obj)/relacheck $(@) $(<) $(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ \ - --remove-section=.note.gnu.property \ - --prefix-alloc-sections=.init -$(obj)/%.pi.o: $(obj)/%.o FORCE - $(call if_changed,objcopy) + --remove-section=.note.gnu.property +$(obj)/%.pi.o: $(obj)/%.o $(obj)/relacheck FORCE + $(call if_changed,piobjcopy) + +# ensure that all the lib- code ends up as __init code and data +$(obj)/lib-%.pi.o: OBJCOPYFLAGS += --prefix-alloc-sections=.init $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE $(call if_changed_rule,cc_o_c) -obj-y := kaslr_early.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o -extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) +obj-y := idreg-override.pi.o \ + map_kernel.pi.o map_range.pi.o \ + lib-fdt.pi.o lib-fdt_ro.pi.o +obj-$(CONFIG_RELOCATABLE) += relocate.pi.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_early.pi.o +obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.pi.o +targets := $(patsubst %.pi.o,%.o,$(obj-y)) diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c new file mode 100644 index 000000000000..bc57b290e5e7 --- /dev/null +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Early cpufeature override framework + * + * Copyright (C) 2020 Google LLC + * Author: Marc Zyngier <maz@kernel.org> + */ + +#include <linux/ctype.h> +#include <linux/kernel.h> +#include <linux/libfdt.h> + +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/setup.h> + +#include "pi.h" + +#define FTR_DESC_NAME_LEN 20 +#define FTR_DESC_FIELD_LEN 10 +#define FTR_ALIAS_NAME_LEN 30 +#define FTR_ALIAS_OPTION_LEN 116 + +static u64 __boot_status __initdata; + +typedef bool filter_t(u64 val); + +struct ftr_set_desc { + char name[FTR_DESC_NAME_LEN]; + PREL64(struct arm64_ftr_override, override); + struct { + char name[FTR_DESC_FIELD_LEN]; + u8 shift; + u8 width; + PREL64(filter_t, filter); + } fields[]; +}; + +#define FIELD(n, s, f) { .name = n, .shift = s, .width = 4, .filter = f } + +static const struct ftr_set_desc mmfr0 __prel64_initconst = { + .name = "id_aa64mmfr0", + .override = &id_aa64mmfr0_override, + .fields = { + FIELD("ecv", ID_AA64MMFR0_EL1_ECV_SHIFT, NULL), + {} + }, +}; + +static bool __init mmfr1_vh_filter(u64 val) +{ + /* + * If we ever reach this point while running VHE, we're + * guaranteed to be on one of these funky, VHE-stuck CPUs. If + * the user was trying to force nVHE on us, proceed with + * attitude adjustment. + */ + return !(__boot_status == (BOOT_CPU_FLAG_E2H | BOOT_CPU_MODE_EL2) && + val == 0); +} + +static const struct ftr_set_desc mmfr1 __prel64_initconst = { + .name = "id_aa64mmfr1", + .override = &id_aa64mmfr1_override, + .fields = { + FIELD("vh", ID_AA64MMFR1_EL1_VH_SHIFT, mmfr1_vh_filter), + {} + }, +}; + + +static bool __init mmfr2_varange_filter(u64 val) +{ + int __maybe_unused feat; + + if (val) + return false; + +#ifdef CONFIG_ARM64_LPA2 + feat = cpuid_feature_extract_signed_field(read_sysreg(id_aa64mmfr0_el1), + ID_AA64MMFR0_EL1_TGRAN_SHIFT); + if (feat >= ID_AA64MMFR0_EL1_TGRAN_LPA2) { + id_aa64mmfr0_override.val |= + (ID_AA64MMFR0_EL1_TGRAN_LPA2 - 1) << ID_AA64MMFR0_EL1_TGRAN_SHIFT; + id_aa64mmfr0_override.mask |= 0xfU << ID_AA64MMFR0_EL1_TGRAN_SHIFT; + + /* + * Override PARange to 48 bits - the override will just be + * ignored if the actual PARange is smaller, but this is + * unlikely to be the case for LPA2 capable silicon. + */ + id_aa64mmfr0_override.val |= + ID_AA64MMFR0_EL1_PARANGE_48 << ID_AA64MMFR0_EL1_PARANGE_SHIFT; + id_aa64mmfr0_override.mask |= 0xfU << ID_AA64MMFR0_EL1_PARANGE_SHIFT; + } +#endif + return true; +} + +static const struct ftr_set_desc mmfr2 __prel64_initconst = { + .name = "id_aa64mmfr2", + .override = &id_aa64mmfr2_override, + .fields = { + FIELD("varange", ID_AA64MMFR2_EL1_VARange_SHIFT, mmfr2_varange_filter), + {} + }, +}; + +static bool __init pfr0_sve_filter(u64 val) +{ + /* + * Disabling SVE also means disabling all the features that + * are associated with it. The easiest way to do it is just to + * override id_aa64zfr0_el1 to be 0. + */ + if (!val) { + id_aa64zfr0_override.val = 0; + id_aa64zfr0_override.mask = GENMASK(63, 0); + } + + return true; +} + +static const struct ftr_set_desc pfr0 __prel64_initconst = { + .name = "id_aa64pfr0", + .override = &id_aa64pfr0_override, + .fields = { + FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter), + FIELD("el0", ID_AA64PFR0_EL1_EL0_SHIFT, NULL), + FIELD("mpam", ID_AA64PFR0_EL1_MPAM_SHIFT, NULL), + {} + }, +}; + +static bool __init pfr1_sme_filter(u64 val) +{ + /* + * Similarly to SVE, disabling SME also means disabling all + * the features that are associated with it. Just set + * id_aa64smfr0_el1 to 0 and don't look back. + */ + if (!val) { + id_aa64smfr0_override.val = 0; + id_aa64smfr0_override.mask = GENMASK(63, 0); + } + + return true; +} + +static const struct ftr_set_desc pfr1 __prel64_initconst = { + .name = "id_aa64pfr1", + .override = &id_aa64pfr1_override, + .fields = { + FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ), + FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL), + FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), + FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), + FIELD("mpam_frac", ID_AA64PFR1_EL1_MPAM_frac_SHIFT, NULL), + {} + }, +}; + +static const struct ftr_set_desc isar1 __prel64_initconst = { + .name = "id_aa64isar1", + .override = &id_aa64isar1_override, + .fields = { + FIELD("gpi", ID_AA64ISAR1_EL1_GPI_SHIFT, NULL), + FIELD("gpa", ID_AA64ISAR1_EL1_GPA_SHIFT, NULL), + FIELD("api", ID_AA64ISAR1_EL1_API_SHIFT, NULL), + FIELD("apa", ID_AA64ISAR1_EL1_APA_SHIFT, NULL), + {} + }, +}; + +static const struct ftr_set_desc isar2 __prel64_initconst = { + .name = "id_aa64isar2", + .override = &id_aa64isar2_override, + .fields = { + FIELD("gpa3", ID_AA64ISAR2_EL1_GPA3_SHIFT, NULL), + FIELD("apa3", ID_AA64ISAR2_EL1_APA3_SHIFT, NULL), + FIELD("mops", ID_AA64ISAR2_EL1_MOPS_SHIFT, NULL), + {} + }, +}; + +static const struct ftr_set_desc smfr0 __prel64_initconst = { + .name = "id_aa64smfr0", + .override = &id_aa64smfr0_override, + .fields = { + FIELD("smever", ID_AA64SMFR0_EL1_SMEver_SHIFT, NULL), + /* FA64 is a one bit field... :-/ */ + { "fa64", ID_AA64SMFR0_EL1_FA64_SHIFT, 1, }, + {} + }, +}; + +static bool __init hvhe_filter(u64 val) +{ + u64 mmfr1 = read_sysreg(id_aa64mmfr1_el1); + + return (val == 1 && + lower_32_bits(__boot_status) == BOOT_CPU_MODE_EL2 && + cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_EL1_VH_SHIFT)); +} + +static const struct ftr_set_desc sw_features __prel64_initconst = { + .name = "arm64_sw", + .override = &arm64_sw_feature_override, + .fields = { + FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL), + FIELD("hvhe", ARM64_SW_FEATURE_OVERRIDE_HVHE, hvhe_filter), + FIELD("rodataoff", ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF, NULL), + {} + }, +}; + +static const +PREL64(const struct ftr_set_desc, reg) regs[] __prel64_initconst = { + { &mmfr0 }, + { &mmfr1 }, + { &mmfr2 }, + { &pfr0 }, + { &pfr1 }, + { &isar1 }, + { &isar2 }, + { &smfr0 }, + { &sw_features }, +}; + +static const struct { + char alias[FTR_ALIAS_NAME_LEN]; + char feature[FTR_ALIAS_OPTION_LEN]; +} aliases[] __initconst = { + { "kvm_arm.mode=nvhe", "arm64_sw.hvhe=0 id_aa64mmfr1.vh=0" }, + { "kvm_arm.mode=protected", "arm64_sw.hvhe=1" }, + { "arm64.nosve", "id_aa64pfr0.sve=0" }, + { "arm64.nosme", "id_aa64pfr1.sme=0" }, + { "arm64.nobti", "id_aa64pfr1.bt=0" }, + { "arm64.nogcs", "id_aa64pfr1.gcs=0" }, + { "arm64.nopauth", + "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 " + "id_aa64isar1.api=0 id_aa64isar1.apa=0 " + "id_aa64isar2.gpa3=0 id_aa64isar2.apa3=0" }, + { "arm64.nomops", "id_aa64isar2.mops=0" }, + { "arm64.nomte", "id_aa64pfr1.mte=0" }, + { "nokaslr", "arm64_sw.nokaslr=1" }, + { "rodata=off", "arm64_sw.rodataoff=1" }, + { "arm64.nolva", "id_aa64mmfr2.varange=0" }, + { "arm64.no32bit_el0", "id_aa64pfr0.el0=1" }, + { "arm64.nompam", "id_aa64pfr0.mpam=0 id_aa64pfr1.mpam_frac=0" }, +}; + +static int __init parse_hexdigit(const char *p, u64 *v) +{ + // skip "0x" if it comes next + if (p[0] == '0' && tolower(p[1]) == 'x') + p += 2; + + // check whether the RHS is a single hex digit + if (!isxdigit(p[0]) || (p[1] && !isspace(p[1]))) + return -EINVAL; + + *v = tolower(*p) - (isdigit(*p) ? '0' : 'a' - 10); + return 0; +} + +static int __init find_field(const char *cmdline, char *opt, int len, + const struct ftr_set_desc *reg, int f, u64 *v) +{ + int flen = strlen(reg->fields[f].name); + + // append '<fieldname>=' to obtain '<name>.<fieldname>=' + memcpy(opt + len, reg->fields[f].name, flen); + len += flen; + opt[len++] = '='; + + if (memcmp(cmdline, opt, len)) + return -1; + + return parse_hexdigit(cmdline + len, v); +} + +static void __init match_options(const char *cmdline) +{ + char opt[FTR_DESC_NAME_LEN + FTR_DESC_FIELD_LEN + 2]; + int i; + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + const struct ftr_set_desc *reg = prel64_pointer(regs[i].reg); + struct arm64_ftr_override *override; + int len = strlen(reg->name); + int f; + + override = prel64_pointer(reg->override); + + // set opt[] to '<name>.' + memcpy(opt, reg->name, len); + opt[len++] = '.'; + + for (f = 0; reg->fields[f].name[0] != '\0'; f++) { + u64 shift = reg->fields[f].shift; + u64 width = reg->fields[f].width ?: 4; + u64 mask = GENMASK_ULL(shift + width - 1, shift); + bool (*filter)(u64 val); + u64 v; + + if (find_field(cmdline, opt, len, reg, f, &v)) + continue; + + /* + * If an override gets filtered out, advertise + * it by setting the value to the all-ones while + * clearing the mask... Yes, this is fragile. + */ + filter = prel64_pointer(reg->fields[f].filter); + if (filter && !filter(v)) { + override->val |= mask; + override->mask &= ~mask; + continue; + } + + override->val &= ~mask; + override->val |= (v << shift) & mask; + override->mask |= mask; + + return; + } + } +} + +static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) +{ + do { + char buf[256]; + size_t len; + int i; + + cmdline = skip_spaces(cmdline); + + /* terminate on "--" appearing on the command line by itself */ + if (cmdline[0] == '-' && cmdline[1] == '-' && isspace(cmdline[2])) + return; + + for (len = 0; cmdline[len] && !isspace(cmdline[len]); len++) { + if (len >= sizeof(buf) - 1) + break; + if (cmdline[len] == '-') + buf[len] = '_'; + else + buf[len] = cmdline[len]; + } + if (!len) + return; + + buf[len] = 0; + + cmdline += len; + + match_options(buf); + + for (i = 0; parse_aliases && i < ARRAY_SIZE(aliases); i++) + if (!memcmp(buf, aliases[i].alias, len + 1)) + __parse_cmdline(aliases[i].feature, false); + } while (1); +} + +static __init const u8 *get_bootargs_cmdline(const void *fdt, int node) +{ + static char const bootargs[] __initconst = "bootargs"; + const u8 *prop; + + if (node < 0) + return NULL; + + prop = fdt_getprop(fdt, node, bootargs, NULL); + if (!prop) + return NULL; + + return strlen(prop) ? prop : NULL; +} + +static __init void parse_cmdline(const void *fdt, int chosen) +{ + static char const cmdline[] __initconst = CONFIG_CMDLINE; + const u8 *prop = get_bootargs_cmdline(fdt, chosen); + + if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop) + __parse_cmdline(cmdline, true); + + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop) + __parse_cmdline(prop, true); +} + +void __init init_feature_override(u64 boot_status, const void *fdt, + int chosen) +{ + struct arm64_ftr_override *override; + const struct ftr_set_desc *reg; + int i; + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + reg = prel64_pointer(regs[i].reg); + override = prel64_pointer(reg->override); + + override->val = 0; + override->mask = 0; + } + + __boot_status = boot_status; + + parse_cmdline(fdt, chosen); + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + reg = prel64_pointer(regs[i].reg); + override = prel64_pointer(reg->override); + dcache_clean_inval_poc((unsigned long)override, + (unsigned long)(override + 1)); + } +} + +char * __init skip_spaces(const char *str) +{ + while (isspace(*str)) + ++str; + return (char *)str; +} diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c index 17bff6e399e4..e0e018046a46 100644 --- a/arch/arm64/kernel/pi/kaslr_early.c +++ b/arch/arm64/kernel/pi/kaslr_early.c @@ -14,69 +14,21 @@ #include <asm/archrandom.h> #include <asm/memory.h> +#include <asm/pgtable.h> -/* taken from lib/string.c */ -static char *__strstr(const char *s1, const char *s2) -{ - size_t l1, l2; - - l2 = strlen(s2); - if (!l2) - return (char *)s1; - l1 = strlen(s1); - while (l1 >= l2) { - l1--; - if (!memcmp(s1, s2, l2)) - return (char *)s1; - s1++; - } - return NULL; -} -static bool cmdline_contains_nokaslr(const u8 *cmdline) -{ - const u8 *str; - - str = __strstr(cmdline, "nokaslr"); - return str == cmdline || (str > cmdline && *(str - 1) == ' '); -} - -static bool is_kaslr_disabled_cmdline(void *fdt) -{ - if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { - int node; - const u8 *prop; - - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - goto out; - - prop = fdt_getprop(fdt, node, "bootargs", NULL); - if (!prop) - goto out; - - if (cmdline_contains_nokaslr(prop)) - return true; - - if (IS_ENABLED(CONFIG_CMDLINE_EXTEND)) - goto out; - - return false; - } -out: - return cmdline_contains_nokaslr(CONFIG_CMDLINE); -} +#include "pi.h" -static u64 get_kaslr_seed(void *fdt) +static u64 __init get_kaslr_seed(void *fdt, int node) { - int node, len; + static char const seed_str[] __initconst = "kaslr-seed"; fdt64_t *prop; u64 ret; + int len; - node = fdt_path_offset(fdt, "/chosen"); if (node < 0) return 0; - prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); + prop = fdt_getprop_w(fdt, node, seed_str, &len); if (!prop || len != sizeof(u64)) return 0; @@ -85,14 +37,14 @@ static u64 get_kaslr_seed(void *fdt) return ret; } -asmlinkage u64 kaslr_early_init(void *fdt) +u64 __init kaslr_early_init(void *fdt, int chosen) { - u64 seed; + u64 seed, range; - if (is_kaslr_disabled_cmdline(fdt)) + if (kaslr_disabled_cmdline()) return 0; - seed = get_kaslr_seed(fdt); + seed = get_kaslr_seed(fdt, chosen); if (!seed) { if (!__early_cpu_has_rndr() || !__arm64_rndr((unsigned long *)&seed)) @@ -102,9 +54,9 @@ asmlinkage u64 kaslr_early_init(void *fdt) /* * OK, so we are proceeding with KASLR enabled. Calculate a suitable * kernel image offset from the seed. Let's place the kernel in the - * middle half of the VMALLOC area (VA_BITS_MIN - 2), and stay clear of - * the lower and upper quarters to avoid colliding with other - * allocations. + * 'middle' half of the VMALLOC area, and stay clear of the lower and + * upper quarters to avoid colliding with other allocations. */ - return BIT(VA_BITS_MIN - 3) + (seed & GENMASK(VA_BITS_MIN - 3, 0)); + range = (VMALLOC_END - KIMAGE_VADDR) / 2; + return range / 2 + (((__uint128_t)range * seed) >> 64); } diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c new file mode 100644 index 000000000000..a852264958c3 --- /dev/null +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +#include <linux/init.h> +#include <linux/libfdt.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/sizes.h> +#include <linux/string.h> + +#include <asm/memory.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> + +#include "pi.h" + +extern const u8 __eh_frame_start[], __eh_frame_end[]; + +extern void idmap_cpu_replace_ttbr1(phys_addr_t pgdir); + +static void __init map_segment(pgd_t *pg_dir, phys_addr_t *pgd, u64 va_offset, + void *start, void *end, pgprot_t prot, + bool may_use_cont, int root_level) +{ + map_range(pgd, ((u64)start + va_offset) & ~PAGE_OFFSET, + ((u64)end + va_offset) & ~PAGE_OFFSET, (u64)start, + prot, root_level, (pte_t *)pg_dir, may_use_cont, 0); +} + +static void __init unmap_segment(pgd_t *pg_dir, u64 va_offset, void *start, + void *end, int root_level) +{ + map_segment(pg_dir, NULL, va_offset, start, end, __pgprot(0), + false, root_level); +} + +static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) +{ + bool enable_scs = IS_ENABLED(CONFIG_UNWIND_PATCH_PAC_INTO_SCS); + bool twopass = IS_ENABLED(CONFIG_RELOCATABLE); + phys_addr_t pgdp = (phys_addr_t)init_pg_dir + PAGE_SIZE; + pgprot_t text_prot = PAGE_KERNEL_ROX; + pgprot_t data_prot = PAGE_KERNEL; + pgprot_t prot; + + /* + * External debuggers may need to write directly to the text mapping to + * install SW breakpoints. Allow this (only) when explicitly requested + * with rodata=off. + */ + if (arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF)) + text_prot = PAGE_KERNEL_EXEC; + + /* + * We only enable the shadow call stack dynamically if we are running + * on a system that does not implement PAC or BTI. PAC and SCS provide + * roughly the same level of protection, and BTI relies on the PACIASP + * instructions serving as landing pads, preventing us from patching + * those instructions into something else. + */ + if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) && cpu_has_pac()) + enable_scs = false; + + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && cpu_has_bti()) { + enable_scs = false; + + /* + * If we have a CPU that supports BTI and a kernel built for + * BTI then mark the kernel executable text as guarded pages + * now so we don't have to rewrite the page tables later. + */ + text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP); + } + + /* Map all code read-write on the first pass if needed */ + twopass |= enable_scs; + prot = twopass ? data_prot : text_prot; + + /* + * [_stext, _text) isn't executed after boot and contains some + * non-executable, unpredictable data, so map it non-executable. + */ + map_segment(init_pg_dir, &pgdp, va_offset, _text, _stext, data_prot, + false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot, + !twopass, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata, + __inittext_begin, data_prot, false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, __inittext_begin, + __inittext_end, prot, false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, __initdata_begin, + __initdata_end, data_prot, false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, _data, _end, data_prot, + true, root_level); + dsb(ishst); + + idmap_cpu_replace_ttbr1((phys_addr_t)init_pg_dir); + + if (twopass) { + if (IS_ENABLED(CONFIG_RELOCATABLE)) + relocate_kernel(kaslr_offset); + + if (enable_scs) { + scs_patch(__eh_frame_start + va_offset, + __eh_frame_end - __eh_frame_start, false); + asm("ic ialluis"); + + dynamic_scs_is_enabled = true; + } + + /* + * Unmap the text region before remapping it, to avoid + * potential TLB conflicts when creating the contiguous + * descriptors. + */ + unmap_segment(init_pg_dir, va_offset, _stext, _etext, + root_level); + dsb(ishst); + isb(); + __tlbi(vmalle1); + isb(); + + /* + * Remap these segments with different permissions + * No new page table allocations should be needed + */ + map_segment(init_pg_dir, NULL, va_offset, _stext, _etext, + text_prot, true, root_level); + map_segment(init_pg_dir, NULL, va_offset, __inittext_begin, + __inittext_end, text_prot, false, root_level); + } + + /* Copy the root page table to its final location */ + memcpy((void *)swapper_pg_dir + va_offset, init_pg_dir, PAGE_SIZE); + dsb(ishst); + idmap_cpu_replace_ttbr1((phys_addr_t)swapper_pg_dir); +} + +static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(phys_addr_t ttbr) +{ + u64 sctlr = read_sysreg(sctlr_el1); + u64 tcr = read_sysreg(tcr_el1) | TCR_EL1_DS; + u64 mmfr0 = read_sysreg(id_aa64mmfr0_el1); + u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_PARANGE_SHIFT); + + tcr &= ~TCR_EL1_IPS_MASK; + tcr |= parange << TCR_EL1_IPS_SHIFT; + + asm(" msr sctlr_el1, %0 ;" + " isb ;" + " msr ttbr0_el1, %1 ;" + " msr tcr_el1, %2 ;" + " isb ;" + " tlbi vmalle1 ;" + " dsb nsh ;" + " isb ;" + " msr sctlr_el1, %3 ;" + " isb ;" + :: "r"(sctlr & ~SCTLR_ELx_M), "r"(ttbr), "r"(tcr), "r"(sctlr)); +} + +static void __init remap_idmap_for_lpa2(void) +{ + /* clear the bits that change meaning once LPA2 is turned on */ + ptdesc_t mask = PTE_SHARED; + + /* + * We have to clear bits [9:8] in all block or page descriptors in the + * initial ID map, as otherwise they will be (mis)interpreted as + * physical address bits once we flick the LPA2 switch (TCR.DS). Since + * we cannot manipulate live descriptors in that way without creating + * potential TLB conflicts, let's create another temporary ID map in a + * LPA2 compatible fashion, and update the initial ID map while running + * from that. + */ + create_init_idmap(init_pg_dir, mask); + dsb(ishst); + set_ttbr0_for_lpa2((phys_addr_t)init_pg_dir); + + /* + * Recreate the initial ID map with the same granularity as before. + * Don't bother with the FDT, we no longer need it after this. + */ + memset(init_idmap_pg_dir, 0, + (char *)init_idmap_pg_end - (char *)init_idmap_pg_dir); + + create_init_idmap(init_idmap_pg_dir, mask); + dsb(ishst); + + /* switch back to the updated initial ID map */ + set_ttbr0_for_lpa2((phys_addr_t)init_idmap_pg_dir); + + /* wipe the temporary ID map from memory */ + memset(init_pg_dir, 0, (char *)init_pg_end - (char *)init_pg_dir); +} + +static void *__init map_fdt(phys_addr_t fdt) +{ + static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE); + phys_addr_t efdt = fdt + MAX_FDT_SIZE; + phys_addr_t ptep = (phys_addr_t)ptes; /* We're idmapped when called */ + + /* + * Map up to MAX_FDT_SIZE bytes, but avoid overlap with + * the kernel image. + */ + map_range(&ptep, fdt, (u64)_text > fdt ? min((u64)_text, efdt) : efdt, + fdt, PAGE_KERNEL, IDMAP_ROOT_LEVEL, + (pte_t *)init_idmap_pg_dir, false, 0); + dsb(ishst); + + return (void *)fdt; +} + +/* + * PI version of the Cavium Eratum 27456 detection, which makes it + * impossible to use non-global mappings. + */ +static bool __init ng_mappings_allowed(void) +{ + static const struct midr_range cavium_erratum_27456_cpus[] __initconst = { + /* Cavium ThunderX, T88 pass 1.x - 2.1 */ + MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), + /* Cavium ThunderX, T81 pass 1.0 */ + MIDR_REV(MIDR_THUNDERX_81XX, 0, 0), + {}, + }; + + for (const struct midr_range *r = cavium_erratum_27456_cpus; r->model; r++) { + if (midr_is_cpu_model_range(read_cpuid_id(), r->model, + r->rv_min, r->rv_max)) + return false; + } + + return true; +} + +asmlinkage void __init early_map_kernel(u64 boot_status, phys_addr_t fdt) +{ + static char const chosen_str[] __initconst = "/chosen"; + u64 va_base, pa_base = (u64)&_text; + u64 kaslr_offset = pa_base % MIN_KIMG_ALIGN; + int root_level = 4 - CONFIG_PGTABLE_LEVELS; + int va_bits = VA_BITS; + int chosen; + void *fdt_mapped = map_fdt(fdt); + + /* Clear BSS and the initial page tables */ + memset(__bss_start, 0, (char *)init_pg_end - (char *)__bss_start); + + /* Parse the command line for CPU feature overrides */ + chosen = fdt_path_offset(fdt_mapped, chosen_str); + init_feature_override(boot_status, fdt_mapped, chosen); + + if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && !cpu_has_lva()) { + va_bits = VA_BITS_MIN; + } else if (IS_ENABLED(CONFIG_ARM64_LPA2) && !cpu_has_lpa2()) { + va_bits = VA_BITS_MIN; + root_level++; + } + + if (va_bits > VA_BITS_MIN) + sysreg_clear_set(tcr_el1, TCR_EL1_T1SZ_MASK, TCR_T1SZ(va_bits)); + + /* + * The virtual KASLR displacement modulo 2MiB is decided by the + * physical placement of the image, as otherwise, we might not be able + * to create the early kernel mapping using 2 MiB block descriptors. So + * take the low bits of the KASLR offset from the physical address, and + * fill in the high bits from the seed. + */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + u64 kaslr_seed = kaslr_early_init(fdt_mapped, chosen); + + if (kaslr_seed && kaslr_requires_kpti()) + arm64_use_ng_mappings = ng_mappings_allowed(); + + kaslr_offset |= kaslr_seed & ~(MIN_KIMG_ALIGN - 1); + } + + if (IS_ENABLED(CONFIG_ARM64_LPA2) && va_bits > VA_BITS_MIN) + remap_idmap_for_lpa2(); + + va_base = KIMAGE_VADDR + kaslr_offset; + map_kernel(kaslr_offset, va_base - pa_base, root_level); +} diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c new file mode 100644 index 000000000000..de52cd85c691 --- /dev/null +++ b/arch/arm64/kernel/pi/map_range.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +#include <linux/types.h> +#include <linux/sizes.h> + +#include <asm/memory.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> + +#include "pi.h" + +/** + * map_range - Map a contiguous range of physical pages into virtual memory + * + * @pte: Address of physical pointer to array of pages to + * allocate page tables from + * @start: Virtual address of the start of the range + * @end: Virtual address of the end of the range (exclusive) + * @pa: Physical address of the start of the range + * @prot: Access permissions of the range + * @level: Translation level for the mapping + * @tbl: The level @level page table to create the mappings in + * @may_use_cont: Whether the use of the contiguous attribute is allowed + * @va_offset: Offset between a physical page and its current mapping + * in the VA space + */ +void __init map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset) +{ + u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; + ptdesc_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; + int lshift = (3 - level) * PTDESC_TABLE_SHIFT; + u64 lmask = (PAGE_SIZE << lshift) - 1; + + start &= PAGE_MASK; + pa &= PAGE_MASK; + + /* Advance tbl to the entry that covers start */ + tbl += (start >> (lshift + PAGE_SHIFT)) % PTRS_PER_PTE; + + /* + * Set the right block/page bits for this level unless we are + * clearing the mapping + */ + if (protval) + protval |= (level == 2) ? PMD_TYPE_SECT : PTE_TYPE_PAGE; + + while (start < end) { + u64 next = min((start | lmask) + 1, PAGE_ALIGN(end)); + + if (level < 2 || (level == 2 && (start | next | pa) & lmask)) { + /* + * This chunk needs a finer grained mapping. Create a + * table mapping if necessary and recurse. + */ + if (pte_none(*tbl)) { + *tbl = __pte(__phys_to_pte_val(*pte) | + PMD_TYPE_TABLE | PMD_TABLE_UXN); + *pte += PTRS_PER_PTE * sizeof(pte_t); + } + map_range(pte, start, next, pa, prot, level + 1, + (pte_t *)(__pte_to_phys(*tbl) + va_offset), + may_use_cont, va_offset); + } else { + /* + * Start a contiguous range if start and pa are + * suitably aligned + */ + if (((start | pa) & cmask) == 0 && may_use_cont) + protval |= PTE_CONT; + + /* + * Clear the contiguous attribute if the remaining + * range does not cover a contiguous block + */ + if ((end & ~cmask) <= start) + protval &= ~PTE_CONT; + + /* Put down a block or page mapping */ + *tbl = __pte(__phys_to_pte_val(pa) | protval); + } + pa += next - start; + start = next; + tbl++; + } +} + +asmlinkage phys_addr_t __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask) +{ + phys_addr_t ptep = (phys_addr_t)pg_dir + PAGE_SIZE; /* MMU is off */ + pgprot_t text_prot = PAGE_KERNEL_ROX; + pgprot_t data_prot = PAGE_KERNEL; + + pgprot_val(text_prot) &= ~clrmask; + pgprot_val(data_prot) &= ~clrmask; + + /* MMU is off; pointer casts to phys_addr_t are safe */ + map_range(&ptep, (u64)_stext, (u64)__initdata_begin, + (phys_addr_t)_stext, text_prot, IDMAP_ROOT_LEVEL, + (pte_t *)pg_dir, false, 0); + map_range(&ptep, (u64)__initdata_begin, (u64)_end, + (phys_addr_t)__initdata_begin, data_prot, IDMAP_ROOT_LEVEL, + (pte_t *)pg_dir, false, 0); + + return ptep; +} diff --git a/arch/arm64/kernel/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c index a1fe4b4ff591..bbe7d30ed12b 100644 --- a/arch/arm64/kernel/patch-scs.c +++ b/arch/arm64/kernel/pi/patch-scs.c @@ -4,16 +4,17 @@ * Author: Ard Biesheuvel <ardb@google.com> */ -#include <linux/bug.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/linkage.h> -#include <linux/printk.h> #include <linux/types.h> -#include <asm/cacheflush.h> #include <asm/scs.h> +#include "pi.h" + +bool dynamic_scs_is_enabled; + // // This minimal DWARF CFI parser is partially based on the code in // arch/arc/kernel/unwind.c, and on the document below: @@ -49,7 +50,9 @@ #define DW_CFA_GNU_negative_offset_extended 0x2f #define DW_CFA_hi_user 0x3f -extern const u8 __eh_frame_start[], __eh_frame_end[]; +#define DW_EH_PE_sdata4 0x0b +#define DW_EH_PE_sdata8 0x0c +#define DW_EH_PE_pcrel 0x10 enum { PACIASP = 0xd503233f, @@ -81,7 +84,11 @@ static void __always_inline scs_patch_loc(u64 loc) */ return; } - dcache_clean_pou(loc, loc + sizeof(u32)); + if (IS_ENABLED(CONFIG_ARM64_WORKAROUND_CLEAN_CACHE)) + asm("dc civac, %0" :: "r"(loc)); + else + asm(ALTERNATIVE("dc cvau, %0", "nop", ARM64_HAS_CACHE_IDC) + :: "r"(loc)); } /* @@ -117,7 +124,12 @@ struct eh_frame { union { struct { // CIE u8 version; - u8 augmentation_string[]; + u8 augmentation_string[3]; + u8 code_alignment_factor; + u8 data_alignment_factor; + u8 return_address_register; + u8 augmentation_data_size; + u8 fde_pointer_format; }; struct { // FDE @@ -125,29 +137,38 @@ struct eh_frame { s32 range; u8 opcodes[]; }; + + struct { // FDE + s64 initial_loc64; + s64 range64; + u8 opcodes64[]; + }; }; }; -static int noinstr scs_handle_fde_frame(const struct eh_frame *frame, - bool fde_has_augmentation_data, - int code_alignment_factor, - bool dry_run) +static int scs_handle_fde_frame(const struct eh_frame *frame, + int code_alignment_factor, + bool use_sdata8, + bool dry_run) { int size = frame->size - offsetof(struct eh_frame, opcodes) + 4; u64 loc = (u64)offset_to_ptr(&frame->initial_loc); const u8 *opcode = frame->opcodes; + int l; - if (fde_has_augmentation_data) { - int l; + if (use_sdata8) { + loc = (u64)&frame->initial_loc64 + frame->initial_loc64; + opcode = frame->opcodes64; + size -= 8; + } - // assume single byte uleb128_t - if (WARN_ON(*opcode & BIT(7))) - return -ENOEXEC; + // assume single byte uleb128_t for augmentation data size + if (*opcode & BIT(7)) + return EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE; - l = *opcode++; - opcode += l; - size -= l + 1; - } + l = *opcode++; + opcode += l; + size -= l + 1; /* * Starting from 'loc', apply the CFA opcodes that advance the location @@ -198,21 +219,20 @@ static int noinstr scs_handle_fde_frame(const struct eh_frame *frame, break; default: - pr_err("unhandled opcode: %02x in FDE frame %lx\n", opcode[-1], (uintptr_t)frame); - return -ENOEXEC; + return EDYNSCS_INVALID_CFA_OPCODE; } } return 0; } -int noinstr scs_patch(const u8 eh_frame[], int size) +int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run) { + int code_alignment_factor = 1; + bool fde_use_sdata8 = false; const u8 *p = eh_frame; while (size > 4) { const struct eh_frame *frame = (const void *)p; - bool fde_has_augmentation_data = true; - int code_alignment_factor = 1; int ret; if (frame->size == 0 || @@ -221,28 +241,49 @@ int noinstr scs_patch(const u8 eh_frame[], int size) break; if (frame->cie_id_or_pointer == 0) { - const u8 *p = frame->augmentation_string; - - /* a 'z' in the augmentation string must come first */ - fde_has_augmentation_data = *p == 'z'; + /* + * Require presence of augmentation data (z) with a + * specifier for the size of the FDE initial_loc and + * range fields (R), and nothing else. + */ + if (strcmp(frame->augmentation_string, "zR")) + return EDYNSCS_INVALID_CIE_HEADER; /* * The code alignment factor is a uleb128 encoded field * but given that the only sensible values are 1 or 4, - * there is no point in decoding the whole thing. + * there is no point in decoding the whole thing. Also + * sanity check the size of the data alignment factor + * field, and the values of the return address register + * and augmentation data size fields. */ - p += strlen(p) + 1; - if (!WARN_ON(*p & BIT(7))) - code_alignment_factor = *p; + if ((frame->code_alignment_factor & BIT(7)) || + (frame->data_alignment_factor & BIT(7)) || + frame->return_address_register != 30 || + frame->augmentation_data_size != 1) + return EDYNSCS_INVALID_CIE_HEADER; + + code_alignment_factor = frame->code_alignment_factor; + + switch (frame->fde_pointer_format) { + case DW_EH_PE_pcrel | DW_EH_PE_sdata4: + fde_use_sdata8 = false; + break; + case DW_EH_PE_pcrel | DW_EH_PE_sdata8: + fde_use_sdata8 = true; + break; + default: + return EDYNSCS_INVALID_CIE_SDATA_SIZE; + } } else { - ret = scs_handle_fde_frame(frame, - fde_has_augmentation_data, - code_alignment_factor, - true); + ret = scs_handle_fde_frame(frame, code_alignment_factor, + fde_use_sdata8, !skip_dry_run); if (ret) return ret; - scs_handle_fde_frame(frame, fde_has_augmentation_data, - code_alignment_factor, false); + + if (!skip_dry_run) + scs_handle_fde_frame(frame, code_alignment_factor, + fde_use_sdata8, false); } p += sizeof(frame->size) + frame->size; @@ -250,13 +291,3 @@ int noinstr scs_patch(const u8 eh_frame[], int size) } return 0; } - -asmlinkage void __init scs_patch_vmlinux(void) -{ - if (!should_patch_pac_into_scs()) - return; - - WARN_ON(scs_patch(__eh_frame_start, __eh_frame_end - __eh_frame_start)); - icache_inval_all_pou(); - isb(); -} diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h new file mode 100644 index 000000000000..aec3172d4003 --- /dev/null +++ b/arch/arm64/kernel/pi/pi.h @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +#include <linux/types.h> + +#define __prel64_initconst __section(".init.rodata.prel64") + +#define PREL64(type, name) union { type *name; prel64_t name ## _prel; } + +#define prel64_pointer(__d) (typeof(__d))prel64_to_pointer(&__d##_prel) + +typedef volatile signed long prel64_t; + +static inline void *prel64_to_pointer(const prel64_t *offset) +{ + if (!*offset) + return NULL; + return (void *)offset + *offset; +} + +extern bool dynamic_scs_is_enabled; + +extern pgd_t init_idmap_pg_dir[], init_idmap_pg_end[]; +extern pgd_t init_pg_dir[], init_pg_end[]; + +void init_feature_override(u64 boot_status, const void *fdt, int chosen); +u64 kaslr_early_init(void *fdt, int chosen); +void relocate_kernel(u64 offset); +int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run); + +void map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset); + +asmlinkage void early_map_kernel(u64 boot_status, phys_addr_t fdt); + +asmlinkage phys_addr_t create_init_idmap(pgd_t *pgd, ptdesc_t clrmask); diff --git a/arch/arm64/kernel/pi/relacheck.c b/arch/arm64/kernel/pi/relacheck.c new file mode 100644 index 000000000000..b0cd4d0d275b --- /dev/null +++ b/arch/arm64/kernel/pi/relacheck.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 - Google LLC + * Author: Ard Biesheuvel <ardb@google.com> + */ + +#include <elf.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define HOST_ORDER ELFDATA2LSB +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define HOST_ORDER ELFDATA2MSB +#endif + +static Elf64_Ehdr *ehdr; +static Elf64_Shdr *shdr; +static const char *strtab; +static bool swap; + +static uint64_t swab_elfxword(uint64_t val) +{ + return swap ? __builtin_bswap64(val) : val; +} + +static uint32_t swab_elfword(uint32_t val) +{ + return swap ? __builtin_bswap32(val) : val; +} + +static uint16_t swab_elfhword(uint16_t val) +{ + return swap ? __builtin_bswap16(val) : val; +} + +int main(int argc, char *argv[]) +{ + struct stat stat; + int fd, ret; + + if (argc < 3) { + fprintf(stderr, "file arguments missing\n"); + exit(EXIT_FAILURE); + } + + fd = open(argv[1], O_RDWR); + if (fd < 0) { + fprintf(stderr, "failed to open %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + ret = fstat(fd, &stat); + if (ret < 0) { + fprintf(stderr, "failed to stat() %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + ehdr = mmap(0, stat.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ehdr == MAP_FAILED) { + fprintf(stderr, "failed to mmap() %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + swap = ehdr->e_ident[EI_DATA] != HOST_ORDER; + shdr = (void *)ehdr + swab_elfxword(ehdr->e_shoff); + strtab = (void *)ehdr + + swab_elfxword(shdr[swab_elfhword(ehdr->e_shstrndx)].sh_offset); + + for (int i = 0; i < swab_elfhword(ehdr->e_shnum); i++) { + unsigned long info, flags; + bool prel64 = false; + Elf64_Rela *rela; + int numrela; + + if (swab_elfword(shdr[i].sh_type) != SHT_RELA) + continue; + + /* only consider RELA sections operating on data */ + info = swab_elfword(shdr[i].sh_info); + flags = swab_elfxword(shdr[info].sh_flags); + if ((flags & (SHF_ALLOC | SHF_EXECINSTR)) != SHF_ALLOC) + continue; + + /* + * We generally don't permit ABS64 relocations in the code that + * runs before relocation processing occurs. If statically + * initialized absolute symbol references are unavoidable, they + * may be emitted into a *.rodata.prel64 section and they will + * be converted to place-relative 64-bit references. This + * requires special handling in the referring code. + */ + if (strstr(strtab + swab_elfword(shdr[info].sh_name), + ".rodata.prel64")) { + prel64 = true; + } + + rela = (void *)ehdr + swab_elfxword(shdr[i].sh_offset); + numrela = swab_elfxword(shdr[i].sh_size) / sizeof(*rela); + + for (int j = 0; j < numrela; j++) { + uint64_t info = swab_elfxword(rela[j].r_info); + + if (ELF64_R_TYPE(info) != R_AARCH64_ABS64) + continue; + + if (prel64) { + /* convert ABS64 into PREL64 */ + info ^= R_AARCH64_ABS64 ^ R_AARCH64_PREL64; + rela[j].r_info = swab_elfxword(info); + } else { + fprintf(stderr, + "Unexpected absolute relocations detected in %s\n", + argv[2]); + close(fd); + unlink(argv[1]); + exit(EXIT_FAILURE); + } + } + } + close(fd); + return 0; +} diff --git a/arch/arm64/kernel/pi/relocate.c b/arch/arm64/kernel/pi/relocate.c new file mode 100644 index 000000000000..2407d2696398 --- /dev/null +++ b/arch/arm64/kernel/pi/relocate.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Authors: Ard Biesheuvel <ardb@google.com> +// Peter Collingbourne <pcc@google.com> + +#include <linux/elf.h> +#include <linux/init.h> +#include <linux/types.h> + +#include "pi.h" + +extern const Elf64_Rela rela_start[], rela_end[]; +extern const u64 relr_start[], relr_end[]; + +void __init relocate_kernel(u64 offset) +{ + u64 *place = NULL; + + for (const Elf64_Rela *rela = rela_start; rela < rela_end; rela++) { + if (ELF64_R_TYPE(rela->r_info) != R_AARCH64_RELATIVE) + continue; + *(u64 *)(rela->r_offset + offset) = rela->r_addend + offset; + } + + if (!IS_ENABLED(CONFIG_RELR) || !offset) + return; + + /* + * Apply RELR relocations. + * + * RELR is a compressed format for storing relative relocations. The + * encoded sequence of entries looks like: + * [ AAAAAAAA BBBBBBB1 BBBBBBB1 ... AAAAAAAA BBBBBB1 ... ] + * + * i.e. start with an address, followed by any number of bitmaps. The + * address entry encodes 1 relocation. The subsequent bitmap entries + * encode up to 63 relocations each, at subsequent offsets following + * the last address entry. + * + * The bitmap entries must have 1 in the least significant bit. The + * assumption here is that an address cannot have 1 in lsb. Odd + * addresses are not supported. Any odd addresses are stored in the + * RELA section, which is handled above. + * + * With the exception of the least significant bit, each bit in the + * bitmap corresponds with a machine word that follows the base address + * word, and the bit value indicates whether or not a relocation needs + * to be applied to it. The second least significant bit represents the + * machine word immediately following the initial address, and each bit + * that follows represents the next word, in linear order. As such, a + * single bitmap can encode up to 63 relocations in a 64-bit object. + */ + for (const u64 *relr = relr_start; relr < relr_end; relr++) { + if ((*relr & 1) == 0) { + place = (u64 *)(*relr + offset); + *place++ += offset; + } else { + for (u64 *p = place, r = *relr >> 1; r; p++, r >>= 1) + if (r & 1) + *p += offset; + place += 63; + } + } +} diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 968d5fffe233..4137cc5ef031 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -58,10 +58,13 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * Instructions which load PC relative literals are not going to work * when executed from an XOL slot. Instructions doing an exclusive * load/store are not going to complete successfully when single-step - * exception handling happens in the middle of the sequence. + * exception handling happens in the middle of the sequence. Memory + * copy/set instructions require that all three instructions be placed + * consecutively in memory. */ if (aarch64_insn_uses_literal(insn) || - aarch64_insn_is_exclusive(insn)) + aarch64_insn_is_exclusive(insn) || + aarch64_insn_is_mops(insn)) return false; return true; @@ -73,9 +76,18 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot. */ enum probe_insn __kprobes -arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) +arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api) { /* + * While 'nop' instruction can execute in the out-of-line slot, + * simulating them in breakpoint handling offers better performance. + */ + if (aarch64_insn_is_nop(insn)) { + api->handler = simulate_nop; + return INSN_GOOD_NO_SLOT; + } + + /* * Instructions reading or modifying the PC won't work from the XOL * slot. */ @@ -96,13 +108,10 @@ arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) aarch64_insn_is_bl(insn)) { api->handler = simulate_b_bl; } else if (aarch64_insn_is_br(insn) || - aarch64_insn_is_blr(insn) || - aarch64_insn_is_ret(insn)) { - api->handler = simulate_br_blr_ret; - } else if (aarch64_insn_is_ldr_lit(insn)) { - api->handler = simulate_ldr_literal; - } else if (aarch64_insn_is_ldrsw_lit(insn)) { - api->handler = simulate_ldrsw_literal; + aarch64_insn_is_blr(insn)) { + api->handler = simulate_br_blr; + } else if (aarch64_insn_is_ret(insn)) { + api->handler = simulate_ret; } else { /* * Instruction cannot be stepped out-of-line and we don't @@ -137,9 +146,20 @@ enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) { enum probe_insn decoded; - probe_opcode_t insn = le32_to_cpu(*addr); - probe_opcode_t *scan_end = NULL; + u32 insn = le32_to_cpu(*addr); + kprobe_opcode_t *scan_end = NULL; unsigned long size = 0, offset = 0; + struct arch_probe_insn *api = &asi->api; + + if (aarch64_insn_is_ldr_lit(insn)) { + api->handler = simulate_ldr_literal; + decoded = INSN_GOOD_NO_SLOT; + } else if (aarch64_insn_is_ldrsw_lit(insn)) { + api->handler = simulate_ldrsw_literal; + decoded = INSN_GOOD_NO_SLOT; + } else { + decoded = arm_probe_decode_insn(insn, &asi->api); + } /* * If there's a symbol defined in front of and near enough to @@ -157,7 +177,6 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) else scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE; } - decoded = arm_probe_decode_insn(insn, &asi->api); if (decoded != INSN_REJECTED && scan_end) if (is_probed_address_atomic(addr - 1, scan_end)) diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h index 8b758c5a2062..0e4195de8206 100644 --- a/arch/arm64/kernel/probes/decode-insn.h +++ b/arch/arm64/kernel/probes/decode-insn.h @@ -28,6 +28,6 @@ enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi); #endif enum probe_insn __kprobes -arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *asi); +arm_probe_decode_insn(u32 insn, struct arch_probe_insn *asi); #endif /* _ARM_KERNEL_KPROBES_ARM64_H */ diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 70b91a8c6bb3..43a0361a8bf0 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) "kprobes: " fmt +#include <linux/execmem.h> #include <linux/extable.h> #include <linux/kasan.h> #include <linux/kernel.h> @@ -27,7 +28,7 @@ #include <asm/debug-monitors.h> #include <asm/insn.h> #include <asm/irq.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/ptrace.h> #include <asm/sections.h> #include <asm/system_misc.h> @@ -41,9 +42,23 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); static void __kprobes post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *); +void *alloc_insn_page(void) +{ + void *addr; + + addr = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); + if (!addr) + return NULL; + if (set_memory_rox((unsigned long)addr, 1)) { + execmem_free(addr); + return NULL; + } + return addr; +} + static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { - kprobe_opcode_t *addr = p->ainsn.api.insn; + kprobe_opcode_t *addr = p->ainsn.xol_insn; /* * Prepare insn slot, Mark Rutland points out it depends on a coupe of @@ -64,20 +79,20 @@ static void __kprobes arch_prepare_ss_slot(struct kprobe *p) * the BRK exception handler, so it is unnecessary to generate * Contex-Synchronization-Event via ISB again. */ - aarch64_insn_patch_text_nosync(addr, p->opcode); + aarch64_insn_patch_text_nosync(addr, le32_to_cpu(p->opcode)); aarch64_insn_patch_text_nosync(addr + 1, BRK64_OPCODE_KPROBES_SS); /* * Needs restoring of return address after stepping xol. */ - p->ainsn.api.restore = (unsigned long) p->addr + + p->ainsn.xol_restore = (unsigned long) p->addr + sizeof(kprobe_opcode_t); } static void __kprobes arch_prepare_simulate(struct kprobe *p) { /* This instructions is not executed xol. No need to adjust the PC */ - p->ainsn.api.restore = 0; + p->ainsn.xol_restore = 0; } static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) @@ -85,7 +100,7 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); if (p->ainsn.api.handler) - p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs); + p->ainsn.api.handler(le32_to_cpu(p->opcode), (long)p->addr, regs); /* single step simulated, now go for post processing */ post_kprobe_handler(p, kcb, regs); @@ -99,7 +114,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; /* copy instruction */ - p->opcode = le32_to_cpu(*p->addr); + p->opcode = *p->addr; if (search_exception_tables(probe_addr)) return -EINVAL; @@ -110,18 +125,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; case INSN_GOOD_NO_SLOT: /* insn need simulation */ - p->ainsn.api.insn = NULL; + p->ainsn.xol_insn = NULL; break; case INSN_GOOD: /* instruction uses slot */ - p->ainsn.api.insn = get_insn_slot(); - if (!p->ainsn.api.insn) + p->ainsn.xol_insn = get_insn_slot(); + if (!p->ainsn.xol_insn) return -ENOMEM; break; } /* prepare the instruction */ - if (p->ainsn.api.insn) + if (p->ainsn.xol_insn) arch_prepare_ss_slot(p); else arch_prepare_simulate(p); @@ -129,13 +144,6 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return 0; } -void *alloc_insn_page(void) -{ - return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - /* arm kprobe: install breakpoint in text */ void __kprobes arch_arm_kprobe(struct kprobe *p) { @@ -149,15 +157,16 @@ void __kprobes arch_arm_kprobe(struct kprobe *p) void __kprobes arch_disarm_kprobe(struct kprobe *p) { void *addr = p->addr; + u32 insn = le32_to_cpu(p->opcode); - aarch64_insn_patch_text(&addr, &p->opcode, 1); + aarch64_insn_patch_text(&addr, &insn, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) { - if (p->ainsn.api.insn) { - free_insn_slot(p->ainsn.api.insn, 0); - p->ainsn.api.insn = NULL; + if (p->ainsn.xol_insn) { + free_insn_slot(p->ainsn.xol_insn, 0); + p->ainsn.xol_insn = NULL; } } @@ -212,9 +221,9 @@ static void __kprobes setup_singlestep(struct kprobe *p, } - if (p->ainsn.api.insn) { + if (p->ainsn.xol_insn) { /* prepare for single stepping */ - slot = (unsigned long)p->ainsn.api.insn; + slot = (unsigned long)p->ainsn.xol_insn; kprobes_save_local_irqflag(kcb, regs); instruction_pointer_set(regs, slot); @@ -252,8 +261,8 @@ static void __kprobes post_kprobe_handler(struct kprobe *cur, struct kprobe_ctlblk *kcb, struct pt_regs *regs) { /* return addr restore if non-branching insn */ - if (cur->ainsn.api.restore != 0) - instruction_pointer_set(regs, cur->ainsn.api.restore); + if (cur->ainsn.xol_restore != 0) + instruction_pointer_set(regs, cur->ainsn.xol_restore); /* restore back original saved kprobe variables and continue */ if (kcb->kprobe_status == KPROBE_REENTER) { @@ -298,8 +307,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr) return 0; } -static int __kprobes -kprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr) +int __kprobes +kprobe_brk_handler(struct pt_regs *regs, unsigned long esr) { struct kprobe *p, *cur_kprobe; struct kprobe_ctlblk *kcb; @@ -342,20 +351,15 @@ kprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr) return DBG_HOOK_HANDLED; } -static struct break_hook kprobes_break_hook = { - .imm = KPROBES_BRK_IMM, - .fn = kprobe_breakpoint_handler, -}; - -static int __kprobes -kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr) +int __kprobes +kprobe_ss_brk_handler(struct pt_regs *regs, unsigned long esr) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); unsigned long addr = instruction_pointer(regs); struct kprobe *cur = kprobe_running(); if (cur && (kcb->kprobe_status & (KPROBE_HIT_SS | KPROBE_REENTER)) && - ((unsigned long)&cur->ainsn.api.insn[1] == addr)) { + ((unsigned long)&cur->ainsn.xol_insn[1] == addr)) { kprobes_restore_local_irqflag(kcb, regs); post_kprobe_handler(cur, kcb, regs); @@ -366,10 +370,15 @@ kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr) return DBG_HOOK_ERROR; } -static struct break_hook kprobes_break_ss_hook = { - .imm = KPROBES_BRK_SS_IMM, - .fn = kprobe_breakpoint_ss_handler, -}; +int __kprobes +kretprobe_brk_handler(struct pt_regs *regs, unsigned long esr) +{ + if (regs->pc != (unsigned long)__kretprobe_trampoline) + return DBG_HOOK_ERROR; + + regs->pc = kretprobe_trampoline_handler(regs, (void *)regs->regs[29]); + return DBG_HOOK_HANDLED; +} /* * Provide a blacklist of symbols identifying ranges which cannot be kprobed. @@ -396,11 +405,6 @@ int __init arch_populate_kprobe_blacklist(void) return ret; } -void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs) -{ - return (void *)kretprobe_trampoline_handler(regs, (void *)regs->regs[29]); -} - void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -418,8 +422,5 @@ int __kprobes arch_trampoline_kprobe(struct kprobe *p) int __init arch_init_kprobes(void) { - register_kernel_break_hook(&kprobes_break_hook); - register_kernel_break_hook(&kprobes_break_ss_hook); - return 0; } diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S index 9a6499bed58b..b60739d3983f 100644 --- a/arch/arm64/kernel/probes/kprobes_trampoline.S +++ b/arch/arm64/kernel/probes/kprobes_trampoline.S @@ -4,83 +4,17 @@ */ #include <linux/linkage.h> -#include <asm/asm-offsets.h> +#include <asm/asm-bug.h> #include <asm/assembler.h> .text - .macro save_all_base_regs - stp x0, x1, [sp, #S_X0] - stp x2, x3, [sp, #S_X2] - stp x4, x5, [sp, #S_X4] - stp x6, x7, [sp, #S_X6] - stp x8, x9, [sp, #S_X8] - stp x10, x11, [sp, #S_X10] - stp x12, x13, [sp, #S_X12] - stp x14, x15, [sp, #S_X14] - stp x16, x17, [sp, #S_X16] - stp x18, x19, [sp, #S_X18] - stp x20, x21, [sp, #S_X20] - stp x22, x23, [sp, #S_X22] - stp x24, x25, [sp, #S_X24] - stp x26, x27, [sp, #S_X26] - stp x28, x29, [sp, #S_X28] - add x0, sp, #PT_REGS_SIZE - stp lr, x0, [sp, #S_LR] - /* - * Construct a useful saved PSTATE - */ - mrs x0, nzcv - mrs x1, daif - orr x0, x0, x1 - mrs x1, CurrentEL - orr x0, x0, x1 - mrs x1, SPSel - orr x0, x0, x1 - stp xzr, x0, [sp, #S_PC] - .endm - - .macro restore_all_base_regs - ldr x0, [sp, #S_PSTATE] - and x0, x0, #(PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT) - msr nzcv, x0 - ldp x0, x1, [sp, #S_X0] - ldp x2, x3, [sp, #S_X2] - ldp x4, x5, [sp, #S_X4] - ldp x6, x7, [sp, #S_X6] - ldp x8, x9, [sp, #S_X8] - ldp x10, x11, [sp, #S_X10] - ldp x12, x13, [sp, #S_X12] - ldp x14, x15, [sp, #S_X14] - ldp x16, x17, [sp, #S_X16] - ldp x18, x19, [sp, #S_X18] - ldp x20, x21, [sp, #S_X20] - ldp x22, x23, [sp, #S_X22] - ldp x24, x25, [sp, #S_X24] - ldp x26, x27, [sp, #S_X26] - ldp x28, x29, [sp, #S_X28] - .endm - SYM_CODE_START(__kretprobe_trampoline) - sub sp, sp, #PT_REGS_SIZE - - save_all_base_regs - - /* Setup a frame pointer. */ - add x29, sp, #S_FP - - mov x0, sp - bl trampoline_probe_handler /* - * Replace trampoline address in lr with actual orig_ret_addr return - * address. + * Trigger a breakpoint exception. The PC will be adjusted by + * kretprobe_brk_handler(), and no subsequent instructions will + * be executed from the trampoline. */ - mov lr, x0 - - /* The frame pointer (x29) is restored with other registers. */ - restore_all_base_regs - - add sp, sp, #PT_REGS_SIZE - ret - + brk #KRETPROBES_BRK_IMM + ASM_BUG() SYM_CODE_END(__kretprobe_trampoline) diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 22d0b3252476..89fbeb32107e 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -13,6 +13,7 @@ #include <asm/traps.h> #include "simulate-insn.h" +#include "asm/gcs.h" #define bbl_displacement(insn) \ sign_extend32(((insn) & 0x3ffffff) << 2, 27) @@ -49,6 +50,21 @@ static inline u32 get_w_reg(struct pt_regs *regs, int reg) return lower_32_bits(pt_regs_read_reg(regs, reg)); } +static inline int update_lr(struct pt_regs *regs, long addr) +{ + int err = 0; + + if (user_mode(regs) && task_gcs_el0_enabled(current)) { + push_user_gcs(addr, &err); + if (err) { + force_sig(SIGSEGV); + return err; + } + } + procedure_link_pointer_set(regs, addr); + return err; +} + static bool __kprobes check_cbz(u32 opcode, struct pt_regs *regs) { int xn = opcode & 0x1f; @@ -107,9 +123,9 @@ simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs) { int disp = bbl_displacement(opcode); - /* Link register is x30 */ if (opcode & (1 << 31)) - set_x_reg(regs, 30, addr + 4); + if (update_lr(regs, addr + 4)) + return; instruction_pointer_set(regs, addr + disp); } @@ -126,16 +142,34 @@ simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs) } void __kprobes -simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs) +simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs) { int xn = (opcode >> 5) & 0x1f; + u64 b_target = get_x_reg(regs, xn); - /* update pc first in case we're doing a "blr lr" */ - instruction_pointer_set(regs, get_x_reg(regs, xn)); - - /* Link register is x30 */ if (((opcode >> 21) & 0x3) == 1) - set_x_reg(regs, 30, addr + 4); + if (update_lr(regs, addr + 4)) + return; + + instruction_pointer_set(regs, b_target); +} + +void __kprobes +simulate_ret(u32 opcode, long addr, struct pt_regs *regs) +{ + u64 ret_addr; + int err = 0; + int xn = (opcode >> 5) & 0x1f; + u64 r_target = get_x_reg(regs, xn); + + if (user_mode(regs) && task_gcs_el0_enabled(current)) { + ret_addr = pop_user_gcs(&err); + if (err || ret_addr != r_target) { + force_sig(SIGSEGV); + return; + } + } + instruction_pointer_set(regs, r_target); } void __kprobes @@ -171,17 +205,15 @@ simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs) void __kprobes simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs) { - u64 *load_addr; + unsigned long load_addr; int xn = opcode & 0x1f; - int disp; - disp = ldr_displacement(opcode); - load_addr = (u64 *) (addr + disp); + load_addr = addr + ldr_displacement(opcode); if (opcode & (1 << 30)) /* x0-x30 */ - set_x_reg(regs, xn, *load_addr); + set_x_reg(regs, xn, READ_ONCE(*(u64 *)load_addr)); else /* w0-w30 */ - set_w_reg(regs, xn, *load_addr); + set_w_reg(regs, xn, READ_ONCE(*(u32 *)load_addr)); instruction_pointer_set(regs, instruction_pointer(regs) + 4); } @@ -189,14 +221,18 @@ simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs) void __kprobes simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs) { - s32 *load_addr; + unsigned long load_addr; int xn = opcode & 0x1f; - int disp; - disp = ldr_displacement(opcode); - load_addr = (s32 *) (addr + disp); + load_addr = addr + ldr_displacement(opcode); - set_x_reg(regs, xn, *load_addr); + set_x_reg(regs, xn, READ_ONCE(*(s32 *)load_addr)); instruction_pointer_set(regs, instruction_pointer(regs) + 4); } + +void __kprobes +simulate_nop(u32 opcode, long addr, struct pt_regs *regs) +{ + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); +} diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h index e065dc92218e..9e772a292d56 100644 --- a/arch/arm64/kernel/probes/simulate-insn.h +++ b/arch/arm64/kernel/probes/simulate-insn.h @@ -11,10 +11,12 @@ void simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs); void simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs); void simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs); -void simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs); +void simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs); +void simulate_ret(u32 opcode, long addr, struct pt_regs *regs); void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs); +void simulate_nop(u32 opcode, long addr, struct pt_regs *regs); #endif /* _ARM_KERNEL_KPROBES_SIMULATE_INSN_H */ diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index d49aef2657cd..941668800aea 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -6,6 +6,7 @@ #include <linux/ptrace.h> #include <linux/uprobes.h> #include <asm/cacheflush.h> +#include <asm/gcs.h> #include "decode-insn.h" @@ -17,12 +18,20 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *xol_page_kaddr = kmap_atomic(page); void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); + /* + * Initial cache maintenance of the xol page done via set_pte_at(). + * Subsequent CMOs only needed if the xol slot changes. + */ + if (!memcmp(dst, src, len)) + goto done; + /* Initialize the slot */ memcpy(dst, src, len); /* flush caches (dcache/icache) */ sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len); +done: kunmap_atomic(xol_page_kaddr); } @@ -34,7 +43,7 @@ unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - probe_opcode_t insn; + u32 insn; /* TODO: Currently we do not support AARCH32 instruction probing */ if (mm->context.flags & MMCF_AARCH32) @@ -42,7 +51,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) return -EINVAL; - insn = *(probe_opcode_t *)(&auprobe->insn[0]); + insn = le32_to_cpu(auprobe->insn); switch (arm_probe_decode_insn(insn, &auprobe->api)) { case INSN_REJECTED: @@ -102,13 +111,13 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - probe_opcode_t insn; + u32 insn; unsigned long addr; if (!auprobe->simulate) return false; - insn = *(probe_opcode_t *)(&auprobe->insn[0]); + insn = le32_to_cpu(auprobe->insn); addr = instruction_pointer(regs); if (auprobe->api.handler) @@ -122,7 +131,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) struct uprobe_task *utask = current->utask; /* - * Task has received a fatal signal, so reset back to probbed + * Task has received a fatal signal, so reset back to probed * address. */ instruction_pointer_set(regs, utask->vaddr); @@ -151,11 +160,43 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { unsigned long orig_ret_vaddr; + unsigned long gcs_ret_vaddr; + int err = 0; + u64 gcspr; orig_ret_vaddr = procedure_link_pointer(regs); + + if (task_gcs_el0_enabled(current)) { + gcspr = read_sysreg_s(SYS_GCSPR_EL0); + gcs_ret_vaddr = get_user_gcs((__force unsigned long __user *)gcspr, &err); + if (err) { + force_sig(SIGSEGV); + goto out; + } + + /* + * If the LR and GCS return addr don't match, then some kind of PAC + * signing or control flow occurred since entering the probed function. + * Likely because the user is attempting to retprobe on an instruction + * that isn't a function boundary or inside a leaf function. Explicitly + * abort this retprobe because it will generate a GCS exception. + */ + if (gcs_ret_vaddr != orig_ret_vaddr) { + orig_ret_vaddr = -1; + goto out; + } + + put_user_gcs(trampoline_vaddr, (__force unsigned long __user *)gcspr, &err); + if (err) { + force_sig(SIGSEGV); + goto out; + } + } + /* Replace the return addr with trampoline addr */ procedure_link_pointer_set(regs, trampoline_vaddr); +out: return orig_ret_vaddr; } @@ -165,7 +206,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, return NOTIFY_DONE; } -static int uprobe_breakpoint_handler(struct pt_regs *regs, +int uprobe_brk_handler(struct pt_regs *regs, unsigned long esr) { if (uprobe_pre_sstep_notifier(regs)) @@ -174,7 +215,7 @@ static int uprobe_breakpoint_handler(struct pt_regs *regs, return DBG_HOOK_ERROR; } -static int uprobe_single_step_handler(struct pt_regs *regs, +int uprobe_single_step_handler(struct pt_regs *regs, unsigned long esr) { struct uprobe_task *utask = current->utask; @@ -186,23 +227,3 @@ static int uprobe_single_step_handler(struct pt_regs *regs, return DBG_HOOK_ERROR; } -/* uprobe breakpoint handler hook */ -static struct break_hook uprobes_break_hook = { - .imm = UPROBES_BRK_IMM, - .fn = uprobe_breakpoint_handler, -}; - -/* uprobe single step handler hook */ -static struct step_hook uprobes_step_hook = { - .fn = uprobe_single_step_handler, -}; - -static int __init arch_init_uprobes(void) -{ - register_user_break_hook(&uprobes_break_hook); - register_user_step_hook(&uprobes_step_hook); - - return 0; -} - -device_initcall(arch_init_uprobes); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 0fcc4eb1a7ab..fba7ca102a8c 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -43,11 +43,13 @@ #include <linux/stacktrace.h> #include <asm/alternative.h> +#include <asm/arch_timer.h> #include <asm/compat.h> #include <asm/cpufeature.h> #include <asm/cacheflush.h> #include <asm/exec.h> #include <asm/fpsimd.h> +#include <asm/gcs.h> #include <asm/mmu_context.h> #include <asm/mte.h> #include <asm/processor.h> @@ -226,7 +228,7 @@ void __show_regs(struct pt_regs *regs) printk("sp : %016llx\n", sp); if (system_uses_irq_prio_masking()) - printk("pmr_save: %08llx\n", regs->pmr_save); + printk("pmr: %08x\n", regs->pmr); i = top_reg; @@ -271,12 +273,69 @@ static void flush_tagged_addr_state(void) clear_thread_flag(TIF_TAGGED_ADDR); } +static void flush_poe(void) +{ + if (!system_supports_poe()) + return; + + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); +} + +#ifdef CONFIG_ARM64_GCS + +static void flush_gcs(void) +{ + if (!system_supports_gcs()) + return; + + current->thread.gcspr_el0 = 0; + current->thread.gcs_base = 0; + current->thread.gcs_size = 0; + current->thread.gcs_el0_mode = 0; + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); + write_sysreg_s(0, SYS_GCSPR_EL0); +} + +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + unsigned long gcs; + + if (!system_supports_gcs()) + return 0; + + p->thread.gcs_base = 0; + p->thread.gcs_size = 0; + + p->thread.gcs_el0_mode = current->thread.gcs_el0_mode; + p->thread.gcs_el0_locked = current->thread.gcs_el0_locked; + + gcs = gcs_alloc_thread_stack(p, args); + if (IS_ERR_VALUE(gcs)) + return PTR_ERR((void *)gcs); + + return 0; +} + +#else + +static void flush_gcs(void) { } +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + return 0; +} + +#endif + void flush_thread(void) { fpsimd_flush_thread(); tls_thread_flush(); flush_ptrace_hw_breakpoint(current); flush_tagged_addr_state(); + flush_poe(); + flush_gcs(); } void arch_release_task_struct(struct task_struct *tsk) @@ -286,53 +345,34 @@ void arch_release_task_struct(struct task_struct *tsk) int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - if (current->mm) - fpsimd_preserve_current_state(); - *dst = *src; + /* + * The current/src task's FPSIMD state may or may not be live, and may + * have been altered by ptrace after entry to the kernel. Save the + * effective FPSIMD state so that this will be copied into dst. + */ + fpsimd_save_and_flush_current_state(); + fpsimd_sync_from_effective_state(src); - /* We rely on the above assignment to initialize dst's thread_flags: */ - BUILD_BUG_ON(!IS_ENABLED(CONFIG_THREAD_INFO_IN_TASK)); + *dst = *src; /* - * Detach src's sve_state (if any) from dst so that it does not - * get erroneously used or freed prematurely. dst's copies - * will be allocated on demand later on if dst uses SVE. - * For consistency, also clear TIF_SVE here: this could be done - * later in copy_process(), but to avoid tripping up future - * maintainers it is best not to leave TIF flags and buffers in - * an inconsistent state, even temporarily. + * Drop stale reference to src's sve_state and convert dst to + * non-streaming FPSIMD mode. */ + dst->thread.fp_type = FP_STATE_FPSIMD; dst->thread.sve_state = NULL; clear_tsk_thread_flag(dst, TIF_SVE); + task_smstop_sm(dst); /* - * In the unlikely event that we create a new thread with ZA - * enabled we should retain the ZA and ZT state so duplicate - * it here. This may be shortly freed if we exec() or if - * CLONE_SETTLS but it's simpler to do it here. To avoid - * confusing the rest of the code ensure that we have a - * sve_state allocated whenever sme_state is allocated. + * Drop stale reference to src's sme_state and ensure dst has ZA + * disabled. + * + * When necessary, ZA will be inherited later in copy_thread_za(). */ - if (thread_za_enabled(&src->thread)) { - dst->thread.sve_state = kzalloc(sve_state_size(src), - GFP_KERNEL); - if (!dst->thread.sve_state) - return -ENOMEM; - - dst->thread.sme_state = kmemdup(src->thread.sme_state, - sme_state_size(src), - GFP_KERNEL); - if (!dst->thread.sme_state) { - kfree(dst->thread.sve_state); - dst->thread.sve_state = NULL; - return -ENOMEM; - } - } else { - dst->thread.sme_state = NULL; - clear_tsk_thread_flag(dst, TIF_SME); - } - - dst->thread.fp_type = FP_STATE_FPSIMD; + dst->thread.sme_state = NULL; + clear_tsk_thread_flag(dst, TIF_SME); + dst->thread.svcr &= ~SVCR_ZA_MASK; /* clear any pending asynchronous tag fault raised by the parent */ clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT); @@ -340,14 +380,40 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } +static int copy_thread_za(struct task_struct *dst, struct task_struct *src) +{ + if (!thread_za_enabled(&src->thread)) + return 0; + + dst->thread.sve_state = kzalloc(sve_state_size(src), + GFP_KERNEL); + if (!dst->thread.sve_state) + return -ENOMEM; + + dst->thread.sme_state = kmemdup(src->thread.sme_state, + sme_state_size(src), + GFP_KERNEL); + if (!dst->thread.sme_state) { + kfree(dst->thread.sve_state); + dst->thread.sve_state = NULL; + return -ENOMEM; + } + + set_tsk_thread_flag(dst, TIF_SME); + dst->thread.svcr |= SVCR_ZA_MASK; + + return 0; +} + asmlinkage void ret_from_fork(void) asm("ret_from_fork"); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long stack_start = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); + int ret; memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); @@ -371,8 +437,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * out-of-sync with the saved value. */ *task_user_tls(p) = read_sysreg(tpidr_el0); - if (system_supports_tpidr2()) - p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + + if (system_supports_poe()) + p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); if (stack_start) { if (is_compat_thread(task_thread_info(p))) @@ -382,13 +449,43 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) } /* + * Due to the AAPCS64 "ZA lazy saving scheme", PSTATE.ZA and + * TPIDR2 need to be manipulated as a pair, and either both + * need to be inherited or both need to be reset. + * + * Within a process, child threads must not inherit their + * parent's TPIDR2 value or they may clobber their parent's + * stack at some later point. + * + * When a process is fork()'d, the child must inherit ZA and + * TPIDR2 from its parent in case there was dormant ZA state. + * + * Use CLONE_VM to determine when the child will share the + * address space with the parent, and cannot safely inherit the + * state. + */ + if (system_supports_sme()) { + if (!(clone_flags & CLONE_VM)) { + p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + ret = copy_thread_za(p, current); + if (ret) + return ret; + } else { + p->thread.tpidr2_el0 = 0; + WARN_ON_ONCE(p->thread.svcr & SVCR_ZA_MASK); + } + } + + /* * If a TLS pointer was passed to clone, use it for the new - * thread. We also reset TPIDR2 if it's in use. + * thread. */ - if (clone_flags & CLONE_SETTLS) { + if (clone_flags & CLONE_SETTLS) p->thread.uw.tp_value = tls; - p->thread.tpidr2_el0 = 0; - } + + ret = copy_thread_gcs(p, args); + if (ret != 0) + return ret; } else { /* * A kthread has no context to ERET to, so ensure any buggy @@ -399,9 +496,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT; + childregs->stackframe.type = FRAME_META_TYPE_FINAL; p->thread.cpu_context.x19 = (unsigned long)args->fn; p->thread.cpu_context.x20 = (unsigned long)args->fn_arg; + + if (system_supports_poe()) + p->thread.por_el0 = POR_EL0_INIT; } p->thread.cpu_context.pc = (unsigned long)ret_from_fork; p->thread.cpu_context.sp = (unsigned long)childregs; @@ -409,7 +510,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * For the benefit of the unwinder, set up childregs->stackframe * as the final frame for the new task. */ - p->thread.cpu_context.fp = (unsigned long)childregs->stackframe; + p->thread.cpu_context.fp = (unsigned long)&childregs->stackframe; ptrace_hw_copy_thread(p); @@ -429,7 +530,7 @@ static void tls_thread_switch(struct task_struct *next) if (is_compat_thread(task_thread_info(next))) write_sysreg(next->thread.uw.tp_value, tpidrro_el0); - else if (!arm64_kernel_unmapped_at_el0()) + else write_sysreg(0, tpidrro_el0); write_sysreg(*task_user_tls(next), tpidr_el0); @@ -454,7 +555,7 @@ static void ssbs_thread_switch(struct task_struct *next) * If all CPUs implement the SSBS extension, then we just need to * context-switch the PSTATE field. */ - if (cpus_have_const_cap(ARM64_SSBS)) + if (alternative_has_cap_unlikely(ARM64_SSBS)) return; spectre_v4_enable_task_mitigation(next); @@ -474,28 +575,109 @@ static void entry_task_switch(struct task_struct *next) __this_cpu_write(__entry_task, next); } +#ifdef CONFIG_ARM64_GCS + +void gcs_preserve_current_state(void) +{ + current->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); +} + +static void gcs_thread_switch(struct task_struct *next) +{ + if (!system_supports_gcs()) + return; + + /* GCSPR_EL0 is always readable */ + gcs_preserve_current_state(); + write_sysreg_s(next->thread.gcspr_el0, SYS_GCSPR_EL0); + + if (current->thread.gcs_el0_mode != next->thread.gcs_el0_mode) + gcs_set_el0_mode(next); + + /* + * Ensure that GCS memory effects of the 'prev' thread are + * ordered before other memory accesses with release semantics + * (or preceded by a DMB) on the current PE. In addition, any + * memory accesses with acquire semantics (or succeeded by a + * DMB) are ordered before GCS memory effects of the 'next' + * thread. This will ensure that the GCS memory effects are + * visible to other PEs in case of migration. + */ + if (task_gcs_el0_enabled(current) || task_gcs_el0_enabled(next)) + gcsb_dsync(); +} + +#else + +static void gcs_thread_switch(struct task_struct *next) +{ +} + +#endif + /* - * ARM erratum 1418040 handling, affecting the 32bit view of CNTVCT. - * Ensure access is disabled when switching to a 32bit task, ensure - * access is enabled when switching to a 64bit task. + * Handle sysreg updates for ARM erratum 1418040 which affects the 32bit view of + * CNTVCT, various other errata which require trapping all CNTVCT{,_EL0} + * accesses and prctl(PR_SET_TSC). Ensure access is disabled iff a workaround is + * required or PR_TSC_SIGSEGV is set. */ -static void erratum_1418040_thread_switch(struct task_struct *next) +static void update_cntkctl_el1(struct task_struct *next) { - if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) || - !this_cpu_has_cap(ARM64_WORKAROUND_1418040)) - return; + struct thread_info *ti = task_thread_info(next); - if (is_compat_thread(task_thread_info(next))) + if (test_ti_thread_flag(ti, TIF_TSC_SIGSEGV) || + has_erratum_handler(read_cntvct_el0) || + (IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) && + this_cpu_has_cap(ARM64_WORKAROUND_1418040) && + is_compat_thread(ti))) sysreg_clear_set(cntkctl_el1, ARCH_TIMER_USR_VCT_ACCESS_EN, 0); else sysreg_clear_set(cntkctl_el1, 0, ARCH_TIMER_USR_VCT_ACCESS_EN); } -static void erratum_1418040_new_exec(void) +static void cntkctl_thread_switch(struct task_struct *prev, + struct task_struct *next) { + if ((read_ti_thread_flags(task_thread_info(prev)) & + (_TIF_32BIT | _TIF_TSC_SIGSEGV)) != + (read_ti_thread_flags(task_thread_info(next)) & + (_TIF_32BIT | _TIF_TSC_SIGSEGV))) + update_cntkctl_el1(next); +} + +static int do_set_tsc_mode(unsigned int val) +{ + bool tsc_sigsegv; + + if (val == PR_TSC_SIGSEGV) + tsc_sigsegv = true; + else if (val == PR_TSC_ENABLE) + tsc_sigsegv = false; + else + return -EINVAL; + preempt_disable(); - erratum_1418040_thread_switch(current); + update_thread_flag(TIF_TSC_SIGSEGV, tsc_sigsegv); + update_cntkctl_el1(current); preempt_enable(); + + return 0; +} + +static void permission_overlay_switch(struct task_struct *next) +{ + if (!system_supports_poe()) + return; + + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (current->thread.por_el0 != next->thread.por_el0) { + write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); + /* + * No ISB required as we can tolerate spurious Overlay faults - + * the fault handler will check again based on the new value + * of POR_EL0. + */ + } } /* @@ -531,14 +713,17 @@ struct task_struct *__switch_to(struct task_struct *prev, contextidr_thread_switch(next); entry_task_switch(next); ssbs_thread_switch(next); - erratum_1418040_thread_switch(next); + cntkctl_thread_switch(prev, next); ptrauth_thread_switch_user(next); + permission_overlay_switch(next); + gcs_thread_switch(next); /* - * Complete any pending TLB or cache maintenance on this CPU in case - * the thread migrates to a different CPU. - * This full barrier is also required by the membarrier system - * call. + * Complete any pending TLB or cache maintenance on this CPU in case the + * thread migrates to a different CPU. This full barrier is also + * required by the membarrier system call. Additionally it makes any + * in-progress pgtable writes visible to the table walker; See + * emit_pte_barriers(). */ dsb(ish); @@ -648,7 +833,7 @@ void arch_setup_new_exec(void) current->mm->context.flags = mmflags; ptrauth_thread_init_user(); mte_thread_init_user(); - erratum_1418040_new_exec(); + do_set_tsc_mode(PR_TSC_ENABLE); if (task_spec_ssb_noexec(current)) { arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS, @@ -670,10 +855,14 @@ long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg) if (is_compat_thread(ti)) return -EINVAL; - if (system_supports_mte()) + if (system_supports_mte()) { valid_mask |= PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC \ | PR_MTE_TAG_MASK; + if (cpus_have_cap(ARM64_MTE_STORE_ONLY)) + valid_mask |= PR_MTE_STORE_ONLY; + } + if (arg & ~valid_mask) return -EINVAL; @@ -714,7 +903,7 @@ long get_tagged_addr_ctrl(struct task_struct *task) * disable it for tasks that already opted in to the relaxed ABI. */ -static struct ctl_table tagged_addr_sysctl_table[] = { +static const struct ctl_table tagged_addr_sysctl_table[] = { { .procname = "tagged_addr_disabled", .mode = 0644, @@ -724,7 +913,6 @@ static struct ctl_table tagged_addr_sysctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; static int __init tagged_addr_init(void) @@ -758,3 +946,26 @@ int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state, return prot; } #endif + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (is_compat_task()) + return -EINVAL; + + if (test_thread_flag(TIF_TSC_SIGSEGV)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (is_compat_task()) + return -EINVAL; + + return do_set_tsc_mode(val); +} diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 05f40c4e18fd..80a580e019c5 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -91,12 +91,7 @@ early_param("nospectre_v2", parse_spectre_v2_param); static bool spectre_v2_mitigations_off(void) { - bool ret = __nospectre_v2 || cpu_mitigations_off(); - - if (ret) - pr_info_once("spectre-v2 mitigation disabled by command line option\n"); - - return ret; + return __nospectre_v2 || cpu_mitigations_off(); } static const char *get_bhb_affected_string(enum mitigation_state bhb_state) @@ -172,7 +167,7 @@ static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void) return SPECTRE_UNAFFECTED; /* Alternatively, we have a list of unaffected CPUs */ - if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list)) + if (is_midr_in_range_list(spectre_v2_safe_list)) return SPECTRE_UNAFFECTED; return SPECTRE_VULNERABLE; @@ -331,7 +326,7 @@ bool has_spectre_v3a(const struct arm64_cpu_capabilities *entry, int scope) }; WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range_list(read_cpuid_id(), spectre_v3a_unsafe_list); + return is_midr_in_range_list(spectre_v3a_unsafe_list); } void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused) @@ -421,13 +416,8 @@ early_param("ssbd", parse_spectre_v4_param); */ static bool spectre_v4_mitigations_off(void) { - bool ret = cpu_mitigations_off() || - __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED; - - if (ret) - pr_info_once("spectre-v4 mitigation disabled by command-line option\n"); - - return ret; + return cpu_mitigations_off() || + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED; } /* Do we need to toggle the mitigation state on entry to/exit from the kernel? */ @@ -475,7 +465,7 @@ static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void) { /* sentinel */ }, }; - if (is_midr_in_range_list(read_cpuid_id(), spectre_v4_safe_list)) + if (is_midr_in_range_list(spectre_v4_safe_list)) return SPECTRE_UNAFFECTED; /* CPU features are detected first */ @@ -558,6 +548,18 @@ static enum mitigation_state spectre_v4_enable_hw_mitigation(void) /* SCTLR_EL1.DSSBS was initialised to 0 during boot */ set_pstate_ssbs(0); + + /* + * SSBS is self-synchronizing and is intended to affect subsequent + * speculative instructions, but some CPUs can speculate with a stale + * value of SSBS. + * + * Mitigate this with an unconditional speculation barrier, as CPUs + * could mis-speculate branches and bypass a conditional barrier. + */ + if (IS_ENABLED(CONFIG_ARM64_ERRATUM_3194386)) + spec_bar(); + return SPECTRE_MITIGATED; } @@ -833,52 +835,91 @@ static unsigned long system_bhb_mitigations; * This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any * SCOPE_SYSTEM call will give the right answer. */ -u8 spectre_bhb_loop_affected(int scope) +static bool is_spectre_bhb_safe(int scope) +{ + static const struct midr_range spectre_bhb_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A510), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A520), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + {}, + }; + static bool all_safe = true; + + if (scope != SCOPE_LOCAL_CPU) + return all_safe; + + if (is_midr_in_range_list(spectre_bhb_safe_list)) + return true; + + all_safe = false; + + return false; +} + +static u8 spectre_bhb_loop_affected(void) { u8 k = 0; - static u8 max_bhb_k; - - if (scope == SCOPE_LOCAL_CPU) { - static const struct midr_range spectre_bhb_k32_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), - MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), - MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), - {}, - }; - static const struct midr_range spectre_bhb_k24_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), - {}, - }; - static const struct midr_range spectre_bhb_k11_list[] = { - MIDR_ALL_VERSIONS(MIDR_AMPERE1), - {}, - }; - static const struct midr_range spectre_bhb_k8_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), - {}, - }; - - if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) - k = 32; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) - k = 24; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k11_list)) - k = 11; - else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list)) - k = 8; - - max_bhb_k = max(max_bhb_k, k); - } else { - k = max_bhb_k; - } + + static const struct midr_range spectre_bhb_k132_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), + {}, + }; + static const struct midr_range spectre_bhb_k38_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE), + {}, + }; + static const struct midr_range spectre_bhb_k32_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + {}, + }; + static const struct midr_range spectre_bhb_k24_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD), + MIDR_ALL_VERSIONS(MIDR_HISI_HIP09), + {}, + }; + static const struct midr_range spectre_bhb_k11_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + {}, + }; + static const struct midr_range spectre_bhb_k8_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + {}, + }; + + if (is_midr_in_range_list(spectre_bhb_k132_list)) + k = 132; + else if (is_midr_in_range_list(spectre_bhb_k38_list)) + k = 38; + else if (is_midr_in_range_list(spectre_bhb_k32_list)) + k = 32; + else if (is_midr_in_range_list(spectre_bhb_k24_list)) + k = 24; + else if (is_midr_in_range_list(spectre_bhb_k11_list)) + k = 11; + else if (is_midr_in_range_list(spectre_bhb_k8_list)) + k = 8; return k; } @@ -904,29 +945,13 @@ static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void) } } -static bool is_spectre_bhb_fw_affected(int scope) +static bool has_spectre_bhb_fw_mitigation(void) { - static bool system_affected; enum mitigation_state fw_state; bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE; - static const struct midr_range spectre_bhb_firmware_mitigated_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), - {}, - }; - bool cpu_in_list = is_midr_in_range_list(read_cpuid_id(), - spectre_bhb_firmware_mitigated_list); - - if (scope != SCOPE_LOCAL_CPU) - return system_affected; fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); - if (cpu_in_list || (has_smccc && fw_state == SPECTRE_MITIGATED)) { - system_affected = true; - return true; - } - - return false; + return has_smccc && fw_state == SPECTRE_MITIGATED; } static bool supports_ecbhb(int scope) @@ -942,6 +967,8 @@ static bool supports_ecbhb(int scope) ID_AA64MMFR1_EL1_ECBHB_SHIFT); } +static u8 max_bhb_k; + bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope) { @@ -950,16 +977,23 @@ bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, if (supports_csv2p3(scope)) return false; - if (supports_clearbhb(scope)) - return true; + if (is_spectre_bhb_safe(scope)) + return false; - if (spectre_bhb_loop_affected(scope)) - return true; + /* + * At this point the core isn't known to be "safe" so we're going to + * assume it's vulnerable. We still need to update `max_bhb_k` though, + * but only if we aren't mitigating with clearbhb though. + */ + if (scope == SCOPE_LOCAL_CPU && !supports_clearbhb(SCOPE_LOCAL_CPU)) + max_bhb_k = max(max_bhb_k, spectre_bhb_loop_affected()); - if (is_spectre_bhb_fw_affected(scope)) - return true; + return true; +} - return false; +u8 get_spectre_bhb_loop_value(void) +{ + return max_bhb_k; } static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) @@ -972,14 +1006,14 @@ static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) * When KPTI is in use, the vectors are switched when exiting to * user-space. */ - if (arm64_kernel_unmapped_at_el0()) + if (cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0)) return; write_sysreg(v, vbar_el1); isb(); } -static bool __read_mostly __nospectre_bhb; +bool __read_mostly __nospectre_bhb; static int __init parse_spectre_bhb_param(char *str) { __nospectre_bhb = true; @@ -990,7 +1024,7 @@ early_param("nospectre_bhb", parse_spectre_bhb_param); void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) { bp_hardening_cb_t cpu_cb; - enum mitigation_state fw_state, state = SPECTRE_VULNERABLE; + enum mitigation_state state = SPECTRE_VULNERABLE; struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU)) @@ -999,9 +1033,7 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) { /* No point mitigating Spectre-BHB alone. */ } else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) { - pr_info_once("spectre-bhb mitigation disabled by compile time option\n"); - } else if (cpu_mitigations_off() || __nospectre_bhb) { - pr_info_once("spectre-bhb mitigation disabled by command line option\n"); + /* Do nothing */ } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { state = SPECTRE_MITIGATED; set_bit(BHB_HW, &system_bhb_mitigations); @@ -1016,7 +1048,7 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN); state = SPECTRE_MITIGATED; set_bit(BHB_INSN, &system_bhb_mitigations); - } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { + } else if (spectre_bhb_loop_affected()) { /* * Ensure KVM uses the indirect vector which will have the * branchy-loop added. A57/A72-r0 will already have selected @@ -1029,37 +1061,39 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP); state = SPECTRE_MITIGATED; set_bit(BHB_LOOP, &system_bhb_mitigations); - } else if (is_spectre_bhb_fw_affected(SCOPE_LOCAL_CPU)) { - fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); - if (fw_state == SPECTRE_MITIGATED) { - /* - * Ensure KVM uses one of the spectre bp_hardening - * vectors. The indirect vector doesn't include the EL3 - * call, so needs upgrading to - * HYP_VECTOR_SPECTRE_INDIRECT. - */ - if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) - data->slot += 1; - - this_cpu_set_vectors(EL1_VECTOR_BHB_FW); - - /* - * The WA3 call in the vectors supersedes the WA1 call - * made during context-switch. Uninstall any firmware - * bp_hardening callback. - */ - cpu_cb = spectre_v2_get_sw_mitigation_cb(); - if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) - __this_cpu_write(bp_hardening_data.fn, NULL); - - state = SPECTRE_MITIGATED; - set_bit(BHB_FW, &system_bhb_mitigations); - } + } else if (has_spectre_bhb_fw_mitigation()) { + /* + * Ensure KVM uses one of the spectre bp_hardening + * vectors. The indirect vector doesn't include the EL3 + * call, so needs upgrading to + * HYP_VECTOR_SPECTRE_INDIRECT. + */ + if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) + data->slot += 1; + + this_cpu_set_vectors(EL1_VECTOR_BHB_FW); + + /* + * The WA3 call in the vectors supersedes the WA1 call + * made during context-switch. Uninstall any firmware + * bp_hardening callback. + */ + cpu_cb = spectre_v2_get_sw_mitigation_cb(); + if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) + __this_cpu_write(bp_hardening_data.fn, NULL); + + state = SPECTRE_MITIGATED; + set_bit(BHB_FW, &system_bhb_mitigations); } update_mitigation_state(&spectre_bhb_state, state); } +bool is_spectre_bhb_fw_mitigated(void) +{ + return test_bit(BHB_FW, &system_bhb_mitigations); +} + /* Patched to NOP when enabled */ void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, __le32 *origptr, @@ -1088,7 +1122,6 @@ void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, { u8 rd; u32 insn; - u16 loop_count = spectre_bhb_loop_affected(SCOPE_SYSTEM); BUG_ON(nr_inst != 1); /* MOV -> MOV */ @@ -1097,7 +1130,7 @@ void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, insn = le32_to_cpu(*origptr); rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); - insn = aarch64_insn_gen_movewide(rd, loop_count, 0, + insn = aarch64_insn_gen_movewide(rd, max_bhb_k, 0, AARCH64_INSN_VARIANT_64BIT, AARCH64_INSN_MOVEWIDE_ZERO); *updptr++ = cpu_to_le32(insn); @@ -1154,3 +1187,18 @@ void unpriv_ebpf_notify(int new_state) pr_err("WARNING: %s", EBPF_WARN); } #endif + +void spectre_print_disabled_mitigations(void) +{ + /* Keep a single copy of the common message suffix to avoid duplication. */ + const char *spectre_disabled_suffix = "mitigation disabled by command-line option\n"; + + if (spectre_v2_mitigations_off()) + pr_info("spectre-v2 %s", spectre_disabled_suffix); + + if (spectre_v4_mitigations_off()) + pr_info("spectre-v4 %s", spectre_disabled_suffix); + + if (__nospectre_bhb || cpu_mitigations_off()) + pr_info("spectre-bhb %s", spectre_disabled_suffix); +} diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c index 29a8e444db83..fabd732d0a2d 100644 --- a/arch/arm64/kernel/psci.c +++ b/arch/arm64/kernel/psci.c @@ -40,7 +40,7 @@ static int cpu_psci_cpu_boot(unsigned int cpu) { phys_addr_t pa_secondary_entry = __pa_symbol(secondary_entry); int err = psci_ops.cpu_on(cpu_logical_map(cpu), pa_secondary_entry); - if (err) + if (err && err != -EPERM) pr_err("failed to boot CPU%d (%d)\n", cpu, err); return err; diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 20d7ef82de90..b9bdd83fbbca 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -28,11 +28,13 @@ #include <linux/hw_breakpoint.h> #include <linux/regset.h> #include <linux/elf.h> +#include <linux/rseq.h> #include <asm/compat.h> #include <asm/cpufeature.h> #include <asm/debug-monitors.h> #include <asm/fpsimd.h> +#include <asm/gcs.h> #include <asm/mte.h> #include <asm/pointer_auth.h> #include <asm/stacktrace.h> @@ -139,7 +141,7 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) addr += n; if (regs_within_kernel_stack(regs, (unsigned long)addr)) - return *addr; + return READ_ONCE_NOCHECK(*addr); else return 0; } @@ -173,7 +175,6 @@ static void ptrace_hbptriggered(struct perf_event *bp, struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp); const char *desc = "Hardware breakpoint trap (ptrace)"; -#ifdef CONFIG_COMPAT if (is_compat_task()) { int si_errno = 0; int i; @@ -195,7 +196,7 @@ static void ptrace_hbptriggered(struct perf_event *bp, desc); return; } -#endif + arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, bkpt->trigger, desc); } @@ -593,7 +594,7 @@ static int __fpr_get(struct task_struct *target, { struct user_fpsimd_state *uregs; - sve_sync_to_fpsimd(target); + fpsimd_sync_from_effective_state(target); uregs = &target->thread.uw.fpsimd_state; @@ -625,7 +626,7 @@ static int __fpr_set(struct task_struct *target, * Ensure target->thread.uw.fpsimd_state is up to date, so that a * short copyin can't resurrect stale data. */ - sve_sync_to_fpsimd(target); + fpsimd_sync_from_effective_state(target); newstate = target->thread.uw.fpsimd_state; @@ -652,7 +653,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - sve_sync_from_fpsimd_zeropad(target); + fpsimd_sync_to_effective_state_zeropad(target); fpsimd_flush_task_state(target); return ret; @@ -697,6 +698,41 @@ static int tls_set(struct task_struct *target, const struct user_regset *regset, return ret; } +static int fpmr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_fpmr()) + return -EINVAL; + + if (target == current) + fpsimd_preserve_current_state(); + + return membuf_store(&to, target->thread.uw.fpmr); +} + +static int fpmr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + unsigned long fpmr; + + if (!system_supports_fpmr()) + return -EINVAL; + + fpmr = target->thread.uw.fpmr; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpmr, 0, count); + if (ret) + return ret; + + target->thread.uw.fpmr = fpmr; + + fpsimd_flush_task_state(target); + + return 0; +} + static int system_call_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) @@ -728,7 +764,6 @@ static void sve_init_header_from_task(struct user_sve_header *header, { unsigned int vq; bool active; - bool fpsimd_only; enum vec_type task_type; memset(header, 0, sizeof(*header)); @@ -740,35 +775,33 @@ static void sve_init_header_from_task(struct user_sve_header *header, task_type = ARM64_VEC_SVE; active = (task_type == type); + if (active && target->thread.fp_type == FP_STATE_SVE) + header->flags = SVE_PT_REGS_SVE; + else + header->flags = SVE_PT_REGS_FPSIMD; + switch (type) { case ARM64_VEC_SVE: if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) header->flags |= SVE_PT_VL_INHERIT; - fpsimd_only = !test_tsk_thread_flag(target, TIF_SVE); break; case ARM64_VEC_SME: if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) header->flags |= SVE_PT_VL_INHERIT; - fpsimd_only = false; break; default: WARN_ON_ONCE(1); return; } - if (active) { - if (fpsimd_only) { - header->flags |= SVE_PT_REGS_FPSIMD; - } else { - header->flags |= SVE_PT_REGS_SVE; - } - } - header->vl = task_get_vl(target, type); vq = sve_vq_from_vl(header->vl); header->max_vl = vec_max_vl(type); - header->size = SVE_PT_SIZE(vq, header->flags); + if (active) + header->size = SVE_PT_SIZE(vq, header->flags); + else + header->size = sizeof(header); header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl), SVE_PT_REGS_SVE); } @@ -787,18 +820,25 @@ static int sve_get_common(struct task_struct *target, unsigned int vq; unsigned long start, end; + if (target == current) + fpsimd_preserve_current_state(); + /* Header */ sve_init_header_from_task(&header, target, type); vq = sve_vq_from_vl(header.vl); membuf_write(&to, &header, sizeof(header)); - if (target == current) - fpsimd_preserve_current_state(); - BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); + /* + * When the requested vector type is not active, do not present data + * from the other mode to userspace. + */ + if (header.size == sizeof(header)) + return 0; + switch ((header.flags & SVE_PT_REGS_MASK)) { case SVE_PT_REGS_FPSIMD: return __fpr_get(target, regset, to); @@ -826,7 +866,7 @@ static int sve_get_common(struct task_struct *target, return membuf_zero(&to, end - start); default: - return 0; + BUILD_BUG(); } } @@ -850,6 +890,9 @@ static int sve_set_common(struct task_struct *target, struct user_sve_header header; unsigned int vq; unsigned long start, end; + bool fpsimd; + + fpsimd_flush_task_state(target); /* Header */ if (count < sizeof(header)) @@ -857,97 +900,116 @@ static int sve_set_common(struct task_struct *target, ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, 0, sizeof(header)); if (ret) - goto out; + return ret; /* - * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by - * vec_set_vector_length(), which will also validate them for us: + * Streaming SVE data is always stored and presented in SVE format. + * Require the user to provide SVE formatted data for consistency, and + * to avoid the risk that we configure the task into an invalid state. */ - ret = vec_set_vector_length(target, type, header.vl, - ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); - if (ret) - goto out; + fpsimd = (header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD; + if (fpsimd && type == ARM64_VEC_SME) + return -EINVAL; + + /* + * On systems without SVE we accept FPSIMD format writes with + * a VL of 0 to allow exiting streaming mode, otherwise a VL + * is required. + */ + if (header.vl) { + /* + * If the system does not support SVE we can't + * configure a SVE VL. + */ + if (!system_supports_sve() && type == ARM64_VEC_SVE) + return -EINVAL; + + /* + * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are + * consumed by vec_set_vector_length(), which will + * also validate them for us: + */ + ret = vec_set_vector_length(target, type, header.vl, + ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); + if (ret) + return ret; + } else { + /* If the system supports SVE we require a VL. */ + if (system_supports_sve()) + return -EINVAL; - /* Actual VL set may be less than the user asked for: */ + /* + * Only FPSIMD formatted data with no flags set is + * supported. + */ + if (header.flags != SVE_PT_REGS_FPSIMD) + return -EINVAL; + } + + /* Allocate SME storage if necessary, preserving any existing ZA/ZT state */ + if (type == ARM64_VEC_SME) { + sme_alloc(target, false); + if (!target->thread.sme_state) + return -ENOMEM; + } + + /* Allocate SVE storage if necessary, zeroing any existing SVE state */ + if (!fpsimd) { + sve_alloc(target, true); + if (!target->thread.sve_state) + return -ENOMEM; + } + + /* + * Actual VL set may be different from what the user asked + * for, or we may have configured the _ONEXEC VL not the + * current VL: + */ vq = sve_vq_from_vl(task_get_vl(target, type)); /* Enter/exit streaming mode */ if (system_supports_sme()) { - u64 old_svcr = target->thread.svcr; - switch (type) { case ARM64_VEC_SVE: target->thread.svcr &= ~SVCR_SM_MASK; + set_tsk_thread_flag(target, TIF_SVE); break; case ARM64_VEC_SME: target->thread.svcr |= SVCR_SM_MASK; - - /* - * Disable traps and ensure there is SME storage but - * preserve any currently set values in ZA/ZT. - */ - sme_alloc(target, false); set_tsk_thread_flag(target, TIF_SME); break; default: WARN_ON_ONCE(1); - ret = -EINVAL; - goto out; + return -EINVAL; } - - /* - * If we switched then invalidate any existing SVE - * state and ensure there's storage. - */ - if (target->thread.svcr != old_svcr) - sve_alloc(target, true); } + /* Always zero V regs, FPSR, and FPCR */ + memset(¤t->thread.uw.fpsimd_state, 0, + sizeof(current->thread.uw.fpsimd_state)); + /* Registers: FPSIMD-only case */ BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); - if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) { - ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, - SVE_PT_FPSIMD_OFFSET); + if (fpsimd) { clear_tsk_thread_flag(target, TIF_SVE); target->thread.fp_type = FP_STATE_FPSIMD; - goto out; + ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, + SVE_PT_FPSIMD_OFFSET); + return ret; } - /* - * Otherwise: no registers or full SVE case. For backwards - * compatibility reasons we treat empty flags as SVE registers. - */ + /* Otherwise: no registers or full SVE case. */ + + target->thread.fp_type = FP_STATE_SVE; /* * If setting a different VL from the requested VL and there is * register data, the data layout will be wrong: don't even * try to set the registers in this case. */ - if (count && vq != sve_vq_from_vl(header.vl)) { - ret = -EIO; - goto out; - } - - sve_alloc(target, true); - if (!target->thread.sve_state) { - ret = -ENOMEM; - clear_tsk_thread_flag(target, TIF_SVE); - target->thread.fp_type = FP_STATE_FPSIMD; - goto out; - } - - /* - * Ensure target->thread.sve_state is up to date with target's - * FPSIMD regs, so that a short copyin leaves trailing - * registers unmodified. Only enable SVE if we are - * configuring normal SVE, a system with streaming SVE may not - * have normal SVE. - */ - fpsimd_sync_to_sve(target); - if (type == ARM64_VEC_SVE) - set_tsk_thread_flag(target, TIF_SVE); - target->thread.fp_type = FP_STATE_SVE; + if (count && vq != sve_vq_from_vl(header.vl)) + return -EIO; BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); start = SVE_PT_SVE_OFFSET; @@ -956,7 +1018,7 @@ static int sve_set_common(struct task_struct *target, target->thread.sve_state, start, end); if (ret) - goto out; + return ret; start = end; end = SVE_PT_SVE_FPSR_OFFSET(vq); @@ -972,8 +1034,6 @@ static int sve_set_common(struct task_struct *target, &target->thread.uw.fpsimd_state.fpsr, start, end); -out: - fpsimd_flush_task_state(target); return ret; } @@ -982,7 +1042,7 @@ static int sve_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - if (!system_supports_sve()) + if (!system_supports_sve() && !system_supports_sme()) return -EINVAL; return sve_set_common(target, regset, pos, count, kbuf, ubuf, @@ -1095,7 +1155,11 @@ static int za_set(struct task_struct *target, if (ret) goto out; - /* Actual VL set may be less than the user asked for: */ + /* + * Actual VL set may be different from what the user asked + * for, or we may have configured the _ONEXEC rather than + * current VL: + */ vq = sve_vq_from_vl(task_get_sme_vl(target)); /* Ensure there is some SVE storage for streaming mode */ @@ -1107,12 +1171,13 @@ static int za_set(struct task_struct *target, } } - /* Allocate/reinit ZA storage */ - sme_alloc(target, true); - if (!target->thread.sme_state) { - ret = -ENOMEM; - goto out; - } + /* + * Only flush the storage if PSTATE.ZA was not already set, + * otherwise preserve any existing data. + */ + sme_alloc(target, !thread_za_enabled(&target->thread)); + if (!target->thread.sme_state) + return -ENOMEM; /* If there is no data then disable ZA */ if (!count) { @@ -1387,7 +1452,7 @@ static int tagged_addr_ctrl_get(struct task_struct *target, { long ctrl = get_tagged_addr_ctrl(target); - if (IS_ERR_VALUE(ctrl)) + if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl))) return ctrl; return membuf_write(&to, &ctrl, sizeof(ctrl)); @@ -1401,6 +1466,10 @@ static int tagged_addr_ctrl_set(struct task_struct *target, const struct int ret; long ctrl; + ctrl = get_tagged_addr_ctrl(target); + if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl))) + return ctrl; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); if (ret) return ret; @@ -1409,6 +1478,101 @@ static int tagged_addr_ctrl_set(struct task_struct *target, const struct } #endif +#ifdef CONFIG_ARM64_POE +static int poe_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_poe()) + return -EINVAL; + + return membuf_write(&to, &target->thread.por_el0, + sizeof(target->thread.por_el0)); +} + +static int poe_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + long ctrl; + + if (!system_supports_poe()) + return -EINVAL; + + ctrl = target->thread.por_el0; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + target->thread.por_el0 = ctrl; + + return 0; +} +#endif + +#ifdef CONFIG_ARM64_GCS +static void task_gcs_to_user(struct user_gcs *user_gcs, + const struct task_struct *target) +{ + user_gcs->features_enabled = target->thread.gcs_el0_mode; + user_gcs->features_locked = target->thread.gcs_el0_locked; + user_gcs->gcspr_el0 = target->thread.gcspr_el0; +} + +static void task_gcs_from_user(struct task_struct *target, + const struct user_gcs *user_gcs) +{ + target->thread.gcs_el0_mode = user_gcs->features_enabled; + target->thread.gcs_el0_locked = user_gcs->features_locked; + target->thread.gcspr_el0 = user_gcs->gcspr_el0; +} + +static int gcs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + if (target == current) + gcs_preserve_current_state(); + + task_gcs_to_user(&user_gcs, target); + + return membuf_write(&to, &user_gcs, sizeof(user_gcs)); +} + +static int gcs_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + task_gcs_to_user(&user_gcs, target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1); + if (ret) + return ret; + + if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + task_gcs_from_user(target, &user_gcs); + + return 0; +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -1417,6 +1581,7 @@ enum aarch64_regset { REGSET_HW_BREAK, REGSET_HW_WATCH, #endif + REGSET_FPMR, REGSET_SYSTEM_CALL, #ifdef CONFIG_ARM64_SVE REGSET_SVE, @@ -1437,11 +1602,17 @@ enum aarch64_regset { #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI REGSET_TAGGED_ADDR_CTRL, #endif +#ifdef CONFIG_ARM64_POE + REGSET_POE, +#endif +#ifdef CONFIG_ARM64_GCS + REGSET_GCS, +#endif }; static const struct user_regset aarch64_regsets[] = { [REGSET_GPR] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = sizeof(struct user_pt_regs) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), @@ -1449,7 +1620,7 @@ static const struct user_regset aarch64_regsets[] = { .set = gpr_set }, [REGSET_FPR] = { - .core_note_type = NT_PRFPREG, + USER_REGSET_NOTE_TYPE(PRFPREG), .n = sizeof(struct user_fpsimd_state) / sizeof(u32), /* * We pretend we have 32-bit registers because the fpsr and @@ -1462,7 +1633,7 @@ static const struct user_regset aarch64_regsets[] = { .set = fpr_set }, [REGSET_TLS] = { - .core_note_type = NT_ARM_TLS, + USER_REGSET_NOTE_TYPE(ARM_TLS), .n = 2, .size = sizeof(void *), .align = sizeof(void *), @@ -1471,7 +1642,7 @@ static const struct user_regset aarch64_regsets[] = { }, #ifdef CONFIG_HAVE_HW_BREAKPOINT [REGSET_HW_BREAK] = { - .core_note_type = NT_ARM_HW_BREAK, + USER_REGSET_NOTE_TYPE(ARM_HW_BREAK), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -1479,7 +1650,7 @@ static const struct user_regset aarch64_regsets[] = { .set = hw_break_set, }, [REGSET_HW_WATCH] = { - .core_note_type = NT_ARM_HW_WATCH, + USER_REGSET_NOTE_TYPE(ARM_HW_WATCH), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -1488,17 +1659,26 @@ static const struct user_regset aarch64_regsets[] = { }, #endif [REGSET_SYSTEM_CALL] = { - .core_note_type = NT_ARM_SYSTEM_CALL, + USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL), .n = 1, .size = sizeof(int), .align = sizeof(int), .regset_get = system_call_get, .set = system_call_set, }, + [REGSET_FPMR] = { + USER_REGSET_NOTE_TYPE(ARM_FPMR), + .n = 1, + .size = sizeof(u64), + .align = sizeof(u64), + .regset_get = fpmr_get, + .set = fpmr_set, + }, #ifdef CONFIG_ARM64_SVE [REGSET_SVE] = { /* Scalable Vector Extension */ - .core_note_type = NT_ARM_SVE, - .n = DIV_ROUND_UP(SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE), + USER_REGSET_NOTE_TYPE(ARM_SVE), + .n = DIV_ROUND_UP(SVE_PT_SIZE(ARCH_SVE_VQ_MAX, + SVE_PT_REGS_SVE), SVE_VQ_BYTES), .size = SVE_VQ_BYTES, .align = SVE_VQ_BYTES, @@ -1508,7 +1688,7 @@ static const struct user_regset aarch64_regsets[] = { #endif #ifdef CONFIG_ARM64_SME [REGSET_SSVE] = { /* Streaming mode SVE */ - .core_note_type = NT_ARM_SSVE, + USER_REGSET_NOTE_TYPE(ARM_SSVE), .n = DIV_ROUND_UP(SVE_PT_SIZE(SME_VQ_MAX, SVE_PT_REGS_SVE), SVE_VQ_BYTES), .size = SVE_VQ_BYTES, @@ -1517,7 +1697,7 @@ static const struct user_regset aarch64_regsets[] = { .set = ssve_set, }, [REGSET_ZA] = { /* SME ZA */ - .core_note_type = NT_ARM_ZA, + USER_REGSET_NOTE_TYPE(ARM_ZA), /* * ZA is a single register but it's variably sized and * the ptrace core requires that the size of any data @@ -1533,7 +1713,7 @@ static const struct user_regset aarch64_regsets[] = { .set = za_set, }, [REGSET_ZT] = { /* SME ZT */ - .core_note_type = NT_ARM_ZT, + USER_REGSET_NOTE_TYPE(ARM_ZT), .n = 1, .size = ZT_SIG_REG_BYTES, .align = sizeof(u64), @@ -1543,7 +1723,7 @@ static const struct user_regset aarch64_regsets[] = { #endif #ifdef CONFIG_ARM64_PTR_AUTH [REGSET_PAC_MASK] = { - .core_note_type = NT_ARM_PAC_MASK, + USER_REGSET_NOTE_TYPE(ARM_PAC_MASK), .n = sizeof(struct user_pac_mask) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), @@ -1551,7 +1731,7 @@ static const struct user_regset aarch64_regsets[] = { /* this cannot be set dynamically */ }, [REGSET_PAC_ENABLED_KEYS] = { - .core_note_type = NT_ARM_PAC_ENABLED_KEYS, + USER_REGSET_NOTE_TYPE(ARM_PAC_ENABLED_KEYS), .n = 1, .size = sizeof(long), .align = sizeof(long), @@ -1560,7 +1740,7 @@ static const struct user_regset aarch64_regsets[] = { }, #ifdef CONFIG_CHECKPOINT_RESTORE [REGSET_PACA_KEYS] = { - .core_note_type = NT_ARM_PACA_KEYS, + USER_REGSET_NOTE_TYPE(ARM_PACA_KEYS), .n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t), .size = sizeof(__uint128_t), .align = sizeof(__uint128_t), @@ -1568,7 +1748,7 @@ static const struct user_regset aarch64_regsets[] = { .set = pac_address_keys_set, }, [REGSET_PACG_KEYS] = { - .core_note_type = NT_ARM_PACG_KEYS, + USER_REGSET_NOTE_TYPE(ARM_PACG_KEYS), .n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t), .size = sizeof(__uint128_t), .align = sizeof(__uint128_t), @@ -1579,7 +1759,7 @@ static const struct user_regset aarch64_regsets[] = { #endif #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI [REGSET_TAGGED_ADDR_CTRL] = { - .core_note_type = NT_ARM_TAGGED_ADDR_CTRL, + USER_REGSET_NOTE_TYPE(ARM_TAGGED_ADDR_CTRL), .n = 1, .size = sizeof(long), .align = sizeof(long), @@ -1587,6 +1767,26 @@ static const struct user_regset aarch64_regsets[] = { .set = tagged_addr_ctrl_set, }, #endif +#ifdef CONFIG_ARM64_POE + [REGSET_POE] = { + USER_REGSET_NOTE_TYPE(ARM_POE), + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = poe_get, + .set = poe_set, + }, +#endif +#ifdef CONFIG_ARM64_GCS + [REGSET_GCS] = { + USER_REGSET_NOTE_TYPE(ARM_GCS), + .n = sizeof(struct user_gcs) / sizeof(u64), + .size = sizeof(u64), + .align = sizeof(u64), + .regset_get = gcs_get, + .set = gcs_set, + }, +#endif }; static const struct user_regset_view user_aarch64_view = { @@ -1594,7 +1794,6 @@ static const struct user_regset_view user_aarch64_view = { .regsets = aarch64_regsets, .n = ARRAY_SIZE(aarch64_regsets) }; -#ifdef CONFIG_COMPAT enum compat_regset { REGSET_COMPAT_GPR, REGSET_COMPAT_VFP, @@ -1770,7 +1969,7 @@ static int compat_tls_set(struct task_struct *target, static const struct user_regset aarch32_regsets[] = { [REGSET_COMPAT_GPR] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), @@ -1778,7 +1977,7 @@ static const struct user_regset aarch32_regsets[] = { .set = compat_gpr_set }, [REGSET_COMPAT_VFP] = { - .core_note_type = NT_ARM_VFP, + USER_REGSET_NOTE_TYPE(ARM_VFP), .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), @@ -1795,7 +1994,7 @@ static const struct user_regset_view user_aarch32_view = { static const struct user_regset aarch32_ptrace_regsets[] = { [REGSET_GPR] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), @@ -1803,7 +2002,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .set = compat_gpr_set }, [REGSET_FPR] = { - .core_note_type = NT_ARM_VFP, + USER_REGSET_NOTE_TYPE(ARM_VFP), .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), @@ -1811,7 +2010,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .set = compat_vfp_set }, [REGSET_TLS] = { - .core_note_type = NT_ARM_TLS, + USER_REGSET_NOTE_TYPE(ARM_TLS), .n = 1, .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), @@ -1820,7 +2019,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { }, #ifdef CONFIG_HAVE_HW_BREAKPOINT [REGSET_HW_BREAK] = { - .core_note_type = NT_ARM_HW_BREAK, + USER_REGSET_NOTE_TYPE(ARM_HW_BREAK), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -1828,7 +2027,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .set = hw_break_set, }, [REGSET_HW_WATCH] = { - .core_note_type = NT_ARM_HW_WATCH, + USER_REGSET_NOTE_TYPE(ARM_HW_WATCH), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -1837,7 +2036,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { }, #endif [REGSET_SYSTEM_CALL] = { - .core_note_type = NT_ARM_SYSTEM_CALL, + USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL), .n = 1, .size = sizeof(int), .align = sizeof(int), @@ -1851,6 +2050,7 @@ static const struct user_regset_view user_aarch32_ptrace_view = { .regsets = aarch32_ptrace_regsets, .n = ARRAY_SIZE(aarch32_ptrace_regsets) }; +#ifdef CONFIG_COMPAT static int compat_ptrace_read_user(struct task_struct *tsk, compat_ulong_t off, compat_ulong_t __user *ret) { @@ -2112,7 +2312,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, const struct user_regset_view *task_user_regset_view(struct task_struct *task) { -#ifdef CONFIG_COMPAT /* * Core dumping of 32-bit tasks or compat ptrace requests must use the * user_aarch32_view compatible with arm32. Native ptrace requests on @@ -2123,7 +2322,7 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) return &user_aarch32_view; else if (is_compat_thread(task_thread_info(task))) return &user_aarch32_ptrace_view; -#endif + return &user_aarch64_view; } diff --git a/arch/arm64/kernel/reloc_test_core.c b/arch/arm64/kernel/reloc_test_core.c index 99f2ffe9fc05..5b0891146054 100644 --- a/arch/arm64/kernel/reloc_test_core.c +++ b/arch/arm64/kernel/reloc_test_core.c @@ -74,4 +74,5 @@ static void __exit reloc_test_exit(void) module_init(reloc_test_init); module_exit(reloc_test_exit); +MODULE_DESCRIPTION("Relocation testing module"); MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c new file mode 100644 index 000000000000..c64a06f58c0b --- /dev/null +++ b/arch/arm64/kernel/rsi.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include <linux/jump_label.h> +#include <linux/memblock.h> +#include <linux/psci.h> +#include <linux/swiotlb.h> +#include <linux/cc_platform.h> +#include <linux/platform_device.h> + +#include <asm/io.h> +#include <asm/mem_encrypt.h> +#include <asm/rsi.h> + +static struct realm_config config; + +unsigned long prot_ns_shared; +EXPORT_SYMBOL(prot_ns_shared); + +DEFINE_STATIC_KEY_FALSE_RO(rsi_present); +EXPORT_SYMBOL(rsi_present); + +bool cc_platform_has(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_MEM_ENCRYPT: + return is_realm_world(); + default: + return false; + } +} +EXPORT_SYMBOL_GPL(cc_platform_has); + +static bool rsi_version_matches(void) +{ + unsigned long ver_lower, ver_higher; + unsigned long ret = rsi_request_version(RSI_ABI_VERSION, + &ver_lower, + &ver_higher); + + if (ret == SMCCC_RET_NOT_SUPPORTED) + return false; + + if (ret != RSI_SUCCESS) { + pr_err("RME: RMM doesn't support RSI version %lu.%lu. Supported range: %lu.%lu-%lu.%lu\n", + RSI_ABI_VERSION_MAJOR, RSI_ABI_VERSION_MINOR, + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower), + RSI_ABI_VERSION_GET_MAJOR(ver_higher), + RSI_ABI_VERSION_GET_MINOR(ver_higher)); + return false; + } + + pr_info("RME: Using RSI version %lu.%lu\n", + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower)); + + return true; +} + +static void __init arm64_rsi_setup_memory(void) +{ + u64 i; + phys_addr_t start, end; + + /* + * Iterate over the available memory ranges and convert the state to + * protected memory. We should take extra care to ensure that we DO NOT + * permit any "DESTROYED" pages to be converted to "RAM". + * + * panic() is used because if the attempt to switch the memory to + * protected has failed here, then future accesses to the memory are + * simply going to be reflected as a SEA (Synchronous External Abort) + * which we can't handle. Bailing out early prevents the guest limping + * on and dying later. + */ + for_each_mem_range(i, &start, &end) { + if (rsi_set_memory_range_protected_safe(start, end)) { + panic("Failed to set memory range to protected: %pa-%pa", + &start, &end); + } + } +} + +/* + * Check if a given PA range is Trusted (e.g., Protected memory, a Trusted Device + * mapping, or an MMIO emulated in the Realm world). + * + * We can rely on the RIPAS value of the region to detect if a given region is + * protected. + * + * RIPAS_DEV - A trusted device memory or a trusted emulated MMIO (in the Realm + * world + * RIPAS_RAM - Memory (RAM), protected by the RMM guarantees. (e.g., Firmware + * reserved regions for data sharing). + * + * RIPAS_DESTROYED is a special case of one of the above, where the host did + * something without our permission and as such we can't do anything about it. + * + * The only case where something is emulated by the untrusted hypervisor or is + * backed by shared memory is indicated by RSI_RIPAS_EMPTY. + */ +bool arm64_rsi_is_protected(phys_addr_t base, size_t size) +{ + enum ripas ripas; + phys_addr_t end, top; + + /* Overflow ? */ + if (WARN_ON(base + size <= base)) + return false; + + end = ALIGN(base + size, RSI_GRANULE_SIZE); + base = ALIGN_DOWN(base, RSI_GRANULE_SIZE); + + while (base < end) { + if (WARN_ON(rsi_ipa_state_get(base, end, &ripas, &top))) + break; + if (WARN_ON(top <= base)) + break; + if (ripas == RSI_RIPAS_EMPTY) + break; + base = top; + } + + return base >= end; +} +EXPORT_SYMBOL(arm64_rsi_is_protected); + +static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot) +{ + if (arm64_rsi_is_protected(phys, size)) + *prot = pgprot_encrypted(*prot); + else + *prot = pgprot_decrypted(*prot); + + return 0; +} + +void __init arm64_rsi_init(void) +{ + if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) + return; + if (!rsi_version_matches()) + return; + if (WARN_ON(rsi_get_realm_config(&config))) + return; + prot_ns_shared = BIT(config.ipa_bits - 1); + + if (arm64_ioremap_prot_hook_register(realm_ioremap_hook)) + return; + + if (realm_register_memory_enc_ops()) + return; + + arm64_rsi_setup_memory(); + + static_branch_enable(&rsi_present); +} + +static struct platform_device rsi_dev = { + .name = RSI_PDEV_NAME, + .id = PLATFORM_DEVID_NONE +}; + +static int __init arm64_create_dummy_rsi_dev(void) +{ + if (is_realm_world() && + platform_device_register(&rsi_dev)) + pr_err("failed to register rsi platform device\n"); + return 0; +} + +arch_initcall(arm64_create_dummy_rsi_dev) diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index 255d12f881c2..778f2a1faac8 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -34,10 +34,8 @@ unsigned long sdei_exit_mode; DECLARE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); DECLARE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); -#ifdef CONFIG_VMAP_STACK DEFINE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); DEFINE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); -#endif DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr); DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr); @@ -65,9 +63,6 @@ static void free_sdei_stacks(void) { int cpu; - if (!IS_ENABLED(CONFIG_VMAP_STACK)) - return; - for_each_possible_cpu(cpu) { _free_sdei_stack(&sdei_stack_normal_ptr, cpu); _free_sdei_stack(&sdei_stack_critical_ptr, cpu); @@ -91,9 +86,6 @@ static int init_sdei_stacks(void) int cpu; int err = 0; - if (!IS_ENABLED(CONFIG_VMAP_STACK)) - return 0; - for_each_possible_cpu(cpu) { err = _init_sdei_stack(&sdei_stack_normal_ptr, cpu); if (err) @@ -206,7 +198,7 @@ out_err: /* * do_sdei_event() returns one of: * SDEI_EV_HANDLED - success, return to the interrupted context. - * SDEI_EV_FAILED - failure, return this error code to firmare. + * SDEI_EV_FAILED - failure, return this error code to firmware. * virtual-address - success, return to this address. */ unsigned long __kprobes do_sdei_event(struct pt_regs *regs, @@ -247,7 +239,7 @@ unsigned long __kprobes do_sdei_event(struct pt_regs *regs, * If we interrupted the kernel with interrupts masked, we always go * back to wherever we came from. */ - if (mode == kernel_mode && !interrupts_enabled(regs)) + if (mode == kernel_mode && regs_irqs_disabled(regs)) return SDEI_EV_HANDLED; /* diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 417a8a86b2db..23c05dc7a8f2 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -43,6 +43,7 @@ #include <asm/cpu_ops.h> #include <asm/kasan.h> #include <asm/numa.h> +#include <asm/rsi.h> #include <asm/scs.h> #include <asm/sections.h> #include <asm/setup.h> @@ -166,36 +167,25 @@ static void __init smp_build_mpidr_hash(void) pr_warn("Large number of MPIDR hash buckets detected\n"); } -static void *early_fdt_ptr __initdata; - -void __init *get_early_fdt_ptr(void) -{ - return early_fdt_ptr; -} - -asmlinkage void __init early_fdt_map(u64 dt_phys) -{ - int fdt_size; - - early_fixmap_init(); - early_fdt_ptr = fixmap_remap_fdt(dt_phys, &fdt_size, PAGE_KERNEL); -} - static void __init setup_machine_fdt(phys_addr_t dt_phys) { - int size; + int size = 0; void *dt_virt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); const char *name; if (dt_virt) memblock_reserve(dt_phys, size); - if (!dt_virt || !early_init_dt_scan(dt_virt)) { + /* + * dt_virt is a fixmap address, hence __pa(dt_virt) can't be used. + * Pass dt_phys directly. + */ + if (!early_init_dt_scan(dt_virt, dt_phys)) { pr_crit("\n" - "Error: invalid device tree blob at physical address %pa (virtual address 0x%px)\n" - "The dtb must be 8-byte aligned and must not exceed 2 MB in size\n" - "\nPlease check your bootloader.", - &dt_phys, dt_virt); + "Error: invalid device tree blob: PA=%pa, VA=%px, size=%d bytes\n" + "The dtb must be 8-byte aligned and must not exceed 2 MB in size.\n" + "\nPlease check your bootloader.\n", + &dt_phys, dt_virt, size); /* * Note that in this _really_ early stage we cannot even BUG() @@ -224,7 +214,7 @@ static void __init request_standard_resources(void) unsigned long i = 0; size_t res_size; - kernel_code.start = __pa_symbol(_stext); + kernel_code.start = __pa_symbol(_text); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); @@ -233,9 +223,7 @@ static void __init request_standard_resources(void) num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); - standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES); - if (!standard_resources) - panic("%s: Failed to allocate %zu bytes\n", __func__, res_size); + standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES); for_each_mem_region(region) { res = &standard_resources[i++]; @@ -292,19 +280,12 @@ u64 cpu_logical_map(unsigned int cpu) void __init __no_sanitize_address setup_arch(char **cmdline_p) { - setup_initial_init_mm(_stext, _etext, _edata, _end); + setup_initial_init_mm(_text, _etext, _edata, _end); *cmdline_p = boot_command_line; kaslr_init(); - /* - * If know now we are going to need KPTI then use non-global - * mappings from the start, avoiding the cost of rewriting - * everything later. - */ - arm64_use_ng_mappings = kaslr_requires_kpti(); - early_fixmap_init(); early_ioremap_init(); @@ -320,9 +301,15 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) dynamic_scs_init(); /* - * Unmask asynchronous aborts and fiq after bringing up possible - * earlycon. (Report possible System Errors once we can report this - * occurred). + * The primary CPU enters the kernel with all DAIF exceptions masked. + * + * We must unmask Debug and SError before preemption or scheduling is + * possible to ensure that these are consistently unmasked across + * threads, and we want to unmask SError as soon as possible after + * initializing earlycon so that we can report any SErrors immediately. + * + * IRQ and FIQ will be unmasked after the root irqchip has been + * detected and initialized. */ local_daif_restore(DAIF_PROCCTX_NOIRQ); @@ -367,13 +354,12 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) else psci_acpi_init(); + arm64_rsi_init(); + init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); - /* Init percpu seeds for random tags after cpus are set up. */ - kasan_init_sw_tags(); - #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Make sure init_thread_info.ttbr0 always generates translation @@ -402,19 +388,10 @@ static inline bool cpu_can_disable(unsigned int cpu) return false; } -static int __init topology_init(void) +bool arch_cpu_is_hotpluggable(int num) { - int i; - - for_each_possible_cpu(i) { - struct cpu *cpu = &per_cpu(cpu_data.cpu, i); - cpu->hotpluggable = cpu_can_disable(i); - register_cpu(cpu, i); - } - - return 0; + return cpu_can_disable(num); } -subsys_initcall(topology_init); static void dump_kernel_offset(void) { diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index c7ebe744c64e..1110eeb21f57 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -9,6 +9,7 @@ #include <linux/cache.h> #include <linux/compat.h> #include <linux/errno.h> +#include <linux/irq-entry-common.h> #include <linux/kernel.h> #include <linux/signal.h> #include <linux/freezer.h> @@ -16,15 +17,17 @@ #include <linux/uaccess.h> #include <linux/sizes.h> #include <linux/string.h> -#include <linux/resume_user_mode.h> #include <linux/ratelimit.h> +#include <linux/rseq.h> #include <linux/syscalls.h> +#include <linux/pkeys.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/elf.h> #include <asm/exception.h> #include <asm/cacheflush.h> +#include <asm/gcs.h> #include <asm/ucontext.h> #include <asm/unistd.h> #include <asm/fpsimd.h> @@ -34,6 +37,8 @@ #include <asm/traps.h> #include <asm/vdso.h> +#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) + /* * Do a signal return; undo the signal stack. These are aligned to 128-bit. */ @@ -42,11 +47,6 @@ struct rt_sigframe { struct ucontext uc; }; -struct frame_record { - u64 fp; - u64 lr; -}; - struct rt_sigframe_user_layout { struct rt_sigframe __user *sigframe; struct frame_record __user *next_frame; @@ -56,18 +56,76 @@ struct rt_sigframe_user_layout { unsigned long fpsimd_offset; unsigned long esr_offset; + unsigned long gcs_offset; unsigned long sve_offset; unsigned long tpidr2_offset; unsigned long za_offset; unsigned long zt_offset; + unsigned long fpmr_offset; + unsigned long poe_offset; unsigned long extra_offset; unsigned long end_offset; }; -#define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16) +/* + * Holds any EL0-controlled state that influences unprivileged memory accesses. + * This includes both accesses done in userspace and uaccess done in the kernel. + * + * This state needs to be carefully managed to ensure that it doesn't cause + * uaccess to fail when setting up the signal frame, and the signal handler + * itself also expects a well-defined state when entered. + */ +struct user_access_state { + u64 por_el0; +}; + #define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16) #define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16) +/* + * Save the user access state into ua_state and reset it to disable any + * restrictions. + */ +static void save_reset_user_access_state(struct user_access_state *ua_state) +{ + if (system_supports_poe()) { + u64 por_enable_all = 0; + + for (int pkey = 0; pkey < arch_max_pkey(); pkey++) + por_enable_all |= POR_ELx_PERM_PREP(pkey, POE_RWX); + + ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0); + write_sysreg_s(por_enable_all, SYS_POR_EL0); + /* + * No ISB required as we can tolerate spurious Overlay faults - + * the fault handler will check again based on the new value + * of POR_EL0. + */ + } +} + +/* + * Set the user access state for invoking the signal handler. + * + * No uaccess should be done after that function is called. + */ +static void set_handler_user_access_state(void) +{ + if (system_supports_poe()) + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); +} + +/* + * Restore the user access state to the values saved in ua_state. + * + * No uaccess should be done after that function is called. + */ +static void restore_user_access_state(const struct user_access_state *ua_state) +{ + if (system_supports_poe()) + write_sysreg_s(ua_state->por_el0, SYS_POR_EL0); +} + static void init_user_layout(struct rt_sigframe_user_layout *user) { const size_t reserved_size = @@ -182,6 +240,12 @@ struct user_ctxs { u32 za_size; struct zt_context __user *zt; u32 zt_size; + struct fpmr_context __user *fpmr; + u32 fpmr_size; + struct poe_context __user *poe; + u32 poe_size; + struct gcs_context __user *gcs; + u32 gcs_size; }; static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) @@ -190,6 +254,8 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) ¤t->thread.uw.fpsimd_state; int err; + fpsimd_sync_from_effective_state(current); + /* copy the FP and status/control registers */ err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); __put_user_error(fpsimd->fpsr, &ctx->fpsr, err); @@ -202,31 +268,95 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) return err ? -EFAULT : 0; } -static int restore_fpsimd_context(struct user_ctxs *user) +static int read_fpsimd_context(struct user_fpsimd_state *fpsimd, + struct user_ctxs *user) { - struct user_fpsimd_state fpsimd; - int err = 0; + int err; /* check the size information */ if (user->fpsimd_size != sizeof(struct fpsimd_context)) return -EINVAL; /* copy the FP and status/control registers */ - err = __copy_from_user(fpsimd.vregs, &(user->fpsimd->vregs), - sizeof(fpsimd.vregs)); - __get_user_error(fpsimd.fpsr, &(user->fpsimd->fpsr), err); - __get_user_error(fpsimd.fpcr, &(user->fpsimd->fpcr), err); + err = __copy_from_user(fpsimd->vregs, &(user->fpsimd->vregs), + sizeof(fpsimd->vregs)); + __get_user_error(fpsimd->fpsr, &(user->fpsimd->fpsr), err); + __get_user_error(fpsimd->fpcr, &(user->fpsimd->fpcr), err); + + return err ? -EFAULT : 0; +} + +static int restore_fpsimd_context(struct user_ctxs *user) +{ + struct user_fpsimd_state fpsimd; + int err; + + err = read_fpsimd_context(&fpsimd, user); + if (err) + return err; clear_thread_flag(TIF_SVE); + current->thread.svcr &= ~SVCR_SM_MASK; current->thread.fp_type = FP_STATE_FPSIMD; /* load the hardware registers from the fpsimd_state structure */ + fpsimd_update_current_state(&fpsimd); + return 0; +} + +static int preserve_fpmr_context(struct fpmr_context __user *ctx) +{ + int err = 0; + + __put_user_error(FPMR_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(current->thread.uw.fpmr, &ctx->fpmr, err); + + return err; +} + +static int restore_fpmr_context(struct user_ctxs *user) +{ + u64 fpmr; + int err = 0; + + if (user->fpmr_size != sizeof(*user->fpmr)) + return -EINVAL; + + __get_user_error(fpmr, &user->fpmr->fpmr, err); if (!err) - fpsimd_update_current_state(&fpsimd); + current->thread.uw.fpmr = fpmr; - return err ? -EFAULT : 0; + return err; +} + +static int preserve_poe_context(struct poe_context __user *ctx, + const struct user_access_state *ua_state) +{ + int err = 0; + + __put_user_error(POE_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(ua_state->por_el0, &ctx->por_el0, err); + + return err; } +static int restore_poe_context(struct user_ctxs *user, + struct user_access_state *ua_state) +{ + u64 por_el0; + int err = 0; + + if (user->poe_size != sizeof(*user->poe)) + return -EINVAL; + + __get_user_error(por_el0, &(user->poe->por_el0), err); + if (!err) + ua_state->por_el0 = por_el0; + + return err; +} #ifdef CONFIG_ARM64_SVE @@ -242,7 +372,7 @@ static int preserve_sve_context(struct sve_context __user *ctx) vl = task_get_sme_vl(current); vq = sve_vq_from_vl(vl); flags |= SVE_SIG_FLAG_SM; - } else if (test_thread_flag(TIF_SVE)) { + } else if (current->thread.fp_type == FP_STATE_SVE) { vq = sve_vq_from_vl(vl); } @@ -257,11 +387,6 @@ static int preserve_sve_context(struct sve_context __user *ctx) err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { - /* - * This assumes that the SVE state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET, current->thread.sve_state, SVE_SIG_REGS_SIZE(vq)); @@ -276,6 +401,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) unsigned int vl, vq; struct user_fpsimd_state fpsimd; u16 user_vl, flags; + bool sm; if (user->sve_size < sizeof(*user->sve)) return -EINVAL; @@ -285,7 +411,8 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (err) return err; - if (flags & SVE_SIG_FLAG_SM) { + sm = flags & SVE_SIG_FLAG_SM; + if (sm) { if (!system_supports_sme()) return -EINVAL; @@ -305,28 +432,23 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (user_vl != vl) return -EINVAL; - if (user->sve_size == sizeof(*user->sve)) { - clear_thread_flag(TIF_SVE); - current->thread.svcr &= ~SVCR_SM_MASK; - current->thread.fp_type = FP_STATE_FPSIMD; - goto fpsimd_only; - } + /* + * Non-streaming SVE state may be preserved without an SVE payload, in + * which case the SVE context only has a header with VL==0, and all + * state can be restored from the FPSIMD context. + * + * Streaming SVE state is always preserved with an SVE payload. For + * consistency and robustness, reject restoring streaming SVE state + * without an SVE payload. + */ + if (!sm && user->sve_size == sizeof(*user->sve)) + return restore_fpsimd_context(user); vq = sve_vq_from_vl(vl); if (user->sve_size < SVE_SIG_CONTEXT_SIZE(vq)) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.sve_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ - sve_alloc(current, true); if (!current->thread.sve_state) { clear_thread_flag(TIF_SVE); @@ -346,19 +468,14 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) set_thread_flag(TIF_SVE); current->thread.fp_type = FP_STATE_SVE; -fpsimd_only: - /* copy the FP and status/control registers */ - /* restore_sigframe() already checked that user->fpsimd != NULL. */ - err = __copy_from_user(fpsimd.vregs, user->fpsimd->vregs, - sizeof(fpsimd.vregs)); - __get_user_error(fpsimd.fpsr, &user->fpsimd->fpsr, err); - __get_user_error(fpsimd.fpcr, &user->fpsimd->fpcr, err); + err = read_fpsimd_context(&fpsimd, user); + if (err) + return err; - /* load the hardware registers from the fpsimd_state structure */ - if (!err) - fpsimd_update_current_state(&fpsimd); + /* Merge the FPSIMD registers into the SVE state */ + fpsimd_update_current_state(&fpsimd); - return err ? -EFAULT : 0; + return 0; } #else /* ! CONFIG_ARM64_SVE */ @@ -378,13 +495,12 @@ extern int preserve_sve_context(void __user *ctx); static int preserve_tpidr2_context(struct tpidr2_context __user *ctx) { + u64 tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); int err = 0; - current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); - __put_user_error(TPIDR2_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); - __put_user_error(current->thread.tpidr2_el0, &ctx->tpidr2, err); + __put_user_error(tpidr2_el0, &ctx->tpidr2, err); return err; } @@ -426,11 +542,6 @@ static int preserve_za_context(struct za_context __user *ctx) err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { - /* - * This assumes that the ZA state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET, current->thread.sme_state, ZA_SIG_REGS_SIZE(vq)); @@ -465,16 +576,6 @@ static int restore_za_context(struct user_ctxs *user) if (user->za_size < ZA_SIG_CONTEXT_SIZE(vq)) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.sme_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ - sme_alloc(current, true); if (!current->thread.sme_state) { current->thread.svcr &= ~SVCR_ZA_MASK; @@ -512,11 +613,6 @@ static int preserve_zt_context(struct zt_context __user *ctx) BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); - /* - * This assumes that the ZT state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + ZT_SIG_REGS_OFFSET, thread_zt_state(¤t->thread), ZT_SIG_REGS_SIZE(1)); @@ -542,16 +638,6 @@ static int restore_zt_context(struct user_ctxs *user) if (nregs != 1) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.zt_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch ZT in thread state */ - err = __copy_from_user(thread_zt_state(¤t->thread), (char __user const *)user->zt + ZT_SIG_REGS_OFFSET, @@ -574,6 +660,82 @@ extern int restore_zt_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_SME */ +#ifdef CONFIG_ARM64_GCS + +static int preserve_gcs_context(struct gcs_context __user *ctx) +{ + int err = 0; + u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * If GCS is enabled we will add a cap token to the frame, + * include it in the GCSPR_EL0 we report to support stack + * switching via sigreturn if GCS is enabled. We do not allow + * enabling via sigreturn so the token is only relevant for + * threads with GCS enabled. + */ + if (task_gcs_el0_enabled(current)) + gcspr -= 8; + + __put_user_error(GCS_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(gcspr, &ctx->gcspr, err); + __put_user_error(0, &ctx->reserved, err); + __put_user_error(current->thread.gcs_el0_mode, + &ctx->features_enabled, err); + + return err; +} + +static int restore_gcs_context(struct user_ctxs *user) +{ + u64 gcspr, enabled; + int err = 0; + + if (user->gcs_size != sizeof(*user->gcs)) + return -EINVAL; + + __get_user_error(gcspr, &user->gcs->gcspr, err); + __get_user_error(enabled, &user->gcs->features_enabled, err); + if (err) + return err; + + /* Don't allow unknown modes */ + if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + err = gcs_check_locked(current, enabled); + if (err != 0) + return err; + + /* Don't allow enabling */ + if (!task_gcs_el0_enabled(current) && + (enabled & PR_SHADOW_STACK_ENABLE)) + return -EINVAL; + + /* If we are disabling disable everything */ + if (!(enabled & PR_SHADOW_STACK_ENABLE)) + enabled = 0; + + current->thread.gcs_el0_mode = enabled; + + /* + * We let userspace set GCSPR_EL0 to anything here, we will + * validate later in gcs_restore_signal(). + */ + write_sysreg_s(gcspr, SYS_GCSPR_EL0); + + return 0; +} + +#else /* ! CONFIG_ARM64_GCS */ + +/* Turn any non-optimised out attempts to use these into a link error: */ +extern int preserve_gcs_context(void __user *ctx); +extern int restore_gcs_context(struct user_ctxs *user); + +#endif /* ! CONFIG_ARM64_GCS */ + static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) { @@ -590,6 +752,9 @@ static int parse_user_sigframe(struct user_ctxs *user, user->tpidr2 = NULL; user->za = NULL; user->zt = NULL; + user->fpmr = NULL; + user->poe = NULL; + user->gcs = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -640,6 +805,17 @@ static int parse_user_sigframe(struct user_ctxs *user, /* ignore */ break; + case POE_MAGIC: + if (!system_supports_poe()) + goto invalid; + + if (user->poe) + goto invalid; + + user->poe = (struct poe_context __user *)head; + user->poe_size = size; + break; + case SVE_MAGIC: if (!system_supports_sve() && !system_supports_sme()) goto invalid; @@ -684,6 +860,28 @@ static int parse_user_sigframe(struct user_ctxs *user, user->zt_size = size; break; + case FPMR_MAGIC: + if (!system_supports_fpmr()) + goto invalid; + + if (user->fpmr) + goto invalid; + + user->fpmr = (struct fpmr_context __user *)head; + user->fpmr_size = size; + break; + + case GCS_MAGIC: + if (!system_supports_gcs()) + goto invalid; + + if (user->gcs) + goto invalid; + + user->gcs = (struct gcs_context __user *)head; + user->gcs_size = size; + break; + case EXTRA_MAGIC: if (have_extra_context) goto invalid; @@ -767,7 +965,8 @@ invalid: } static int restore_sigframe(struct pt_regs *regs, - struct rt_sigframe __user *sf) + struct rt_sigframe __user *sf, + struct user_access_state *ua_state) { sigset_t set; int i, err; @@ -789,6 +988,8 @@ static int restore_sigframe(struct pt_regs *regs, */ forget_syscall(regs); + fpsimd_save_and_flush_current_state(); + err |= !valid_user_regs(®s->user_regs, current); if (err == 0) err = parse_user_sigframe(&user, sf); @@ -803,22 +1004,84 @@ static int restore_sigframe(struct pt_regs *regs, err = restore_fpsimd_context(&user); } + if (err == 0 && system_supports_gcs() && user.gcs) + err = restore_gcs_context(&user); + if (err == 0 && system_supports_tpidr2() && user.tpidr2) err = restore_tpidr2_context(&user); + if (err == 0 && system_supports_fpmr() && user.fpmr) + err = restore_fpmr_context(&user); + if (err == 0 && system_supports_sme() && user.za) err = restore_za_context(&user); if (err == 0 && system_supports_sme2() && user.zt) err = restore_zt_context(&user); + if (err == 0 && system_supports_poe() && user.poe) + err = restore_poe_context(&user, ua_state); + return err; } +#ifdef CONFIG_ARM64_GCS +static int gcs_restore_signal(void) +{ + u64 gcspr_el0, cap; + int ret; + + if (!system_supports_gcs()) + return 0; + + if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)) + return 0; + + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * Ensure that any changes to the GCS done via GCS operations + * are visible to the normal reads we do to validate the + * token. + */ + gcsb_dsync(); + + /* + * GCSPR_EL0 should be pointing at a capped GCS, read the cap. + * We don't enforce that this is in a GCS page, if it is not + * then faults will be generated on GCS operations - the main + * concern is to protect GCS pages. + */ + ret = copy_from_user(&cap, (unsigned long __user *)gcspr_el0, + sizeof(cap)); + if (ret) + return -EFAULT; + + /* + * Check that the cap is the actual GCS before replacing it. + */ + if (cap != GCS_SIGNAL_CAP(gcspr_el0)) + return -EINVAL; + + /* Invalidate the token to prevent reuse */ + put_user_gcs(0, (unsigned long __user *)gcspr_el0, &ret); + if (ret != 0) + return -EFAULT; + + write_sysreg_s(gcspr_el0 + 8, SYS_GCSPR_EL0); + + return 0; +} + +#else +static int gcs_restore_signal(void) { return 0; } +#endif + SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; + struct user_access_state ua_state; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; @@ -835,12 +1098,17 @@ SYSCALL_DEFINE0(rt_sigreturn) if (!access_ok(frame, sizeof (*frame))) goto badframe; - if (restore_sigframe(regs, frame)) + if (restore_sigframe(regs, frame, &ua_state)) + goto badframe; + + if (gcs_restore_signal()) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) goto badframe; + restore_user_access_state(&ua_state); + return regs->regs[0]; badframe: @@ -875,10 +1143,19 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } +#ifdef CONFIG_ARM64_GCS + if (system_supports_gcs() && (add_all || current->thread.gcspr_el0)) { + err = sigframe_alloc(user, &user->gcs_offset, + sizeof(struct gcs_context)); + if (err) + return err; + } +#endif + if (system_supports_sve() || system_supports_sme()) { unsigned int vq = 0; - if (add_all || test_thread_flag(TIF_SVE) || + if (add_all || current->thread.fp_type == FP_STATE_SVE || thread_sm_enabled(¤t->thread)) { int vl = max(sve_max_vl(), sme_max_vl()); @@ -928,11 +1205,26 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, } } + if (system_supports_fpmr()) { + err = sigframe_alloc(user, &user->fpmr_offset, + sizeof(struct fpmr_context)); + if (err) + return err; + } + + if (system_supports_poe()) { + err = sigframe_alloc(user, &user->poe_offset, + sizeof(struct poe_context)); + if (err) + return err; + } + return sigframe_alloc_end(user); } static int setup_sigframe(struct rt_sigframe_user_layout *user, - struct pt_regs *regs, sigset_t *set) + struct pt_regs *regs, sigset_t *set, + const struct user_access_state *ua_state) { int i, err = 0; struct rt_sigframe __user *sf = user->sigframe; @@ -968,6 +1260,12 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); } + if (system_supports_gcs() && err == 0 && user->gcs_offset) { + struct gcs_context __user *gcs_ctx = + apply_user_offset(user, user->gcs_offset); + err |= preserve_gcs_context(gcs_ctx); + } + /* Scalable Vector Extension state (including streaming), if present */ if ((system_supports_sve() || system_supports_sme()) && err == 0 && user->sve_offset) { @@ -983,6 +1281,20 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, err |= preserve_tpidr2_context(tpidr2_ctx); } + /* FPMR if supported */ + if (system_supports_fpmr() && err == 0) { + struct fpmr_context __user *fpmr_ctx = + apply_user_offset(user, user->fpmr_offset); + err |= preserve_fpmr_context(fpmr_ctx); + } + + if (system_supports_poe() && err == 0) { + struct poe_context __user *poe_ctx = + apply_user_offset(user, user->poe_offset); + + err |= preserve_poe_context(poe_ctx, ua_state); + } + /* ZA state if present */ if (system_supports_sme() && err == 0 && user->za_offset) { struct za_context __user *za_ctx = @@ -1071,15 +1383,81 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, return 0; } -static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, +#ifdef CONFIG_ARM64_GCS + +static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) +{ + u64 gcspr_el0; + int ret = 0; + + if (!system_supports_gcs()) + return 0; + + if (!task_gcs_el0_enabled(current)) + return 0; + + /* + * We are entering a signal handler, current register state is + * active. + */ + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * Push a cap and the GCS entry for the trampoline onto the GCS. + */ + put_user_gcs((unsigned long)sigtramp, + (unsigned long __user *)(gcspr_el0 - 16), &ret); + put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 8), + (unsigned long __user *)(gcspr_el0 - 8), &ret); + if (ret != 0) + return ret; + + gcspr_el0 -= 16; + write_sysreg_s(gcspr_el0, SYS_GCSPR_EL0); + + return 0; +} +#else + +static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) +{ + return 0; +} + +#endif + +static int setup_return(struct pt_regs *regs, struct ksignal *ksig, struct rt_sigframe_user_layout *user, int usig) { __sigrestore_t sigtramp; + int err; + + if (ksig->ka.sa.sa_flags & SA_RESTORER) + sigtramp = ksig->ka.sa.sa_restorer; + else + sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); + + err = gcs_signal_entry(sigtramp, ksig); + if (err) + return err; + + /* + * We must not fail from this point onwards. We are going to update + * registers, including SP, in order to invoke the signal handler. If + * we failed and attempted to deliver a nested SIGSEGV to a handler + * after that point, the subsequent sigreturn would end up restoring + * the (partial) state for the original signal handler. + */ regs->regs[0] = usig; + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + regs->regs[1] = (unsigned long)&user->sigframe->info; + regs->regs[2] = (unsigned long)&user->sigframe->uc; + } regs->sp = (unsigned long)user->sigframe; regs->regs[29] = (unsigned long)&user->next_frame->fp; - regs->pc = (unsigned long)ka->sa.sa_handler; + regs->regs[30] = (unsigned long)sigtramp; + regs->pc = (unsigned long)ksig->ka.sa.sa_handler; /* * Signal delivery is a (wacky) indirect function call in @@ -1102,29 +1480,12 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, /* Signal handlers are invoked with ZA and streaming mode disabled */ if (system_supports_sme()) { - /* - * If we were in streaming mode the saved register - * state was SVE but we will exit SM and use the - * FPSIMD register state - flush the saved FPSIMD - * register state in case it gets loaded. - */ - if (current->thread.svcr & SVCR_SM_MASK) { - memset(¤t->thread.uw.fpsimd_state, 0, - sizeof(current->thread.uw.fpsimd_state)); - current->thread.fp_type = FP_STATE_FPSIMD; - } - - current->thread.svcr &= ~(SVCR_ZA_MASK | - SVCR_SM_MASK); - sme_smstop(); + task_smstop_sm(current); + current->thread.svcr &= ~SVCR_ZA_MASK; + write_sysreg_s(0, SYS_TPIDR2_EL0); } - if (ka->sa.sa_flags & SA_RESTORER) - sigtramp = ka->sa.sa_restorer; - else - sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); - - regs->regs[30] = (unsigned long)sigtramp; + return 0; } static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, @@ -1132,28 +1493,37 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, { struct rt_sigframe_user_layout user; struct rt_sigframe __user *frame; + struct user_access_state ua_state; int err = 0; - fpsimd_signal_preserve_current_state(); + fpsimd_save_and_flush_current_state(); if (get_sigframe(&user, ksig, regs)) return 1; + save_reset_user_access_state(&ua_state); frame = user.sigframe; __put_user_error(0, &frame->uc.uc_flags, err); __put_user_error(NULL, &frame->uc.uc_link, err); err |= __save_altstack(&frame->uc.uc_stack, regs->sp); - err |= setup_sigframe(&user, regs, set); - if (err == 0) { - setup_return(regs, &ksig->ka, &user, usig); - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, &ksig->info); - regs->regs[1] = (unsigned long)&frame->info; - regs->regs[2] = (unsigned long)&frame->uc; - } - } + err |= setup_sigframe(&user, regs, set, &ua_state); + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + err |= copy_siginfo_to_user(&frame->info, &ksig->info); + + if (err == 0) + err = setup_return(regs, ksig, &user, usig); + + /* + * We must not fail if setup_return() succeeded - see comment at the + * beginning of setup_return(). + */ + + if (err == 0) + set_handler_user_access_state(); + else + restore_user_access_state(&ua_state); return err; } @@ -1207,7 +1577,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ -static void do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; @@ -1278,41 +1648,6 @@ static void do_signal(struct pt_regs *regs) restore_saved_sigmask(); } -void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) -{ - do { - if (thread_flags & _TIF_NEED_RESCHED) { - /* Unmask Debug and SError for the next task */ - local_daif_restore(DAIF_PROCCTX_NOIRQ); - - schedule(); - } else { - local_daif_restore(DAIF_PROCCTX); - - if (thread_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - if (thread_flags & _TIF_MTE_ASYNC_FAULT) { - clear_thread_flag(TIF_MTE_ASYNC_FAULT); - send_sig_fault(SIGSEGV, SEGV_MTEAERR, - (void __user *)NULL, current); - } - - if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) - do_signal(regs); - - if (thread_flags & _TIF_NOTIFY_RESUME) - resume_user_mode_work(regs); - - if (thread_flags & _TIF_FOREIGN_FPSTATE) - fpsimd_restore_current_state(); - } - - local_daif_mask(); - thread_flags = read_thread_flags(); - } while (thread_flags & _TIF_WORK_MASK); -} - unsigned long __ro_after_init signal_minsigstksz; /* @@ -1344,7 +1679,7 @@ void __init minsigstksz_setup(void) */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 4700f8522d27..bb3b526ff43f 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -17,7 +17,7 @@ #include <asm/signal32.h> #include <asm/traps.h> #include <linux/uaccess.h> -#include <asm/unistd.h> +#include <asm/unistd_compat_32.h> #include <asm/vdso.h> struct compat_vfp_sigframe { @@ -103,7 +103,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) * Note that this also saves V16-31, which aren't visible * in AArch32. */ - fpsimd_signal_preserve_current_state(); + fpsimd_save_and_flush_current_state(); /* Place structure header on the stack */ __put_user_error(magic, &frame->magic, err); @@ -169,14 +169,17 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) fpsimd.fpsr = fpscr & VFP_FPSCR_STAT_MASK; fpsimd.fpcr = fpscr & VFP_FPSCR_CTRL_MASK; + if (err) + return -EFAULT; + /* * We don't need to touch the exception register, so * reload the hardware state. */ - if (!err) - fpsimd_update_current_state(&fpsimd); + fpsimd_save_and_flush_current_state(); + current->thread.uw.fpsimd_state = fpsimd; - return err ? -EFAULT : 0; + return 0; } static int compat_restore_sigframe(struct pt_regs *regs, @@ -451,7 +454,7 @@ int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set, void compat_setup_restart_syscall(struct pt_regs *regs) { - regs->regs[7] = __NR_compat_restart_syscall; + regs->regs[7] = __NR_compat32_restart_syscall; } /* @@ -460,7 +463,7 @@ void compat_setup_restart_syscall(struct pt_regs *regs) */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/arm64/kernel/sigreturn32.S b/arch/arm64/kernel/sigreturn32.S index ccbd4aab4ba4..6f486b95b413 100644 --- a/arch/arm64/kernel/sigreturn32.S +++ b/arch/arm64/kernel/sigreturn32.S @@ -13,7 +13,7 @@ * need two 16-bit instructions. */ -#include <asm/unistd.h> +#include <asm/unistd_compat_32.h> .section .rodata .globl __aarch32_sigret_code_start @@ -22,26 +22,26 @@ __aarch32_sigret_code_start: /* * ARM Code */ - .byte __NR_compat_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_sigreturn - .byte __NR_compat_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_sigreturn + .byte __NR_compat32_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat32_sigreturn + .byte __NR_compat32_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat32_sigreturn /* * Thumb code */ - .byte __NR_compat_sigreturn, 0x27 // svc #__NR_compat_sigreturn - .byte __NR_compat_sigreturn, 0xdf // mov r7, #__NR_compat_sigreturn + .byte __NR_compat32_sigreturn, 0x27 // svc #__NR_compat32_sigreturn + .byte __NR_compat32_sigreturn, 0xdf // mov r7, #__NR_compat32_sigreturn /* * ARM code */ - .byte __NR_compat_rt_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_rt_sigreturn - .byte __NR_compat_rt_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_rt_sigreturn + .byte __NR_compat32_rt_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat32_rt_sigreturn + .byte __NR_compat32_rt_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat32_rt_sigreturn /* * Thumb code */ - .byte __NR_compat_rt_sigreturn, 0x27 // svc #__NR_compat_rt_sigreturn - .byte __NR_compat_rt_sigreturn, 0xdf // mov r7, #__NR_compat_rt_sigreturn + .byte __NR_compat32_rt_sigreturn, 0x27 // svc #__NR_compat32_rt_sigreturn + .byte __NR_compat32_rt_sigreturn, 0xdf // mov r7, #__NR_compat32_rt_sigreturn .globl __aarch32_sigret_code_end __aarch32_sigret_code_end: diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 2aa5129d8253..f093cdf71be1 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -102,9 +102,6 @@ SYM_CODE_START(cpu_resume) mov x0, xzr bl init_kernel_el mov x19, x0 // preserve boot mode -#if VA_BITS > 48 - ldr_l x0, vabits_actual -#endif bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ adrp x1, swapper_pg_dir diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S index 487381164ff6..2def9d0dd3dd 100644 --- a/arch/arm64/kernel/smccc-call.S +++ b/arch/arm64/kernel/smccc-call.S @@ -7,48 +7,19 @@ #include <asm/asm-offsets.h> #include <asm/assembler.h> -#include <asm/thread_info.h> - -/* - * If we have SMCCC v1.3 and (as is likely) no SVE state in - * the registers then set the SMCCC hint bit to say there's no - * need to preserve it. Do this by directly adjusting the SMCCC - * function value which is already stored in x0 ready to be called. - */ -SYM_FUNC_START(__arm_smccc_sve_check) - - ldr_l x16, smccc_has_sve_hint - cbz x16, 2f - - get_current_task x16 - ldr x16, [x16, #TSK_TI_FLAGS] - tbnz x16, #TIF_FOREIGN_FPSTATE, 1f // Any live FP state? - tbnz x16, #TIF_SVE, 2f // Does that state include SVE? - -1: orr x0, x0, ARM_SMCCC_1_3_SVE_HINT - -2: ret -SYM_FUNC_END(__arm_smccc_sve_check) -EXPORT_SYMBOL(__arm_smccc_sve_check) .macro SMCCC instr - stp x29, x30, [sp, #-16]! - mov x29, sp -alternative_if ARM64_SVE - bl __arm_smccc_sve_check -alternative_else_nop_endif \instr #0 - ldr x4, [sp, #16] + ldr x4, [sp] stp x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS] stp x2, x3, [x4, #ARM_SMCCC_RES_X2_OFFS] - ldr x4, [sp, #24] + ldr x4, [sp, #8] cbz x4, 1f /* no quirk structure */ ldr x9, [x4, #ARM_SMCCC_QUIRK_ID_OFFS] cmp x9, #ARM_SMCCC_QUIRK_QCOM_A6 b.ne 1f str x6, [x4, ARM_SMCCC_QUIRK_STATE_OFFS] -1: ldp x29, x30, [sp], #16 - ret +1: ret .endm /* diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 960b98b43506..1aa324104afb 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -32,7 +32,9 @@ #include <linux/irq_work.h> #include <linux/kernel_stat.h> #include <linux/kexec.h> +#include <linux/kgdb.h> #include <linux/kvm_host.h> +#include <linux/nmi.h> #include <asm/alternative.h> #include <asm/atomic.h> @@ -53,9 +55,6 @@ #include <trace/events/ipi.h> -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); -EXPORT_PER_CPU_SYMBOL(cpu_number); - /* * as from 2.5, kernels no longer have an init_tasks structure * so we need some other way of telling a new secondary core @@ -65,20 +64,20 @@ struct secondary_data secondary_data; /* Number of CPUs which aren't online, but looping in kernel text. */ static int cpus_stuck_in_kernel; -enum ipi_msg_type { - IPI_RESCHEDULE, - IPI_CALL_FUNC, - IPI_CPU_STOP, - IPI_CPU_CRASH_STOP, - IPI_TIMER, - IPI_IRQ_WORK, - IPI_WAKEUP, - NR_IPI +static int ipi_irq_base __ro_after_init; +static int nr_ipi __ro_after_init = NR_IPI; + +struct ipi_descs { + struct irq_desc *descs[MAX_IPI]; }; -static int ipi_irq_base __read_mostly; -static int nr_ipi __read_mostly = NR_IPI; -static struct irq_desc *ipi_desc[NR_IPI] __read_mostly; +static DEFINE_PER_CPU_READ_MOSTLY(struct ipi_descs, pcpu_ipi_desc); + +#define get_ipi_desc(__cpu, __ipi) (per_cpu_ptr(&pcpu_ipi_desc, __cpu)->descs[__ipi]) + +static bool percpu_ipi_descs __ro_after_init; + +static bool crash_stop; static void ipi_setup(int cpu); @@ -124,7 +123,8 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) /* Now bring the CPU into our world */ ret = boot_secondary(cpu, idle); if (ret) { - pr_err("CPU%u: failed to boot: %d\n", cpu, ret); + if (ret != -EPERM) + pr_err("CPU%u: failed to boot: %d\n", cpu, ret); return ret; } @@ -215,7 +215,7 @@ asmlinkage notrace void secondary_start_kernel(void) if (system_uses_irq_prio_masking()) init_gic_priority_masking(); - rcu_cpu_starting(cpu); + rcutree_report_cpu_starting(cpu); trace_hardirqs_off(); /* @@ -256,6 +256,13 @@ asmlinkage notrace void secondary_start_kernel(void) set_cpu_online(cpu, true); complete(&cpu_running); + /* + * Secondary CPUs enter the kernel with all DAIF exceptions masked. + * + * As with setup_arch() we must unmask Debug and SError exceptions, and + * as the root irqchip has already been detected and initialized we can + * unmask IRQ and FIQ at the same time. + */ local_daif_restore(DAIF_PROCCTX); /* @@ -343,7 +350,7 @@ void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) /* * Now that the dying CPU is beyond the point of no return w.r.t. - * in-kernel synchronisation, try to get the firwmare to help us to + * in-kernel synchronisation, try to get the firmware to help us to * verify that it has really left the kernel before we consider * clobbering anything it might still be using. */ @@ -401,7 +408,7 @@ void __noreturn cpu_die_early(void) /* Mark this CPU absent */ set_cpu_present(cpu, 0); - rcu_report_dead(cpu); + rcutree_report_cpu_dead(); if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) { update_cpu_boot_status(CPU_KILL_ME); @@ -431,9 +438,9 @@ static void __init hyp_mode_check(void) void __init smp_cpus_done(unsigned int max_cpus) { pr_info("SMP: Total of %d processors activated.\n", num_online_cpus()); - setup_cpu_features(); hyp_mode_check(); - apply_alternatives_all(); + setup_system_features(); + setup_user_features(); mark_linear_text_alias_ro(); } @@ -445,20 +452,17 @@ void __init smp_prepare_boot_cpu(void) * freed shortly, so we must move over to the runtime per-cpu area. */ set_my_cpu_offset(per_cpu_offset(smp_processor_id())); - cpuinfo_store_boot_cpu(); - /* - * We now know enough about the boot CPU to apply the - * alternatives that cannot wait until interrupt handling - * and/or scheduling is enabled. - */ - apply_boot_alternatives(); + cpuinfo_store_boot_cpu(); + setup_boot_cpu_features(); /* Conditionally switch to GIC PMR for interrupt masking */ if (system_uses_irq_prio_masking()) init_gic_priority_masking(); kasan_init_hw_tags(); + /* Init percpu seeds for random tags after cpus are set up. */ + kasan_init_sw_tags(); } /* @@ -500,6 +504,59 @@ static int __init smp_cpu_setup(int cpu) static bool bootcpu_valid __initdata; static unsigned int cpu_count = 1; +int arch_register_cpu(int cpu) +{ + acpi_handle acpi_handle = acpi_get_processor_handle(cpu); + struct cpu *c = &per_cpu(cpu_devices, cpu); + + if (!acpi_disabled && !acpi_handle && + IS_ENABLED(CONFIG_ACPI_HOTPLUG_CPU)) + return -EPROBE_DEFER; + +#ifdef CONFIG_ACPI_HOTPLUG_CPU + /* For now block anything that looks like physical CPU Hotplug */ + if (invalid_logical_cpuid(cpu) || !cpu_present(cpu)) { + pr_err_once("Changing CPU present bit is not supported\n"); + return -ENODEV; + } +#endif + + /* + * Availability of the acpi handle is sufficient to establish + * that _STA has already been checked. No need to recheck here. + */ + c->hotpluggable = arch_cpu_is_hotpluggable(cpu); + + return register_cpu(c, cpu); +} + +#ifdef CONFIG_ACPI_HOTPLUG_CPU +void arch_unregister_cpu(int cpu) +{ + acpi_handle acpi_handle = acpi_get_processor_handle(cpu); + struct cpu *c = &per_cpu(cpu_devices, cpu); + acpi_status status; + unsigned long long sta; + + if (!acpi_handle) { + pr_err_once("Removing a CPU without associated ACPI handle\n"); + return; + } + + status = acpi_evaluate_integer(acpi_handle, "_STA", NULL, &sta); + if (ACPI_FAILURE(status)) + return; + + /* For now do not allow anything that looks like physical CPU HP */ + if (cpu_present(cpu) && !(sta & ACPI_STA_DEVICE_PRESENT)) { + pr_err_once("Changing CPU present bit is not supported\n"); + return; + } + + unregister_cpu(c); +} +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ + #ifdef CONFIG_ACPI static struct acpi_madt_generic_interrupt cpu_madt_gicc[NR_CPUS]; @@ -520,7 +577,8 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) { u64 hwid = processor->arm_mpidr; - if (!(processor->flags & ACPI_MADT_ENABLED)) { + if (!(processor->flags & + (ACPI_MADT_ENABLED | ACPI_MADT_GICC_ONLINE_CAPABLE))) { pr_debug("skipping disabled CPU entry with 0x%llx MPIDR\n", hwid); return; } @@ -739,8 +797,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) */ for_each_possible_cpu(cpu) { - per_cpu(cpu_number, cpu) = cpu; - if (cpu == smp_processor_id()) continue; @@ -757,14 +813,15 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } } -static const char *ipi_types[NR_IPI] __tracepoint_string = { +static const char *ipi_types[MAX_IPI] __tracepoint_string = { [IPI_RESCHEDULE] = "Rescheduling interrupts", [IPI_CALL_FUNC] = "Function call interrupts", [IPI_CPU_STOP] = "CPU stop interrupts", - [IPI_CPU_CRASH_STOP] = "CPU stop (for crash dump) interrupts", + [IPI_CPU_STOP_NMI] = "CPU stop NMIs", [IPI_TIMER] = "Timer broadcast interrupts", [IPI_IRQ_WORK] = "IRQ work interrupts", - [IPI_WAKEUP] = "CPU wake-up interrupts", + [IPI_CPU_BACKTRACE] = "CPU backtrace interrupts", + [IPI_KGDB_ROUNDUP] = "KGDB roundup interrupts", }; static void smp_cross_call(const struct cpumask *target, unsigned int ipinr); @@ -775,11 +832,11 @@ int arch_show_interrupts(struct seq_file *p, int prec) { unsigned int cpu, i; - for (i = 0; i < NR_IPI; i++) { + for (i = 0; i < MAX_IPI; i++) { seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i, prec >= 4 ? " " : ""); for_each_online_cpu(cpu) - seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu)); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(get_ipi_desc(cpu, i), cpu)); seq_printf(p, " %s\n", ipi_types[i]); } @@ -797,13 +854,6 @@ void arch_send_call_function_single_ipi(int cpu) smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC); } -#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL -void arch_send_wakeup_ipi_mask(const struct cpumask *mask) -{ - smp_cross_call(mask, IPI_WAKEUP); -} -#endif - #ifdef CONFIG_IRQ_WORK void arch_irq_work_raise(void) { @@ -811,9 +861,9 @@ void arch_irq_work_raise(void) } #endif -static void __noreturn local_cpu_stop(void) +static void __noreturn local_cpu_stop(unsigned int cpu) { - set_cpu_online(smp_processor_id(), false); + set_cpu_online(cpu, false); local_daif_mask(); sdei_mask_local_cpu(); @@ -827,21 +877,26 @@ static void __noreturn local_cpu_stop(void) */ void __noreturn panic_smp_self_stop(void) { - local_cpu_stop(); + local_cpu_stop(smp_processor_id()); } -#ifdef CONFIG_KEXEC_CORE -static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0); -#endif - static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) { #ifdef CONFIG_KEXEC_CORE + /* + * Use local_daif_mask() instead of local_irq_disable() to make sure + * that pseudo-NMIs are disabled. The "crash stop" code starts with + * an IRQ and falls back to NMI (which might be pseudo). If the IRQ + * finally goes through right as we're timing out then the NMI could + * interrupt us. It's better to prevent the NMI and let the IRQ + * finish since the pt_regs will be better. + */ + local_daif_mask(); + crash_save_cpu(regs, cpu); - atomic_dec(&waiting_for_crash_ipi); + set_cpu_online(cpu, false); - local_irq_disable(); sdei_mask_local_cpu(); if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) @@ -854,6 +909,49 @@ static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs #endif } +static void arm64_send_ipi(const cpumask_t *mask, unsigned int nr) +{ + unsigned int cpu; + + if (!percpu_ipi_descs) + __ipi_send_mask(get_ipi_desc(0, nr), mask); + else + for_each_cpu(cpu, mask) + __ipi_send_single(get_ipi_desc(cpu, nr), cpu); +} + +static void arm64_backtrace_ipi(cpumask_t *mask) +{ + arm64_send_ipi(mask, IPI_CPU_BACKTRACE); +} + +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu) +{ + /* + * NOTE: though nmi_trigger_cpumask_backtrace() has "nmi_" in the name, + * nothing about it truly needs to be implemented using an NMI, it's + * just that it's _allowed_ to work with NMIs. If ipi_should_be_nmi() + * returned false our backtrace attempt will just use a regular IPI. + */ + nmi_trigger_cpumask_backtrace(mask, exclude_cpu, arm64_backtrace_ipi); +} + +#ifdef CONFIG_KGDB +void kgdb_roundup_cpus(void) +{ + int this_cpu = raw_smp_processor_id(); + int cpu; + + for_each_online_cpu(cpu) { + /* No need to roundup ourselves */ + if (cpu == this_cpu) + continue; + + __ipi_send_single(get_ipi_desc(cpu, IPI_KGDB_ROUNDUP), cpu); + } +} +#endif + /* * Main handler for inter-processor interrupts */ @@ -874,14 +972,12 @@ static void do_handle_IPI(int ipinr) break; case IPI_CPU_STOP: - local_cpu_stop(); - break; - - case IPI_CPU_CRASH_STOP: - if (IS_ENABLED(CONFIG_KEXEC_CORE)) { + case IPI_CPU_STOP_NMI: + if (IS_ENABLED(CONFIG_KEXEC_CORE) && crash_stop) { ipi_cpu_crash_stop(cpu, get_irq_regs()); - unreachable(); + } else { + local_cpu_stop(cpu); } break; @@ -897,13 +993,17 @@ static void do_handle_IPI(int ipinr) break; #endif -#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL - case IPI_WAKEUP: - WARN_ONCE(!acpi_parking_protocol_valid(cpu), - "CPU%u: Wake-up IPI outside the ACPI parking protocol\n", - cpu); + case IPI_CPU_BACKTRACE: + /* + * NOTE: in some cases this _won't_ be NMI context. See the + * comment in arch_trigger_cpumask_backtrace(). + */ + nmi_cpu_backtrace(get_irq_regs()); + break; + + case IPI_KGDB_ROUNDUP: + kgdb_nmicallback(cpu, get_irq_regs()); break; -#endif default: pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); @@ -916,14 +1016,31 @@ static void do_handle_IPI(int ipinr) static irqreturn_t ipi_handler(int irq, void *data) { - do_handle_IPI(irq - ipi_irq_base); + unsigned int ipi = (irq - ipi_irq_base) % nr_ipi; + + do_handle_IPI(ipi); return IRQ_HANDLED; } static void smp_cross_call(const struct cpumask *target, unsigned int ipinr) { trace_ipi_raise(target, ipi_types[ipinr]); - __ipi_send_mask(ipi_desc[ipinr], target); + arm64_send_ipi(target, ipinr); +} + +static bool ipi_should_be_nmi(enum ipi_msg_type ipi) +{ + if (!system_uses_irq_prio_masking()) + return false; + + switch (ipi) { + case IPI_CPU_STOP_NMI: + case IPI_CPU_BACKTRACE: + case IPI_KGDB_ROUNDUP: + return true; + default: + return false; + } } static void ipi_setup(int cpu) @@ -933,8 +1050,18 @@ static void ipi_setup(int cpu) if (WARN_ON_ONCE(!ipi_irq_base)) return; - for (i = 0; i < nr_ipi; i++) - enable_percpu_irq(ipi_irq_base + i, 0); + for (i = 0; i < nr_ipi; i++) { + if (!percpu_ipi_descs) { + if (ipi_should_be_nmi(i)) { + prepare_percpu_nmi(ipi_irq_base + i); + enable_percpu_nmi(ipi_irq_base + i, 0); + } else { + enable_percpu_irq(ipi_irq_base + i, 0); + } + } else { + enable_irq(irq_desc_get_irq(get_ipi_desc(cpu, i))); + } + } } #ifdef CONFIG_HOTPLUG_CPU @@ -945,31 +1072,78 @@ static void ipi_teardown(int cpu) if (WARN_ON_ONCE(!ipi_irq_base)) return; - for (i = 0; i < nr_ipi; i++) - disable_percpu_irq(ipi_irq_base + i); + for (i = 0; i < nr_ipi; i++) { + if (!percpu_ipi_descs) { + if (ipi_should_be_nmi(i)) { + disable_percpu_nmi(ipi_irq_base + i); + teardown_percpu_nmi(ipi_irq_base + i); + } else { + disable_percpu_irq(ipi_irq_base + i); + } + } else { + disable_irq(irq_desc_get_irq(get_ipi_desc(cpu, i))); + } + } } #endif -void __init set_smp_ipi_range(int ipi_base, int n) +static void ipi_setup_sgi(int ipi) { - int i; + int err, irq, cpu; - WARN_ON(n < NR_IPI); - nr_ipi = min(n, NR_IPI); + irq = ipi_irq_base + ipi; - for (i = 0; i < nr_ipi; i++) { - int err; + if (ipi_should_be_nmi(ipi)) { + err = request_percpu_nmi(irq, ipi_handler, "IPI", NULL, &irq_stat); + WARN(err, "Could not request IRQ %d as NMI, err=%d\n", irq, err); + } else { + err = request_percpu_irq(irq, ipi_handler, "IPI", &irq_stat); + WARN(err, "Could not request IRQ %d as IRQ, err=%d\n", irq, err); + } + + for_each_possible_cpu(cpu) + get_ipi_desc(cpu, ipi) = irq_to_desc(irq); - err = request_percpu_irq(ipi_base + i, ipi_handler, - "IPI", &cpu_number); - WARN_ON(err); + irq_set_status_flags(irq, IRQ_HIDDEN); +} - ipi_desc[i] = irq_to_desc(ipi_base + i); - irq_set_status_flags(ipi_base + i, IRQ_HIDDEN); +static void ipi_setup_lpi(int ipi, int ncpus) +{ + for (int cpu = 0; cpu < ncpus; cpu++) { + int err, irq; + + irq = ipi_irq_base + (cpu * nr_ipi) + ipi; + + err = irq_force_affinity(irq, cpumask_of(cpu)); + WARN(err, "Could not force affinity IRQ %d, err=%d\n", irq, err); + + err = request_irq(irq, ipi_handler, IRQF_NO_AUTOEN, "IPI", + NULL); + WARN(err, "Could not request IRQ %d, err=%d\n", irq, err); + + irq_set_status_flags(irq, (IRQ_HIDDEN | IRQ_NO_BALANCING_MASK)); + + get_ipi_desc(cpu, ipi) = irq_to_desc(irq); } +} + +void __init set_smp_ipi_range_percpu(int ipi_base, int n, int ncpus) +{ + int i; + + WARN_ON(n < MAX_IPI); + nr_ipi = min(n, MAX_IPI); + percpu_ipi_descs = !!ncpus; ipi_irq_base = ipi_base; + for (i = 0; i < nr_ipi; i++) { + if (!percpu_ipi_descs) + ipi_setup_sgi(i); + else + ipi_setup_lpi(i, ncpus); + } + /* Setup the boot CPU immediately */ ipi_setup(smp_processor_id()); } @@ -979,6 +1153,17 @@ void arch_smp_send_reschedule(int cpu) smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE); } +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL +void arch_send_wakeup_ipi(unsigned int cpu) +{ + /* + * We use a scheduler IPI to wake the CPU as this avoids the need for a + * dedicated IPI and we can safely handle spurious scheduler IPIs. + */ + smp_send_reschedule(cpu); +} +#endif + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST void tick_broadcast(const struct cpumask *mask) { @@ -999,79 +1184,109 @@ static inline unsigned int num_other_online_cpus(void) void smp_send_stop(void) { + static unsigned long stop_in_progress; + static cpumask_t mask; unsigned long timeout; - if (num_other_online_cpus()) { - cpumask_t mask; + /* + * If this cpu is the only one alive at this point in time, online or + * not, there are no stop messages to be sent around, so just back out. + */ + if (num_other_online_cpus() == 0) + goto skip_ipi; - cpumask_copy(&mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &mask); + /* Only proceed if this is the first CPU to reach this code */ + if (test_and_set_bit(0, &stop_in_progress)) + return; - if (system_state <= SYSTEM_RUNNING) - pr_crit("SMP: stopping secondary CPUs\n"); - smp_cross_call(&mask, IPI_CPU_STOP); - } + /* + * Send an IPI to all currently online CPUs except the CPU running + * this code. + * + * NOTE: we don't do anything here to prevent other CPUs from coming + * online after we snapshot `cpu_online_mask`. Ideally, the calling code + * should do something to prevent other CPUs from coming up. This code + * can be called in the panic path and thus it doesn't seem wise to + * grab the CPU hotplug mutex ourselves. Worst case: + * - If a CPU comes online as we're running, we'll likely notice it + * during the 1 second wait below and then we'll catch it when we try + * with an NMI (assuming NMIs are enabled) since we re-snapshot the + * mask before sending an NMI. + * - If we leave the function and see that CPUs are still online we'll + * at least print a warning. Especially without NMIs this function + * isn't foolproof anyway so calling code will just have to accept + * the fact that there could be cases where a CPU can't be stopped. + */ + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); - /* Wait up to one second for other CPUs to stop */ + if (system_state <= SYSTEM_RUNNING) + pr_crit("SMP: stopping secondary CPUs\n"); + + /* + * Start with a normal IPI and wait up to one second for other CPUs to + * stop. We do this first because it gives other processors a chance + * to exit critical sections / drop locks and makes the rest of the + * stop process (especially console flush) more robust. + */ + smp_cross_call(&mask, IPI_CPU_STOP); timeout = USEC_PER_SEC; while (num_other_online_cpus() && timeout--) udelay(1); - if (num_other_online_cpus()) + /* + * If CPUs are still online, try an NMI. There's no excuse for this to + * be slow, so we only give them an extra 10 ms to respond. + */ + if (num_other_online_cpus() && ipi_should_be_nmi(IPI_CPU_STOP_NMI)) { + smp_rmb(); + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + + pr_info("SMP: retry stop with NMI for CPUs %*pbl\n", + cpumask_pr_args(&mask)); + + smp_cross_call(&mask, IPI_CPU_STOP_NMI); + timeout = USEC_PER_MSEC * 10; + while (num_other_online_cpus() && timeout--) + udelay(1); + } + + if (num_other_online_cpus()) { + smp_rmb(); + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(cpu_online_mask)); + cpumask_pr_args(&mask)); + } +skip_ipi: sdei_mask_local_cpu(); } #ifdef CONFIG_KEXEC_CORE void crash_smp_send_stop(void) { - static int cpus_stopped; - cpumask_t mask; - unsigned long timeout; - /* * This function can be called twice in panic path, but obviously * we execute this only once. + * + * We use this same boolean to tell whether the IPI we send was a + * stop or a "crash stop". */ - if (cpus_stopped) + if (crash_stop) return; + crash_stop = 1; - cpus_stopped = 1; - - /* - * If this cpu is the only one alive at this point in time, online or - * not, there are no stop messages to be sent around, so just back out. - */ - if (num_other_online_cpus() == 0) - goto skip_ipi; - - cpumask_copy(&mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &mask); - - atomic_set(&waiting_for_crash_ipi, num_other_online_cpus()); - - pr_crit("SMP: stopping secondary CPUs\n"); - smp_cross_call(&mask, IPI_CPU_CRASH_STOP); - - /* Wait up to one second for other CPUs to stop */ - timeout = USEC_PER_SEC; - while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--) - udelay(1); + smp_send_stop(); - if (atomic_read(&waiting_for_crash_ipi) > 0) - pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(&mask)); - -skip_ipi: - sdei_mask_local_cpu(); sdei_handler_abort(); } bool smp_crash_stop_failed(void) { - return (atomic_read(&waiting_for_crash_ipi) > 0); + return num_other_online_cpus() != 0; } #endif diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 17f66a74c745..3ebcf8c53fb0 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -7,7 +7,9 @@ #include <linux/kernel.h> #include <linux/efi.h> #include <linux/export.h> +#include <linux/filter.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> @@ -18,6 +20,55 @@ #include <asm/stack_pointer.h> #include <asm/stacktrace.h> +enum kunwind_source { + KUNWIND_SOURCE_UNKNOWN, + KUNWIND_SOURCE_FRAME, + KUNWIND_SOURCE_CALLER, + KUNWIND_SOURCE_TASK, + KUNWIND_SOURCE_REGS_PC, +}; + +union unwind_flags { + unsigned long all; + struct { + unsigned long fgraph : 1, + kretprobe : 1; + }; +}; + +/* + * Kernel unwind state + * + * @common: Common unwind state. + * @task: The task being unwound. + * @graph_idx: Used by ftrace_graph_ret_addr() for optimized stack unwinding. + * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance + * associated with the most recently encountered replacement lr + * value. + */ +struct kunwind_state { + struct unwind_state common; + struct task_struct *task; + int graph_idx; +#ifdef CONFIG_KRETPROBES + struct llist_node *kr_cur; +#endif + enum kunwind_source source; + union unwind_flags flags; + struct pt_regs *regs; +}; + +static __always_inline void +kunwind_init(struct kunwind_state *state, + struct task_struct *task) +{ + unwind_init_common(&state->common); + state->task = task; + state->source = KUNWIND_SOURCE_UNKNOWN; + state->flags.all = 0; + state->regs = NULL; +} + /* * Start an unwind from a pt_regs. * @@ -26,13 +77,15 @@ * The regs must be on a stack currently owned by the calling task. */ static __always_inline void -unwind_init_from_regs(struct unwind_state *state, - struct pt_regs *regs) +kunwind_init_from_regs(struct kunwind_state *state, + struct pt_regs *regs) { - unwind_init_common(state, current); + kunwind_init(state, current); - state->fp = regs->regs[29]; - state->pc = regs->pc; + state->regs = regs; + state->common.fp = regs->regs[29]; + state->common.pc = regs->pc; + state->source = KUNWIND_SOURCE_REGS_PC; } /* @@ -44,12 +97,13 @@ unwind_init_from_regs(struct unwind_state *state, * The function which invokes this must be noinline. */ static __always_inline void -unwind_init_from_caller(struct unwind_state *state) +kunwind_init_from_caller(struct kunwind_state *state) { - unwind_init_common(state, current); + kunwind_init(state, current); - state->fp = (unsigned long)__builtin_frame_address(1); - state->pc = (unsigned long)__builtin_return_address(0); + state->common.fp = (unsigned long)__builtin_frame_address(1); + state->common.pc = (unsigned long)__builtin_return_address(0); + state->source = KUNWIND_SOURCE_CALLER; } /* @@ -63,41 +117,133 @@ unwind_init_from_caller(struct unwind_state *state) * call this for the current task. */ static __always_inline void -unwind_init_from_task(struct unwind_state *state, - struct task_struct *task) +kunwind_init_from_task(struct kunwind_state *state, + struct task_struct *task) { - unwind_init_common(state, task); + kunwind_init(state, task); - state->fp = thread_saved_fp(task); - state->pc = thread_saved_pc(task); + state->common.fp = thread_saved_fp(task); + state->common.pc = thread_saved_pc(task); + state->source = KUNWIND_SOURCE_TASK; } static __always_inline int -unwind_recover_return_address(struct unwind_state *state) +kunwind_recover_return_address(struct kunwind_state *state) { #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (state->task->ret_stack && - (state->pc == (unsigned long)return_to_handler)) { + (state->common.pc == (unsigned long)return_to_handler)) { unsigned long orig_pc; - orig_pc = ftrace_graph_ret_addr(state->task, NULL, state->pc, - (void *)state->fp); - if (WARN_ON_ONCE(state->pc == orig_pc)) + orig_pc = ftrace_graph_ret_addr(state->task, &state->graph_idx, + state->common.pc, + (void *)state->common.fp); + if (state->common.pc == orig_pc) { + WARN_ON_ONCE(state->task == current); return -EINVAL; - state->pc = orig_pc; + } + state->common.pc = orig_pc; + state->flags.fgraph = 1; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ #ifdef CONFIG_KRETPROBES - if (is_kretprobe_trampoline(state->pc)) { - state->pc = kretprobe_find_ret_addr(state->task, - (void *)state->fp, - &state->kr_cur); + if (is_kretprobe_trampoline(state->common.pc)) { + unsigned long orig_pc; + orig_pc = kretprobe_find_ret_addr(state->task, + (void *)state->common.fp, + &state->kr_cur); + if (!orig_pc) + return -EINVAL; + state->common.pc = orig_pc; + state->flags.kretprobe = 1; } #endif /* CONFIG_KRETPROBES */ return 0; } +static __always_inline +int kunwind_next_regs_pc(struct kunwind_state *state) +{ + struct stack_info *info; + unsigned long fp = state->common.fp; + struct pt_regs *regs; + + regs = container_of((u64 *)fp, struct pt_regs, stackframe.record.fp); + + info = unwind_find_stack(&state->common, (unsigned long)regs, sizeof(*regs)); + if (!info) + return -EINVAL; + + unwind_consume_stack(&state->common, info, (unsigned long)regs, + sizeof(*regs)); + + state->regs = regs; + state->common.pc = regs->pc; + state->common.fp = regs->regs[29]; + state->regs = NULL; + state->source = KUNWIND_SOURCE_REGS_PC; + return 0; +} + +static __always_inline int +kunwind_next_frame_record_meta(struct kunwind_state *state) +{ + struct task_struct *tsk = state->task; + unsigned long fp = state->common.fp; + struct frame_record_meta *meta; + struct stack_info *info; + + info = unwind_find_stack(&state->common, fp, sizeof(*meta)); + if (!info) + return -EINVAL; + + meta = (struct frame_record_meta *)fp; + switch (READ_ONCE(meta->type)) { + case FRAME_META_TYPE_FINAL: + if (meta == &task_pt_regs(tsk)->stackframe) + return -ENOENT; + WARN_ON_ONCE(tsk == current); + return -EINVAL; + case FRAME_META_TYPE_PT_REGS: + return kunwind_next_regs_pc(state); + default: + WARN_ON_ONCE(tsk == current); + return -EINVAL; + } +} + +static __always_inline int +kunwind_next_frame_record(struct kunwind_state *state) +{ + unsigned long fp = state->common.fp; + struct frame_record *record; + struct stack_info *info; + unsigned long new_fp, new_pc; + + if (fp & 0x7) + return -EINVAL; + + info = unwind_find_stack(&state->common, fp, sizeof(*record)); + if (!info) + return -EINVAL; + + record = (struct frame_record *)fp; + new_fp = READ_ONCE(record->fp); + new_pc = READ_ONCE(record->lr); + + if (!new_fp && !new_pc) + return kunwind_next_frame_record_meta(state); + + unwind_consume_stack(&state->common, info, fp, sizeof(*record)); + + state->common.fp = new_fp; + state->common.pc = new_pc; + state->source = KUNWIND_SOURCE_FRAME; + + return 0; +} + /* * Unwind from one frame record (A) to the next frame record (B). * @@ -106,40 +252,51 @@ unwind_recover_return_address(struct unwind_state *state) * and the location (but not the fp value) of B. */ static __always_inline int -unwind_next(struct unwind_state *state) +kunwind_next(struct kunwind_state *state) { - struct task_struct *tsk = state->task; - unsigned long fp = state->fp; int err; - /* Final frame; nothing to unwind */ - if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) - return -ENOENT; + state->flags.all = 0; + + switch (state->source) { + case KUNWIND_SOURCE_FRAME: + case KUNWIND_SOURCE_CALLER: + case KUNWIND_SOURCE_TASK: + case KUNWIND_SOURCE_REGS_PC: + err = kunwind_next_frame_record(state); + break; + default: + err = -EINVAL; + } - err = unwind_next_frame_record(state); if (err) return err; - state->pc = ptrauth_strip_kernel_insn_pac(state->pc); + state->common.pc = ptrauth_strip_kernel_insn_pac(state->common.pc); - return unwind_recover_return_address(state); + return kunwind_recover_return_address(state); } -static __always_inline void -unwind(struct unwind_state *state, stack_trace_consume_fn consume_entry, - void *cookie) +typedef bool (*kunwind_consume_fn)(const struct kunwind_state *state, void *cookie); + +static __always_inline int +do_kunwind(struct kunwind_state *state, kunwind_consume_fn consume_state, + void *cookie) { - if (unwind_recover_return_address(state)) - return; + int ret; - while (1) { - int ret; + ret = kunwind_recover_return_address(state); + if (ret) + return ret; - if (!consume_entry(cookie, state->pc)) - break; - ret = unwind_next(state); + while (1) { + if (!consume_state(state, cookie)) + return -EINVAL; + ret = kunwind_next(state); + if (ret == -ENOENT) + return 0; if (ret < 0) - break; + return ret; } } @@ -172,17 +329,16 @@ unwind(struct unwind_state *state, stack_trace_consume_fn consume_entry, : stackinfo_get_unknown(); \ }) -noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, - void *cookie, struct task_struct *task, - struct pt_regs *regs) +static __always_inline int +kunwind_stack_walk(kunwind_consume_fn consume_state, + void *cookie, struct task_struct *task, + struct pt_regs *regs) { struct stack_info stacks[] = { stackinfo_get_task(task), STACKINFO_CPU(irq), -#if defined(CONFIG_VMAP_STACK) STACKINFO_CPU(overflow), -#endif -#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_ARM_SDE_INTERFACE) +#if defined(CONFIG_ARM_SDE_INTERFACE) STACKINFO_SDEI(normal), STACKINFO_SDEI(critical), #endif @@ -190,28 +346,131 @@ noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, STACKINFO_EFI, #endif }; - struct unwind_state state = { - .stacks = stacks, - .nr_stacks = ARRAY_SIZE(stacks), + struct kunwind_state state = { + .common = { + .stacks = stacks, + .nr_stacks = ARRAY_SIZE(stacks), + }, }; if (regs) { if (task != current) - return; - unwind_init_from_regs(&state, regs); + return -EINVAL; + kunwind_init_from_regs(&state, regs); } else if (task == current) { - unwind_init_from_caller(&state); + kunwind_init_from_caller(&state); } else { - unwind_init_from_task(&state, task); + kunwind_init_from_task(&state, task); } - unwind(&state, consume_entry, cookie); + return do_kunwind(&state, consume_state, cookie); +} + +struct kunwind_consume_entry_data { + stack_trace_consume_fn consume_entry; + void *cookie; +}; + +static __always_inline bool +arch_kunwind_consume_entry(const struct kunwind_state *state, void *cookie) +{ + struct kunwind_consume_entry_data *data = cookie; + return data->consume_entry(data->cookie, state->common.pc); } -static bool dump_backtrace_entry(void *arg, unsigned long where) +noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task, + struct pt_regs *regs) { + struct kunwind_consume_entry_data data = { + .consume_entry = consume_entry, + .cookie = cookie, + }; + + kunwind_stack_walk(arch_kunwind_consume_entry, &data, task, regs); +} + +static __always_inline bool +arch_reliable_kunwind_consume_entry(const struct kunwind_state *state, void *cookie) +{ + /* + * At an exception boundary we can reliably consume the saved PC. We do + * not know whether the LR was live when the exception was taken, and + * so we cannot perform the next unwind step reliably. + * + * All that matters is whether the *entire* unwind is reliable, so give + * up as soon as we hit an exception boundary. + */ + if (state->source == KUNWIND_SOURCE_REGS_PC) + return false; + + return arch_kunwind_consume_entry(state, cookie); +} + +noinline noinstr int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, + struct task_struct *task) +{ + struct kunwind_consume_entry_data data = { + .consume_entry = consume_entry, + .cookie = cookie, + }; + + return kunwind_stack_walk(arch_reliable_kunwind_consume_entry, &data, + task, NULL); +} + +struct bpf_unwind_consume_entry_data { + bool (*consume_entry)(void *cookie, u64 ip, u64 sp, u64 fp); + void *cookie; +}; + +static bool +arch_bpf_unwind_consume_entry(const struct kunwind_state *state, void *cookie) +{ + struct bpf_unwind_consume_entry_data *data = cookie; + + return data->consume_entry(data->cookie, state->common.pc, 0, + state->common.fp); +} + +noinline noinstr void arch_bpf_stack_walk(bool (*consume_entry)(void *cookie, u64 ip, u64 sp, + u64 fp), void *cookie) +{ + struct bpf_unwind_consume_entry_data data = { + .consume_entry = consume_entry, + .cookie = cookie, + }; + + kunwind_stack_walk(arch_bpf_unwind_consume_entry, &data, current, NULL); +} + +static const char *state_source_string(const struct kunwind_state *state) +{ + switch (state->source) { + case KUNWIND_SOURCE_FRAME: return NULL; + case KUNWIND_SOURCE_CALLER: return "C"; + case KUNWIND_SOURCE_TASK: return "T"; + case KUNWIND_SOURCE_REGS_PC: return "P"; + default: return "U"; + } +} + +static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg) +{ + const char *source = state_source_string(state); + union unwind_flags flags = state->flags; + bool has_info = source || flags.all; char *loglvl = arg; - printk("%s %pSb\n", loglvl, (void *)where); + + printk("%s %pSb%s%s%s%s%s\n", loglvl, + (void *)state->common.pc, + has_info ? " (" : "", + source ? source : "", + flags.fgraph ? "F" : "", + flags.kretprobe ? "K" : "", + has_info ? ")" : ""); + return true; } @@ -230,7 +489,7 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, return; printk("%sCall trace:\n", loglvl); - arch_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); + kunwind_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); put_task_stack(tsk); } @@ -240,3 +499,123 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) dump_backtrace(NULL, tsk, loglvl); barrier(); } + +/* + * The struct defined for userspace stack frame in AARCH64 mode. + */ +struct frame_tail { + struct frame_tail __user *fp; + unsigned long lr; +} __attribute__((packed)); + +/* + * Get the return address for a single stackframe and return a pointer to the + * next frame tail. + */ +static struct frame_tail __user * +unwind_user_frame(struct frame_tail __user *tail, void *cookie, + stack_trace_consume_fn consume_entry) +{ + struct frame_tail buftail; + unsigned long err; + unsigned long lr; + + /* Also check accessibility of one struct frame_tail beyond */ + if (!access_ok(tail, sizeof(buftail))) + return NULL; + + pagefault_disable(); + err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); + pagefault_enable(); + + if (err) + return NULL; + + lr = ptrauth_strip_user_insn_pac(buftail.lr); + + if (!consume_entry(cookie, lr)) + return NULL; + + /* + * Frame pointers should strictly progress back up the stack + * (towards higher addresses). + */ + if (tail >= buftail.fp) + return NULL; + + return buftail.fp; +} + +#ifdef CONFIG_COMPAT +/* + * The registers we're interested in are at the end of the variable + * length saved register structure. The fp points at the end of this + * structure so the address of this struct is: + * (struct compat_frame_tail *)(xxx->fp)-1 + * + * This code has been adapted from the ARM OProfile support. + */ +struct compat_frame_tail { + compat_uptr_t fp; /* a (struct compat_frame_tail *) in compat mode */ + u32 sp; + u32 lr; +} __attribute__((packed)); + +static struct compat_frame_tail __user * +unwind_compat_user_frame(struct compat_frame_tail __user *tail, void *cookie, + stack_trace_consume_fn consume_entry) +{ + struct compat_frame_tail buftail; + unsigned long err; + + /* Also check accessibility of one struct frame_tail beyond */ + if (!access_ok(tail, sizeof(buftail))) + return NULL; + + pagefault_disable(); + err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); + pagefault_enable(); + + if (err) + return NULL; + + if (!consume_entry(cookie, buftail.lr)) + return NULL; + + /* + * Frame pointers should strictly progress back up the stack + * (towards higher addresses). + */ + if (tail + 1 >= (struct compat_frame_tail __user *) + compat_ptr(buftail.fp)) + return NULL; + + return (struct compat_frame_tail __user *)compat_ptr(buftail.fp) - 1; +} +#endif /* CONFIG_COMPAT */ + + +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) +{ + if (!consume_entry(cookie, regs->pc)) + return; + + if (!compat_user_mode(regs)) { + /* AARCH64 mode */ + struct frame_tail __user *tail; + + tail = (struct frame_tail __user *)regs->regs[29]; + while (tail && !((unsigned long)tail & 0x7)) + tail = unwind_user_frame(tail, cookie, consume_entry); + } else { +#ifdef CONFIG_COMPAT + /* AARCH32 compat mode */ + struct compat_frame_tail __user *tail; + + tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; + while (tail && !((unsigned long)tail & 0x3)) + tail = unwind_compat_user_frame(tail, cookie, consume_entry); +#endif + } +} diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 0fbdf5fe64d8..eaaff94329cd 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -12,6 +12,7 @@ #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/exec.h> +#include <asm/fpsimd.h> #include <asm/mte.h> #include <asm/memory.h> #include <asm/mmu_context.h> @@ -55,13 +56,13 @@ void notrace __cpu_suspend_exit(void) /* Restore CnP bit in TTBR1_EL1 */ if (system_supports_cnp()) - cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir); + cpu_enable_swapper_cnp(); /* * PSTATE was not saved over suspend/resume, re-enable any detected * features that might not have been set correctly. */ - if (cpus_have_const_cap(ARM64_HAS_DIT)) + if (alternative_has_cap_unlikely(ARM64_HAS_DIT)) set_pstate_dit(1); __uaccess_enable_hw_pan(); @@ -80,6 +81,8 @@ void notrace __cpu_suspend_exit(void) */ spectre_v4_enable_mitigation(NULL); + sme_suspend_exit(); + /* Restore additional feature-specific configuration */ ptrauth_suspend_exit(); } @@ -98,6 +101,15 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) struct sleep_stack_data state; struct arm_cpuidle_irq_context context; + /* + * Some portions of CPU state (e.g. PSTATE.{PAN,DIT}) are initialized + * before alternatives are patched, but are only restored by + * __cpu_suspend_exit() after alternatives are patched. To avoid + * accidentally losing these bits we must not attempt to suspend until + * after alternatives have been patched. + */ + WARN_ON(!system_capabilities_finalized()); + /* Report any MTE async fault before going to suspend */ mte_suspend_enter(); diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c index d5ffaaab31a7..f08408b6e826 100644 --- a/arch/arm64/kernel/sys.c +++ b/arch/arm64/kernel/sys.c @@ -48,14 +48,16 @@ asmlinkage long __arm64_sys_ni_syscall(const struct pt_regs *__unused) */ #define __arm64_sys_personality __arm64_sys_arm64_personality +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) + #undef __SYSCALL #define __SYSCALL(nr, sym) asmlinkage long __arm64_##sym(const struct pt_regs *); -#include <asm/unistd.h> +#include <asm/syscall_table_64.h> #undef __SYSCALL #define __SYSCALL(nr, sym) [nr] = __arm64_##sym, const syscall_fn_t sys_call_table[__NR_syscalls] = { [0 ... __NR_syscalls - 1] = __arm64_sys_ni_syscall, -#include <asm/unistd.h> +#include <asm/syscall_table_64.h> }; diff --git a/arch/arm64/kernel/sys32.c b/arch/arm64/kernel/sys32.c index fc40386afb1b..96bcfb907443 100644 --- a/arch/arm64/kernel/sys32.c +++ b/arch/arm64/kernel/sys32.c @@ -5,17 +5,12 @@ * Copyright (C) 2015 ARM Ltd. */ -/* - * Needed to avoid conflicting __NR_* macros between uapi/asm/unistd.h and - * asm/unistd32.h. - */ -#define __COMPAT_SYSCALL_NR - #include <linux/compat.h> #include <linux/compiler.h> #include <linux/syscalls.h> #include <asm/syscall.h> +#include <asm/unistd_compat_32.h> asmlinkage long compat_sys_sigreturn(void); asmlinkage long compat_sys_rt_sigreturn(void); @@ -122,14 +117,16 @@ COMPAT_SYSCALL_DEFINE6(aarch32_fallocate, int, fd, int, mode, return ksys_fallocate(fd, mode, arg_u64(offset), arg_u64(len)); } +#define __SYSCALL_WITH_COMPAT(nr, sym, compat) __SYSCALL(nr, compat) + #undef __SYSCALL #define __SYSCALL(nr, sym) asmlinkage long __arm64_##sym(const struct pt_regs *); -#include <asm/unistd32.h> +#include <asm/syscall_table_32.h> #undef __SYSCALL #define __SYSCALL(nr, sym) [nr] = __arm64_##sym, -const syscall_fn_t compat_sys_call_table[__NR_compat_syscalls] = { - [0 ... __NR_compat_syscalls - 1] = __arm64_sys_ni_syscall, -#include <asm/unistd32.h> +const syscall_fn_t compat_sys_call_table[__NR_compat32_syscalls] = { + [0 ... __NR_compat32_syscalls - 1] = __arm64_sys_ni_syscall, +#include <asm/syscall_table_32.h> }; diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index df14336c3a29..4a609e9b65de 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -31,7 +31,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end) if (fatal_signal_pending(current)) return 0; - if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_1542419)) { /* * The workaround requires an inner-shareable tlbi. * We pick the reserved-ASID to minimise the impact. diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index 9a70d9746b66..c062badd1a56 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -14,20 +14,18 @@ #include <asm/syscall.h> #include <asm/thread_info.h> #include <asm/unistd.h> +#include <asm/unistd_compat_32.h> long compat_arm_syscall(struct pt_regs *regs, int scno); long sys_ni_syscall(void); static long do_ni_syscall(struct pt_regs *regs, int scno) { -#ifdef CONFIG_COMPAT - long ret; if (is_compat_task()) { - ret = compat_arm_syscall(regs, scno); + long ret = compat_arm_syscall(regs, scno); if (ret != -ENOSYS) return ret; } -#endif return sys_ni_syscall(); } @@ -45,7 +43,7 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, add_random_kstack_offset(); - if (scno < sc_nr) { + if (likely(scno < sc_nr)) { syscall_fn_t syscall_fn; syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; ret = __invoke_syscall(regs, syscall_fn); @@ -56,17 +54,15 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, syscall_set_return_value(current, regs, 0, ret); /* - * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), - * but not enough for arm64 stack utilization comfort. To keep - * reasonable stack head room, reduce the maximum offset to 9 bits. - * - * The actual entropy will be further reduced by the compiler when - * applying stack alignment constraints: the AAPCS mandates a - * 16-byte (i.e. 4-bit) aligned SP at function boundaries. + * This value will get limited by KSTACK_OFFSET_MAX(), which is 10 + * bits. The actual entropy will be further reduced by the compiler + * when applying stack alignment constraints: the AAPCS mandates a + * 16-byte aligned SP at function boundaries, which will remove the + * 4 low bits from any entropy chosen here. * - * The resulting 5 bits of entropy is seen in SP[8:4]. + * The resulting 6 bits of entropy is seen in SP[9:4]. */ - choose_random_kstack_offset(get_random_u16() & 0x1FF); + choose_random_kstack_offset(get_random_u16()); } static inline bool has_syscall_work(unsigned long flags) @@ -100,7 +96,7 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, * (Similarly for HVC and SMC elsewhere.) */ - if (flags & _TIF_MTE_ASYNC_FAULT) { + if (unlikely(flags & _TIF_MTE_ASYNC_FAULT)) { /* * Process the asynchronous tag check fault before the actual * syscall. do_notify_resume() will send a signal to userspace @@ -158,7 +154,7 @@ void do_el0_svc(struct pt_regs *regs) #ifdef CONFIG_COMPAT void do_el0_svc_compat(struct pt_regs *regs) { - el0_svc_common(regs, regs->regs[7], __NR_compat_syscalls, + el0_svc_common(regs, regs->regs[7], __NR_compat32_syscalls, compat_sys_call_table); } #endif diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 817d788cd866..5d24dc53799b 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -15,62 +15,16 @@ #include <linux/arch_topology.h> #include <linux/cacheinfo.h> #include <linux/cpufreq.h> +#include <linux/cpu_smt.h> #include <linux/init.h> #include <linux/percpu.h> +#include <linux/sched/isolation.h> +#include <linux/xarray.h> #include <asm/cpu.h> #include <asm/cputype.h> #include <asm/topology.h> -#ifdef CONFIG_ACPI -static bool __init acpi_cpu_is_threaded(int cpu) -{ - int is_threaded = acpi_pptt_cpu_is_thread(cpu); - - /* - * if the PPTT doesn't have thread information, assume a homogeneous - * machine and return the current CPU's thread state. - */ - if (is_threaded < 0) - is_threaded = read_cpuid_mpidr() & MPIDR_MT_BITMASK; - - return !!is_threaded; -} - -/* - * Propagate the topology information of the processor_topology_node tree to the - * cpu_topology array. - */ -int __init parse_acpi_topology(void) -{ - int cpu, topology_id; - - if (acpi_disabled) - return 0; - - for_each_possible_cpu(cpu) { - topology_id = find_acpi_cpu_topology(cpu, 0); - if (topology_id < 0) - return topology_id; - - if (acpi_cpu_is_threaded(cpu)) { - cpu_topology[cpu].thread_id = topology_id; - topology_id = find_acpi_cpu_topology(cpu, 1); - cpu_topology[cpu].core_id = topology_id; - } else { - cpu_topology[cpu].thread_id = -1; - cpu_topology[cpu].core_id = topology_id; - } - topology_id = find_acpi_cpu_topology_cluster(cpu); - cpu_topology[cpu].cluster_id = topology_id; - topology_id = find_acpi_cpu_topology_package(cpu); - cpu_topology[cpu].package_id = topology_id; - } - - return 0; -} -#endif - #ifdef CONFIG_ARM64_AMU_EXTN #define read_corecnt() read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0) #define read_constcnt() read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0) @@ -82,19 +36,34 @@ int __init parse_acpi_topology(void) #undef pr_fmt #define pr_fmt(fmt) "AMU: " fmt -static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale); -static DEFINE_PER_CPU(u64, arch_const_cycles_prev); -static DEFINE_PER_CPU(u64, arch_core_cycles_prev); +/* + * Ensure that amu_scale_freq_tick() will return SCHED_CAPACITY_SCALE until + * the CPU capacity and its associated frequency have been correctly + * initialized. + */ +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT); static cpumask_var_t amu_fie_cpus; +struct amu_cntr_sample { + u64 arch_const_cycles_prev; + u64 arch_core_cycles_prev; + unsigned long last_scale_update; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples); + void update_freq_counters_refs(void) { - this_cpu_write(arch_core_cycles_prev, read_corecnt()); - this_cpu_write(arch_const_cycles_prev, read_constcnt()); + struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples); + + amu_sample->arch_core_cycles_prev = read_corecnt(); + amu_sample->arch_const_cycles_prev = read_constcnt(); } static inline bool freq_counters_valid(int cpu) { + struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu); + if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) return false; @@ -103,8 +72,8 @@ static inline bool freq_counters_valid(int cpu) return false; } - if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) || - !per_cpu(arch_core_cycles_prev, cpu))) { + if (unlikely(!amu_sample->arch_const_cycles_prev || + !amu_sample->arch_core_cycles_prev)) { pr_debug("CPU%d: cycle counters are not enabled.\n", cpu); return false; } @@ -112,14 +81,14 @@ static inline bool freq_counters_valid(int cpu) return true; } -static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) +void freq_inv_set_max_ratio(int cpu, u64 max_rate) { - u64 ratio; + u64 ratio, ref_rate = arch_timer_get_rate(); if (unlikely(!max_rate || !ref_rate)) { - pr_debug("CPU%d: invalid maximum or reference frequency.\n", + WARN_ONCE(1, "CPU%d: invalid maximum or reference frequency.\n", cpu); - return -EINVAL; + return; } /* @@ -139,27 +108,30 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) ratio = div64_u64(ratio, max_rate); if (!ratio) { WARN_ONCE(1, "Reference frequency too low.\n"); - return -EINVAL; + return; } - per_cpu(arch_max_freq_scale, cpu) = (unsigned long)ratio; - - return 0; + WRITE_ONCE(per_cpu(arch_max_freq_scale, cpu), (unsigned long)ratio); } static void amu_scale_freq_tick(void) { + struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples); u64 prev_core_cnt, prev_const_cnt; u64 core_cnt, const_cnt, scale; - prev_const_cnt = this_cpu_read(arch_const_cycles_prev); - prev_core_cnt = this_cpu_read(arch_core_cycles_prev); + prev_const_cnt = amu_sample->arch_const_cycles_prev; + prev_core_cnt = amu_sample->arch_core_cycles_prev; update_freq_counters_refs(); - const_cnt = this_cpu_read(arch_const_cycles_prev); - core_cnt = this_cpu_read(arch_core_cycles_prev); + const_cnt = amu_sample->arch_const_cycles_prev; + core_cnt = amu_sample->arch_core_cycles_prev; + /* + * This should not happen unless the AMUs have been reset and the + * counter values have not been restored - unlikely + */ if (unlikely(core_cnt <= prev_core_cnt || const_cnt <= prev_const_cnt)) return; @@ -179,6 +151,8 @@ static void amu_scale_freq_tick(void) scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); this_cpu_write(arch_freq_scale, (unsigned long)scale); + + amu_sample->last_scale_update = jiffies; } static struct scale_freq_data amu_sfd = { @@ -186,20 +160,114 @@ static struct scale_freq_data amu_sfd = { .set_freq_scale = amu_scale_freq_tick, }; +static __always_inline bool amu_fie_cpu_supported(unsigned int cpu) +{ + return cpumask_available(amu_fie_cpus) && + cpumask_test_cpu(cpu, amu_fie_cpus); +} + +void arch_cpu_idle_enter(void) +{ + unsigned int cpu = smp_processor_id(); + + if (!amu_fie_cpu_supported(cpu)) + return; + + /* Kick in AMU update but only if one has not happened already */ + if (housekeeping_cpu(cpu, HK_TYPE_TICK) && + time_is_before_jiffies(per_cpu(cpu_amu_samples.last_scale_update, cpu))) + amu_scale_freq_tick(); +} + +#define AMU_SAMPLE_EXP_MS 20 + +int arch_freq_get_on_cpu(int cpu) +{ + struct amu_cntr_sample *amu_sample; + unsigned int start_cpu = cpu; + unsigned long last_update; + unsigned int freq = 0; + u64 scale; + + if (!amu_fie_cpu_supported(cpu) || !arch_scale_freq_ref(cpu)) + return -EOPNOTSUPP; + + while (1) { + + amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu); + + last_update = amu_sample->last_scale_update; + + /* + * For those CPUs that are in full dynticks mode, or those that have + * not seen tick for a while, try an alternative source for the counters + * (and thus freq scale), if available, for given policy: this boils + * down to identifying an active cpu within the same freq domain, if any. + */ + if (!housekeeping_cpu(cpu, HK_TYPE_TICK) || + time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) { + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + int ref_cpu; + + if (!policy) + return -EINVAL; + + if (!cpumask_intersects(policy->related_cpus, + housekeeping_cpumask(HK_TYPE_TICK))) { + cpufreq_cpu_put(policy); + return -EOPNOTSUPP; + } + + for_each_cpu_wrap(ref_cpu, policy->cpus, cpu + 1) { + if (ref_cpu == start_cpu) { + /* Prevent verifying same CPU twice */ + ref_cpu = nr_cpu_ids; + break; + } + if (!idle_cpu(ref_cpu)) + break; + } + + cpufreq_cpu_put(policy); + + if (ref_cpu >= nr_cpu_ids) + /* No alternative to pull info from */ + return -EAGAIN; + + cpu = ref_cpu; + } else { + break; + } + } + /* + * Reversed computation to the one used to determine + * the arch_freq_scale value + * (see amu_scale_freq_tick for details) + */ + scale = arch_scale_freq_capacity(cpu); + freq = scale * arch_scale_freq_ref(cpu); + freq >>= SCHED_CAPACITY_SHIFT; + return freq; +} + static void amu_fie_setup(const struct cpumask *cpus) { int cpu; /* We are already set since the last insmod of cpufreq driver */ - if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + if (cpumask_available(amu_fie_cpus) && + unlikely(cpumask_subset(cpus, amu_fie_cpus))) return; - for_each_cpu(cpu, cpus) { - if (!freq_counters_valid(cpu) || - freq_inv_set_max_ratio(cpu, - cpufreq_get_hw_max_freq(cpu) * 1000ULL, - arch_timer_get_rate())) + for_each_cpu(cpu, cpus) + if (!freq_counters_valid(cpu)) return; + + if (!cpumask_available(amu_fie_cpus) && + !zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) { + WARN_ONCE(1, "Failed to allocate FIE cpumask for CPUs[%*pbl]\n", + cpumask_pr_args(cpus)); + return; } cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); @@ -237,17 +305,8 @@ static struct notifier_block init_amu_fie_notifier = { static int __init init_amu_fie(void) { - int ret; - - if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) - return -ENOMEM; - - ret = cpufreq_register_notifier(&init_amu_fie_notifier, + return cpufreq_register_notifier(&init_amu_fie_notifier, CPUFREQ_POLICY_NOTIFIER); - if (ret) - free_cpumask_var(amu_fie_cpus); - - return ret; } core_initcall(init_amu_fie); diff --git a/arch/arm64/kernel/trace-events-emulation.h b/arch/arm64/kernel/trace-events-emulation.h index 6c40f58b844a..c51b547b583e 100644 --- a/arch/arm64/kernel/trace-events-emulation.h +++ b/arch/arm64/kernel/trace-events-emulation.h @@ -18,7 +18,7 @@ TRACE_EVENT(instruction_emulation, ), TP_fast_assign( - __assign_str(instr, instr); + __assign_str(instr); __entry->addr = addr; ), diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8b70759cdbb9..914282016069 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -41,7 +41,7 @@ #include <asm/extable.h> #include <asm/insn.h> #include <asm/kprobes.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/traps.h> #include <asm/smp.h> #include <asm/stack_pointer.h> @@ -149,19 +149,18 @@ pstate_check_t * const aarch32_opcode_cond_checks[16] = { int show_unhandled_signals = 0; -static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) +void dump_kernel_instr(unsigned long kaddr) { - unsigned long addr = instruction_pointer(regs); char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; int i; - if (user_mode(regs)) + if (!is_ttbr1_addr(kaddr)) return; for (i = -4; i < 1; i++) { unsigned int val, bad; - bad = aarch64_insn_read(&((u32 *)addr)[i], &val); + bad = aarch64_insn_read(&((u32 *)kaddr)[i], &val); if (!bad) p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); @@ -169,25 +168,18 @@ static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) p += sprintf(p, i == 0 ? "(????????) " : "???????? "); } - printk("%sCode: %s\n", lvl, str); + printk(KERN_EMERG "Code: %s\n", str); } -#ifdef CONFIG_PREEMPT -#define S_PREEMPT " PREEMPT" -#elif defined(CONFIG_PREEMPT_RT) -#define S_PREEMPT " PREEMPT_RT" -#else -#define S_PREEMPT "" -#endif - #define S_SMP " SMP" static int __die(const char *str, long err, struct pt_regs *regs) { static int die_counter; int ret; + unsigned long addr = instruction_pointer(regs); - pr_emerg("Internal error: %s: %016lx [#%d]" S_PREEMPT S_SMP "\n", + pr_emerg("Internal error: %s: %016lx [#%d] " S_SMP "\n", str, err, ++die_counter); /* trap and error numbers are mostly meaningless on ARM */ @@ -198,7 +190,10 @@ static int __die(const char *str, long err, struct pt_regs *regs) print_modules(); show_regs(regs); - dump_kernel_instr(KERN_EMERG, regs); + if (user_mode(regs)) + return ret; + + dump_kernel_instr(addr); return ret; } @@ -273,6 +268,12 @@ void arm64_force_sig_fault(int signo, int code, unsigned long far, force_sig_fault(signo, code, (void __user *)far); } +void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey) +{ + arm64_show_signal(SIGSEGV, str); + force_sig_pkuerr((void __user *)far, pkey); +} + void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str) { @@ -456,7 +457,7 @@ void do_el0_undef(struct pt_regs *regs, unsigned long esr) u32 insn; /* check for AArch32 breakpoint instructions */ - if (!aarch32_break_handler(regs)) + if (try_handle_aarch32_break(regs)) return; if (user_insn_read(regs, &insn)) @@ -500,6 +501,16 @@ void do_el1_bti(struct pt_regs *regs, unsigned long esr) die("Oops - BTI", regs, esr); } +void do_el0_gcs(struct pt_regs *regs, unsigned long esr) +{ + force_signal_inject(SIGSEGV, SEGV_CPERR, regs->pc, 0); +} + +void do_el1_gcs(struct pt_regs *regs, unsigned long esr) +{ + die("Oops - GCS", regs, esr); +} + void do_el0_fpac(struct pt_regs *regs, unsigned long esr) { force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr); @@ -516,53 +527,7 @@ void do_el1_fpac(struct pt_regs *regs, unsigned long esr) void do_el0_mops(struct pt_regs *regs, unsigned long esr) { - bool wrong_option = esr & ESR_ELx_MOPS_ISS_WRONG_OPTION; - bool option_a = esr & ESR_ELx_MOPS_ISS_OPTION_A; - int dstreg = ESR_ELx_MOPS_ISS_DESTREG(esr); - int srcreg = ESR_ELx_MOPS_ISS_SRCREG(esr); - int sizereg = ESR_ELx_MOPS_ISS_SIZEREG(esr); - unsigned long dst, src, size; - - dst = pt_regs_read_reg(regs, dstreg); - src = pt_regs_read_reg(regs, srcreg); - size = pt_regs_read_reg(regs, sizereg); - - /* - * Put the registers back in the original format suitable for a - * prologue instruction, using the generic return routine from the - * Arm ARM (DDI 0487I.a) rules CNTMJ and MWFQH. - */ - if (esr & ESR_ELx_MOPS_ISS_MEM_INST) { - /* SET* instruction */ - if (option_a ^ wrong_option) { - /* Format is from Option A; forward set */ - pt_regs_write_reg(regs, dstreg, dst + size); - pt_regs_write_reg(regs, sizereg, -size); - } - } else { - /* CPY* instruction */ - if (!(option_a ^ wrong_option)) { - /* Format is from Option B */ - if (regs->pstate & PSR_N_BIT) { - /* Backward copy */ - pt_regs_write_reg(regs, dstreg, dst - size); - pt_regs_write_reg(regs, srcreg, src - size); - } - } else { - /* Format is from Option A */ - if (size & BIT(63)) { - /* Forward copy */ - pt_regs_write_reg(regs, dstreg, dst + size); - pt_regs_write_reg(regs, srcreg, src + size); - pt_regs_write_reg(regs, sizereg, -size); - } - } - } - - if (esr & ESR_ELx_MOPS_ISS_FROM_EPILOGUE) - regs->pc -= 8; - else - regs->pc -= 4; + arm64_mops_reset_regs(®s->user_regs, esr); /* * If single stepping then finish the step before executing the @@ -571,6 +536,13 @@ void do_el0_mops(struct pt_regs *regs, unsigned long esr) user_fastforward_single_step(current); } +void do_el1_mops(struct pt_regs *regs, unsigned long esr) +{ + arm64_mops_reset_regs(®s->user_regs, esr); + + kernel_fastforward_single_step(regs); +} + #define __user_cache_maint(insn, address, res) \ if (address >= TASK_SIZE_MAX) { \ res = -EFAULT; \ @@ -631,7 +603,7 @@ static void ctr_read_handler(unsigned long esr, struct pt_regs *regs) int rt = ESR_ELx_SYS64_ISS_RT(esr); unsigned long val = arm64_ftr_reg_user_value(&arm64_ftr_reg_ctrel0); - if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) { + if (cpus_have_final_cap(ARM64_WORKAROUND_1542419)) { /* Hide DIC so that we can trap the unnecessary maintenance...*/ val &= ~BIT(CTR_EL0_DIC_SHIFT); @@ -647,18 +619,26 @@ static void ctr_read_handler(unsigned long esr, struct pt_regs *regs) static void cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = ESR_ELx_SYS64_ISS_RT(esr); + if (test_thread_flag(TIF_TSC_SIGSEGV)) { + force_sig(SIGSEGV); + } else { + int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_timer_read_counter()); - arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + pt_regs_write_reg(regs, rt, arch_timer_read_counter()); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } } static void cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = ESR_ELx_SYS64_ISS_RT(esr); + if (test_thread_flag(TIF_TSC_SIGSEGV)) { + force_sig(SIGSEGV); + } else { + int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_timer_get_rate()); - arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + pt_regs_write_reg(regs, rt, arch_timer_get_rate()); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } } static void mrs_handler(unsigned long esr, struct pt_regs *regs) @@ -884,6 +864,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_MOPS] = "MOPS", [ESR_ELx_EC_FP_EXC32] = "FP (AArch32)", [ESR_ELx_EC_FP_EXC64] = "FP (AArch64)", + [ESR_ELx_EC_GCS] = "Guarded Control Stack", [ESR_ELx_EC_SERROR] = "SError", [ESR_ELx_EC_BREAKPT_LOW] = "Breakpoint (lower EL)", [ESR_ELx_EC_BREAKPT_CUR] = "Breakpoint (current EL)", @@ -916,8 +897,6 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr) "Bad EL0 synchronous exception"); } -#ifdef CONFIG_VMAP_STACK - DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) __aligned(16); @@ -943,16 +922,16 @@ void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigne __show_regs(regs); /* - * We use nmi_panic to limit the potential for recusive overflows, and + * We use nmi_panic to limit the potential for recursive overflows, and * to get a better stack trace. */ nmi_panic(NULL, "kernel stack overflow"); cpu_park_loop(); } -#endif void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr) { + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_STILL_OK); console_verbose(); pr_crit("SError Interrupt on CPU%d, code 0x%016lx -- %s\n", @@ -1009,7 +988,7 @@ void do_serror(struct pt_regs *regs, unsigned long esr) int is_valid_bugaddr(unsigned long addr) { /* - * bug_handler() only called for BRK #BUG_BRK_IMM. + * bug_brk_handler() only called for BRK #BUG_BRK_IMM. * So the answer is trivial -- any spurious instances with no * bug table entry will be rejected by report_bug() and passed * back to the debug-monitors code and handled as a fatal @@ -1019,7 +998,7 @@ int is_valid_bugaddr(unsigned long addr) } #endif -static int bug_handler(struct pt_regs *regs, unsigned long esr) +int bug_brk_handler(struct pt_regs *regs, unsigned long esr) { switch (report_bug(regs->pc, regs)) { case BUG_TRAP_TYPE_BUG: @@ -1039,13 +1018,8 @@ static int bug_handler(struct pt_regs *regs, unsigned long esr) return DBG_HOOK_HANDLED; } -static struct break_hook bug_break_hook = { - .fn = bug_handler, - .imm = BUG_BRK_IMM, -}; - -#ifdef CONFIG_CFI_CLANG -static int cfi_handler(struct pt_regs *regs, unsigned long esr) +#ifdef CONFIG_CFI +int cfi_brk_handler(struct pt_regs *regs, unsigned long esr) { unsigned long target; u32 type; @@ -1068,15 +1042,9 @@ static int cfi_handler(struct pt_regs *regs, unsigned long esr) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } +#endif /* CONFIG_CFI */ -static struct break_hook cfi_break_hook = { - .fn = cfi_handler, - .imm = CFI_BRK_IMM_BASE, - .mask = CFI_BRK_IMM_MASK, -}; -#endif /* CONFIG_CFI_CLANG */ - -static int reserved_fault_handler(struct pt_regs *regs, unsigned long esr) +int reserved_fault_brk_handler(struct pt_regs *regs, unsigned long esr) { pr_err("%s generated an invalid instruction at %pS!\n", "Kernel text patching", @@ -1086,11 +1054,6 @@ static int reserved_fault_handler(struct pt_regs *regs, unsigned long esr) return DBG_HOOK_ERROR; } -static struct break_hook fault_break_hook = { - .fn = reserved_fault_handler, - .imm = FAULT_BRK_IMM, -}; - #ifdef CONFIG_KASAN_SW_TAGS #define KASAN_ESR_RECOVER 0x20 @@ -1098,7 +1061,7 @@ static struct break_hook fault_break_hook = { #define KASAN_ESR_SIZE_MASK 0x0f #define KASAN_ESR_SIZE(esr) (1 << ((esr) & KASAN_ESR_SIZE_MASK)) -static int kasan_handler(struct pt_regs *regs, unsigned long esr) +int kasan_brk_handler(struct pt_regs *regs, unsigned long esr) { bool recover = esr & KASAN_ESR_RECOVER; bool write = esr & KASAN_ESR_WRITE; @@ -1129,64 +1092,12 @@ static int kasan_handler(struct pt_regs *regs, unsigned long esr) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } - -static struct break_hook kasan_break_hook = { - .fn = kasan_handler, - .imm = KASAN_BRK_IMM, - .mask = KASAN_BRK_MASK, -}; #endif #ifdef CONFIG_UBSAN_TRAP -static int ubsan_handler(struct pt_regs *regs, unsigned long esr) +int ubsan_brk_handler(struct pt_regs *regs, unsigned long esr) { - die(report_ubsan_failure(regs, esr & UBSAN_BRK_MASK), regs, esr); + die(report_ubsan_failure(esr & UBSAN_BRK_MASK), regs, esr); return DBG_HOOK_HANDLED; } - -static struct break_hook ubsan_break_hook = { - .fn = ubsan_handler, - .imm = UBSAN_BRK_IMM, - .mask = UBSAN_BRK_MASK, -}; -#endif - -#define esr_comment(esr) ((esr) & ESR_ELx_BRK64_ISS_COMMENT_MASK) - -/* - * Initial handler for AArch64 BRK exceptions - * This handler only used until debug_traps_init(). - */ -int __init early_brk64(unsigned long addr, unsigned long esr, - struct pt_regs *regs) -{ -#ifdef CONFIG_CFI_CLANG - if ((esr_comment(esr) & ~CFI_BRK_IMM_MASK) == CFI_BRK_IMM_BASE) - return cfi_handler(regs, esr) != DBG_HOOK_HANDLED; -#endif -#ifdef CONFIG_KASAN_SW_TAGS - if ((esr_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) - return kasan_handler(regs, esr) != DBG_HOOK_HANDLED; -#endif -#ifdef CONFIG_UBSAN_TRAP - if ((esr_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM) - return ubsan_handler(regs, esr) != DBG_HOOK_HANDLED; -#endif - return bug_handler(regs, esr) != DBG_HOOK_HANDLED; -} - -void __init trap_init(void) -{ - register_kernel_break_hook(&bug_break_hook); -#ifdef CONFIG_CFI_CLANG - register_kernel_break_hook(&cfi_break_hook); #endif - register_kernel_break_hook(&fault_break_hook); -#ifdef CONFIG_KASAN_SW_TAGS - register_kernel_break_hook(&kasan_break_hook); -#endif -#ifdef CONFIG_UBSAN_TRAP - register_kernel_break_hook(&ubsan_break_hook); -#endif - debug_traps_init(); -} diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index d9e1355730ef..78ddf6bdecad 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -18,8 +18,7 @@ #include <linux/sched.h> #include <linux/signal.h> #include <linux/slab.h> -#include <linux/time_namespace.h> -#include <linux/timekeeper_internal.h> +#include <linux/vdso_datastore.h> #include <linux/vmalloc.h> #include <vdso/datapage.h> #include <vdso/helpers.h> @@ -34,19 +33,11 @@ enum vdso_abi { VDSO_ABI_AA32, }; -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES, -}; - struct vdso_abi_info { const char *name; const char *vdso_code_start; const char *vdso_code_end; unsigned long vdso_pages; - /* Data Mapping */ - struct vm_special_mapping *dm; /* Code Mapping */ struct vm_special_mapping *cm; }; @@ -66,15 +57,6 @@ static struct vdso_abi_info vdso_info[] __ro_after_init = { #endif /* CONFIG_COMPAT_VDSO */ }; -/* - * The vDSO data page. - */ -static union { - struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = vdso_data_store.data; - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { @@ -116,75 +98,6 @@ static int __init __vdso_init(enum vdso_abi abi) return 0; } -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -/* - * The vvar mapping contains data for a specific time namespace, so when a task - * changes namespace we must unmap its vvar data for the old namespace. - * Subsequent faults will map in data for the new namespace. - * - * For more details see timens_setup_vdso_data(). - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm)) - zap_vma_pages(vma); -#ifdef CONFIG_COMPAT_VDSO - if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA32].dm)) - zap_vma_pages(vma); -#endif - } - - mmap_read_unlock(mm); - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long pfn; - - switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: - if (timens_page) - pfn = page_to_pfn(timens_page); - else - pfn = sym_to_pfn(vdso_data); - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = sym_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - - return vmf_insert_pfn(vma, vmf->address, pfn); -} - static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, @@ -194,11 +107,11 @@ static int __setup_additional_pages(enum vdso_abi abi, unsigned long gp_flags = 0; void *ret; - BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ - vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; + vdso_mapping_len = vdso_text_len + VDSO_NR_PAGES * PAGE_SIZE; vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); if (IS_ERR_VALUE(vdso_base)) { @@ -206,20 +119,19 @@ static int __setup_additional_pages(enum vdso_abi abi, goto up_fail; } - ret = _install_special_mapping(mm, vdso_base, VVAR_NR_PAGES * PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_PFNMAP, - vdso_info[abi].dm); + ret = vdso_install_vvar_mapping(mm, vdso_base); if (IS_ERR(ret)) goto up_fail; - if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti()) + if (system_supports_bti_kernel()) gp_flags = VM_ARM64_BTI; - vdso_base += VVAR_NR_PAGES * PAGE_SIZE; + vdso_base += VDSO_NR_PAGES * PAGE_SIZE; mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, VM_READ|VM_EXEC|gp_flags| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_SEALED_SYSMAP, vdso_info[abi].cm); if (IS_ERR(ret)) goto up_fail; @@ -238,7 +150,6 @@ up_fail: enum aarch32_map { AA32_MAP_VECTORS, /* kuser helpers */ AA32_MAP_SIGPAGE, - AA32_MAP_VVAR, AA32_MAP_VDSO, }; @@ -263,10 +174,6 @@ static struct vm_special_mapping aarch32_vdso_maps[] = { .pages = &aarch32_sig_page, .mremap = aarch32_sigpage_mremap, }, - [AA32_MAP_VVAR] = { - .name = "[vvar]", - .fault = vvar_fault, - }, [AA32_MAP_VDSO] = { .name = "[vdso]", .mremap = vdso_mremap, @@ -316,7 +223,6 @@ static int __init __aarch32_alloc_vdso_pages(void) if (!IS_ENABLED(CONFIG_COMPAT_VDSO)) return 0; - vdso_info[VDSO_ABI_AA32].dm = &aarch32_vdso_maps[AA32_MAP_VVAR]; vdso_info[VDSO_ABI_AA32].cm = &aarch32_vdso_maps[AA32_MAP_VDSO]; return __vdso_init(VDSO_ABI_AA32); @@ -351,7 +257,8 @@ static int aarch32_kuser_helpers_setup(struct mm_struct *mm) */ ret = _install_special_mapping(mm, AARCH32_VECTORS_BASE, PAGE_SIZE, VM_READ | VM_EXEC | - VM_MAYREAD | VM_MAYEXEC, + VM_MAYREAD | VM_MAYEXEC | + VM_SEALED_SYSMAP, &aarch32_vdso_maps[AA32_MAP_VECTORS]); return PTR_ERR_OR_ZERO(ret); @@ -374,7 +281,8 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm) */ ret = _install_special_mapping(mm, addr, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | - VM_MAYWRITE | VM_MAYEXEC, + VM_MAYWRITE | VM_MAYEXEC | + VM_SEALED_SYSMAP, &aarch32_vdso_maps[AA32_MAP_SIGPAGE]); if (IS_ERR(ret)) goto out; @@ -411,26 +319,14 @@ out: } #endif /* CONFIG_COMPAT */ -enum aarch64_map { - AA64_MAP_VVAR, - AA64_MAP_VDSO, -}; - -static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { - [AA64_MAP_VVAR] = { - .name = "[vvar]", - .fault = vvar_fault, - }, - [AA64_MAP_VDSO] = { - .name = "[vdso]", - .mremap = vdso_mremap, - }, +static struct vm_special_mapping aarch64_vdso_map __ro_after_init = { + .name = "[vdso]", + .mremap = vdso_mremap, }; static int __init vdso_init(void) { - vdso_info[VDSO_ABI_AA64].dm = &aarch64_vdso_maps[AA64_MAP_VVAR]; - vdso_info[VDSO_ABI_AA64].cm = &aarch64_vdso_maps[AA64_MAP_VDSO]; + vdso_info[VDSO_ABI_AA64].cm = &aarch64_vdso_map; return __vdso_init(VDSO_ABI_AA64); } diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index fe7a53c6781f..7dec05dd33b7 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -7,9 +7,9 @@ # # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include -obj-vdso := vgettimeofday.o note.o sigreturn.o +obj-vdso := vgettimeofday.o note.o sigreturn.o vgetrandom.o vgetrandom-chacha.o # Build rules targets := $(obj-vdso) vdso.so vdso.so.dbg @@ -21,7 +21,7 @@ btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti # potential future proofing if we end up with internal calls to the exported # routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so # preparation in build-time C")). -ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv \ +ldflags-y := -shared -soname=linux-vdso.so.1 \ -Bsymbolic --build-id=sha1 -n $(btildflags-y) ifdef CONFIG_LD_ORPHAN_WARN @@ -34,26 +34,28 @@ ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO # -Wmissing-prototypes and -Wmissing-declarations are removed from -# the CFLAGS of vgettimeofday.c to make possible to build the -# kernel with CONFIG_WERROR enabled. -CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \ - $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \ - $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \ - -Wmissing-prototypes -Wmissing-declarations -KASAN_SANITIZE := n -KCSAN_SANITIZE := n -UBSAN_SANITIZE := n -OBJECT_FILES_NON_STANDARD := y -KCOV_INSTRUMENT := n - -CFLAGS_vgettimeofday.o = -O2 -mcmodel=tiny -fasynchronous-unwind-tables +# the CFLAGS to make possible to build the kernel with CONFIG_WERROR enabled. +CC_FLAGS_REMOVE_VDSO := $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \ + $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) \ + $(GCC_PLUGINS_CFLAGS) \ + $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \ + -Wmissing-prototypes -Wmissing-declarations + +CC_FLAGS_ADD_VDSO := -O2 -mcmodel=tiny -fasynchronous-unwind-tables + +CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_REMOVE_VDSO) +CFLAGS_REMOVE_vgetrandom.o = $(CC_FLAGS_REMOVE_VDSO) + +CFLAGS_vgettimeofday.o = $(CC_FLAGS_ADD_VDSO) +CFLAGS_vgetrandom.o = $(CC_FLAGS_ADD_VDSO) ifneq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y) endif -# Disable gcov profiling for VDSO code -GCOV_PROFILE := n +ifneq ($(c-getrandom-y),) + CFLAGS_vgetrandom.o += -include $(c-getrandom-y) +endif targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) @@ -68,7 +70,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) # Generate VDSO offsets using helper script -gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +gen-vdsosym := $(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ @@ -78,13 +80,3 @@ include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE # Actual build commands quiet_cmd_vdsold_and_vdso_check = LD $@ cmd_vdsold_and_vdso_check = $(cmd_ld); $(cmd_vdso_check) - -# Install commands for the unstripped file -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ - -vdso.so: $(obj)/vdso.so.dbg - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: vdso.so diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index 45354f2ddf70..52314be29191 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -11,18 +11,18 @@ #include <linux/const.h> #include <asm/page.h> #include <asm/vdso.h> +#include <asm/vdso/vsyscall.h> #include <asm-generic/vmlinux.lds.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") OUTPUT_ARCH(aarch64) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif - . = VDSO_LBASE + SIZEOF_HEADERS; + VDSO_VVAR_SYMS + + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -38,6 +38,7 @@ SECTIONS */ /DISCARD/ : { *(.note.GNU-stack .note.gnu.property) + *(.ARM.attributes) } .note : { *(.note.*) } :text :note @@ -102,6 +103,7 @@ VERSION __kernel_gettimeofday; __kernel_clock_gettime; __kernel_clock_getres; + __kernel_getrandom; local: *; }; } diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..67890b445309 --- /dev/null +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/linkage.h> +#include <asm/cache.h> +#include <asm/assembler.h> + + .text + +#define state0 v0 +#define state1 v1 +#define state2 v2 +#define state3 v3 +#define copy0 v4 +#define copy0_q q4 +#define copy1 v5 +#define copy2 v6 +#define copy3 v7 +#define copy3_d d7 +#define one_d d16 +#define one_q q16 +#define one_v v16 +#define tmp v17 +#define rot8 v18 + +/* + * ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive + * number of blocks of output with nonce 0, taking an input key and 8-bytes + * counter. Importantly does not spill to the stack. + * + * This implementation avoids d8-d15 because they are callee-save in user + * space. + * + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, + * const uint8_t *key, + * uint32_t *counter, + * size_t nblocks) + * + * x0: output bytes + * x1: 32-byte key input + * x2: 8-byte counter input/output + * x3: number of 64-byte block to write to output + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + + /* copy0 = "expand 32-byte k" */ + mov_q x8, 0x3320646e61707865 + mov_q x9, 0x6b20657479622d32 + mov copy0.d[0], x8 + mov copy0.d[1], x9 + + /* copy1,copy2 = key */ + ld1 { copy1.4s, copy2.4s }, [x1] + /* copy3 = counter || zero nonce */ + ld1 { copy3.2s }, [x2] + + movi one_v.2s, #1 + uzp1 one_v.4s, one_v.4s, one_v.4s + +.Lblock: + /* copy state to auxiliary vectors for the final add after the permute. */ + mov state0.16b, copy0.16b + mov state1.16b, copy1.16b + mov state2.16b, copy2.16b + mov state3.16b, copy3.16b + + mov w4, 20 +.Lpermute: + /* + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers state0-state3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + */ + +.Ldoubleround: + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + add state0.4s, state0.4s, state1.4s + eor state3.16b, state3.16b, state0.16b + rev32 state3.8h, state3.8h + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + add state2.4s, state2.4s, state3.4s + eor tmp.16b, state1.16b, state2.16b + shl state1.4s, tmp.4s, #12 + sri state1.4s, tmp.4s, #20 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + add state0.4s, state0.4s, state1.4s + eor tmp.16b, state3.16b, state0.16b + shl state3.4s, tmp.4s, #8 + sri state3.4s, tmp.4s, #24 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + add state2.4s, state2.4s, state3.4s + eor tmp.16b, state1.16b, state2.16b + shl state1.4s, tmp.4s, #7 + sri state1.4s, tmp.4s, #25 + + /* state1[0,1,2,3] = state1[1,2,3,0] */ + ext state1.16b, state1.16b, state1.16b, #4 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + ext state2.16b, state2.16b, state2.16b, #8 + /* state3[0,1,2,3] = state3[1,2,3,0] */ + ext state3.16b, state3.16b, state3.16b, #12 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + add state0.4s, state0.4s, state1.4s + eor state3.16b, state3.16b, state0.16b + rev32 state3.8h, state3.8h + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + add state2.4s, state2.4s, state3.4s + eor tmp.16b, state1.16b, state2.16b + shl state1.4s, tmp.4s, #12 + sri state1.4s, tmp.4s, #20 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + add state0.4s, state0.4s, state1.4s + eor tmp.16b, state3.16b, state0.16b + shl state3.4s, tmp.4s, #8 + sri state3.4s, tmp.4s, #24 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + add state2.4s, state2.4s, state3.4s + eor tmp.16b, state1.16b, state2.16b + shl state1.4s, tmp.4s, #7 + sri state1.4s, tmp.4s, #25 + + /* state1[0,1,2,3] = state1[3,0,1,2] */ + ext state1.16b, state1.16b, state1.16b, #12 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + ext state2.16b, state2.16b, state2.16b, #8 + /* state3[0,1,2,3] = state3[1,2,3,0] */ + ext state3.16b, state3.16b, state3.16b, #4 + + subs w4, w4, #2 + b.ne .Ldoubleround + + /* output0 = state0 + state0 */ + add state0.4s, state0.4s, copy0.4s + /* output1 = state1 + state1 */ + add state1.4s, state1.4s, copy1.4s + /* output2 = state2 + state2 */ + add state2.4s, state2.4s, copy2.4s + /* output2 = state3 + state3 */ + add state3.4s, state3.4s, copy3.4s + st1 { state0.16b - state3.16b }, [x0] + + /* + * ++copy3.counter, the 'add' clears the upper half of the SIMD register + * which is the expected behaviour here. + */ + add copy3_d, copy3_d, one_d + + /* output += 64, --nblocks */ + add x0, x0, 64 + subs x3, x3, #1 + b.ne .Lblock + + /* counter = copy3.counter */ + st1 { copy3.2s }, [x2] + + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ + movi state0.16b, #0 + movi state1.16b, #0 + movi state2.16b, #0 + movi state3.16b, #0 + movi copy1.16b, #0 + movi copy2.16b, #0 + ret +SYM_FUNC_END(__arch_chacha20_blocks_nostack) + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso/vgetrandom.c b/arch/arm64/kernel/vdso/vgetrandom.c new file mode 100644 index 000000000000..832fe195292b --- /dev/null +++ b/arch/arm64/kernel/vdso/vgetrandom.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <uapi/asm-generic/errno.h> + +typeof(__cvdso_getrandom) __kernel_getrandom; + +ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + if (alternative_has_cap_likely(ARM64_HAS_FPSIMD)) + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); + + if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) + return -ENOSYS; + return getrandom_syscall(buffer, len, flags); +} diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 2f73e5bca213..9d0efed91414 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -3,7 +3,7 @@ # Makefile for vdso32 # -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include # Same as cc-*option, but using CC_COMPAT instead of CC ifeq ($(CONFIG_CC_IS_CLANG), y) @@ -21,8 +21,6 @@ endif cc32-option = $(call try-run,\ $(CC_COMPAT) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) -cc32-disable-warning = $(call try-run,\ - $(CC_COMPAT) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) # We cannot use the global flags to compile the vDSO files, the main reason # being that the 32-bit compiler may be older than the main (64-bit) compiler @@ -59,13 +57,13 @@ VDSO_CAFLAGS += -DDISABLE_BRANCH_PROFILING VDSO_CAFLAGS += -march=armv8-a VDSO_CFLAGS := $(VDSO_CAFLAGS) -VDSO_CFLAGS += -DENABLE_COMPAT_VDSO=1 # KBUILD_CFLAGS from top-level Makefile VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -fno-common \ + $(filter -Werror,$(KBUILD_CPPFLAGS)) \ -Werror-implicit-function-declaration \ -Wno-format-security \ - -std=gnu11 + -std=gnu11 -fms-extensions VDSO_CFLAGS += -O2 # Some useful compiler-dependent flags from top-level Makefile VDSO_CFLAGS += $(call cc32-option,-Wno-pointer-sign) @@ -73,16 +71,7 @@ VDSO_CFLAGS += -fno-strict-overflow VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes) VDSO_CFLAGS += -Werror=date-time VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types) - -# The 32-bit compiler does not provide 128-bit integers, which are used in -# some headers that are indirectly included from the vDSO code. -# This hack makes the compiler happy and should trigger a warning/error if -# variables of such type are referenced. -VDSO_CFLAGS += -D__uint128_t='void*' -# Silence some warnings coming from headers that operate on long's -# (on GCC 4.8 or older, there is unfortunately no way to silence this warning) -VDSO_CFLAGS += $(call cc32-disable-warning,shift-count-overflow) -VDSO_CFLAGS += -Wno-int-to-pointer-cast +VDSO_CFLAGS += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag) # Compile as THUMB2 or ARM. Unwinding via frame-pointers in THUMB2 is # unreliable. @@ -98,7 +87,7 @@ VDSO_AFLAGS += -D__ASSEMBLY__ # From arm vDSO Makefile VDSO_LDFLAGS += -Bsymbolic --no-undefined -soname=linux-vdso.so.1 VDSO_LDFLAGS += -z max-page-size=4096 -z common-page-size=4096 -VDSO_LDFLAGS += -shared --hash-style=sysv --build-id=sha1 +VDSO_LDFLAGS += -shared --build-id=sha1 VDSO_LDFLAGS += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) @@ -118,7 +107,7 @@ endif VDSO_CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os # Build rules -targets := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) vdso.so vdso.so.dbg vdso.so.raw +targets := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) vdso.so vdso32.so.dbg vdso.so.raw c-obj-vdso := $(addprefix $(obj)/, $(c-obj-vdso)) c-obj-vdso-gettimeofday := $(addprefix $(obj)/, $(c-obj-vdso-gettimeofday)) asm-obj-vdso := $(addprefix $(obj)/, $(asm-obj-vdso)) @@ -127,19 +116,16 @@ obj-vdso := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) -include/generated/vdso32-offsets.h: $(obj)/vdso.so.dbg FORCE - $(call if_changed,vdsosym) - # Strip rule for vdso.so $(obj)/vdso.so: OBJCOPYFLAGS := -S -$(obj)/vdso.so: $(obj)/vdso.so.dbg FORCE +$(obj)/vdso.so: $(obj)/vdso32.so.dbg FORCE $(call if_changed,objcopy) -$(obj)/vdso.so.dbg: $(obj)/vdso.so.raw $(obj)/$(munge) FORCE +$(obj)/vdso32.so.dbg: $(obj)/vdso.so.raw $(obj)/$(munge) FORCE $(call if_changed,vdsomunge) # Link rule for the .so file, .lds has to be first -$(obj)/vdso.so.raw: $(src)/vdso.lds $(obj-vdso) FORCE +$(obj)/vdso.so.raw: $(obj)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold_and_vdso_check) # Compilation rules for the vDSO sources @@ -166,19 +152,3 @@ quiet_cmd_vdsoas = AS32 $@ quiet_cmd_vdsomunge = MUNGE $@ cmd_vdsomunge = $(obj)/$(munge) $< $@ - -# Generate vDSO offsets using helper script (borrowed from the 64-bit vDSO) -gen-vdsosym := $(srctree)/$(src)/../vdso/gen_vdso_offsets.sh -quiet_cmd_vdsosym = VDSOSYM $@ -# The AArch64 nm should be able to read an AArch32 binary - cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ - -# Install commands for the unstripped file -quiet_cmd_vdso_install = INSTALL32 $@ - cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/vdso32.so - -vdso.so: $(obj)/vdso.so.dbg - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: vdso.so diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S index 8d95d7d35057..e02b27487ce8 100644 --- a/arch/arm64/kernel/vdso32/vdso.lds.S +++ b/arch/arm64/kernel/vdso32/vdso.lds.S @@ -12,17 +12,16 @@ #include <asm/page.h> #include <asm/vdso.h> #include <asm-generic/vmlinux.lds.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") OUTPUT_ARCH(arm) SECTIONS { - PROVIDE_HIDDEN(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE_HIDDEN(_timens_data = _vdso_data + PAGE_SIZE); -#endif - . = VDSO_LBASE + SIZEOF_HEADERS; + VDSO_VVAR_SYMS + + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } diff --git a/arch/arm64/kernel/vdso32/vgettimeofday.c b/arch/arm64/kernel/vdso32/vgettimeofday.c index 5acff29c5991..29b4d8f61e39 100644 --- a/arch/arm64/kernel/vdso32/vgettimeofday.c +++ b/arch/arm64/kernel/vdso32/vgettimeofday.c @@ -5,6 +5,8 @@ * Copyright (C) 2018 ARM Limited * */ +#define BUILD_VDSO32_64 +#include <vdso/gettime.h> int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts) diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/vmcore_info.c index 66cde752cd74..9619ece66b79 100644 --- a/arch/arm64/kernel/crash_core.c +++ b/arch/arm64/kernel/vmcore_info.c @@ -4,7 +4,7 @@ * Copyright (C) Huawei Futurewei Technologies. */ -#include <linux/crash_core.h> +#include <linux/vmcore_info.h> #include <asm/cpufeature.h> #include <asm/memory.h> #include <asm/pgtable-hwdef.h> @@ -14,7 +14,7 @@ static inline u64 get_tcr_el1_t1sz(void); static inline u64 get_tcr_el1_t1sz(void) { - return (read_sysreg(tcr_el1) & TCR_T1SZ_MASK) >> TCR_T1SZ_OFFSET; + return (read_sysreg(tcr_el1) & TCR_EL1_T1SZ_MASK) >> TCR_EL1_T1SZ_SHIFT; } void arch_crash_save_vmcoreinfo(void) @@ -23,7 +23,6 @@ void arch_crash_save_vmcoreinfo(void) /* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */ vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR); vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END); - vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START); vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END); vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START); vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END); diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 3cd7e76cc562..ad6133b89e7a 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -13,7 +13,7 @@ *(__kvm_ex_table) \ __stop___kvm_ex_table = .; -#define HYPERVISOR_DATA_SECTIONS \ +#define HYPERVISOR_RODATA_SECTIONS \ HYP_SECTION_NAME(.rodata) : { \ . = ALIGN(PAGE_SIZE); \ __hyp_rodata_start = .; \ @@ -23,6 +23,15 @@ __hyp_rodata_end = .; \ } +#define HYPERVISOR_DATA_SECTION \ + HYP_SECTION_NAME(.data) : { \ + . = ALIGN(PAGE_SIZE); \ + __hyp_data_start = .; \ + *(HYP_SECTION_NAME(.data)) \ + . = ALIGN(PAGE_SIZE); \ + __hyp_data_end = .; \ + } + #define HYPERVISOR_PERCPU_SECTION \ . = ALIGN(PAGE_SIZE); \ HYP_SECTION_NAME(.data..percpu) : { \ @@ -51,7 +60,8 @@ #define SBSS_ALIGN PAGE_SIZE #else /* CONFIG_KVM */ #define HYPERVISOR_EXTABLE -#define HYPERVISOR_DATA_SECTIONS +#define HYPERVISOR_RODATA_SECTIONS +#define HYPERVISOR_DATA_SECTION #define HYPERVISOR_PERCPU_SECTION #define HYPERVISOR_RELOC_SECTION #define SBSS_ALIGN 0 @@ -126,9 +136,9 @@ jiffies = jiffies_64; #ifdef CONFIG_UNWIND_TABLES #define UNWIND_DATA_SECTIONS \ .eh_frame : { \ - __eh_frame_start = .; \ + __pi___eh_frame_start = .; \ *(.eh_frame) \ - __eh_frame_end = .; \ + __pi___eh_frame_end = .; \ } #else #define UNWIND_DATA_SECTIONS @@ -162,6 +172,7 @@ SECTIONS /DISCARD/ : { *(.interp .dynamic) *(.dynsym .dynstr .hash .gnu.hash) + *(.ARM.attributes) } . = KIMAGE_VADDR; @@ -189,7 +200,7 @@ SECTIONS /* everything from this point to __init_begin will be marked RO NX */ RO_DATA(PAGE_SIZE) - HYPERVISOR_DATA_SECTIONS + HYPERVISOR_RODATA_SECTIONS .got : { *(.got) } /* @@ -248,9 +259,9 @@ SECTIONS __inittext_end = .; __initdata_begin = .; - init_idmap_pg_dir = .; + __pi_init_idmap_pg_dir = .; . += INIT_IDMAP_DIR_SIZE; - init_idmap_pg_end = .; + __pi_init_idmap_pg_end = .; .init.data : { INIT_DATA @@ -264,31 +275,38 @@ SECTIONS EXIT_DATA } + RUNTIME_CONST_VARIABLES + PERCPU_SECTION(L1_CACHE_BYTES) HYPERVISOR_PERCPU_SECTION HYPERVISOR_RELOC_SECTION .rela.dyn : ALIGN(8) { - __rela_start = .; + __pi_rela_start = .; *(.rela .rela*) - __rela_end = .; + __pi_rela_end = .; } .relr.dyn : ALIGN(8) { - __relr_start = .; + __pi_relr_start = .; *(.relr.dyn) - __relr_end = .; + __pi_relr_end = .; } . = ALIGN(SEGMENT_ALIGN); __initdata_end = .; __init_end = .; + .data.rel.ro : { *(.data.rel.ro) } + ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!") + _data = .; _sdata = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) + HYPERVISOR_DATA_SECTION + /* * Data written with the MMU off but read with the MMU on requires * cache lines to be invalidated, discarding up to a Cache Writeback @@ -311,16 +329,23 @@ SECTIONS __pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin); _edata = .; + /* start of zero-init region */ BSS_SECTION(SBSS_ALIGN, 0, 0) + __pi___bss_start = __bss_start; . = ALIGN(PAGE_SIZE); - init_pg_dir = .; + __pi_init_pg_dir = .; . += INIT_DIR_SIZE; - init_pg_end = .; + __pi_init_pg_end = .; + /* end of zero-init region */ + + . += SZ_4K; /* stack for the early C runtime */ + early_init_stack = .; . = ALIGN(SEGMENT_ALIGN); __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; + __pi__end = .; STABS_DEBUG DWARF_DEBUG @@ -336,9 +361,6 @@ SECTIONS *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) } ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") - - .data.rel.ro : { *(.data.rel.ro) } - ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!") } #include "image-vars.h" diff --git a/arch/arm64/kernel/watchdog_hld.c b/arch/arm64/kernel/watchdog_hld.c index dcd25322127c..3093037dcb7b 100644 --- a/arch/arm64/kernel/watchdog_hld.c +++ b/arch/arm64/kernel/watchdog_hld.c @@ -34,3 +34,61 @@ bool __init arch_perf_nmi_is_available(void) */ return arm_pmu_irq_is_nmi(); } + +static int watchdog_perf_update_period(void *data) +{ + int cpu = smp_processor_id(); + u64 max_cpu_freq, new_period; + + max_cpu_freq = cpufreq_get_hw_max_freq(cpu) * 1000UL; + if (!max_cpu_freq) + return 0; + + new_period = watchdog_thresh * max_cpu_freq; + hardlockup_detector_perf_adjust_period(new_period); + + return 0; +} + +static int watchdog_freq_notifier_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = data; + int cpu; + + if (val != CPUFREQ_CREATE_POLICY) + return NOTIFY_DONE; + + /* + * Let each online CPU related to the policy update the period by their + * own. This will serialize with the framework on start/stop the lockup + * detector (softlockup_{start,stop}_all) and avoid potential race + * condition. Otherwise we may have below theoretical race condition: + * (core 0/1 share the same policy) + * [core 0] [core 1] + * hardlockup_detector_event_create() + * hw_nmi_get_sample_period() + * (cpufreq registered, notifier callback invoked) + * watchdog_freq_notifier_callback() + * watchdog_perf_update_period() + * (since core 1's event's not yet created, + * the period is not set) + * perf_event_create_kernel_counter() + * (event's period is SAFE_MAX_CPU_FREQ) + */ + for_each_cpu(cpu, policy->cpus) + smp_call_on_cpu(cpu, watchdog_perf_update_period, NULL, false); + + return NOTIFY_DONE; +} + +static struct notifier_block watchdog_freq_notifier = { + .notifier_call = watchdog_freq_notifier_callback, +}; + +static int __init init_watchdog_freq_notifier(void) +{ + return cpufreq_register_notifier(&watchdog_freq_notifier, + CPUFREQ_POLICY_NOTIFIER); +} +core_initcall(init_watchdog_freq_notifier); |
