diff options
Diffstat (limited to 'arch/arm64/kernel')
80 files changed, 3868 insertions, 2294 deletions
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 467cb7117273..71c29a2a2f19 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -33,8 +33,8 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - syscall.o proton-pack.o idreg-override.o idle.o \ - patching.o + syscall.o proton-pack.o idle.o patching.o pi/ \ + rsi.o obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o @@ -47,7 +47,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_CPU_PM) += sleep.o suspend.o -obj-$(CONFIG_CPU_IDLE) += cpuidle.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_EFI) += efi.o efi-rt-wrapper.o @@ -57,7 +56,7 @@ obj-$(CONFIG_ACPI) += acpi.o obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o obj-$(CONFIG_PARAVIRT) += paravirt.o -obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o pi/ +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o obj-$(CONFIG_ELF_CORE) += elfcore.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \ @@ -66,20 +65,12 @@ obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o -obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o -obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.o - -# We need to prevent the SCS patching code from patching itself. Using -# -mbranch-protection=none here to avoid the patchable PAC opcodes from being -# generated triggers an issue with full LTO on Clang, which stops emitting PAC -# instructions altogether. So disable LTO as well for the compilation unit. -CFLAGS_patch-scs.o += -mbranch-protection=none -CFLAGS_REMOVE_patch-scs.o += $(CC_FLAGS_LTO) # Force dependency (vdso*-wrap.S includes vdso.so through incbin) $(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so diff --git a/arch/arm64/kernel/Makefile.syscalls b/arch/arm64/kernel/Makefile.syscalls new file mode 100644 index 000000000000..0542a718871a --- /dev/null +++ b/arch/arm64/kernel/Makefile.syscalls @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 + +syscall_abis_32 += +syscall_abis_64 += renameat rlimit memfd_secret + +syscalltbl = arch/arm64/tools/syscall_%.tbl diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index dba8fcec7f33..e6f66491fbe9 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -26,9 +26,11 @@ #include <linux/libfdt.h> #include <linux/smp.h> #include <linux/serial_core.h> +#include <linux/suspend.h> #include <linux/pgtable.h> #include <acpi/ghes.h> +#include <acpi/processor.h> #include <asm/cputype.h> #include <asm/cpu_ops.h> #include <asm/daifflags.h> @@ -44,6 +46,7 @@ EXPORT_SYMBOL(acpi_pci_disabled); static bool param_acpi_off __initdata; static bool param_acpi_on __initdata; static bool param_acpi_force __initdata; +static bool param_acpi_nospcr __initdata; static int __init parse_acpi(char *arg) { @@ -57,6 +60,8 @@ static int __init parse_acpi(char *arg) param_acpi_on = true; else if (strcmp(arg, "force") == 0) /* force ACPI to be enabled */ param_acpi_force = true; + else if (strcmp(arg, "nospcr") == 0) /* disable SPCR as default console */ + param_acpi_nospcr = true; else return -EINVAL; /* Core will print when we return error */ @@ -227,7 +232,29 @@ done: if (earlycon_acpi_spcr_enable) early_init_dt_scan_chosen_stdout(); } else { - acpi_parse_spcr(earlycon_acpi_spcr_enable, true); +#ifdef CONFIG_HIBERNATION + struct acpi_table_header *facs = NULL; + acpi_get_table(ACPI_SIG_FACS, 1, &facs); + if (facs) { + swsusp_hardware_signature = + ((struct acpi_table_facs *)facs)->hardware_signature; + acpi_put_table(facs); + } +#endif + + /* + * For varying privacy and security reasons, sometimes need + * to completely silence the serial console output, and only + * enable it when needed. + * But there are many existing systems that depend on this + * behaviour, use acpi=nospcr to disable console in ACPI SPCR + * table as default serial console. + */ + acpi_parse_spcr(earlycon_acpi_spcr_enable, + !param_acpi_nospcr); + pr_info("Use ACPI SPCR as default console: %s\n", + param_acpi_nospcr ? "No" : "Yes"); + if (IS_ENABLED(CONFIG_ACPI_BGRT)) acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); } @@ -413,107 +440,23 @@ void arch_reserve_mem_area(acpi_physical_address addr, size_t size) memblock_mark_nomap(addr, size); } -#ifdef CONFIG_ACPI_FFH -/* - * Implements ARM64 specific callbacks to support ACPI FFH Operation Region as - * specified in https://developer.arm.com/docs/den0048/latest - */ -struct acpi_ffh_data { - struct acpi_ffh_info info; - void (*invoke_ffh_fn)(unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, - unsigned long a4, unsigned long a5, - unsigned long a6, unsigned long a7, - struct arm_smccc_res *args, - struct arm_smccc_quirk *res); - void (*invoke_ffh64_fn)(const struct arm_smccc_1_2_regs *args, - struct arm_smccc_1_2_regs *res); -}; - -int acpi_ffh_address_space_arch_setup(void *handler_ctxt, void **region_ctxt) +#ifdef CONFIG_ACPI_HOTPLUG_CPU +int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 apci_id, + int *pcpu) { - enum arm_smccc_conduit conduit; - struct acpi_ffh_data *ffh_ctxt; - - if (arm_smccc_get_version() < ARM_SMCCC_VERSION_1_2) - return -EOPNOTSUPP; - - conduit = arm_smccc_1_1_get_conduit(); - if (conduit == SMCCC_CONDUIT_NONE) { - pr_err("%s: invalid SMCCC conduit\n", __func__); - return -EOPNOTSUPP; + /* If an error code is passed in this stub can't fix it */ + if (*pcpu < 0) { + pr_warn_once("Unable to map CPU to valid ID\n"); + return *pcpu; } - ffh_ctxt = kzalloc(sizeof(*ffh_ctxt), GFP_KERNEL); - if (!ffh_ctxt) - return -ENOMEM; - - if (conduit == SMCCC_CONDUIT_SMC) { - ffh_ctxt->invoke_ffh_fn = __arm_smccc_smc; - ffh_ctxt->invoke_ffh64_fn = arm_smccc_1_2_smc; - } else { - ffh_ctxt->invoke_ffh_fn = __arm_smccc_hvc; - ffh_ctxt->invoke_ffh64_fn = arm_smccc_1_2_hvc; - } - - memcpy(ffh_ctxt, handler_ctxt, sizeof(ffh_ctxt->info)); - - *region_ctxt = ffh_ctxt; - return AE_OK; -} - -static bool acpi_ffh_smccc_owner_allowed(u32 fid) -{ - int owner = ARM_SMCCC_OWNER_NUM(fid); - - if (owner == ARM_SMCCC_OWNER_STANDARD || - owner == ARM_SMCCC_OWNER_SIP || owner == ARM_SMCCC_OWNER_OEM) - return true; - - return false; + return 0; } +EXPORT_SYMBOL(acpi_map_cpu); -int acpi_ffh_address_space_arch_handler(acpi_integer *value, void *region_context) +int acpi_unmap_cpu(int cpu) { - int ret = 0; - struct acpi_ffh_data *ffh_ctxt = region_context; - - if (ffh_ctxt->info.offset == 0) { - /* SMC/HVC 32bit call */ - struct arm_smccc_res res; - u32 a[8] = { 0 }, *ptr = (u32 *)value; - - if (!ARM_SMCCC_IS_FAST_CALL(*ptr) || ARM_SMCCC_IS_64(*ptr) || - !acpi_ffh_smccc_owner_allowed(*ptr) || - ffh_ctxt->info.length > 32) { - ret = AE_ERROR; - } else { - int idx, len = ffh_ctxt->info.length >> 2; - - for (idx = 0; idx < len; idx++) - a[idx] = *(ptr + idx); - - ffh_ctxt->invoke_ffh_fn(a[0], a[1], a[2], a[3], a[4], - a[5], a[6], a[7], &res, NULL); - memcpy(value, &res, sizeof(res)); - } - - } else if (ffh_ctxt->info.offset == 1) { - /* SMC/HVC 64bit call */ - struct arm_smccc_1_2_regs *r = (struct arm_smccc_1_2_regs *)value; - - if (!ARM_SMCCC_IS_FAST_CALL(r->a0) || !ARM_SMCCC_IS_64(r->a0) || - !acpi_ffh_smccc_owner_allowed(r->a0) || - ffh_ctxt->info.length > sizeof(*r)) { - ret = AE_ERROR; - } else { - ffh_ctxt->invoke_ffh64_fn(r, r); - memcpy(value, r, ffh_ctxt->info.length); - } - } else { - ret = AE_ERROR; - } - - return ret; + return 0; } -#endif /* CONFIG_ACPI_FFH */ +EXPORT_SYMBOL(acpi_unmap_cpu); +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c index e51535a5f939..2465f291c7e1 100644 --- a/arch/arm64/kernel/acpi_numa.c +++ b/arch/arm64/kernel/acpi_numa.c @@ -27,24 +27,13 @@ #include <asm/numa.h> -static int acpi_early_node_map[NR_CPUS] __initdata = { NUMA_NO_NODE }; +static int acpi_early_node_map[NR_CPUS] __initdata = { [0 ... NR_CPUS - 1] = NUMA_NO_NODE }; int __init acpi_numa_get_nid(unsigned int cpu) { return acpi_early_node_map[cpu]; } -static inline int get_cpu_for_acpi_id(u32 uid) -{ - int cpu; - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) - if (uid == get_acpi_id_for_cpu(cpu)) - return cpu; - - return -EINVAL; -} - static int __init acpi_parse_gicc_pxm(union acpi_subtable_headers *header, const unsigned long end) { diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index dd6ce86d4332..e737c6295ec7 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -462,6 +462,9 @@ static int run_all_insn_set_hw_mode(unsigned int cpu) for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) { struct insn_emulation *insn = insn_emulations[i]; bool enable = READ_ONCE(insn->current_mode) == INSN_HW; + if (insn->status == INSN_UNAVAILABLE) + continue; + if (insn->set_hw_mode && insn->set_hw_mode(enable)) { pr_warn("CPU[%u] cannot support the emulation of %s", cpu, insn->name); @@ -504,7 +507,7 @@ static int update_insn_emulation_mode(struct insn_emulation *insn, return ret; } -static int emulation_proc_handler(struct ctl_table *table, int write, +static int emulation_proc_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 5a7dbbe0ce63..eb1a840e4110 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -12,15 +12,12 @@ #include <linux/ftrace.h> #include <linux/kexec.h> #include <linux/mm.h> -#include <linux/dma-mapping.h> #include <linux/kvm_host.h> -#include <linux/preempt.h> #include <linux/suspend.h> #include <asm/cpufeature.h> #include <asm/fixmap.h> #include <asm/thread_info.h> #include <asm/memory.h> -#include <asm/signal32.h> #include <asm/smp_plat.h> #include <asm/suspend.h> #include <linux/kbuild.h> @@ -28,8 +25,6 @@ int main(void) { - DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); - BLANK(); DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); @@ -75,49 +70,31 @@ int main(void) DEFINE(S_FP, offsetof(struct pt_regs, regs[29])); DEFINE(S_LR, offsetof(struct pt_regs, regs[30])); DEFINE(S_SP, offsetof(struct pt_regs, sp)); - DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate)); DEFINE(S_PC, offsetof(struct pt_regs, pc)); + DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1)); - DEFINE(S_PMR_SAVE, offsetof(struct pt_regs, pmr_save)); + DEFINE(S_PMR, offsetof(struct pt_regs, pmr)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); + DEFINE(S_STACKFRAME_TYPE, offsetof(struct pt_regs, stackframe.type)); DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); BLANK(); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS - DEFINE(FREGS_X0, offsetof(struct ftrace_regs, regs[0])); - DEFINE(FREGS_X2, offsetof(struct ftrace_regs, regs[2])); - DEFINE(FREGS_X4, offsetof(struct ftrace_regs, regs[4])); - DEFINE(FREGS_X6, offsetof(struct ftrace_regs, regs[6])); - DEFINE(FREGS_X8, offsetof(struct ftrace_regs, regs[8])); - DEFINE(FREGS_FP, offsetof(struct ftrace_regs, fp)); - DEFINE(FREGS_LR, offsetof(struct ftrace_regs, lr)); - DEFINE(FREGS_SP, offsetof(struct ftrace_regs, sp)); - DEFINE(FREGS_PC, offsetof(struct ftrace_regs, pc)); + DEFINE(FREGS_X0, offsetof(struct __arch_ftrace_regs, regs[0])); + DEFINE(FREGS_X2, offsetof(struct __arch_ftrace_regs, regs[2])); + DEFINE(FREGS_X4, offsetof(struct __arch_ftrace_regs, regs[4])); + DEFINE(FREGS_X6, offsetof(struct __arch_ftrace_regs, regs[6])); + DEFINE(FREGS_X8, offsetof(struct __arch_ftrace_regs, regs[8])); + DEFINE(FREGS_FP, offsetof(struct __arch_ftrace_regs, fp)); + DEFINE(FREGS_LR, offsetof(struct __arch_ftrace_regs, lr)); + DEFINE(FREGS_SP, offsetof(struct __arch_ftrace_regs, sp)); + DEFINE(FREGS_PC, offsetof(struct __arch_ftrace_regs, pc)); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS - DEFINE(FREGS_DIRECT_TRAMP, offsetof(struct ftrace_regs, direct_tramp)); + DEFINE(FREGS_DIRECT_TRAMP, offsetof(struct __arch_ftrace_regs, direct_tramp)); #endif - DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); + DEFINE(FREGS_SIZE, sizeof(struct __arch_ftrace_regs)); BLANK(); #endif -#ifdef CONFIG_COMPAT - DEFINE(COMPAT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0)); - DEFINE(COMPAT_RT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_rt_sigframe, sig.uc.uc_mcontext.arm_r0)); - BLANK(); -#endif - DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); - BLANK(); - DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm)); - DEFINE(VMA_VM_FLAGS, offsetof(struct vm_area_struct, vm_flags)); - BLANK(); - DEFINE(VM_EXEC, VM_EXEC); - BLANK(); - DEFINE(PAGE_SZ, PAGE_SIZE); - BLANK(); - DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); - DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); - BLANK(); - DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); - BLANK(); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); DEFINE(FTR_OVR_VAL_OFFSET, offsetof(struct arm64_ftr_override, val)); @@ -128,6 +105,7 @@ int main(void) DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1)); DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2)); DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_cpu_context, regs)); + DEFINE(CPU_ELR_EL2, offsetof(struct kvm_cpu_context, sys_regs[ELR_EL2])); DEFINE(CPU_RGSR_EL1, offsetof(struct kvm_cpu_context, sys_regs[RGSR_EL1])); DEFINE(CPU_GCR_EL1, offsetof(struct kvm_cpu_context, sys_regs[GCR_EL1])); DEFINE(CPU_APIAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIAKEYLO_EL1])); @@ -145,6 +123,7 @@ int main(void) DEFINE(NVHE_INIT_HCR_EL2, offsetof(struct kvm_nvhe_init_params, hcr_el2)); DEFINE(NVHE_INIT_VTTBR, offsetof(struct kvm_nvhe_init_params, vttbr)); DEFINE(NVHE_INIT_VTCR, offsetof(struct kvm_nvhe_init_params, vtcr)); + DEFINE(NVHE_INIT_TMP, offsetof(struct kvm_nvhe_init_params, tmp)); #endif #ifdef CONFIG_CPU_PM DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); @@ -200,18 +179,6 @@ int main(void) DEFINE(FTRACE_OPS_FUNC, offsetof(struct ftrace_ops, func)); #endif BLANK(); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - DEFINE(FGRET_REGS_X0, offsetof(struct fgraph_ret_regs, regs[0])); - DEFINE(FGRET_REGS_X1, offsetof(struct fgraph_ret_regs, regs[1])); - DEFINE(FGRET_REGS_X2, offsetof(struct fgraph_ret_regs, regs[2])); - DEFINE(FGRET_REGS_X3, offsetof(struct fgraph_ret_regs, regs[3])); - DEFINE(FGRET_REGS_X4, offsetof(struct fgraph_ret_regs, regs[4])); - DEFINE(FGRET_REGS_X5, offsetof(struct fgraph_ret_regs, regs[5])); - DEFINE(FGRET_REGS_X6, offsetof(struct fgraph_ret_regs, regs[6])); - DEFINE(FGRET_REGS_X7, offsetof(struct fgraph_ret_regs, regs[7])); - DEFINE(FGRET_REGS_FP, offsetof(struct fgraph_ret_regs, fp)); - DEFINE(FGRET_REGS_SIZE, sizeof(struct fgraph_ret_regs)); -#endif #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS DEFINE(FTRACE_OPS_DIRECT_CALL, offsetof(struct ftrace_ops, direct_call)); #endif diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index d9c9218fa1fd..309942b06c5b 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -101,16 +101,18 @@ int populate_cache_leaves(unsigned int cpu) unsigned int level, idx; enum cache_type type; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; + struct cacheinfo *infos = this_cpu_ci->info_list; for (idx = 0, level = 1; level <= this_cpu_ci->num_levels && - idx < this_cpu_ci->num_leaves; idx++, level++) { + idx < this_cpu_ci->num_leaves; level++) { type = get_cache_type(level); if (type == CACHE_TYPE_SEPARATE) { - ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); - ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); + if (idx + 1 >= this_cpu_ci->num_leaves) + break; + ci_leaf_init(&infos[idx++], CACHE_TYPE_DATA, level); + ci_leaf_init(&infos[idx++], CACHE_TYPE_INST, level); } else { - ci_leaf_init(this_leaf++, type, level); + ci_leaf_init(&infos[idx++], type, level); } } return 0; diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 76b8dd37092a..7ce555862895 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -432,6 +432,41 @@ static const struct midr_range erratum_spec_unpriv_load_list[] = { }; #endif +#ifdef CONFIG_ARM64_ERRATUM_3194386 +static const struct midr_range erratum_spec_ssbs_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A725), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X4), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X925), + MIDR_ALL_VERSIONS(MIDR_MICROSOFT_AZURE_COBALT_100), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3), + {} +}; +#endif + +#ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38 +static const struct midr_range erratum_ac03_cpu_38_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { @@ -729,6 +764,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = { MIDR_FIXED(MIDR_CPU_VAR_REV(1,1), BIT(25)), }, #endif +#ifdef CONFIG_ARM64_ERRATUM_3194386 + { + .desc = "SSBS not fully self-synchronizing", + .capability = ARM64_WORKAROUND_SPECULATIVE_SSBS, + ERRATA_MIDR_RANGE_LIST(erratum_spec_ssbs_list), + }, +#endif #ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD { .desc = "ARM errata 2966298, 3117295", @@ -741,9 +783,17 @@ const struct arm64_cpu_capabilities arm64_errata[] = { { .desc = "AmpereOne erratum AC03_CPU_38", .capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38, - ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1), + ERRATA_MIDR_RANGE_LIST(erratum_ac03_cpu_38_list), }, #endif { + .desc = "Broken CNTVOFF_EL2", + .capability = ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF, + ERRATA_MIDR_RANGE_LIST(((const struct midr_range[]) { + MIDR_ALL_VERSIONS(MIDR_QCOM_ORYON_X1), + {} + })), + }, + { } }; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 8d1a634a403e..d561cf3b8ac7 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -75,6 +75,7 @@ #include <linux/cpu.h> #include <linux/kasan.h> #include <linux/percpu.h> +#include <linux/sched/isolation.h> #include <asm/cpu.h> #include <asm/cpufeature.h> @@ -103,6 +104,7 @@ static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly; COMPAT_HWCAP_LPAE) unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT; unsigned int compat_elf_hwcap2 __read_mostly; +unsigned int compat_elf_hwcap3 __read_mostly; #endif DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS); @@ -140,12 +142,42 @@ void dump_cpu_features(void) pr_emerg("0x%*pb\n", ARM64_NCAPS, &system_cpucaps); } +#define __ARM64_MAX_POSITIVE(reg, field) \ + ((reg##_##field##_SIGNED ? \ + BIT(reg##_##field##_WIDTH - 1) : \ + BIT(reg##_##field##_WIDTH)) - 1) + +#define __ARM64_MIN_NEGATIVE(reg, field) BIT(reg##_##field##_WIDTH - 1) + +#define __ARM64_CPUID_FIELDS(reg, field, min_value, max_value) \ + .sys_reg = SYS_##reg, \ + .field_pos = reg##_##field##_SHIFT, \ + .field_width = reg##_##field##_WIDTH, \ + .sign = reg##_##field##_SIGNED, \ + .min_field_value = min_value, \ + .max_field_value = max_value, + +/* + * ARM64_CPUID_FIELDS() encodes a field with a range from min_value to + * an implicit maximum that depends on the sign-ess of the field. + * + * An unsigned field will be capped at all ones, while a signed field + * will be limited to the positive half only. + */ #define ARM64_CPUID_FIELDS(reg, field, min_value) \ - .sys_reg = SYS_##reg, \ - .field_pos = reg##_##field##_SHIFT, \ - .field_width = reg##_##field##_WIDTH, \ - .sign = reg##_##field##_SIGNED, \ - .min_field_value = reg##_##field##_##min_value, + __ARM64_CPUID_FIELDS(reg, field, \ + SYS_FIELD_VALUE(reg, field, min_value), \ + __ARM64_MAX_POSITIVE(reg, field)) + +/* + * ARM64_CPUID_FIELDS_NEG() encodes a field with a range from an + * implicit minimal value to max_value. This should be used when + * matching a non-implemented property. + */ +#define ARM64_CPUID_FIELDS_NEG(reg, field, max_value) \ + __ARM64_CPUID_FIELDS(reg, field, \ + __ARM64_MIN_NEGATIVE(reg, field), \ + SYS_FIELD_VALUE(reg, field, max_value)) #define __ARM64_FTR_BITS(SIGNED, VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ { \ @@ -198,6 +230,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_XS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0), @@ -220,6 +253,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_LUT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CSSC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRFM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CLRBHB_SHIFT, 4, 0), @@ -234,6 +268,12 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64isar3[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV2_SHIFT, 4, 0), @@ -249,12 +289,14 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_FP_SHIFT, 4, ID_AA64PFR0_EL1_FP_NI), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL1_SHIFT, 4, ID_AA64PFR0_EL1_ELx_64BIT_ONLY), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL0_SHIFT, 4, ID_AA64PFR0_EL1_ELx_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL1_SHIFT, 4, ID_AA64PFR0_EL1_EL1_IMP), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL0_SHIFT, 4, ID_AA64PFR0_EL1_EL0_IMP), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), @@ -267,12 +309,19 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64pfr2[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_FPMR_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F64MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F32MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F16MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, 0), @@ -285,6 +334,8 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_EltPerm_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_AES_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, 0), @@ -295,6 +346,8 @@ static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_LUTv2_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SMEver_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I16I64_SHIFT, 4, 0), @@ -307,6 +360,10 @@ static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F16_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F8F16_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F8F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I8I32_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F32_SHIFT, 1, 0), @@ -316,6 +373,34 @@ static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_BI32I32_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F32F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8FMA_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP4_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP2_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SBitPerm_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_AES_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SFEXPA_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_STMOP_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SMOP4_SHIFT, 1, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64fpfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8CVT_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8FMA_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP4_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP2_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM8_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM4_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E4M3_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E5M2_SHIFT, 1, 0), ARM64_FTR_END, }; @@ -366,6 +451,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ECBHB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TIDCP1_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HCX_SHIFT, 4, 0), @@ -402,11 +488,18 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_POE), + FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1POE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0), ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = { + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_ctr[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RES1 */ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DIC_SHIFT, 1, 1), @@ -613,6 +706,14 @@ static const struct arm64_ftr_bits ftr_id_dfr1[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_mpamidr[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PMG_MAX_SHIFT, MPAMIDR_EL1_PMG_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_VPMR_MAX_SHIFT, MPAMIDR_EL1_VPMR_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_HAS_HCR_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PARTID_MAX_SHIFT, MPAMIDR_EL1_PARTID_MAX_WIDTH, 0), + ARM64_FTR_END, +}; + /* * Common ftr bits for a 32bit register with all hidden, strict * attributes, with 4bit feature fields and a default safe value of @@ -655,13 +756,15 @@ static const struct arm64_ftr_bits ftr_raz[] = { #define ARM64_FTR_REG(id, table) \ __ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override) -struct arm64_ftr_override __ro_after_init id_aa64mmfr1_override; -struct arm64_ftr_override __ro_after_init id_aa64pfr0_override; -struct arm64_ftr_override __ro_after_init id_aa64pfr1_override; -struct arm64_ftr_override __ro_after_init id_aa64zfr0_override; -struct arm64_ftr_override __ro_after_init id_aa64smfr0_override; -struct arm64_ftr_override __ro_after_init id_aa64isar1_override; -struct arm64_ftr_override __ro_after_init id_aa64isar2_override; +struct arm64_ftr_override id_aa64mmfr0_override; +struct arm64_ftr_override id_aa64mmfr1_override; +struct arm64_ftr_override id_aa64mmfr2_override; +struct arm64_ftr_override id_aa64pfr0_override; +struct arm64_ftr_override id_aa64pfr1_override; +struct arm64_ftr_override id_aa64zfr0_override; +struct arm64_ftr_override id_aa64smfr0_override; +struct arm64_ftr_override id_aa64isar1_override; +struct arm64_ftr_override id_aa64isar2_override; struct arm64_ftr_override arm64_sw_feature_override; @@ -702,10 +805,12 @@ static const struct __ftr_reg_entry { &id_aa64pfr0_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1, &id_aa64pfr1_override), + ARM64_FTR_REG(SYS_ID_AA64PFR2_EL1, ftr_id_aa64pfr2), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0, &id_aa64zfr0_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0, &id_aa64smfr0_override), + ARM64_FTR_REG(SYS_ID_AA64FPFR0_EL1, ftr_id_aa64fpfr0), /* Op1 = 0, CRn = 0, CRm = 5 */ ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0), @@ -717,13 +822,20 @@ static const struct __ftr_reg_entry { &id_aa64isar1_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2, &id_aa64isar2_override), + ARM64_FTR_REG(SYS_ID_AA64ISAR3_EL1, ftr_id_aa64isar3), /* Op1 = 0, CRn = 0, CRm = 7 */ - ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0, + &id_aa64mmfr0_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1, &id_aa64mmfr1_override), - ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2, + &id_aa64mmfr2_override), ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3), + ARM64_FTR_REG(SYS_ID_AA64MMFR4_EL1, ftr_id_aa64mmfr4), + + /* Op1 = 0, CRn = 10, CRm = 4 */ + ARM64_FTR_REG(SYS_MPAMIDR_EL1, ftr_mpamidr), /* Op1 = 1, CRn = 0, CRm = 0 */ ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), @@ -910,16 +1022,16 @@ static void init_cpu_ftr_reg(u32 sys_reg, u64 new) /* Override was valid */ ftr_new = tmp; str = "forced"; - } else if (ftr_ovr == tmp) { + } else { /* Override was the safe value */ str = "already set"; } - if (str) - pr_warn("%s[%d:%d]: %s to %llx\n", - reg->name, - ftrp->shift + ftrp->width - 1, - ftrp->shift, str, tmp); + pr_warn("%s[%d:%d]: %s to %llx\n", + reg->name, + ftrp->shift + ftrp->width - 1, + ftrp->shift, str, + tmp & (BIT(ftrp->width) - 1)); } else if ((ftr_mask & reg->override->val) == ftr_mask) { reg->override->val &= ~ftr_mask; pr_warn("%s[%d:%d]: impossible override, ignored\n", @@ -1043,14 +1155,18 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64ISAR0_EL1, info->reg_id_aa64isar0); init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1); init_cpu_ftr_reg(SYS_ID_AA64ISAR2_EL1, info->reg_id_aa64isar2); + init_cpu_ftr_reg(SYS_ID_AA64ISAR3_EL1, info->reg_id_aa64isar3); init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0); init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); init_cpu_ftr_reg(SYS_ID_AA64MMFR3_EL1, info->reg_id_aa64mmfr3); + init_cpu_ftr_reg(SYS_ID_AA64MMFR4_EL1, info->reg_id_aa64mmfr4); init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); + init_cpu_ftr_reg(SYS_ID_AA64PFR2_EL1, info->reg_id_aa64pfr2); init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0); init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0); + init_cpu_ftr_reg(SYS_ID_AA64FPFR0_EL1, info->reg_id_aa64fpfr0); if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) init_32bit_cpu_features(&info->aarch32); @@ -1068,17 +1184,14 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { unsigned long cpacr = cpacr_save_enable_kernel_sme(); - /* - * We mask out SMPS since even if the hardware - * supports priorities the kernel does not at present - * and we block access to them. - */ - info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; vec_init_vq_map(ARM64_VEC_SME); cpacr_restore(cpacr); } + if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) + init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr); + if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); } @@ -1272,6 +1385,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64isar1, boot->reg_id_aa64isar1); taint |= check_update_ftr_reg(SYS_ID_AA64ISAR2_EL1, cpu, info->reg_id_aa64isar2, boot->reg_id_aa64isar2); + taint |= check_update_ftr_reg(SYS_ID_AA64ISAR3_EL1, cpu, + info->reg_id_aa64isar3, boot->reg_id_aa64isar3); /* * Differing PARange support is fine as long as all peripherals and @@ -1291,6 +1406,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0); taint |= check_update_ftr_reg(SYS_ID_AA64PFR1_EL1, cpu, info->reg_id_aa64pfr1, boot->reg_id_aa64pfr1); + taint |= check_update_ftr_reg(SYS_ID_AA64PFR2_EL1, cpu, + info->reg_id_aa64pfr2, boot->reg_id_aa64pfr2); taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu, info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0); @@ -1298,6 +1415,9 @@ void update_cpu_features(int cpu, taint |= check_update_ftr_reg(SYS_ID_AA64SMFR0_EL1, cpu, info->reg_id_aa64smfr0, boot->reg_id_aa64smfr0); + taint |= check_update_ftr_reg(SYS_ID_AA64FPFR0_EL1, cpu, + info->reg_id_aa64fpfr0, boot->reg_id_aa64fpfr0); + /* Probe vector lengths */ if (IS_ENABLED(CONFIG_ARM64_SVE) && id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { @@ -1314,13 +1434,6 @@ void update_cpu_features(int cpu, id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { unsigned long cpacr = cpacr_save_enable_kernel_sme(); - /* - * We mask out SMPS since even if the hardware - * supports priorities the kernel does not at present - * and we block access to them. - */ - info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; - /* Probe vector lengths */ if (!system_capabilities_finalized()) vec_update_vq_map(ARM64_VEC_SME); @@ -1328,6 +1441,11 @@ void update_cpu_features(int cpu, cpacr_restore(cpacr); } + if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) { + taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu, + info->reg_mpamidr, boot->reg_mpamidr); + } + /* * The kernel uses the LDGM/STGM instructions and the number of tags * they read/write depends on the GMID_EL1.BS field. Check that the @@ -1410,17 +1528,21 @@ u64 __read_sysreg_by_encoding(u32 sys_id) read_sysreg_case(SYS_ID_AA64PFR0_EL1); read_sysreg_case(SYS_ID_AA64PFR1_EL1); + read_sysreg_case(SYS_ID_AA64PFR2_EL1); read_sysreg_case(SYS_ID_AA64ZFR0_EL1); read_sysreg_case(SYS_ID_AA64SMFR0_EL1); + read_sysreg_case(SYS_ID_AA64FPFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR0_EL1); read_sysreg_case(SYS_ID_AA64MMFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR2_EL1); read_sysreg_case(SYS_ID_AA64MMFR3_EL1); + read_sysreg_case(SYS_ID_AA64MMFR4_EL1); read_sysreg_case(SYS_ID_AA64ISAR0_EL1); read_sysreg_case(SYS_ID_AA64ISAR1_EL1); read_sysreg_case(SYS_ID_AA64ISAR2_EL1); + read_sysreg_case(SYS_ID_AA64ISAR3_EL1); read_sysreg_case(SYS_CNTFRQ_EL0); read_sysreg_case(SYS_CTR_EL0); @@ -1451,11 +1573,28 @@ has_always(const struct arm64_cpu_capabilities *entry, int scope) static bool feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry) { - int val = cpuid_feature_extract_field_width(reg, entry->field_pos, - entry->field_width, - entry->sign); + int val, min, max; + u64 tmp; + + val = cpuid_feature_extract_field_width(reg, entry->field_pos, + entry->field_width, + entry->sign); + + tmp = entry->min_field_value; + tmp <<= entry->field_pos; + + min = cpuid_feature_extract_field_width(tmp, entry->field_pos, + entry->field_width, + entry->sign); + + tmp = entry->max_field_value; + tmp <<= entry->field_pos; + + max = cpuid_feature_extract_field_width(tmp, entry->field_pos, + entry->field_width, + entry->sign); - return val >= entry->min_field_value; + return val >= min && val <= max; } static u64 @@ -1506,6 +1645,11 @@ const struct cpumask *system_32bit_el0_cpumask(void) return cpu_possible_mask; } +const struct cpumask *task_cpu_fallback_mask(struct task_struct *p) +{ + return __task_cpu_possible_mask(p, housekeeping_cpumask(HK_TYPE_TICK)); +} + static int __init parse_32bit_el0_param(char *str) { allow_mismatched_32bit_el0 = true; @@ -1620,46 +1764,6 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) return has_cpuid_feature(entry, scope); } -/* - * This check is triggered during the early boot before the cpufeature - * is initialised. Checking the status on the local CPU allows the boot - * CPU to detect the need for non-global mappings and thus avoiding a - * pagetable re-write after all the CPUs are booted. This check will be - * anyway run on individual CPUs, allowing us to get the consistent - * state once the SMP CPUs are up and thus make the switch to non-global - * mappings if required. - */ -bool kaslr_requires_kpti(void) -{ - if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - return false; - - /* - * E0PD does a similar job to KPTI so can be used instead - * where available. - */ - if (IS_ENABLED(CONFIG_ARM64_E0PD)) { - u64 mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1); - if (cpuid_feature_extract_unsigned_field(mmfr2, - ID_AA64MMFR2_EL1_E0PD_SHIFT)) - return false; - } - - /* - * Systems affected by Cavium erratum 24756 are incompatible - * with KPTI. - */ - if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) { - extern const struct midr_range cavium_erratum_27456_cpus[]; - - if (is_midr_in_range_list(read_cpuid_id(), - cavium_erratum_27456_cpus)) - return false; - } - - return kaslr_enabled(); -} - static bool __meltdown_safe = true; static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ @@ -1712,7 +1816,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, } /* Useful for KASLR robustness */ - if (kaslr_requires_kpti()) { + if (kaslr_enabled() && kaslr_requires_kpti()) { if (!__kpti_forced) { str = "KASLR"; __kpti_forced = 1; @@ -1739,6 +1843,28 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, return !meltdown_safe; } +static bool has_nv1(const struct arm64_cpu_capabilities *entry, int scope) +{ + /* + * Although the Apple M2 family appears to support NV1, the + * PTW barfs on the nVHE EL2 S1 page table format. Pretend + * that it doesn't support NV1 at all. + */ + static const struct midr_range nv1_ni_list[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {} + }; + + return (__system_matches_cap(ARM64_HAS_NESTED_VIRT) && + !(has_cpuid_feature(entry, scope) || + is_midr_in_range_list(read_cpuid_id(), nv1_ni_list))); +} + #if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2) static bool has_lpa2_at_stage1(u64 mmfr0) { @@ -1801,6 +1927,11 @@ static int __init __kpti_install_ng_mappings(void *__unused) pgd_t *kpti_ng_temp_pgd; u64 alloc = 0; + if (levels == 5 && !pgtable_l5_enabled()) + levels = 4; + else if (levels == 4 && !pgtable_l4_enabled()) + levels = 3; + remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); if (!cpu) { @@ -1814,9 +1945,9 @@ static int __init __kpti_install_ng_mappings(void *__unused) // // The physical pages are laid out as follows: // - // +--------+-/-------+-/------ +-\\--------+ - // : PTE[] : | PMD[] : | PUD[] : || PGD[] : - // +--------+-\-------+-\------ +-//--------+ + // +--------+-/-------+-/------ +-/------ +-\\\--------+ + // : PTE[] : | PMD[] : | PUD[] : | P4D[] : ||| PGD[] : + // +--------+-\-------+-\------ +-\------ +-///--------+ // ^ // The first page is mapped into this hierarchy at a PMD_SHIFT // aligned virtual address, so that we can manipulate the PTE @@ -2042,14 +2173,7 @@ static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap, static bool hvhe_possible(const struct arm64_cpu_capabilities *entry, int __unused) { - u64 val; - - val = read_sysreg(id_aa64mmfr1_el1); - if (!cpuid_feature_extract_unsigned_field(val, ID_AA64MMFR1_EL1_VH_SHIFT)) - return false; - - val = arm64_sw_feature_override.val & arm64_sw_feature_override.mask; - return cpuid_feature_extract_unsigned_field(val, ARM64_SW_FEATURE_OVERRIDE_HVHE); + return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE); } #ifdef CONFIG_ARM64_PAN @@ -2218,6 +2342,14 @@ static void user_feature_fixup(void) if (regp) regp->user_mask &= ~ID_AA64ISAR1_EL1_BF16_MASK; } + + if (cpus_have_cap(ARM64_WORKAROUND_SPECULATIVE_SSBS)) { + struct arm64_ftr_reg *regp; + + regp = get_arm64_ftr_reg(SYS_ID_AA64PFR1_EL1); + if (regp) + regp->user_mask &= ~ID_AA64PFR1_EL1_SSBS_MASK; + } } static void elf_hwcap_fixup(void) @@ -2250,6 +2382,22 @@ static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused) sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_MSCEn); } +#ifdef CONFIG_ARM64_POE +static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1_E0POE); + sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_E0POE); +} +#endif + +#ifdef CONFIG_ARM64_GCS +static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused) +{ + /* GCSPR_EL0 is always readable */ + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); +} +#endif + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -2269,6 +2417,36 @@ cpucap_panic_on_conflict(const struct arm64_cpu_capabilities *cap) return !!(cap->type & ARM64_CPUCAP_PANIC_ON_CONFLICT); } +static bool +test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) +{ + if (!has_cpuid_feature(entry, scope)) + return false; + + /* Check firmware actually enabled MPAM on this cpu. */ + return (read_sysreg_s(SYS_MPAM1_EL1) & MPAM1_EL1_MPAMEN); +} + +static void +cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) +{ + /* + * Access by the kernel (at EL1) should use the reserved PARTID + * which is configured unrestricted. This avoids priority-inversion + * where latency sensitive tasks have to wait for a task that has + * been throttled to release the lock. + */ + write_sysreg_s(0, SYS_MPAM1_EL1); +} + +static bool +test_has_mpam_hcr(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + + return idr & MPAMIDR_EL1_HAS_HCR; +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .capability = ARM64_ALWAYS_BOOT, @@ -2483,6 +2661,21 @@ static const struct arm64_cpu_capabilities arm64_features[] = { ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, DBM) }, #endif +#ifdef CONFIG_ARM64_HAFT + { + .desc = "Hardware managed Access Flag for Table Descriptors", + /* + * Contrary to the page/block access flag, the table access flag + * cannot be emulated in software (no access fault will occur). + * Therefore this should be used only if it's supported system + * wide. + */ + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAFT, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, HAFT) + }, +#endif { .desc = "CRC32 instructions", .capability = ARM64_HAS_CRC32, @@ -2739,6 +2932,73 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_lpa2, }, + { + .desc = "FPMR", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_FPMR, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_fpmr, + ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, FPMR, IMP) + }, +#ifdef CONFIG_ARM64_VA_BITS_52 + { + .capability = ARM64_HAS_VA52, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, +#ifdef CONFIG_ARM64_64K_PAGES + .desc = "52-bit Virtual Addressing (LVA)", + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, VARange, 52) +#else + .desc = "52-bit Virtual Addressing (LPA2)", +#ifdef CONFIG_ARM64_4K_PAGES + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN4, 52_BIT) +#else + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN16, 52_BIT) +#endif +#endif + }, +#endif + { + .desc = "Memory Partitioning And Monitoring", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM, + .matches = test_has_mpam, + .cpu_enable = cpu_enable_mpam, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, MPAM, 1) + }, + { + .desc = "Memory Partitioning And Monitoring Virtualisation", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM_HCR, + .matches = test_has_mpam_hcr, + }, + { + .desc = "NV1", + .capability = ARM64_HAS_HCR_NV1, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_nv1, + ARM64_CPUID_FIELDS_NEG(ID_AA64MMFR4_EL1, E2H0, NI_NV1) + }, +#ifdef CONFIG_ARM64_POE + { + .desc = "Stage-1 Permission Overlay Extension (S1POE)", + .capability = ARM64_HAS_S1POE, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_poe, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) + }, +#endif +#ifdef CONFIG_ARM64_GCS + { + .desc = "Guarded Control Stack (GCS)", + .capability = ARM64_HAS_GCS, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .cpu_enable = cpu_enable_gcs, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP) + }, +#endif {}, }; @@ -2771,6 +3031,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = match, \ } +#define HWCAP_CAP_MATCH_ID(match, reg, field, min_value, cap_type, cap) \ + { \ + __HWCAP_CAP(#cap, cap_type, cap) \ + HWCAP_CPUID_MATCH(reg, field, min_value) \ + .matches = match, \ + } + #ifdef CONFIG_ARM64_PTR_AUTH static const struct arm64_cpu_capabilities ptr_auth_hwcap_addr_matches[] = { { @@ -2799,6 +3066,13 @@ static const struct arm64_cpu_capabilities ptr_auth_hwcap_gen_matches[] = { }; #endif +#ifdef CONFIG_ARM64_SVE +static bool has_sve_feature(const struct arm64_cpu_capabilities *cap, int scope) +{ + return system_supports_sve() && has_user_cpuid_feature(cap, scope); +} +#endif + static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, AES, PMULL, CAP_HWCAP, KERNEL_HWCAP_PMULL), HWCAP_CAP(ID_AA64ISAR0_EL1, AES, AES, CAP_HWCAP, KERNEL_HWCAP_AES), @@ -2817,11 +3091,13 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM, CAP_HWCAP, KERNEL_HWCAP_FLAGM), HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), HWCAP_CAP(ID_AA64ISAR0_EL1, RNDR, IMP, CAP_HWCAP, KERNEL_HWCAP_RNG), + HWCAP_CAP(ID_AA64ISAR3_EL1, FPRCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_FPRCVT), HWCAP_CAP(ID_AA64PFR0_EL1, FP, IMP, CAP_HWCAP, KERNEL_HWCAP_FP), HWCAP_CAP(ID_AA64PFR0_EL1, FP, FP16, CAP_HWCAP, KERNEL_HWCAP_FPHP), HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMD), HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, FP16, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP), HWCAP_CAP(ID_AA64PFR0_EL1, DIT, IMP, CAP_HWCAP, KERNEL_HWCAP_DIT), + HWCAP_CAP(ID_AA64PFR2_EL1, FPMR, IMP, CAP_HWCAP, KERNEL_HWCAP_FPMR), HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, IMP, CAP_HWCAP, KERNEL_HWCAP_DCPOP), HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, DPB2, CAP_HWCAP, KERNEL_HWCAP_DCPODP), HWCAP_CAP(ID_AA64ISAR1_EL1, JSCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_JSCVT), @@ -2835,22 +3111,32 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_EBF16), HWCAP_CAP(ID_AA64ISAR1_EL1, DGH, IMP, CAP_HWCAP, KERNEL_HWCAP_DGH), HWCAP_CAP(ID_AA64ISAR1_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_I8MM), + HWCAP_CAP(ID_AA64ISAR2_EL1, LUT, IMP, CAP_HWCAP, KERNEL_HWCAP_LUT), + HWCAP_CAP(ID_AA64ISAR3_EL1, FAMINMAX, IMP, CAP_HWCAP, KERNEL_HWCAP_FAMINMAX), HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE), - HWCAP_CAP(ID_AA64ZFR0_EL1, SVEver, SVE2p1, CAP_HWCAP, KERNEL_HWCAP_SVE2P1), - HWCAP_CAP(ID_AA64ZFR0_EL1, SVEver, SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), - HWCAP_CAP(ID_AA64ZFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES), - HWCAP_CAP(ID_AA64ZFR0_EL1, AES, PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), - HWCAP_CAP(ID_AA64ZFR0_EL1, BitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), - HWCAP_CAP(ID_AA64ZFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_B16B16), - HWCAP_CAP(ID_AA64ZFR0_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16), - HWCAP_CAP(ID_AA64ZFR0_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16), - HWCAP_CAP(ID_AA64ZFR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), - HWCAP_CAP(ID_AA64ZFR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4), - HWCAP_CAP(ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), - HWCAP_CAP(ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), - HWCAP_CAP(ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p2, CAP_HWCAP, KERNEL_HWCAP_SVE2P2), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p1, CAP_HWCAP, KERNEL_HWCAP_SVE2P1), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, AES2, CAP_HWCAP, KERNEL_HWCAP_SVE_AES2), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_B16B16), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, BFSCALE, CAP_HWCAP, KERNEL_HWCAP_SVE_BFSCALE), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F16MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_F16MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, EltPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_ELTPERM), +#endif +#ifdef CONFIG_ARM64_GCS + HWCAP_CAP(ID_AA64PFR1_EL1, GCS, IMP, CAP_HWCAP, KERNEL_HWCAP_GCS), #endif HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS), #ifdef CONFIG_ARM64_BTI @@ -2867,6 +3153,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64MMFR0_EL1, ECV, IMP, CAP_HWCAP, KERNEL_HWCAP_ECV), HWCAP_CAP(ID_AA64MMFR1_EL1, AFP, IMP, CAP_HWCAP, KERNEL_HWCAP_AFP), HWCAP_CAP(ID_AA64ISAR2_EL1, CSSC, IMP, CAP_HWCAP, KERNEL_HWCAP_CSSC), + HWCAP_CAP(ID_AA64ISAR2_EL1, CSSC, CMPBR, CAP_HWCAP, KERNEL_HWCAP_CMPBR), HWCAP_CAP(ID_AA64ISAR2_EL1, RPRFM, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRFM), HWCAP_CAP(ID_AA64ISAR2_EL1, RPRES, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRES), HWCAP_CAP(ID_AA64ISAR2_EL1, WFxT, IMP, CAP_HWCAP, KERNEL_HWCAP_WFXT), @@ -2875,6 +3162,8 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { #ifdef CONFIG_ARM64_SME HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME), HWCAP_CAP(ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), + HWCAP_CAP(ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2), + HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2), HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1), HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2), HWCAP_CAP(ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), @@ -2882,12 +3171,33 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32), HWCAP_CAP(ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16), HWCAP_CAP(ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16), + HWCAP_CAP(ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16), + HWCAP_CAP(ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32), HWCAP_CAP(ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), HWCAP_CAP(ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), HWCAP_CAP(ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), HWCAP_CAP(ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32), HWCAP_CAP(ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), + HWCAP_CAP(ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA), + HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4), + HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2), + HWCAP_CAP(ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM), + HWCAP_CAP(ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES), + HWCAP_CAP(ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA), + HWCAP_CAP(ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP), + HWCAP_CAP(ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4), #endif /* CONFIG_ARM64_SME */ + HWCAP_CAP(ID_AA64FPFR0_EL1, F8CVT, IMP, CAP_HWCAP, KERNEL_HWCAP_F8CVT), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP4), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM8), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM4), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2), +#ifdef CONFIG_ARM64_POE + HWCAP_CAP(ID_AA64MMFR3_EL1, S1POE, IMP, CAP_HWCAP, KERNEL_HWCAP_POE), +#endif {}, }; @@ -3052,13 +3362,9 @@ static void __init enable_cpu_capabilities(u16 scope_mask) boot_scope = !!(scope_mask & SCOPE_BOOT_CPU); for (i = 0; i < ARM64_NCAPS; i++) { - unsigned int num; - caps = cpucap_ptrs[i]; - if (!caps || !(caps->type & scope_mask)) - continue; - num = caps->capability; - if (!cpus_have_cap(num)) + if (!caps || !(caps->type & scope_mask) || + !cpus_have_cap(caps->capability)) continue; if (boot_scope && caps->cpu_enable) @@ -3210,7 +3516,7 @@ static void verify_hyp_capabilities(void) return; safe_mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); - mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); /* Verify VMID bits */ @@ -3231,6 +3537,36 @@ static void verify_hyp_capabilities(void) } } +static void verify_mpam_capabilities(void) +{ + u64 cpu_idr = read_cpuid(ID_AA64PFR0_EL1); + u64 sys_idr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u16 cpu_partid_max, cpu_pmg_max, sys_partid_max, sys_pmg_max; + + if (FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, cpu_idr) != + FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, sys_idr)) { + pr_crit("CPU%d: MPAM version mismatch\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_idr = read_cpuid(MPAMIDR_EL1); + sys_idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + if (FIELD_GET(MPAMIDR_EL1_HAS_HCR, cpu_idr) != + FIELD_GET(MPAMIDR_EL1_HAS_HCR, sys_idr)) { + pr_crit("CPU%d: Missing MPAM HCR\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, cpu_idr); + cpu_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, cpu_idr); + sys_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, sys_idr); + sys_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, sys_idr); + if (cpu_partid_max < sys_partid_max || cpu_pmg_max < sys_pmg_max) { + pr_crit("CPU%d: MPAM PARTID/PMG max values are mismatched\n", smp_processor_id()); + cpu_die_early(); + } +} + /* * Run through the enabled system capabilities and enable() it on this CPU. * The capabilities were decided based on the available CPUs at the boot time. @@ -3257,6 +3593,9 @@ static void verify_local_cpu_capabilities(void) if (is_hyp_mode_available()) verify_hyp_capabilities(); + + if (system_supports_mpam()) + verify_mpam_capabilities(); } void check_local_cpu_capabilities(void) @@ -3334,6 +3673,11 @@ unsigned long cpu_get_elf_hwcap2(void) return elf_hwcap[1]; } +unsigned long cpu_get_elf_hwcap3(void) +{ + return elf_hwcap[2]; +} + static void __init setup_boot_cpu_capabilities(void) { /* @@ -3436,7 +3780,14 @@ static int enable_mismatched_32bit_el0(unsigned int cpu) static int lucky_winner = -1; struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu); - bool cpu_32bit = id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0); + bool cpu_32bit = false; + + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { + if (!housekeeping_cpu(cpu, HK_TYPE_TICK)) + pr_info("Treating adaptive-ticks CPU %u as 64-bit only\n", cpu); + else + cpu_32bit = true; + } if (cpu_32bit) { cpumask_set_cpu(cpu, cpu_32bit_el0_mask); diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c deleted file mode 100644 index f372295207fb..000000000000 --- a/arch/arm64/kernel/cpuidle.c +++ /dev/null @@ -1,74 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * ARM64 CPU idle arch support - * - * Copyright (C) 2014 ARM Ltd. - * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> - */ - -#include <linux/acpi.h> -#include <linux/cpuidle.h> -#include <linux/cpu_pm.h> -#include <linux/psci.h> - -#ifdef CONFIG_ACPI_PROCESSOR_IDLE - -#include <acpi/processor.h> - -#define ARM64_LPI_IS_RETENTION_STATE(arch_flags) (!(arch_flags)) - -static int psci_acpi_cpu_init_idle(unsigned int cpu) -{ - int i, count; - struct acpi_lpi_state *lpi; - struct acpi_processor *pr = per_cpu(processors, cpu); - - if (unlikely(!pr || !pr->flags.has_lpi)) - return -EINVAL; - - /* - * If the PSCI cpu_suspend function hook has not been initialized - * idle states must not be enabled, so bail out - */ - if (!psci_ops.cpu_suspend) - return -EOPNOTSUPP; - - count = pr->power.count - 1; - if (count <= 0) - return -ENODEV; - - for (i = 0; i < count; i++) { - u32 state; - - lpi = &pr->power.lpi_states[i + 1]; - /* - * Only bits[31:0] represent a PSCI power_state while - * bits[63:32] must be 0x0 as per ARM ACPI FFH Specification - */ - state = lpi->address; - if (!psci_power_state_is_valid(state)) { - pr_warn("Invalid PSCI power state %#x\n", state); - return -EINVAL; - } - } - - return 0; -} - -int acpi_processor_ffh_lpi_probe(unsigned int cpu) -{ - return psci_acpi_cpu_init_idle(cpu); -} - -__cpuidle int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi) -{ - u32 state = lpi->address; - - if (ARM64_LPI_IS_RETENTION_STATE(lpi->arch_flags)) - return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM_RCU(psci_cpu_suspend_enter, - lpi->index, state); - else - return CPU_PM_CPU_IDLE_ENTER_PARAM_RCU(psci_cpu_suspend_enter, - lpi->index, state); -} -#endif diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 47043c0d95ec..285d7d538342 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -80,6 +80,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SB] = "sb", [KERNEL_HWCAP_PACA] = "paca", [KERNEL_HWCAP_PACG] = "pacg", + [KERNEL_HWCAP_GCS] = "gcs", [KERNEL_HWCAP_DCPODP] = "dcpodp", [KERNEL_HWCAP_SVE2] = "sve2", [KERNEL_HWCAP_SVEAES] = "sveaes", @@ -128,6 +129,37 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SVE_B16B16] = "sveb16b16", [KERNEL_HWCAP_LRCPC3] = "lrcpc3", [KERNEL_HWCAP_LSE128] = "lse128", + [KERNEL_HWCAP_FPMR] = "fpmr", + [KERNEL_HWCAP_LUT] = "lut", + [KERNEL_HWCAP_FAMINMAX] = "faminmax", + [KERNEL_HWCAP_F8CVT] = "f8cvt", + [KERNEL_HWCAP_F8FMA] = "f8fma", + [KERNEL_HWCAP_F8DP4] = "f8dp4", + [KERNEL_HWCAP_F8DP2] = "f8dp2", + [KERNEL_HWCAP_F8E4M3] = "f8e4m3", + [KERNEL_HWCAP_F8E5M2] = "f8e5m2", + [KERNEL_HWCAP_SME_LUTV2] = "smelutv2", + [KERNEL_HWCAP_SME_F8F16] = "smef8f16", + [KERNEL_HWCAP_SME_F8F32] = "smef8f32", + [KERNEL_HWCAP_SME_SF8FMA] = "smesf8fma", + [KERNEL_HWCAP_SME_SF8DP4] = "smesf8dp4", + [KERNEL_HWCAP_SME_SF8DP2] = "smesf8dp2", + [KERNEL_HWCAP_POE] = "poe", + [KERNEL_HWCAP_CMPBR] = "cmpbr", + [KERNEL_HWCAP_FPRCVT] = "fprcvt", + [KERNEL_HWCAP_F8MM8] = "f8mm8", + [KERNEL_HWCAP_F8MM4] = "f8mm4", + [KERNEL_HWCAP_SVE_F16MM] = "svef16mm", + [KERNEL_HWCAP_SVE_ELTPERM] = "sveeltperm", + [KERNEL_HWCAP_SVE_AES2] = "sveaes2", + [KERNEL_HWCAP_SVE_BFSCALE] = "svebfscale", + [KERNEL_HWCAP_SVE2P2] = "sve2p2", + [KERNEL_HWCAP_SME2P2] = "sme2p2", + [KERNEL_HWCAP_SME_SBITPERM] = "smesbitperm", + [KERNEL_HWCAP_SME_AES] = "smeaes", + [KERNEL_HWCAP_SME_SFEXPA] = "smesfexpa", + [KERNEL_HWCAP_SME_STMOP] = "smestmop", + [KERNEL_HWCAP_SME_SMOP4] = "smesmop4", }; #ifdef CONFIG_COMPAT @@ -265,7 +297,7 @@ const struct seq_operations cpuinfo_op = { }; -static struct kobj_type cpuregs_kobj_type = { +static const struct kobj_type cpuregs_kobj_type = { .sysfs_ops = &kobj_sysfs_ops, }; @@ -443,14 +475,18 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1); info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1); info->reg_id_aa64isar2 = read_cpuid(ID_AA64ISAR2_EL1); + info->reg_id_aa64isar3 = read_cpuid(ID_AA64ISAR3_EL1); info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1); info->reg_id_aa64mmfr3 = read_cpuid(ID_AA64MMFR3_EL1); + info->reg_id_aa64mmfr4 = read_cpuid(ID_AA64MMFR4_EL1); info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); + info->reg_id_aa64pfr2 = read_cpuid(ID_AA64PFR2_EL1); info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1); info->reg_id_aa64smfr0 = read_cpuid(ID_AA64SMFR0_EL1); + info->reg_id_aa64fpfr0 = read_cpuid(ID_AA64FPFR0_EL1); if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) info->reg_gmid = read_cpuid(GMID_EL1); @@ -458,6 +494,19 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) __cpuinfo_store_cpu_32bit(&info->aarch32); + if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + + if (IS_ENABLED(CONFIG_ARM64_SME) && + id_aa64pfr1_sme(info->reg_id_aa64pfr1)) { + /* + * We mask out SMPS since even if the hardware + * supports priorities the kernel does not at present + * and we block access to them. + */ + info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; + } + cpuinfo_detect_icache_policy(info); } diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 64f2ecbdfe5c..58f047de3e1c 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -303,7 +303,6 @@ static int call_break_hook(struct pt_regs *regs, unsigned long esr) { struct break_hook *hook; struct list_head *list; - int (*fn)(struct pt_regs *regs, unsigned long esr) = NULL; list = user_mode(regs) ? &user_break_hook : &kernel_break_hook; @@ -312,13 +311,11 @@ static int call_break_hook(struct pt_regs *regs, unsigned long esr) * entirely not preemptible, and we can use rcu list safely here. */ list_for_each_entry_rcu(hook, list, node) { - unsigned long comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; - - if ((comment & ~hook->mask) == hook->imm) - fn = hook->fn; + if ((esr_brk_comment(esr) & ~hook->mask) == hook->imm) + return hook->fn(regs, esr); } - return fn ? fn(regs, esr) : DBG_HOOK_ERROR; + return DBG_HOOK_ERROR; } NOKPROBE_SYMBOL(call_break_hook); @@ -443,6 +440,11 @@ void kernel_rewind_single_step(struct pt_regs *regs) set_regs_spsr_ss(regs); } +void kernel_fastforward_single_step(struct pt_regs *regs) +{ + clear_regs_spsr_ss(regs); +} + /* ptrace API */ void user_enable_single_step(struct task_struct *task) { diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 0228001347be..1d25d8899dbf 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -9,7 +9,9 @@ #include <linux/efi.h> #include <linux/init.h> +#include <linux/kmemleak.h> #include <linux/screen_info.h> +#include <linux/vmalloc.h> #include <asm/efi.h> #include <asm/stacktrace.h> @@ -32,8 +34,16 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) u64 attr = md->attribute; u32 type = md->type; - if (type == EFI_MEMORY_MAPPED_IO) - return PROT_DEVICE_nGnRE; + if (type == EFI_MEMORY_MAPPED_IO) { + pgprot_t prot = __pgprot(PROT_DEVICE_nGnRE); + + if (arm64_is_protected_mmio(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT)) + prot = pgprot_encrypted(prot); + else + prot = pgprot_decrypted(prot); + return pgprot_val(prot); + } if (region_is_misaligned(md)) { static bool __initdata code_is_misaligned; @@ -103,7 +113,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { struct set_perm_data *spd = data; const efi_memory_desc_t *md = spd->md; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = __ptep_get(ptep); if (md->attribute & EFI_MEMORY_RO) pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); @@ -111,7 +121,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) pte = set_pte_bit(pte, __pgprot(PTE_PXN)); else if (system_supports_bti_kernel() && spd->has_bti) pte = set_pte_bit(pte, __pgprot(PTE_GP)); - set_pte(ptep, pte); + __set_pte(ptep, pte); return 0; } @@ -212,6 +222,7 @@ l: if (!p) { return -ENOMEM; } + kmemleak_not_leak(p); efi_rt_stack_top = p + THREAD_SIZE; return 0; } diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 0fc94207e69a..b260ddc4d3e9 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -10,6 +10,7 @@ #include <linux/linkage.h> #include <linux/lockdep.h> #include <linux/ptrace.h> +#include <linux/resume_user_mode.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/thread_info.h> @@ -102,7 +103,7 @@ static void noinstr exit_to_kernel_mode(struct pt_regs *regs) static __always_inline void __enter_from_user_mode(void) { lockdep_hardirqs_off(CALLER_ADDR0); - CT_WARN_ON(ct_state() != CONTEXT_USER); + CT_WARN_ON(ct_state() != CT_STATE_USER); user_exit_irqoff(); trace_hardirqs_off_finish(); mte_disable_tco_entry(current); @@ -126,16 +127,49 @@ static __always_inline void __exit_to_user_mode(void) lockdep_hardirqs_on(CALLER_ADDR0); } +static void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) +{ + do { + local_irq_enable(); + + if (thread_flags & _TIF_NEED_RESCHED) + schedule(); + + if (thread_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + + if (thread_flags & _TIF_MTE_ASYNC_FAULT) { + clear_thread_flag(TIF_MTE_ASYNC_FAULT); + send_sig_fault(SIGSEGV, SEGV_MTEAERR, + (void __user *)NULL, current); + } + + if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) + do_signal(regs); + + if (thread_flags & _TIF_NOTIFY_RESUME) + resume_user_mode_work(regs); + + if (thread_flags & _TIF_FOREIGN_FPSTATE) + fpsimd_restore_current_state(); + + local_irq_disable(); + thread_flags = read_thread_flags(); + } while (thread_flags & _TIF_WORK_MASK); +} + static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) { unsigned long flags; - local_daif_mask(); + local_irq_disable(); flags = read_thread_flags(); if (unlikely(flags & _TIF_WORK_MASK)) do_notify_resume(regs, flags); + local_daif_mask(); + lockdep_sys_exit(); } @@ -429,6 +463,24 @@ static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) exit_to_kernel_mode(regs); } +static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) +{ + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_gcs(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); +} + +static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) +{ + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_mops(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); +} + static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -471,6 +523,12 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_BTI: el1_bti(regs, esr); break; + case ESR_ELx_EC_GCS: + el1_gcs(regs, esr); + break; + case ESR_ELx_EC_MOPS: + el1_mops(regs, esr); + break; case ESR_ELx_EC_BREAKPT_CUR: case ESR_ELx_EC_SOFTSTP_CUR: case ESR_ELx_EC_WATCHPT_CUR: @@ -650,6 +708,14 @@ static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr) exit_to_user_mode(regs); } +static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_gcs(regs, esr); + exit_to_user_mode(regs); +} + static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) { enter_from_user_mode(regs); @@ -732,6 +798,9 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_MOPS: el0_mops(regs, esr); break; + case ESR_ELx_EC_GCS: + el0_gcs(regs, esr); + break; case ESR_ELx_EC_BREAKPT_LOW: case ESR_ELx_EC_SOFTSTP_LOW: case ESR_ELx_EC_WATCHPT_LOW: diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index f0c16640ef21..169ccf600066 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -329,24 +329,28 @@ SYM_FUNC_END(ftrace_stub_graph) * @fp is checked against the value passed by ftrace_graph_caller(). */ SYM_CODE_START(return_to_handler) - /* save return value regs */ - sub sp, sp, #FGRET_REGS_SIZE - stp x0, x1, [sp, #FGRET_REGS_X0] - stp x2, x3, [sp, #FGRET_REGS_X2] - stp x4, x5, [sp, #FGRET_REGS_X4] - stp x6, x7, [sp, #FGRET_REGS_X6] - str x29, [sp, #FGRET_REGS_FP] // parent's fp + /* Make room for ftrace_regs */ + sub sp, sp, #FREGS_SIZE + + /* Save return value regs */ + stp x0, x1, [sp, #FREGS_X0] + stp x2, x3, [sp, #FREGS_X2] + stp x4, x5, [sp, #FREGS_X4] + stp x6, x7, [sp, #FREGS_X6] + + /* Save the callsite's FP */ + str x29, [sp, #FREGS_FP] mov x0, sp - bl ftrace_return_to_handler // addr = ftrace_return_to_hander(regs); + bl ftrace_return_to_handler // addr = ftrace_return_to_hander(fregs); mov x30, x0 // restore the original return address - /* restore return value regs */ - ldp x0, x1, [sp, #FGRET_REGS_X0] - ldp x2, x3, [sp, #FGRET_REGS_X2] - ldp x4, x5, [sp, #FGRET_REGS_X4] - ldp x6, x7, [sp, #FGRET_REGS_X6] - add sp, sp, #FGRET_REGS_SIZE + /* Restore return value regs */ + ldp x0, x1, [sp, #FREGS_X0] + ldp x2, x3, [sp, #FREGS_X2] + ldp x4, x5, [sp, #FREGS_X4] + ldp x6, x7, [sp, #FREGS_X6] + add sp, sp, #FREGS_SIZE ret SYM_CODE_END(return_to_handler) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 7ef0e127b149..5ae2a34b50bd 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -25,6 +25,7 @@ #include <asm/processor.h> #include <asm/ptrace.h> #include <asm/scs.h> +#include <asm/stacktrace/frame.h> #include <asm/thread_info.h> #include <asm/asm-uaccess.h> #include <asm/unistd.h> @@ -284,15 +285,16 @@ alternative_else_nop_endif stp lr, x21, [sp, #S_LR] /* - * For exceptions from EL0, create a final frame record. - * For exceptions from EL1, create a synthetic frame record so the - * interrupted code shows up in the backtrace. + * Create a metadata frame record. The unwinder will use this to + * identify and unwind exception boundaries. */ - .if \el == 0 stp xzr, xzr, [sp, #S_STACKFRAME] + .if \el == 0 + mov x0, #FRAME_META_TYPE_FINAL .else - stp x29, x22, [sp, #S_STACKFRAME] + mov x0, #FRAME_META_TYPE_PT_REGS .endif + str x0, [sp, #S_STACKFRAME_TYPE] add x29, sp, #S_STACKFRAME #ifdef CONFIG_ARM64_SW_TTBR0_PAN @@ -315,7 +317,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING alternative_else_nop_endif mrs_s x20, SYS_ICC_PMR_EL1 - str x20, [sp, #S_PMR_SAVE] + str w20, [sp, #S_PMR] mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET msr_s SYS_ICC_PMR_EL1, x20 @@ -342,7 +344,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING b .Lskip_pmr_restore\@ alternative_else_nop_endif - ldr x20, [sp, #S_PMR_SAVE] + ldr w20, [sp, #S_PMR] msr_s SYS_ICC_PMR_EL1, x20 /* Ensure priority change is seen by redistributor */ diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index f27acca550d5..8370d55f0353 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -359,6 +359,9 @@ static void task_fpsimd_load(void) WARN_ON(preemptible()); WARN_ON(test_thread_flag(TIF_KERNEL_FPSTATE)); + if (system_supports_fpmr()) + write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR); + if (system_supports_sve() || system_supports_sme()) { switch (current->thread.fp_type) { case FP_STATE_FPSIMD: @@ -383,7 +386,7 @@ static void task_fpsimd_load(void) * fpsimd_save_user_state() or memory corruption, we * should always record an explicit format * when we save. We always at least have the - * memory allocated for FPSMID registers so + * memory allocated for FPSIMD registers so * try that and hope for the best. */ WARN_ON_ONCE(1); @@ -446,6 +449,9 @@ static void fpsimd_save_user_state(void) if (test_thread_flag(TIF_FOREIGN_FPSTATE)) return; + if (system_supports_fpmr()) + *(last->fpmr) = read_sysreg_s(SYS_FPMR); + /* * If a task is in a syscall the ABI allows us to only * preserve the state shared with FPSIMD so don't bother @@ -529,7 +535,7 @@ static unsigned int find_supported_vector_length(enum vec_type type, #if defined(CONFIG_ARM64_SVE) && defined(CONFIG_SYSCTL) -static int vec_proc_do_default_vl(struct ctl_table *table, int write, +static int vec_proc_do_default_vl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct vl_info *info = table->extra1; @@ -556,7 +562,7 @@ static int vec_proc_do_default_vl(struct ctl_table *table, int write, return 0; } -static struct ctl_table sve_default_vl_table[] = { +static const struct ctl_table sve_default_vl_table[] = { { .procname = "sve_default_vector_length", .mode = 0644, @@ -579,7 +585,7 @@ static int __init sve_sysctl_init(void) { return 0; } #endif /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */ #if defined(CONFIG_ARM64_SME) && defined(CONFIG_SYSCTL) -static struct ctl_table sme_default_vl_table[] = { +static const struct ctl_table sme_default_vl_table[] = { { .procname = "sme_default_vector_length", .mode = 0644, @@ -688,6 +694,12 @@ static void sve_to_fpsimd(struct task_struct *task) } } +void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p) +{ + write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK, + SYS_SCTLR_EL1); +} + #ifdef CONFIG_ARM64_SVE /* * Call __sve_free() directly only if you know task can't be scheduled @@ -1134,6 +1146,8 @@ void cpu_enable_sve(const struct arm64_cpu_capabilities *__always_unused p) { write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_ZEN_EL1EN, CPACR_EL1); isb(); + + write_sysreg_s(0, SYS_ZCR_EL1); } void __init sve_setup(void) @@ -1245,6 +1259,9 @@ void cpu_enable_sme(const struct arm64_cpu_capabilities *__always_unused p) write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_SMEN_EL1EN, CPACR_EL1); isb(); + /* Ensure all bits in SMCR are set to known values */ + write_sysreg_s(0, SYS_SMCR_EL1); + /* Allow EL0 to access TPIDR2 */ write_sysreg(read_sysreg(SCTLR_EL1) | SCTLR_ELx_ENTP2, SCTLR_EL1); isb(); @@ -1350,6 +1367,7 @@ static void sve_init_regs(void) } else { fpsimd_to_sve(current); current->thread.fp_type = FP_STATE_SVE; + fpsimd_flush_task_state(current); } } @@ -1518,6 +1536,27 @@ static void fpsimd_save_kernel_state(struct task_struct *task) task->thread.kernel_fpsimd_cpu = smp_processor_id(); } +/* + * Invalidate any task's FPSIMD state that is present on this cpu. + * The FPSIMD context should be acquired with get_cpu_fpsimd_context() + * before calling this function. + */ +static void fpsimd_flush_cpu_state(void) +{ + WARN_ON(!system_supports_fpsimd()); + __this_cpu_write(fpsimd_last_state.st, NULL); + + /* + * Leaving streaming mode enabled will cause issues for any kernel + * NEON and leaving streaming mode or ZA enabled may increase power + * consumption. + */ + if (system_supports_sme()) + sme_smstop(); + + set_thread_flag(TIF_FOREIGN_FPSTATE); +} + void fpsimd_thread_switch(struct task_struct *next) { bool wrong_task, wrong_cpu; @@ -1535,7 +1574,7 @@ void fpsimd_thread_switch(struct task_struct *next) if (test_tsk_thread_flag(next, TIF_KERNEL_FPSTATE)) { fpsimd_load_kernel_state(next); - set_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE); + fpsimd_flush_cpu_state(); } else { /* * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's @@ -1656,31 +1695,6 @@ void fpsimd_signal_preserve_current_state(void) } /* - * Called by KVM when entering the guest. - */ -void fpsimd_kvm_prepare(void) -{ - if (!system_supports_sve()) - return; - - /* - * KVM does not save host SVE state since we can only enter - * the guest from a syscall so the ABI means that only the - * non-saved SVE state needs to be saved. If we have left - * SVE enabled for performance reasons then update the task - * state to be FPSIMD only. - */ - get_cpu_fpsimd_context(); - - if (test_and_clear_thread_flag(TIF_SVE)) { - sve_to_fpsimd(current); - current->thread.fp_type = FP_STATE_FPSIMD; - } - - put_cpu_fpsimd_context(); -} - -/* * Associate current's FPSIMD context with this cpu * The caller must have ownership of the cpu FPSIMD context before calling * this function. @@ -1696,6 +1710,7 @@ static void fpsimd_bind_task_to_cpu(void) last->sve_vl = task_get_sve_vl(current); last->sme_vl = task_get_sme_vl(current); last->svcr = ¤t->thread.svcr; + last->fpmr = ¤t->thread.uw.fpmr; last->fp_type = ¤t->thread.fp_type; last->to_save = FP_STATE_CURRENT; current->thread.fpsimd_cpu = smp_processor_id(); @@ -1825,27 +1840,6 @@ void fpsimd_flush_task_state(struct task_struct *t) } /* - * Invalidate any task's FPSIMD state that is present on this cpu. - * The FPSIMD context should be acquired with get_cpu_fpsimd_context() - * before calling this function. - */ -static void fpsimd_flush_cpu_state(void) -{ - WARN_ON(!system_supports_fpsimd()); - __this_cpu_write(fpsimd_last_state.st, NULL); - - /* - * Leaving streaming mode enabled will cause issues for any kernel - * NEON and leaving streaming mode or ZA enabled may increase power - * consumption. - */ - if (system_supports_sme()) - sme_smstop(); - - set_thread_flag(TIF_FOREIGN_FPSTATE); -} - -/* * Save the FPSIMD state to memory and invalidate cpu view. * This function must be called with preemption disabled. */ diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index a650f5e11fc5..d7c0d023dfe5 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -15,7 +15,7 @@ #include <asm/debug-monitors.h> #include <asm/ftrace.h> #include <asm/insn.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS struct fregs_offset { @@ -23,10 +23,10 @@ struct fregs_offset { int offset; }; -#define FREGS_OFFSET(n, field) \ -{ \ - .name = n, \ - .offset = offsetof(struct ftrace_regs, field), \ +#define FREGS_OFFSET(n, field) \ +{ \ + .name = n, \ + .offset = offsetof(struct __arch_ftrace_regs, field), \ } static const struct fregs_offset fregs_offsets[] = { @@ -143,6 +143,69 @@ unsigned long ftrace_call_adjust(unsigned long addr) return addr; } +/* Convert fentry_ip to the symbol address without kallsyms */ +unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip) +{ + u32 insn; + + /* + * When using patchable-function-entry without pre-function NOPS, ftrace + * entry is the address of the first NOP after the function entry point. + * + * The compiler has either generated: + * + * func+00: func: NOP // To be patched to MOV X9, LR + * func+04: NOP // To be patched to BL <caller> + * + * Or: + * + * func-04: BTI C + * func+00: func: NOP // To be patched to MOV X9, LR + * func+04: NOP // To be patched to BL <caller> + * + * The fentry_ip is the address of `BL <caller>` which is at `func + 4` + * bytes in either case. + */ + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return fentry_ip - AARCH64_INSN_SIZE; + + /* + * When using patchable-function-entry with pre-function NOPs, BTI is + * a bit different. + * + * func+00: func: NOP // To be patched to MOV X9, LR + * func+04: NOP // To be patched to BL <caller> + * + * Or: + * + * func+00: func: BTI C + * func+04: NOP // To be patched to MOV X9, LR + * func+08: NOP // To be patched to BL <caller> + * + * The fentry_ip is the address of `BL <caller>` which is at either + * `func + 4` or `func + 8` depends on whether there is a BTI. + */ + + /* If there is no BTI, the func address should be one instruction before. */ + if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) + return fentry_ip - AARCH64_INSN_SIZE; + + /* We want to be extra safe in case entry ip is on the page edge, + * but otherwise we need to avoid get_kernel_nofault()'s overhead. + */ + if ((fentry_ip & ~PAGE_MASK) < AARCH64_INSN_SIZE * 2) { + if (get_kernel_nofault(insn, (u32 *)(fentry_ip - AARCH64_INSN_SIZE * 2))) + return 0; + } else { + insn = *(u32 *)(fentry_ip - AARCH64_INSN_SIZE * 2); + } + + if (aarch64_insn_is_bti(le32_to_cpu((__le32)insn))) + return fentry_ip - AARCH64_INSN_SIZE * 2; + + return fentry_ip - AARCH64_INSN_SIZE; +} + /* * Replace a single instruction, which may be a branch or NOP. * If @validate == true, a replaced instruction is checked against 'old'. @@ -481,7 +544,20 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - prepare_ftrace_return(ip, &fregs->lr, fregs->fp); + unsigned long return_hooker = (unsigned long)&return_to_handler; + unsigned long frame_pointer = arch_ftrace_regs(fregs)->fp; + unsigned long *parent = &arch_ftrace_regs(fregs)->lr; + unsigned long old; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + + old = *parent; + + if (!function_graph_enter_regs(old, ip, frame_pointer, + (void *)frame_pointer, fregs)) { + *parent = return_hooker; + } } #else /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 99f8a8c082d3..2ce73525de2c 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -32,6 +32,7 @@ #include <asm/scs.h> #include <asm/smp.h> #include <asm/sysreg.h> +#include <asm/stacktrace/frame.h> #include <asm/thread_info.h> #include <asm/virt.h> @@ -41,26 +42,6 @@ #error PAGE_OFFSET must be at least 2MB aligned #endif - .macro putc_base_pa, base:req - mov \base, #0xf0000000 - add \base, \base, #0x00512000 - .endm - .macro putc_base_va, base:req - mov \base, #0xffffff8000000000 - orr \base, \base, #0x30000000 - add \base, \base, #0x00512000 - .endm - .macro wait, base:req, val:req -1: ldrb \val, [\base, #5 * 4] - tbz \val, #6, 1b - .endm - .macro putc, base:req, val:req, ch - .ifnb \ch - mov \val, \ch - .endif - strb \val, [\base, #0] - .endm - /* * Kernel startup entry point. * --------------------------- @@ -100,127 +81,42 @@ * x19 primary_entry() .. start_kernel() whether we entered with the MMU on * x20 primary_entry() .. __primary_switch() CPU boot mode * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 - * x22 create_idmap() .. start_kernel() ID map VA of the DT blob - * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset - * x24 __primary_switch() linear map KASLR seed - * x25 primary_entry() .. start_kernel() supported VA size - * x28 create_idmap() callee preserved temp register */ SYM_CODE_START(primary_entry) -#if 0 - putc_base_pa x21 - putc x21, w23, #'x' - wait x21, w23 - putc x21, w23, #'0' - wait x21, w23 - putc x21, w23, #':' - mov x19, x0 - mov x20, #16 -1: wait x21, w23 - lsr x23, x19, #60 - and w23, w23, #15 - cmp w23, #10 - bcc 2f - add w23, w23, #7 -2: add w23, w23, #'0' - putc x21, w23 - lsl x19, x19, #4 - subs x20, x20, #1 - bcs 1b - wait x21, w23 - putc x21, w23, #'\r' - wait x21, w23 - putc x21, w23, #'\n' - wait x21, w23 - - putc x21, w23, #'x' - wait x21, w23 - putc x21, w23, #'1' - wait x21, w23 - putc x21, w23, #':' - mov x19, x1 - mov x20, #16 -1: wait x21, w23 - lsr x23, x19, #60 - and w23, w23, #15 - cmp w23, #10 - bcc 2f - add w23, w23, #7 -2: add w23, w23, #'0' - putc x21, w23 - lsl x19, x19, #4 - subs x20, x20, #1 - bcs 1b - wait x21, w23 - putc x21, w23, #'\r' - wait x21, w23 - putc x21, w23, #'\n' - wait x21, w23 - - putc x21, w23, #'x' - wait x21, w23 - putc x21, w23, #'2' - wait x21, w23 - putc x21, w23, #':' - mov x19, x2 - mov x20, #16 -1: wait x21, w23 - lsr x23, x19, #60 - and w23, w23, #15 - cmp w23, #10 - bcc 2f - add w23, w23, #7 -2: add w23, w23, #'0' - putc x21, w23 - lsl x19, x19, #4 - subs x20, x20, #1 - bcs 1b - wait x21, w23 - putc x21, w23, #'\r' - wait x21, w23 - putc x21, w23, #'\n' - wait x21, w23 - - putc x21, w23, #'x' - wait x21, w23 - putc x21, w23, #'3' - wait x21, w23 - putc x21, w23, #':' - mov x19, x3 - mov x20, #16 -1: wait x21, w23 - lsr x23, x19, #60 - and w23, w23, #15 - cmp w23, #10 - bcc 2f - add w23, w23, #7 -2: add w23, w23, #'0' - putc x21, w23 - lsl x19, x19, #4 - subs x20, x20, #1 - bcs 1b - wait x21, w23 - putc x21, w23, #'\r' - wait x21, w23 - putc x21, w23, #'\n' - wait x21, w23 -#endif bl record_mmu_state - bl preserve_boot_args - bl create_idmap + + adrp x1, early_init_stack + mov sp, x1 + mov x29, xzr + adrp x0, init_idmap_pg_dir + mov x1, xzr + bl __pi_create_init_idmap + + /* + * If the page tables have been populated with non-cacheable + * accesses (MMU disabled), invalidate those tables again to + * remove any speculatively loaded cache lines. + */ + cbnz x19, 0f + dmb sy + mov x1, x0 // end of used region + adrp x0, init_idmap_pg_dir + adr_l x2, dcache_inval_poc + blr x2 + b 1f /* * If we entered with the MMU and caches on, clean the ID mapped part * of the primary boot code to the PoC so we can safely execute it with * the MMU off. */ - cbz x19, 0f - adrp x0, __idmap_text_start +0: adrp x0, __idmap_text_start adr_l x1, __idmap_text_end adr_l x2, dcache_clean_poc blr x2 -0: mov x0, x19 + +1: mov x0, x19 bl init_kernel_el // w0=cpu_boot_mode mov x20, x0 @@ -230,14 +126,6 @@ SYM_CODE_START(primary_entry) * On return, the CPU will be ready for the MMU to be turned on and * the TCR will have been set. */ -#if VA_BITS > 48 - mrs_s x0, SYS_ID_AA64MMFR2_EL1 - tst x0, ID_AA64MMFR2_EL1_VARange_MASK - mov x0, #VA_BITS - mov x25, #VA_BITS_MIN - csel x25, x25, x0, eq - mov x0, x25 -#endif bl __cpu_setup // initialise processor b __primary_switch SYM_CODE_END(primary_entry) @@ -296,275 +184,6 @@ SYM_CODE_START_LOCAL(preserve_boot_args) ret SYM_CODE_END(preserve_boot_args) -SYM_FUNC_START_LOCAL(clear_page_tables) - /* - * Clear the init page tables. - */ - adrp x0, init_pg_dir - adrp x1, init_pg_end - sub x2, x1, x0 - mov x1, xzr - b __pi_memset // tail call -SYM_FUNC_END(clear_page_tables) - -/* - * Macro to populate page table entries, these entries can be pointers to the next level - * or last level entries pointing to physical memory. - * - * tbl: page table address - * rtbl: pointer to page table or physical memory - * index: start index to write - * eindex: end index to write - [index, eindex] written to - * flags: flags for pagetable entry to or in - * inc: increment to rtbl between each entry - * tmp1: temporary variable - * - * Preserves: tbl, eindex, flags, inc - * Corrupts: index, tmp1 - * Returns: rtbl - */ - .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1 -.Lpe\@: phys_to_pte \tmp1, \rtbl - orr \tmp1, \tmp1, \flags // tmp1 = table entry - str \tmp1, [\tbl, \index, lsl #3] - add \rtbl, \rtbl, \inc // rtbl = pa next level - add \index, \index, #1 - cmp \index, \eindex - b.ls .Lpe\@ - .endm - -/* - * Compute indices of table entries from virtual address range. If multiple entries - * were needed in the previous page table level then the next page table level is assumed - * to be composed of multiple pages. (This effectively scales the end index). - * - * vstart: virtual address of start of range - * vend: virtual address of end of range - we map [vstart, vend] - * shift: shift used to transform virtual address into index - * order: #imm 2log(number of entries in page table) - * istart: index in table corresponding to vstart - * iend: index in table corresponding to vend - * count: On entry: how many extra entries were required in previous level, scales - * our end index. - * On exit: returns how many extra entries required for next page table level - * - * Preserves: vstart, vend - * Returns: istart, iend, count - */ - .macro compute_indices, vstart, vend, shift, order, istart, iend, count - ubfx \istart, \vstart, \shift, \order - ubfx \iend, \vend, \shift, \order - add \iend, \iend, \count, lsl \order - sub \count, \iend, \istart - .endm - -/* - * Map memory for specified virtual address range. Each level of page table needed supports - * multiple entries. If a level requires n entries the next page table level is assumed to be - * formed from n pages. - * - * tbl: location of page table - * rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE) - * vstart: virtual address of start of range - * vend: virtual address of end of range - we map [vstart, vend - 1] - * flags: flags to use to map last level entries - * phys: physical address corresponding to vstart - physical memory is contiguous - * order: #imm 2log(number of entries in PGD table) - * - * If extra_shift is set, an extra level will be populated if the end address does - * not fit in 'extra_shift' bits. This assumes vend is in the TTBR0 range. - * - * Temporaries: istart, iend, tmp, count, sv - these need to be different registers - * Preserves: vstart, flags - * Corrupts: tbl, rtbl, vend, istart, iend, tmp, count, sv - */ - .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, order, istart, iend, tmp, count, sv, extra_shift - sub \vend, \vend, #1 - add \rtbl, \tbl, #PAGE_SIZE - mov \count, #0 - - .ifnb \extra_shift - tst \vend, #~((1 << (\extra_shift)) - 1) - b.eq .L_\@ - compute_indices \vstart, \vend, #\extra_shift, #(PAGE_SHIFT - 3), \istart, \iend, \count - mov \sv, \rtbl - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv - .endif -.L_\@: - compute_indices \vstart, \vend, #PGDIR_SHIFT, #\order, \istart, \iend, \count - mov \sv, \rtbl - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv - -#if SWAPPER_PGTABLE_LEVELS > 3 - compute_indices \vstart, \vend, #PUD_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count - mov \sv, \rtbl - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv -#endif - -#if SWAPPER_PGTABLE_LEVELS > 2 - compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count - mov \sv, \rtbl - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv -#endif - - compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count - bic \rtbl, \phys, #SWAPPER_BLOCK_SIZE - 1 - populate_entries \tbl, \rtbl, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp - .endm - -/* - * Remap a subregion created with the map_memory macro with modified attributes - * or output address. The entire remapped region must have been covered in the - * invocation of map_memory. - * - * x0: last level table address (returned in first argument to map_memory) - * x1: start VA of the existing mapping - * x2: start VA of the region to update - * x3: end VA of the region to update (exclusive) - * x4: start PA associated with the region to update - * x5: attributes to set on the updated region - * x6: order of the last level mappings - */ -SYM_FUNC_START_LOCAL(remap_region) - sub x3, x3, #1 // make end inclusive - - // Get the index offset for the start of the last level table - lsr x1, x1, x6 - bfi x1, xzr, #0, #PAGE_SHIFT - 3 - - // Derive the start and end indexes into the last level table - // associated with the provided region - lsr x2, x2, x6 - lsr x3, x3, x6 - sub x2, x2, x1 - sub x3, x3, x1 - - mov x1, #1 - lsl x6, x1, x6 // block size at this level - - populate_entries x0, x4, x2, x3, x5, x6, x7 - ret -SYM_FUNC_END(remap_region) - -SYM_FUNC_START_LOCAL(create_idmap) - mov x28, lr - /* - * The ID map carries a 1:1 mapping of the physical address range - * covered by the loaded image, which could be anywhere in DRAM. This - * means that the required size of the VA (== PA) space is decided at - * boot time, and could be more than the configured size of the VA - * space for ordinary kernel and user space mappings. - * - * There are three cases to consider here: - * - 39 <= VA_BITS < 48, and the ID map needs up to 48 VA bits to cover - * the placement of the image. In this case, we configure one extra - * level of translation on the fly for the ID map only. (This case - * also covers 42-bit VA/52-bit PA on 64k pages). - * - * - VA_BITS == 48, and the ID map needs more than 48 VA bits. This can - * only happen when using 64k pages, in which case we need to extend - * the root level table rather than add a level. Note that we can - * treat this case as 'always extended' as long as we take care not - * to program an unsupported T0SZ value into the TCR register. - * - * - Combinations that would require two additional levels of - * translation are not supported, e.g., VA_BITS==36 on 16k pages, or - * VA_BITS==39/4k pages with 5-level paging, where the input address - * requires more than 47 or 48 bits, respectively. - */ -#if (VA_BITS < 48) -#define IDMAP_PGD_ORDER (VA_BITS - PGDIR_SHIFT) -#define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) - - /* - * If VA_BITS < 48, we have to configure an additional table level. - * First, we have to verify our assumption that the current value of - * VA_BITS was chosen such that all translation levels are fully - * utilised, and that lowering T0SZ will always result in an additional - * translation level to be configured. - */ -#if VA_BITS != EXTRA_SHIFT -#error "Mismatch between VA_BITS and page size/number of translation levels" -#endif -#else -#define IDMAP_PGD_ORDER (PHYS_MASK_SHIFT - PGDIR_SHIFT) -#define EXTRA_SHIFT - /* - * If VA_BITS == 48, we don't have to configure an additional - * translation level, but the top-level table has more entries. - */ -#endif - adrp x0, init_idmap_pg_dir - adrp x3, _text - adrp x6, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE - mov_q x7, SWAPPER_RX_MMUFLAGS - - map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14, EXTRA_SHIFT - - /* Remap the kernel page tables r/w in the ID map */ - adrp x1, _text - adrp x2, init_pg_dir - adrp x3, init_pg_end - bic x4, x2, #SWAPPER_BLOCK_SIZE - 1 - mov_q x5, SWAPPER_RW_MMUFLAGS - mov x6, #SWAPPER_BLOCK_SHIFT - bl remap_region - - /* Remap the FDT after the kernel image */ - adrp x1, _text - adrp x22, _end + SWAPPER_BLOCK_SIZE - bic x2, x22, #SWAPPER_BLOCK_SIZE - 1 - bfi x22, x21, #0, #SWAPPER_BLOCK_SHIFT // remapped FDT address - add x3, x2, #MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE - bic x4, x21, #SWAPPER_BLOCK_SIZE - 1 - mov_q x5, SWAPPER_RW_MMUFLAGS - mov x6, #SWAPPER_BLOCK_SHIFT - bl remap_region - -#if 0 - mov x7, (PMD_ATTRINDX(MT_DEVICE_nGnRnE) | SWAPPER_PMD_FLAGS) - mov x3, #0xf0000000 - mov x4, #0x30000000 - mov x6, #0x31000000 - create_block_map x0, x7, x3, x4, x6 -#endif - - /* - * Since the page tables have been populated with non-cacheable - * accesses (MMU disabled), invalidate those tables again to - * remove any speculatively loaded cache lines. - */ - cbnz x19, 0f // skip cache invalidation if MMU is on - dmb sy - - adrp x0, init_idmap_pg_dir - adrp x1, init_idmap_pg_end - bl dcache_inval_poc -0: ret x28 -SYM_FUNC_END(create_idmap) - -SYM_FUNC_START_LOCAL(create_kernel_mapping) - adrp x0, init_pg_dir - mov_q x5, KIMAGE_VADDR // compile time __va(_text) -#ifdef CONFIG_RELOCATABLE - add x5, x5, x23 // add KASLR displacement -#endif - adrp x6, _end // runtime __pa(_end) - adrp x3, _text // runtime __pa(_text) - sub x6, x6, x3 // _end - _text - add x6, x6, x5 // runtime __va(_end) - mov_q x7, SWAPPER_RW_MMUFLAGS - - map_memory x0, x1, x5, x6, x7, x3, (VA_BITS - PGDIR_SHIFT), x10, x11, x12, x13, x14 - - dsb ishst // sync with page table walker - ret -SYM_FUNC_END(create_kernel_mapping) - /* * Initialize CPU registers with task-specific and cpu-specific context. * @@ -581,6 +200,8 @@ SYM_FUNC_END(create_kernel_mapping) sub sp, sp, #PT_REGS_SIZE stp xzr, xzr, [sp, #S_STACKFRAME] + mov \tmp1, #FRAME_META_TYPE_FINAL + str \tmp1, [sp, #S_STACKFRAME_TYPE] add x29, sp, #S_STACKFRAME scs_load_current @@ -616,34 +237,9 @@ SYM_FUNC_START_LOCAL(__primary_switched) mov x0, x20 bl set_cpu_boot_mode_flag - // Clear BSS - adr_l x0, __bss_start - mov x1, xzr - adr_l x2, __bss_stop - sub x2, x2, x0 - bl __pi_memset - dsb ishst // Make zero page visible to PTW - -#if VA_BITS > 48 - adr_l x8, vabits_actual // Set this early so KASAN early init - str x25, [x8] // ... observes the correct value - dc civac, x8 // Make visible to booting secondaries -#endif - -#ifdef CONFIG_RANDOMIZE_BASE - adrp x5, memstart_offset_seed // Save KASLR linear map seed - strh w24, [x5, :lo12:memstart_offset_seed] -#endif #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) bl kasan_early_init #endif - mov x0, x21 // pass FDT address in x0 - bl early_fdt_map // Try mapping the FDT early - mov x0, x20 // pass the full boot status - bl init_feature_override // Parse cpu feature overrides -#ifdef CONFIG_UNWIND_PATCH_PAC_INTO_SCS - bl scs_patch_vmlinux -#endif mov x0, x20 bl finalise_el2 // Prefer VHE if possible ldp x29, x30, [sp], #16 @@ -696,11 +292,14 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) adr_l x1, __hyp_text_end adr_l x2, dcache_clean_poc blr x2 -0: - mov_q x0, HCR_HOST_NVHE_FLAGS - msr hcr_el2, x0 + + mov_q x0, INIT_SCTLR_EL2_MMU_OFF + pre_disable_mmu_workaround + msr sctlr_el2, x0 isb +0: + init_el2_hcr HCR_HOST_NVHE_FLAGS init_el2_state /* Hypervisor stub */ @@ -710,27 +309,21 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) mov_q x1, INIT_SCTLR_EL1_MMU_OFF - /* - * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, - * making it impossible to start in nVHE mode. Is that - * compliant with the architecture? Absolutely not! - */ mrs x0, hcr_el2 and x0, x0, #HCR_E2H - cbz x0, 1f + cbz x0, 2f /* Set a sane SCTLR_EL1, the VHE way */ - pre_disable_mmu_workaround msr_s SYS_SCTLR_EL12, x1 mov x2, #BOOT_CPU_FLAG_E2H - b 2f + b 3f -1: - pre_disable_mmu_workaround +2: msr sctlr_el1, x1 mov x2, xzr -2: - __init_el2_nvhe_prepare_eret +3: + mov x0, #INIT_PSTATE_EL1 + msr spsr_el2, x0 mov w0, #BOOT_CPU_MODE_EL2 orr x0, x0, x2 @@ -770,10 +363,13 @@ SYM_FUNC_START_LOCAL(secondary_startup) * Common entry point for secondary CPUs. */ mov x20, x0 // preserve boot mode + +#ifdef CONFIG_ARM64_VA_BITS_52 +alternative_if ARM64_HAS_VA52 bl __cpu_secondary_check52bitva -#if VA_BITS > 48 - ldr_l x0, vabits_actual +alternative_else_nop_endif #endif + bl __cpu_setup // initialise processor adrp x1, swapper_pg_dir adrp x2, idmap_pg_dir @@ -876,15 +472,18 @@ SYM_FUNC_START(__enable_mmu) ret SYM_FUNC_END(__enable_mmu) +#ifdef CONFIG_ARM64_VA_BITS_52 SYM_FUNC_START(__cpu_secondary_check52bitva) -#if VA_BITS > 48 - ldr_l x0, vabits_actual - cmp x0, #52 - b.ne 2f - +#ifndef CONFIG_ARM64_LPA2 mrs_s x0, SYS_ID_AA64MMFR2_EL1 and x0, x0, ID_AA64MMFR2_EL1_VARange_MASK cbnz x0, 2f +#else + mrs x0, id_aa64mmfr0_el1 + sbfx x0, x0, #ID_AA64MMFR0_EL1_TGRAN_SHIFT, 4 + cmp x0, #ID_AA64MMFR0_EL1_TGRAN_LPA2 + b.ge 2f +#endif update_early_cpu_boot_status \ CPU_STUCK_IN_KERNEL | CPU_STUCK_REASON_52_BIT_VA, x0, x1 @@ -892,9 +491,9 @@ SYM_FUNC_START(__cpu_secondary_check52bitva) wfi b 1b -#endif 2: ret SYM_FUNC_END(__cpu_secondary_check52bitva) +#endif SYM_FUNC_START_LOCAL(__no_granule_support) /* Indicate that this CPU can't boot and is stuck in the kernel */ @@ -906,123 +505,18 @@ SYM_FUNC_START_LOCAL(__no_granule_support) b 1b SYM_FUNC_END(__no_granule_support) -#ifdef CONFIG_RELOCATABLE -SYM_FUNC_START_LOCAL(__relocate_kernel) - /* - * Iterate over each entry in the relocation table, and apply the - * relocations in place. - */ - adr_l x9, __rela_start - adr_l x10, __rela_end - mov_q x11, KIMAGE_VADDR // default virtual offset - add x11, x11, x23 // actual virtual offset - -0: cmp x9, x10 - b.hs 1f - ldp x12, x13, [x9], #24 - ldr x14, [x9, #-8] - cmp w13, #R_AARCH64_RELATIVE - b.ne 0b - add x14, x14, x23 // relocate - str x14, [x12, x23] - b 0b - -1: -#ifdef CONFIG_RELR - /* - * Apply RELR relocations. - * - * RELR is a compressed format for storing relative relocations. The - * encoded sequence of entries looks like: - * [ AAAAAAAA BBBBBBB1 BBBBBBB1 ... AAAAAAAA BBBBBB1 ... ] - * - * i.e. start with an address, followed by any number of bitmaps. The - * address entry encodes 1 relocation. The subsequent bitmap entries - * encode up to 63 relocations each, at subsequent offsets following - * the last address entry. - * - * The bitmap entries must have 1 in the least significant bit. The - * assumption here is that an address cannot have 1 in lsb. Odd - * addresses are not supported. Any odd addresses are stored in the RELA - * section, which is handled above. - * - * Excluding the least significant bit in the bitmap, each non-zero - * bit in the bitmap represents a relocation to be applied to - * a corresponding machine word that follows the base address - * word. The second least significant bit represents the machine - * word immediately following the initial address, and each bit - * that follows represents the next word, in linear order. As such, - * a single bitmap can encode up to 63 relocations in a 64-bit object. - * - * In this implementation we store the address of the next RELR table - * entry in x9, the address being relocated by the current address or - * bitmap entry in x13 and the address being relocated by the current - * bit in x14. - */ - adr_l x9, __relr_start - adr_l x10, __relr_end - -2: cmp x9, x10 - b.hs 7f - ldr x11, [x9], #8 - tbnz x11, #0, 3f // branch to handle bitmaps - add x13, x11, x23 - ldr x12, [x13] // relocate address entry - add x12, x12, x23 - str x12, [x13], #8 // adjust to start of bitmap - b 2b - -3: mov x14, x13 -4: lsr x11, x11, #1 - cbz x11, 6f - tbz x11, #0, 5f // skip bit if not set - ldr x12, [x14] // relocate bit - add x12, x12, x23 - str x12, [x14] - -5: add x14, x14, #8 // move to next bit's address - b 4b - -6: /* - * Move to the next bitmap's address. 8 is the word size, and 63 is the - * number of significant bits in a bitmap entry. - */ - add x13, x13, #(8 * 63) - b 2b - -7: -#endif - ret - -SYM_FUNC_END(__relocate_kernel) -#endif - SYM_FUNC_START_LOCAL(__primary_switch) adrp x1, reserved_pg_dir adrp x2, init_idmap_pg_dir bl __enable_mmu -#ifdef CONFIG_RELOCATABLE - adrp x23, KERNEL_START - and x23, x23, MIN_KIMG_ALIGN - 1 -#ifdef CONFIG_RANDOMIZE_BASE - mov x0, x22 - adrp x1, init_pg_end + + adrp x1, early_init_stack mov sp, x1 mov x29, xzr - bl __pi_kaslr_early_init - and x24, x0, #SZ_2M - 1 // capture memstart offset seed - bic x0, x0, #SZ_2M - 1 - orr x23, x23, x0 // record kernel offset -#endif -#endif - bl clear_page_tables - bl create_kernel_mapping + mov x0, x20 // pass the full boot status + mov x1, x21 // pass the FDT + bl __pi_early_map_kernel // Map and relocate the kernel - adrp x1, init_pg_dir - load_ttbr1 x1, x1, x2 -#ifdef CONFIG_RELOCATABLE - bl __relocate_kernel -#endif ldr x8, =__primary_switched adrp x0, KERNEL_START // __pa(KERNEL_START) br x8 diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 02870beb271e..18749e9a6c2d 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -266,9 +266,15 @@ static int swsusp_mte_save_tags(void) max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { struct page *page = pfn_to_online_page(pfn); + struct folio *folio; if (!page) continue; + folio = page_folio(page); + + if (folio_test_hugetlb(folio) && + !folio_test_hugetlb_mte_tagged(folio)) + continue; if (!page_mte_tagged(page)) continue; @@ -407,7 +413,7 @@ int swsusp_arch_resume(void) void *, phys_addr_t, phys_addr_t); struct trans_pgd_info trans_info = { .trans_alloc_page = hibernate_page_alloc, - .trans_alloc_arg = (void *)GFP_ATOMIC, + .trans_alloc_arg = (__force void *)GFP_ATOMIC, }; /* diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 35225632d70a..722ac45f9f7b 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -21,6 +21,7 @@ #include <asm/current.h> #include <asm/debug-monitors.h> +#include <asm/esr.h> #include <asm/hw_breakpoint.h> #include <asm/traps.h> #include <asm/cputype.h> @@ -654,7 +655,7 @@ static int breakpoint_handler(unsigned long unused, unsigned long esr, perf_bp_event(bp, regs); /* Do we need to handle the stepping? */ - if (uses_default_overflow_handler(bp)) + if (is_default_overflow_handler(bp)) step = 1; unlock: rcu_read_unlock(); @@ -733,7 +734,7 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val, static int watchpoint_report(struct perf_event *wp, unsigned long addr, struct pt_regs *regs) { - int step = uses_default_overflow_handler(wp); + int step = is_default_overflow_handler(wp); struct arch_hw_breakpoint *info = counter_arch_bp(wp); info->trigger = addr; @@ -779,7 +780,7 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr, * Check that the access type matches. * 0 => load, otherwise => store */ - access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W : + access = (esr & ESR_ELx_WNR) ? HW_BREAKPOINT_W : HW_BREAKPOINT_R; if (!(access & hw_breakpoint_type(wp))) continue; diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 65f76064c86b..ae990da1eae5 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -114,8 +114,8 @@ SYM_CODE_START_LOCAL(__finalise_el2) // Use EL2 translations for SPE & TRBE and disable access from EL1 mrs x0, mdcr_el2 - bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) - bic x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) + bic x0, x0, #MDCR_EL2_E2PB_MASK + bic x0, x0, #MDCR_EL2_E2TB_MASK msr mdcr_el2, x0 // Transfer the MM state from EL1 to EL2 diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 5e4dc72ab1bd..ef3a69cc398e 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -36,6 +36,41 @@ PROVIDE(__pi___memcpy = __pi_memcpy); PROVIDE(__pi___memmove = __pi_memmove); PROVIDE(__pi___memset = __pi_memset); +PROVIDE(__pi_id_aa64isar1_override = id_aa64isar1_override); +PROVIDE(__pi_id_aa64isar2_override = id_aa64isar2_override); +PROVIDE(__pi_id_aa64mmfr0_override = id_aa64mmfr0_override); +PROVIDE(__pi_id_aa64mmfr1_override = id_aa64mmfr1_override); +PROVIDE(__pi_id_aa64mmfr2_override = id_aa64mmfr2_override); +PROVIDE(__pi_id_aa64pfr0_override = id_aa64pfr0_override); +PROVIDE(__pi_id_aa64pfr1_override = id_aa64pfr1_override); +PROVIDE(__pi_id_aa64smfr0_override = id_aa64smfr0_override); +PROVIDE(__pi_id_aa64zfr0_override = id_aa64zfr0_override); +PROVIDE(__pi_arm64_sw_feature_override = arm64_sw_feature_override); +PROVIDE(__pi_arm64_use_ng_mappings = arm64_use_ng_mappings); +#ifdef CONFIG_CAVIUM_ERRATUM_27456 +PROVIDE(__pi_cavium_erratum_27456_cpus = cavium_erratum_27456_cpus); +#endif +PROVIDE(__pi__ctype = _ctype); +PROVIDE(__pi_memstart_offset_seed = memstart_offset_seed); + +PROVIDE(__pi_init_idmap_pg_dir = init_idmap_pg_dir); +PROVIDE(__pi_init_idmap_pg_end = init_idmap_pg_end); +PROVIDE(__pi_init_pg_dir = init_pg_dir); +PROVIDE(__pi_init_pg_end = init_pg_end); +PROVIDE(__pi_swapper_pg_dir = swapper_pg_dir); + +PROVIDE(__pi__text = _text); +PROVIDE(__pi__stext = _stext); +PROVIDE(__pi__etext = _etext); +PROVIDE(__pi___start_rodata = __start_rodata); +PROVIDE(__pi___inittext_begin = __inittext_begin); +PROVIDE(__pi___inittext_end = __inittext_end); +PROVIDE(__pi___initdata_begin = __initdata_begin); +PROVIDE(__pi___initdata_end = __initdata_end); +PROVIDE(__pi__data = _data); +PROVIDE(__pi___bss_start = __bss_start); +PROVIDE(__pi__end = _end); + #ifdef CONFIG_KVM /* @@ -70,10 +105,8 @@ KVM_NVHE_ALIAS(__hyp_stub_vectors); KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); -#ifdef CONFIG_ARM64_PSEUDO_NMI -/* Static key checked in GIC_PRIO_IRQOFF. */ -KVM_NVHE_ALIAS(gic_nonsecure_priorities); -#endif +/* Static key which is set if CNTVOFF_EL2 is unusable */ +KVM_NVHE_ALIAS(broken_cntvoff_key); /* EL2 exception handling */ KVM_NVHE_ALIAS(__start___kvm_ex_table); diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c index aa7a4ec6a3ae..fe86ada23c7d 100644 --- a/arch/arm64/kernel/io.c +++ b/arch/arm64/kernel/io.c @@ -10,88 +10,43 @@ #include <linux/io.h> /* - * Copy data from IO memory space to "real" memory space. + * This generates a memcpy that works on a from/to address which is aligned to + * bits. Count is in terms of the number of bits sized quantities to copy. It + * optimizes to use the STR groupings when possible so that it is WC friendly. */ -void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) +#define memcpy_toio_aligned(to, from, count, bits) \ + ({ \ + volatile u##bits __iomem *_to = to; \ + const u##bits *_from = from; \ + size_t _count = count; \ + const u##bits *_end_from = _from + ALIGN_DOWN(_count, 8); \ + \ + for (; _from < _end_from; _from += 8, _to += 8) \ + __const_memcpy_toio_aligned##bits(_to, _from, 8); \ + if ((_count % 8) >= 4) { \ + __const_memcpy_toio_aligned##bits(_to, _from, 4); \ + _from += 4; \ + _to += 4; \ + } \ + if ((_count % 4) >= 2) { \ + __const_memcpy_toio_aligned##bits(_to, _from, 2); \ + _from += 2; \ + _to += 2; \ + } \ + if (_count % 2) \ + __const_memcpy_toio_aligned##bits(_to, _from, 1); \ + }) + +void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count) { - while (count && !IS_ALIGNED((unsigned long)from, 8)) { - *(u8 *)to = __raw_readb(from); - from++; - to++; - count--; - } - - while (count >= 8) { - *(u64 *)to = __raw_readq(from); - from += 8; - to += 8; - count -= 8; - } - - while (count) { - *(u8 *)to = __raw_readb(from); - from++; - to++; - count--; - } + memcpy_toio_aligned(to, from, count, 64); + dgh(); } -EXPORT_SYMBOL(__memcpy_fromio); +EXPORT_SYMBOL(__iowrite64_copy_full); -/* - * Copy data from "real" memory space to IO memory space. - */ -void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) +void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count) { - while (count && !IS_ALIGNED((unsigned long)to, 8)) { - __raw_writeb(*(u8 *)from, to); - from++; - to++; - count--; - } - - while (count >= 8) { - __raw_writeq(*(u64 *)from, to); - from += 8; - to += 8; - count -= 8; - } - - while (count) { - __raw_writeb(*(u8 *)from, to); - from++; - to++; - count--; - } -} -EXPORT_SYMBOL(__memcpy_toio); - -/* - * "memset" on IO memory space. - */ -void __memset_io(volatile void __iomem *dst, int c, size_t count) -{ - u64 qc = (u8)c; - - qc |= qc << 8; - qc |= qc << 16; - qc |= qc << 32; - - while (count && !IS_ALIGNED((unsigned long)dst, 8)) { - __raw_writeb(c, dst); - dst++; - count--; - } - - while (count >= 8) { - __raw_writeq(qc, dst); - dst += 8; - count -= 8; - } - - while (count) { - __raw_writeb(c, dst); - dst++; - count--; - } + memcpy_toio_aligned(to, from, count, 32); + dgh(); } -EXPORT_SYMBOL(__memset_io); +EXPORT_SYMBOL(__iowrite32_copy_full); diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c index faf88ec9c48e..b345425193d2 100644 --- a/arch/arm64/kernel/jump_label.c +++ b/arch/arm64/kernel/jump_label.c @@ -7,11 +7,12 @@ */ #include <linux/kernel.h> #include <linux/jump_label.h> +#include <linux/smp.h> #include <asm/insn.h> -#include <asm/patching.h> +#include <asm/text-patching.h> -void arch_jump_label_transform(struct jump_entry *entry, - enum jump_label_type type) +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) { void *addr = (void *)jump_entry_code(entry); u32 insn; @@ -25,4 +26,10 @@ void arch_jump_label_transform(struct jump_entry *entry, } aarch64_insn_patch_text_nosync(addr, insn); + return true; +} + +void arch_jump_label_transform_apply(void) +{ + kick_all_cpus_sync(); } diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 12c7f3c8ba76..1da3e25f9d9e 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -16,9 +16,7 @@ bool __ro_after_init __kaslr_is_enabled = false; void __init kaslr_init(void) { - if (cpuid_feature_extract_unsigned_field(arm64_sw_feature_override.val & - arm64_sw_feature_override.mask, - ARM64_SW_FEATURE_OVERRIDE_NOKASLR)) { + if (kaslr_disabled_cmdline()) { pr_info("KASLR disabled on command line\n"); return; } diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index 4e1f983df3d1..f3c4d3a8a20f 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -17,7 +17,7 @@ #include <asm/debug-monitors.h> #include <asm/insn.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/traps.h> struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index b38aae5b488d..6f121a0164a4 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -207,37 +207,6 @@ void machine_kexec(struct kimage *kimage) BUG(); /* Should never get here. */ } -static void machine_kexec_mask_interrupts(void) -{ - unsigned int i; - struct irq_desc *desc; - - for_each_irq_desc(i, desc) { - struct irq_chip *chip; - int ret; - - chip = irq_desc_get_chip(desc); - if (!chip) - continue; - - /* - * First try to remove the active state. If this - * fails, try to EOI the interrupt. - */ - ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false); - - if (ret && irqd_irq_inprogress(&desc->irq_data) && - chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); - - if (chip->irq_mask) - chip->irq_mask(&desc->irq_data); - - if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) - chip->irq_disable(&desc->irq_data); - } -} - /** * machine_crash_shutdown - shutdown non-crashing cpus and save registers */ @@ -255,7 +224,7 @@ void machine_crash_shutdown(struct pt_regs *regs) pr_info("Starting crashdump kernel...\n"); } -#ifdef CONFIG_HIBERNATION +#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION) /* * To preserve the crash dump kernel image, the relevant memory segments * should be mapped again around the hibernation. diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index 0e017358f4ba..af1ca875c52c 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -39,6 +39,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return kexec_image_post_load_cleanup_default(image); } +#ifdef CONFIG_CRASH_DUMP static int prepare_elf_headers(void **addr, unsigned long *sz) { struct crash_mem *cmem; @@ -80,6 +81,7 @@ out: kfree(cmem); return ret; } +#endif /* * Tries to add the initrd and DTB to the image. If it is not possible to find @@ -93,8 +95,8 @@ int load_other_segments(struct kimage *image, char *cmdline) { struct kexec_buf kbuf; - void *headers, *dtb = NULL; - unsigned long headers_sz, initrd_load_addr = 0, dtb_len, + void *dtb = NULL; + unsigned long initrd_load_addr = 0, dtb_len, orig_segments = image->nr_segments; int ret = 0; @@ -102,7 +104,10 @@ int load_other_segments(struct kimage *image, /* not allocate anything below the kernel */ kbuf.buf_min = kernel_load_addr + kernel_size; +#ifdef CONFIG_CRASH_DUMP /* load elf core header */ + void *headers; + unsigned long headers_sz; if (image->type == KEXEC_TYPE_CRASH) { ret = prepare_elf_headers(&headers, &headers_sz); if (ret) { @@ -130,6 +135,7 @@ int load_other_segments(struct kimage *image, kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", image->elf_load_addr, kbuf.bufsz, kbuf.memsz); } +#endif /* load initrd */ if (initrd) { diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index dd851297596e..06bb680bfe97 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -12,144 +12,18 @@ #include <linux/bitops.h> #include <linux/elf.h> #include <linux/ftrace.h> -#include <linux/gfp.h> #include <linux/kasan.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/moduleloader.h> #include <linux/random.h> #include <linux/scs.h> -#include <linux/vmalloc.h> #include <asm/alternative.h> #include <asm/insn.h> #include <asm/scs.h> #include <asm/sections.h> -static u64 module_direct_base __ro_after_init = 0; -static u64 module_plt_base __ro_after_init = 0; - -/* - * Choose a random page-aligned base address for a window of 'size' bytes which - * entirely contains the interval [start, end - 1]. - */ -static u64 __init random_bounding_box(u64 size, u64 start, u64 end) -{ - u64 max_pgoff, pgoff; - - if ((end - start) >= size) - return 0; - - max_pgoff = (size - (end - start)) / PAGE_SIZE; - pgoff = get_random_u32_inclusive(0, max_pgoff); - - return start - pgoff * PAGE_SIZE; -} - -/* - * Modules may directly reference data and text anywhere within the kernel - * image and other modules. References using PREL32 relocations have a +/-2G - * range, and so we need to ensure that the entire kernel image and all modules - * fall within a 2G window such that these are always within range. - * - * Modules may directly branch to functions and code within the kernel text, - * and to functions and code within other modules. These branches will use - * CALL26/JUMP26 relocations with a +/-128M range. Without PLTs, we must ensure - * that the entire kernel text and all module text falls within a 128M window - * such that these are always within range. With PLTs, we can expand this to a - * 2G window. - * - * We chose the 128M region to surround the entire kernel image (rather than - * just the text) as using the same bounds for the 128M and 2G regions ensures - * by construction that we never select a 128M region that is not a subset of - * the 2G region. For very large and unusual kernel configurations this means - * we may fall back to PLTs where they could have been avoided, but this keeps - * the logic significantly simpler. - */ -static int __init module_init_limits(void) -{ - u64 kernel_end = (u64)_end; - u64 kernel_start = (u64)_text; - u64 kernel_size = kernel_end - kernel_start; - - /* - * The default modules region is placed immediately below the kernel - * image, and is large enough to use the full 2G relocation range. - */ - BUILD_BUG_ON(KIMAGE_VADDR != MODULES_END); - BUILD_BUG_ON(MODULES_VSIZE < SZ_2G); - - if (!kaslr_enabled()) { - if (kernel_size < SZ_128M) - module_direct_base = kernel_end - SZ_128M; - if (kernel_size < SZ_2G) - module_plt_base = kernel_end - SZ_2G; - } else { - u64 min = kernel_start; - u64 max = kernel_end; - - if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) { - pr_info("2G module region forced by RANDOMIZE_MODULE_REGION_FULL\n"); - } else { - module_direct_base = random_bounding_box(SZ_128M, min, max); - if (module_direct_base) { - min = module_direct_base; - max = module_direct_base + SZ_128M; - } - } - - module_plt_base = random_bounding_box(SZ_2G, min, max); - } - - pr_info("%llu pages in range for non-PLT usage", - module_direct_base ? (SZ_128M - kernel_size) / PAGE_SIZE : 0); - pr_info("%llu pages in range for PLT usage", - module_plt_base ? (SZ_2G - kernel_size) / PAGE_SIZE : 0); - - return 0; -} -subsys_initcall(module_init_limits); - -void *module_alloc(unsigned long size) -{ - void *p = NULL; - - /* - * Where possible, prefer to allocate within direct branch range of the - * kernel such that no PLTs are necessary. - */ - if (module_direct_base) { - p = __vmalloc_node_range(size, MODULE_ALIGN, - module_direct_base, - module_direct_base + SZ_128M, - GFP_KERNEL | __GFP_NOWARN, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - } - - if (!p && module_plt_base) { - p = __vmalloc_node_range(size, MODULE_ALIGN, - module_plt_base, - module_plt_base + SZ_2G, - GFP_KERNEL | __GFP_NOWARN, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - } - - if (!p) { - pr_warn_ratelimited("%s: unable to allocate memory\n", - __func__); - } - - if (p && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) { - vfree(p); - return NULL; - } - - /* Memory is intended to be executable, reset the pointer tag. */ - return kasan_reset_tag(p); -} - enum aarch64_reloc_op { RELOC_OP_NONE, RELOC_OP_ABS, @@ -588,14 +462,20 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s; + int ret; + s = find_section(hdr, sechdrs, ".altinstructions"); if (s) apply_alternatives_module((void *)s->sh_addr, s->sh_size); if (scs_is_dynamic()) { s = find_section(hdr, sechdrs, ".init.eh_frame"); - if (s) - scs_patch((void *)s->sh_addr, s->sh_size); + if (s) { + ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size); + if (ret) + pr_err("module %s: error occurred during dynamic SCS patching (%d)\n", + me->name, ret); + } } return module_init_ftrace_plt(hdr, sechdrs, me); diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index a41ef3213e1e..2fbfd27ff5f2 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -38,7 +38,24 @@ EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode); void mte_sync_tags(pte_t pte, unsigned int nr_pages) { struct page *page = pte_page(pte); - unsigned int i; + struct folio *folio = page_folio(page); + unsigned long i; + + if (folio_test_hugetlb(folio)) { + unsigned long nr = folio_nr_pages(folio); + + /* Hugetlb MTE flags are set for head page only */ + if (folio_try_hugetlb_mte_tagging(folio)) { + for (i = 0; i < nr; i++, page++) + mte_clear_page_tags(page_address(page)); + folio_set_hugetlb_mte_tagged(folio); + } + + /* ensure the tags are visible before the PTE is set */ + smp_wmb(); + + return; + } /* if PG_mte_tagged is set, tags have already been initialised */ for (i = 0; i < nr_pages; i++, page++) { @@ -67,7 +84,7 @@ int memcmp_pages(struct page *page1, struct page *page2) /* * If the page content is identical but at least one of the pages is * tagged, return non-zero to avoid KSM merging. If only one of the - * pages is tagged, set_pte_at() may zero or change the tags of the + * pages is tagged, __set_ptes() may zero or change the tags of the * other page via mte_sync_tags(). */ if (page_mte_tagged(page1) || page_mte_tagged(page2)) @@ -410,6 +427,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, void *maddr; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); + struct folio *folio; if (IS_ERR(page)) { err = PTR_ERR(page); @@ -428,7 +446,12 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, put_page(page); break; } - WARN_ON_ONCE(!page_mte_tagged(page)); + + folio = page_folio(page); + if (folio_test_hugetlb(folio)) + WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio)); + else + WARN_ON_ONCE(!page_mte_tagged(page)); /* limit access to the end of the page */ offset = offset_in_page(addr); @@ -582,12 +605,9 @@ subsys_initcall(register_mte_tcf_preferred_sysctl); size_t mte_probe_user_range(const char __user *uaddr, size_t size) { const char __user *end = uaddr + size; - int err = 0; char val; - __raw_get_user(val, uaddr, err); - if (err) - return size; + __raw_get_user(val, uaddr, efault); uaddr = PTR_ALIGN(uaddr, MTE_GRANULE_SIZE); while (uaddr < end) { @@ -595,12 +615,13 @@ size_t mte_probe_user_range(const char __user *uaddr, size_t size) * A read is sufficient for mte, the caller should have probed * for the pte write permission if required. */ - __raw_get_user(val, uaddr, err); - if (err) - return end - uaddr; + __raw_get_user(val, uaddr, efault); uaddr += MTE_GRANULE_SIZE; } (void)val; return 0; + +efault: + return end - uaddr; } diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c index b4835f6d594b..1041bc67a3ee 100644 --- a/arch/arm64/kernel/patching.c +++ b/arch/arm64/kernel/patching.c @@ -10,7 +10,7 @@ #include <asm/fixmap.h> #include <asm/insn.h> #include <asm/kprobes.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/sections.h> static DEFINE_RAW_SPINLOCK(patch_lock); @@ -30,20 +30,17 @@ static bool is_image_text(unsigned long addr) static void __kprobes *patch_map(void *addr, int fixmap) { - unsigned long uintaddr = (uintptr_t) addr; - bool image = is_image_text(uintaddr); - struct page *page; + phys_addr_t phys; - if (image) - page = phys_to_page(__pa_symbol(addr)); - else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) - page = vmalloc_to_page(addr); - else - return addr; + if (is_image_text((unsigned long)addr)) { + phys = __pa_symbol(addr); + } else { + struct page *page = vmalloc_to_page(addr); + BUG_ON(!page); + phys = page_to_phys(page) + offset_in_page(addr); + } - BUG_ON(!page); - return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + - (uintaddr & ~PAGE_MASK)); + return (void *)set_fixmap_offset(fixmap, phys); } static void __kprobes patch_unmap(int fixmap) @@ -105,6 +102,81 @@ noinstr int aarch64_insn_write_literal_u64(void *addr, u64 val) return ret; } +typedef void text_poke_f(void *dst, void *src, size_t patched, size_t len); + +static void *__text_poke(text_poke_f func, void *addr, void *src, size_t len) +{ + unsigned long flags; + size_t patched = 0; + size_t size; + void *waddr; + void *ptr; + + raw_spin_lock_irqsave(&patch_lock, flags); + + while (patched < len) { + ptr = addr + patched; + size = min_t(size_t, PAGE_SIZE - offset_in_page(ptr), + len - patched); + + waddr = patch_map(ptr, FIX_TEXT_POKE0); + func(waddr, src, patched, size); + patch_unmap(FIX_TEXT_POKE0); + + patched += size; + } + raw_spin_unlock_irqrestore(&patch_lock, flags); + + flush_icache_range((uintptr_t)addr, (uintptr_t)addr + len); + + return addr; +} + +static void text_poke_memcpy(void *dst, void *src, size_t patched, size_t len) +{ + copy_to_kernel_nofault(dst, src + patched, len); +} + +static void text_poke_memset(void *dst, void *src, size_t patched, size_t len) +{ + u32 c = *(u32 *)src; + + memset32(dst, c, len / 4); +} + +/** + * aarch64_insn_copy - Copy instructions into (an unused part of) RX memory + * @dst: address to modify + * @src: source of the copy + * @len: length to copy + * + * Useful for JITs to dump new code blocks into unused regions of RX memory. + */ +noinstr void *aarch64_insn_copy(void *dst, void *src, size_t len) +{ + /* A64 instructions must be word aligned */ + if ((uintptr_t)dst & 0x3) + return NULL; + + return __text_poke(text_poke_memcpy, dst, src, len); +} + +/** + * aarch64_insn_set - memset for RX memory regions. + * @dst: address to modify + * @insn: value to set + * @len: length of memory region. + * + * Useful for JITs to fill regions of RX memory with illegal instructions. + */ +noinstr void *aarch64_insn_set(void *dst, u32 insn, size_t len) +{ + if ((uintptr_t)dst & 0x3) + return NULL; + + return __text_poke(text_poke_memset, dst, &insn, len); +} + int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) { u32 *tp = addr; diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index f872c57e9909..fd9a7bed83ce 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -6,28 +6,7 @@ * Copyright (C) 2014 ARM Ltd. */ -#include <linux/acpi.h> -#include <linux/init.h> -#include <linux/io.h> -#include <linux/kernel.h> -#include <linux/mm.h> #include <linux/pci.h> -#include <linux/pci-acpi.h> -#include <linux/pci-ecam.h> -#include <linux/slab.h> - -#ifdef CONFIG_ACPI -/* - * Try to assign the IRQ number when probing a new device - */ -int pcibios_alloc_irq(struct pci_dev *dev) -{ - if (!acpi_disabled) - acpi_pci_irq_enable(dev); - - return 0; -} -#endif /* * raw_pci_read/write - Platform-specific PCI config space access. @@ -61,173 +40,3 @@ int pcibus_to_node(struct pci_bus *bus) EXPORT_SYMBOL(pcibus_to_node); #endif - -#ifdef CONFIG_ACPI - -struct acpi_pci_generic_root_info { - struct acpi_pci_root_info common; - struct pci_config_window *cfg; /* config space mapping */ -}; - -int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) -{ - struct pci_config_window *cfg = bus->sysdata; - struct acpi_device *adev = to_acpi_device(cfg->parent); - struct acpi_pci_root *root = acpi_driver_data(adev); - - return root->segment; -} - -int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) -{ - struct pci_config_window *cfg; - struct acpi_device *adev; - struct device *bus_dev; - - if (acpi_disabled) - return 0; - - cfg = bridge->bus->sysdata; - - /* - * On Hyper-V there is no corresponding ACPI device for a root bridge, - * therefore ->parent is set as NULL by the driver. And set 'adev' as - * NULL in this case because there is no proper ACPI device. - */ - if (!cfg->parent) - adev = NULL; - else - adev = to_acpi_device(cfg->parent); - - bus_dev = &bridge->bus->dev; - - ACPI_COMPANION_SET(&bridge->dev, adev); - set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev))); - - return 0; -} - -static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) -{ - struct resource_entry *entry, *tmp; - int status; - - status = acpi_pci_probe_root_resources(ci); - resource_list_for_each_entry_safe(entry, tmp, &ci->resources) { - if (!(entry->res->flags & IORESOURCE_WINDOW)) - resource_list_destroy_entry(entry); - } - return status; -} - -/* - * Lookup the bus range for the domain in MCFG, and set up config space - * mapping. - */ -static struct pci_config_window * -pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root) -{ - struct device *dev = &root->device->dev; - struct resource *bus_res = &root->secondary; - u16 seg = root->segment; - const struct pci_ecam_ops *ecam_ops; - struct resource cfgres; - struct acpi_device *adev; - struct pci_config_window *cfg; - int ret; - - ret = pci_mcfg_lookup(root, &cfgres, &ecam_ops); - if (ret) { - dev_err(dev, "%04x:%pR ECAM region not found\n", seg, bus_res); - return NULL; - } - - adev = acpi_resource_consumer(&cfgres); - if (adev) - dev_info(dev, "ECAM area %pR reserved by %s\n", &cfgres, - dev_name(&adev->dev)); - else - dev_warn(dev, FW_BUG "ECAM area %pR not reserved in ACPI namespace\n", - &cfgres); - - cfg = pci_ecam_create(dev, &cfgres, bus_res, ecam_ops); - if (IS_ERR(cfg)) { - dev_err(dev, "%04x:%pR error %ld mapping ECAM\n", seg, bus_res, - PTR_ERR(cfg)); - return NULL; - } - - return cfg; -} - -/* release_info: free resources allocated by init_info */ -static void pci_acpi_generic_release_info(struct acpi_pci_root_info *ci) -{ - struct acpi_pci_generic_root_info *ri; - - ri = container_of(ci, struct acpi_pci_generic_root_info, common); - pci_ecam_free(ri->cfg); - kfree(ci->ops); - kfree(ri); -} - -/* Interface called from ACPI code to setup PCI host controller */ -struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) -{ - struct acpi_pci_generic_root_info *ri; - struct pci_bus *bus, *child; - struct acpi_pci_root_ops *root_ops; - struct pci_host_bridge *host; - - ri = kzalloc(sizeof(*ri), GFP_KERNEL); - if (!ri) - return NULL; - - root_ops = kzalloc(sizeof(*root_ops), GFP_KERNEL); - if (!root_ops) { - kfree(ri); - return NULL; - } - - ri->cfg = pci_acpi_setup_ecam_mapping(root); - if (!ri->cfg) { - kfree(ri); - kfree(root_ops); - return NULL; - } - - root_ops->release_info = pci_acpi_generic_release_info; - root_ops->prepare_resources = pci_acpi_root_prepare_resources; - root_ops->pci_ops = (struct pci_ops *)&ri->cfg->ops->pci_ops; - bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg); - if (!bus) - return NULL; - - /* If we must preserve the resource configuration, claim now */ - host = pci_find_host_bridge(bus); - if (host->preserve_config) - pci_bus_claim_resources(bus); - - /* - * Assign whatever was left unassigned. If we didn't claim above, - * this will reassign everything. - */ - pci_assign_unassigned_root_bus_resources(bus); - - list_for_each_entry(child, &bus->children, node) - pcie_bus_configure_settings(child); - - return bus; -} - -void pcibios_add_bus(struct pci_bus *bus) -{ - acpi_pci_add_bus(bus); -} - -void pcibios_remove_bus(struct pci_bus *bus) -{ - acpi_pci_remove_bus(bus); -} - -#endif diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 6d157f32187b..9b7f26b128b5 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -10,94 +10,12 @@ #include <asm/pointer_auth.h> -struct frame_tail { - struct frame_tail __user *fp; - unsigned long lr; -} __attribute__((packed)); - -/* - * Get the return address for a single stackframe and return a pointer to the - * next frame tail. - */ -static struct frame_tail __user * -user_backtrace(struct frame_tail __user *tail, - struct perf_callchain_entry_ctx *entry) -{ - struct frame_tail buftail; - unsigned long err; - unsigned long lr; - - /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(tail, sizeof(buftail))) - return NULL; - - pagefault_disable(); - err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); - pagefault_enable(); - - if (err) - return NULL; - - lr = ptrauth_strip_user_insn_pac(buftail.lr); - - perf_callchain_store(entry, lr); - - /* - * Frame pointers should strictly progress back up the stack - * (towards higher addresses). - */ - if (tail >= buftail.fp) - return NULL; - - return buftail.fp; -} - -#ifdef CONFIG_COMPAT -/* - * The registers we're interested in are at the end of the variable - * length saved register structure. The fp points at the end of this - * structure so the address of this struct is: - * (struct compat_frame_tail *)(xxx->fp)-1 - * - * This code has been adapted from the ARM OProfile support. - */ -struct compat_frame_tail { - compat_uptr_t fp; /* a (struct compat_frame_tail *) in compat mode */ - u32 sp; - u32 lr; -} __attribute__((packed)); - -static struct compat_frame_tail __user * -compat_user_backtrace(struct compat_frame_tail __user *tail, - struct perf_callchain_entry_ctx *entry) +static bool callchain_trace(void *data, unsigned long pc) { - struct compat_frame_tail buftail; - unsigned long err; - - /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(tail, sizeof(buftail))) - return NULL; - - pagefault_disable(); - err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); - pagefault_enable(); - - if (err) - return NULL; - - perf_callchain_store(entry, buftail.lr); - - /* - * Frame pointers should strictly progress back up the stack - * (towards higher addresses). - */ - if (tail + 1 >= (struct compat_frame_tail __user *) - compat_ptr(buftail.fp)) - return NULL; + struct perf_callchain_entry_ctx *entry = data; - return (struct compat_frame_tail __user *)compat_ptr(buftail.fp) - 1; + return perf_callchain_store(entry, pc) == 0; } -#endif /* CONFIG_COMPAT */ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) @@ -107,35 +25,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, return; } - perf_callchain_store(entry, regs->pc); - - if (!compat_user_mode(regs)) { - /* AARCH64 mode */ - struct frame_tail __user *tail; - - tail = (struct frame_tail __user *)regs->regs[29]; - - while (entry->nr < entry->max_stack && - tail && !((unsigned long)tail & 0x7)) - tail = user_backtrace(tail, entry); - } else { -#ifdef CONFIG_COMPAT - /* AARCH32 compat mode */ - struct compat_frame_tail __user *tail; - - tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; - - while ((entry->nr < entry->max_stack) && - tail && !((unsigned long)tail & 0x3)) - tail = compat_user_backtrace(tail, entry); -#endif - } -} - -static bool callchain_trace(void *data, unsigned long pc) -{ - struct perf_callchain_entry_ctx *entry = data; - return perf_callchain_store(entry, pc) == 0; + arch_stack_walk_user(callchain_trace, entry, regs); } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, @@ -148,31 +38,3 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, arch_stack_walk(callchain_trace, entry, current, regs); } - -unsigned long perf_instruction_pointer(struct pt_regs *regs) -{ - if (perf_guest_state()) - return perf_guest_get_ip(); - - return instruction_pointer(regs); -} - -unsigned long perf_misc_flags(struct pt_regs *regs) -{ - unsigned int guest_state = perf_guest_state(); - int misc = 0; - - if (guest_state) { - if (guest_state & PERF_GUEST_USER) - misc |= PERF_RECORD_MISC_GUEST_USER; - else - misc |= PERF_RECORD_MISC_GUEST_KERNEL; - } else { - if (user_mode(regs)) - misc |= PERF_RECORD_MISC_USER; - else - misc |= PERF_RECORD_MISC_KERNEL; - } - - return misc; -} diff --git a/arch/arm64/kernel/pi/.gitignore b/arch/arm64/kernel/pi/.gitignore new file mode 100644 index 000000000000..efb29b663e85 --- /dev/null +++ b/arch/arm64/kernel/pi/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +relacheck diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile index c844a0546d7f..4d11a8c29181 100644 --- a/arch/arm64/kernel/pi/Makefile +++ b/arch/arm64/kernel/pi/Makefile @@ -11,25 +11,34 @@ KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ -fno-asynchronous-unwind-tables -fno-unwind-tables \ $(call cc-option,-fno-addrsig) +# this code may run with the MMU off so disable unaligned accesses +CFLAGS_map_range.o += -mstrict-align + # remove SCS flags from all objects in this directory KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_SCS), $(KBUILD_CFLAGS)) # disable LTO KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS)) -GCOV_PROFILE := n -KASAN_SANITIZE := n -KCSAN_SANITIZE := n -UBSAN_SANITIZE := n -KCOV_INSTRUMENT := n +hostprogs := relacheck + +quiet_cmd_piobjcopy = $(quiet_cmd_objcopy) + cmd_piobjcopy = $(cmd_objcopy) && $(obj)/relacheck $(@) $(<) $(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ \ - --remove-section=.note.gnu.property \ - --prefix-alloc-sections=.init -$(obj)/%.pi.o: $(obj)/%.o FORCE - $(call if_changed,objcopy) + --remove-section=.note.gnu.property +$(obj)/%.pi.o: $(obj)/%.o $(obj)/relacheck FORCE + $(call if_changed,piobjcopy) + +# ensure that all the lib- code ends up as __init code and data +$(obj)/lib-%.pi.o: OBJCOPYFLAGS += --prefix-alloc-sections=.init $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE $(call if_changed_rule,cc_o_c) -obj-y := kaslr_early.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o -extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) +obj-y := idreg-override.pi.o \ + map_kernel.pi.o map_range.pi.o \ + lib-fdt.pi.o lib-fdt_ro.pi.o +obj-$(CONFIG_RELOCATABLE) += relocate.pi.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_early.pi.o +obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.pi.o +extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index e30fd9e32ef3..c6b185b885f7 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -14,6 +14,8 @@ #include <asm/cpufeature.h> #include <asm/setup.h> +#include "pi.h" + #define FTR_DESC_NAME_LEN 20 #define FTR_DESC_FIELD_LEN 10 #define FTR_ALIAS_NAME_LEN 30 @@ -21,15 +23,6 @@ static u64 __boot_status __initdata; -// temporary __prel64 related definitions -// to be removed when this code is moved under pi/ - -#define __prel64_initconst __initconst - -#define PREL64(type, name) union { type *name; } - -#define prel64_pointer(__d) (__d) - typedef bool filter_t(u64 val); struct ftr_set_desc { @@ -45,6 +38,15 @@ struct ftr_set_desc { #define FIELD(n, s, f) { .name = n, .shift = s, .width = 4, .filter = f } +static const struct ftr_set_desc mmfr0 __prel64_initconst = { + .name = "id_aa64mmfr0", + .override = &id_aa64mmfr0_override, + .fields = { + FIELD("ecv", ID_AA64MMFR0_EL1_ECV_SHIFT, NULL), + {} + }, +}; + static bool __init mmfr1_vh_filter(u64 val) { /* @@ -66,6 +68,44 @@ static const struct ftr_set_desc mmfr1 __prel64_initconst = { }, }; + +static bool __init mmfr2_varange_filter(u64 val) +{ + int __maybe_unused feat; + + if (val) + return false; + +#ifdef CONFIG_ARM64_LPA2 + feat = cpuid_feature_extract_signed_field(read_sysreg(id_aa64mmfr0_el1), + ID_AA64MMFR0_EL1_TGRAN_SHIFT); + if (feat >= ID_AA64MMFR0_EL1_TGRAN_LPA2) { + id_aa64mmfr0_override.val |= + (ID_AA64MMFR0_EL1_TGRAN_LPA2 - 1) << ID_AA64MMFR0_EL1_TGRAN_SHIFT; + id_aa64mmfr0_override.mask |= 0xfU << ID_AA64MMFR0_EL1_TGRAN_SHIFT; + + /* + * Override PARange to 48 bits - the override will just be + * ignored if the actual PARange is smaller, but this is + * unlikely to be the case for LPA2 capable silicon. + */ + id_aa64mmfr0_override.val |= + ID_AA64MMFR0_EL1_PARANGE_48 << ID_AA64MMFR0_EL1_PARANGE_SHIFT; + id_aa64mmfr0_override.mask |= 0xfU << ID_AA64MMFR0_EL1_PARANGE_SHIFT; + } +#endif + return true; +} + +static const struct ftr_set_desc mmfr2 __prel64_initconst = { + .name = "id_aa64mmfr2", + .override = &id_aa64mmfr2_override, + .fields = { + FIELD("varange", ID_AA64MMFR2_EL1_VARange_SHIFT, mmfr2_varange_filter), + {} + }, +}; + static bool __init pfr0_sve_filter(u64 val) { /* @@ -86,6 +126,7 @@ static const struct ftr_set_desc pfr0 __prel64_initconst = { .override = &id_aa64pfr0_override, .fields = { FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter), + FIELD("el0", ID_AA64PFR0_EL1_EL0_SHIFT, NULL), {} }, }; @@ -110,6 +151,7 @@ static const struct ftr_set_desc pfr1 __prel64_initconst = { .override = &id_aa64pfr1_override, .fields = { FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ), + FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL), FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), {} @@ -166,13 +208,16 @@ static const struct ftr_set_desc sw_features __prel64_initconst = { .fields = { FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL), FIELD("hvhe", ARM64_SW_FEATURE_OVERRIDE_HVHE, hvhe_filter), + FIELD("rodataoff", ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF, NULL), {} }, }; static const PREL64(const struct ftr_set_desc, reg) regs[] __prel64_initconst = { + { &mmfr0 }, { &mmfr1 }, + { &mmfr2 }, { &pfr0 }, { &pfr1 }, { &isar1 }, @@ -185,11 +230,12 @@ static const struct { char alias[FTR_ALIAS_NAME_LEN]; char feature[FTR_ALIAS_OPTION_LEN]; } aliases[] __initconst = { - { "kvm_arm.mode=nvhe", "id_aa64mmfr1.vh=0" }, - { "kvm_arm.mode=protected", "id_aa64mmfr1.vh=0" }, + { "kvm_arm.mode=nvhe", "arm64_sw.hvhe=0 id_aa64mmfr1.vh=0" }, + { "kvm_arm.mode=protected", "arm64_sw.hvhe=1" }, { "arm64.nosve", "id_aa64pfr0.sve=0" }, { "arm64.nosme", "id_aa64pfr1.sme=0" }, { "arm64.nobti", "id_aa64pfr1.bt=0" }, + { "arm64.nogcs", "id_aa64pfr1.gcs=0" }, { "arm64.nopauth", "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 " "id_aa64isar1.api=0 id_aa64isar1.apa=0 " @@ -197,6 +243,9 @@ static const struct { { "arm64.nomops", "id_aa64isar2.mops=0" }, { "arm64.nomte", "id_aa64pfr1.mte=0" }, { "nokaslr", "arm64_sw.nokaslr=1" }, + { "rodata=off", "arm64_sw.rodataoff=1" }, + { "arm64.nolva", "id_aa64mmfr2.varange=0" }, + { "arm64.no32bit_el0", "id_aa64pfr0.el0=1" }, }; static int __init parse_hexdigit(const char *p, u64 *v) @@ -313,42 +362,35 @@ static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) } while (1); } -static __init const u8 *get_bootargs_cmdline(void) +static __init const u8 *get_bootargs_cmdline(const void *fdt, int node) { + static char const bootargs[] __initconst = "bootargs"; const u8 *prop; - void *fdt; - int node; - fdt = get_early_fdt_ptr(); - if (!fdt) - return NULL; - - node = fdt_path_offset(fdt, "/chosen"); if (node < 0) return NULL; - prop = fdt_getprop(fdt, node, "bootargs", NULL); + prop = fdt_getprop(fdt, node, bootargs, NULL); if (!prop) return NULL; return strlen(prop) ? prop : NULL; } -static __init void parse_cmdline(void) +static __init void parse_cmdline(const void *fdt, int chosen) { - const u8 *prop = get_bootargs_cmdline(); + static char const cmdline[] __initconst = CONFIG_CMDLINE; + const u8 *prop = get_bootargs_cmdline(fdt, chosen); if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop) - __parse_cmdline(CONFIG_CMDLINE, true); + __parse_cmdline(cmdline, true); if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop) __parse_cmdline(prop, true); } -/* Keep checkers quiet */ -void init_feature_override(u64 boot_status); - -asmlinkage void __init init_feature_override(u64 boot_status) +void __init init_feature_override(u64 boot_status, const void *fdt, + int chosen) { struct arm64_ftr_override *override; const struct ftr_set_desc *reg; @@ -364,7 +406,7 @@ asmlinkage void __init init_feature_override(u64 boot_status) __boot_status = boot_status; - parse_cmdline(); + parse_cmdline(fdt, chosen); for (i = 0; i < ARRAY_SIZE(regs); i++) { reg = prel64_pointer(regs[i].reg); @@ -373,3 +415,10 @@ asmlinkage void __init init_feature_override(u64 boot_status) (unsigned long)(override + 1)); } } + +char * __init skip_spaces(const char *str) +{ + while (isspace(*str)) + ++str; + return (char *)str; +} diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c index 17bff6e399e4..0257b43819db 100644 --- a/arch/arm64/kernel/pi/kaslr_early.c +++ b/arch/arm64/kernel/pi/kaslr_early.c @@ -14,69 +14,23 @@ #include <asm/archrandom.h> #include <asm/memory.h> +#include <asm/pgtable.h> -/* taken from lib/string.c */ -static char *__strstr(const char *s1, const char *s2) -{ - size_t l1, l2; - - l2 = strlen(s2); - if (!l2) - return (char *)s1; - l1 = strlen(s1); - while (l1 >= l2) { - l1--; - if (!memcmp(s1, s2, l2)) - return (char *)s1; - s1++; - } - return NULL; -} -static bool cmdline_contains_nokaslr(const u8 *cmdline) -{ - const u8 *str; - - str = __strstr(cmdline, "nokaslr"); - return str == cmdline || (str > cmdline && *(str - 1) == ' '); -} - -static bool is_kaslr_disabled_cmdline(void *fdt) -{ - if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { - int node; - const u8 *prop; - - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - goto out; - - prop = fdt_getprop(fdt, node, "bootargs", NULL); - if (!prop) - goto out; - - if (cmdline_contains_nokaslr(prop)) - return true; +#include "pi.h" - if (IS_ENABLED(CONFIG_CMDLINE_EXTEND)) - goto out; +extern u16 memstart_offset_seed; - return false; - } -out: - return cmdline_contains_nokaslr(CONFIG_CMDLINE); -} - -static u64 get_kaslr_seed(void *fdt) +static u64 __init get_kaslr_seed(void *fdt, int node) { - int node, len; + static char const seed_str[] __initconst = "kaslr-seed"; fdt64_t *prop; u64 ret; + int len; - node = fdt_path_offset(fdt, "/chosen"); if (node < 0) return 0; - prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); + prop = fdt_getprop_w(fdt, node, seed_str, &len); if (!prop || len != sizeof(u64)) return 0; @@ -85,26 +39,28 @@ static u64 get_kaslr_seed(void *fdt) return ret; } -asmlinkage u64 kaslr_early_init(void *fdt) +u64 __init kaslr_early_init(void *fdt, int chosen) { - u64 seed; + u64 seed, range; - if (is_kaslr_disabled_cmdline(fdt)) + if (kaslr_disabled_cmdline()) return 0; - seed = get_kaslr_seed(fdt); + seed = get_kaslr_seed(fdt, chosen); if (!seed) { if (!__early_cpu_has_rndr() || !__arm64_rndr((unsigned long *)&seed)) return 0; } + memstart_offset_seed = seed & U16_MAX; + /* * OK, so we are proceeding with KASLR enabled. Calculate a suitable * kernel image offset from the seed. Let's place the kernel in the - * middle half of the VMALLOC area (VA_BITS_MIN - 2), and stay clear of - * the lower and upper quarters to avoid colliding with other - * allocations. + * 'middle' half of the VMALLOC area, and stay clear of the lower and + * upper quarters to avoid colliding with other allocations. */ - return BIT(VA_BITS_MIN - 3) + (seed & GENMASK(VA_BITS_MIN - 3, 0)); + range = (VMALLOC_END - KIMAGE_VADDR) / 2; + return range / 2 + (((__uint128_t)range * seed) >> 64); } diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c new file mode 100644 index 000000000000..e57b043f324b --- /dev/null +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +#include <linux/init.h> +#include <linux/libfdt.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/sizes.h> +#include <linux/string.h> + +#include <asm/memory.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> + +#include "pi.h" + +extern const u8 __eh_frame_start[], __eh_frame_end[]; + +extern void idmap_cpu_replace_ttbr1(void *pgdir); + +static void __init map_segment(pgd_t *pg_dir, u64 *pgd, u64 va_offset, + void *start, void *end, pgprot_t prot, + bool may_use_cont, int root_level) +{ + map_range(pgd, ((u64)start + va_offset) & ~PAGE_OFFSET, + ((u64)end + va_offset) & ~PAGE_OFFSET, (u64)start, + prot, root_level, (pte_t *)pg_dir, may_use_cont, 0); +} + +static void __init unmap_segment(pgd_t *pg_dir, u64 va_offset, void *start, + void *end, int root_level) +{ + map_segment(pg_dir, NULL, va_offset, start, end, __pgprot(0), + false, root_level); +} + +static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) +{ + bool enable_scs = IS_ENABLED(CONFIG_UNWIND_PATCH_PAC_INTO_SCS); + bool twopass = IS_ENABLED(CONFIG_RELOCATABLE); + u64 pgdp = (u64)init_pg_dir + PAGE_SIZE; + pgprot_t text_prot = PAGE_KERNEL_ROX; + pgprot_t data_prot = PAGE_KERNEL; + pgprot_t prot; + + /* + * External debuggers may need to write directly to the text mapping to + * install SW breakpoints. Allow this (only) when explicitly requested + * with rodata=off. + */ + if (arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF)) + text_prot = PAGE_KERNEL_EXEC; + + /* + * We only enable the shadow call stack dynamically if we are running + * on a system that does not implement PAC or BTI. PAC and SCS provide + * roughly the same level of protection, and BTI relies on the PACIASP + * instructions serving as landing pads, preventing us from patching + * those instructions into something else. + */ + if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) && cpu_has_pac()) + enable_scs = false; + + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && cpu_has_bti()) { + enable_scs = false; + + /* + * If we have a CPU that supports BTI and a kernel built for + * BTI then mark the kernel executable text as guarded pages + * now so we don't have to rewrite the page tables later. + */ + text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP); + } + + /* Map all code read-write on the first pass if needed */ + twopass |= enable_scs; + prot = twopass ? data_prot : text_prot; + + map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot, + !twopass, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata, + __inittext_begin, data_prot, false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, __inittext_begin, + __inittext_end, prot, false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, __initdata_begin, + __initdata_end, data_prot, false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, _data, _end, data_prot, + true, root_level); + dsb(ishst); + + idmap_cpu_replace_ttbr1(init_pg_dir); + + if (twopass) { + if (IS_ENABLED(CONFIG_RELOCATABLE)) + relocate_kernel(kaslr_offset); + + if (enable_scs) { + scs_patch(__eh_frame_start + va_offset, + __eh_frame_end - __eh_frame_start); + asm("ic ialluis"); + + dynamic_scs_is_enabled = true; + } + + /* + * Unmap the text region before remapping it, to avoid + * potential TLB conflicts when creating the contiguous + * descriptors. + */ + unmap_segment(init_pg_dir, va_offset, _stext, _etext, + root_level); + dsb(ishst); + isb(); + __tlbi(vmalle1); + isb(); + + /* + * Remap these segments with different permissions + * No new page table allocations should be needed + */ + map_segment(init_pg_dir, NULL, va_offset, _stext, _etext, + text_prot, true, root_level); + map_segment(init_pg_dir, NULL, va_offset, __inittext_begin, + __inittext_end, text_prot, false, root_level); + } + + /* Copy the root page table to its final location */ + memcpy((void *)swapper_pg_dir + va_offset, init_pg_dir, PAGE_SIZE); + dsb(ishst); + idmap_cpu_replace_ttbr1(swapper_pg_dir); +} + +static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr) +{ + u64 sctlr = read_sysreg(sctlr_el1); + u64 tcr = read_sysreg(tcr_el1) | TCR_DS; + u64 mmfr0 = read_sysreg(id_aa64mmfr0_el1); + u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_PARANGE_SHIFT); + + tcr &= ~TCR_IPS_MASK; + tcr |= parange << TCR_IPS_SHIFT; + + asm(" msr sctlr_el1, %0 ;" + " isb ;" + " msr ttbr0_el1, %1 ;" + " msr tcr_el1, %2 ;" + " isb ;" + " tlbi vmalle1 ;" + " dsb nsh ;" + " isb ;" + " msr sctlr_el1, %3 ;" + " isb ;" + :: "r"(sctlr & ~SCTLR_ELx_M), "r"(ttbr), "r"(tcr), "r"(sctlr)); +} + +static void __init remap_idmap_for_lpa2(void) +{ + /* clear the bits that change meaning once LPA2 is turned on */ + pteval_t mask = PTE_SHARED; + + /* + * We have to clear bits [9:8] in all block or page descriptors in the + * initial ID map, as otherwise they will be (mis)interpreted as + * physical address bits once we flick the LPA2 switch (TCR.DS). Since + * we cannot manipulate live descriptors in that way without creating + * potential TLB conflicts, let's create another temporary ID map in a + * LPA2 compatible fashion, and update the initial ID map while running + * from that. + */ + create_init_idmap(init_pg_dir, mask); + dsb(ishst); + set_ttbr0_for_lpa2((u64)init_pg_dir); + + /* + * Recreate the initial ID map with the same granularity as before. + * Don't bother with the FDT, we no longer need it after this. + */ + memset(init_idmap_pg_dir, 0, + (u64)init_idmap_pg_end - (u64)init_idmap_pg_dir); + + create_init_idmap(init_idmap_pg_dir, mask); + dsb(ishst); + + /* switch back to the updated initial ID map */ + set_ttbr0_for_lpa2((u64)init_idmap_pg_dir); + + /* wipe the temporary ID map from memory */ + memset(init_pg_dir, 0, (u64)init_pg_end - (u64)init_pg_dir); +} + +static void __init map_fdt(u64 fdt) +{ + static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE); + u64 efdt = fdt + MAX_FDT_SIZE; + u64 ptep = (u64)ptes; + + /* + * Map up to MAX_FDT_SIZE bytes, but avoid overlap with + * the kernel image. + */ + map_range(&ptep, fdt, (u64)_text > fdt ? min((u64)_text, efdt) : efdt, + fdt, PAGE_KERNEL, IDMAP_ROOT_LEVEL, + (pte_t *)init_idmap_pg_dir, false, 0); + dsb(ishst); +} + +asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) +{ + static char const chosen_str[] __initconst = "/chosen"; + u64 va_base, pa_base = (u64)&_text; + u64 kaslr_offset = pa_base % MIN_KIMG_ALIGN; + int root_level = 4 - CONFIG_PGTABLE_LEVELS; + int va_bits = VA_BITS; + int chosen; + + map_fdt((u64)fdt); + + /* Clear BSS and the initial page tables */ + memset(__bss_start, 0, (u64)init_pg_end - (u64)__bss_start); + + /* Parse the command line for CPU feature overrides */ + chosen = fdt_path_offset(fdt, chosen_str); + init_feature_override(boot_status, fdt, chosen); + + if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && !cpu_has_lva()) { + va_bits = VA_BITS_MIN; + } else if (IS_ENABLED(CONFIG_ARM64_LPA2) && !cpu_has_lpa2()) { + va_bits = VA_BITS_MIN; + root_level++; + } + + if (va_bits > VA_BITS_MIN) + sysreg_clear_set(tcr_el1, TCR_T1SZ_MASK, TCR_T1SZ(va_bits)); + + /* + * The virtual KASLR displacement modulo 2MiB is decided by the + * physical placement of the image, as otherwise, we might not be able + * to create the early kernel mapping using 2 MiB block descriptors. So + * take the low bits of the KASLR offset from the physical address, and + * fill in the high bits from the seed. + */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + u64 kaslr_seed = kaslr_early_init(fdt, chosen); + + if (kaslr_seed && kaslr_requires_kpti()) + arm64_use_ng_mappings = true; + + kaslr_offset |= kaslr_seed & ~(MIN_KIMG_ALIGN - 1); + } + + if (IS_ENABLED(CONFIG_ARM64_LPA2) && va_bits > VA_BITS_MIN) + remap_idmap_for_lpa2(); + + va_base = KIMAGE_VADDR + kaslr_offset; + map_kernel(kaslr_offset, va_base - pa_base, root_level); +} diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c new file mode 100644 index 000000000000..2b69e3beeef8 --- /dev/null +++ b/arch/arm64/kernel/pi/map_range.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +#include <linux/types.h> +#include <linux/sizes.h> + +#include <asm/memory.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> + +#include "pi.h" + +/** + * map_range - Map a contiguous range of physical pages into virtual memory + * + * @pte: Address of physical pointer to array of pages to + * allocate page tables from + * @start: Virtual address of the start of the range + * @end: Virtual address of the end of the range (exclusive) + * @pa: Physical address of the start of the range + * @prot: Access permissions of the range + * @level: Translation level for the mapping + * @tbl: The level @level page table to create the mappings in + * @may_use_cont: Whether the use of the contiguous attribute is allowed + * @va_offset: Offset between a physical page and its current mapping + * in the VA space + */ +void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, + int level, pte_t *tbl, bool may_use_cont, u64 va_offset) +{ + u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; + pteval_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; + int lshift = (3 - level) * (PAGE_SHIFT - 3); + u64 lmask = (PAGE_SIZE << lshift) - 1; + + start &= PAGE_MASK; + pa &= PAGE_MASK; + + /* Advance tbl to the entry that covers start */ + tbl += (start >> (lshift + PAGE_SHIFT)) % PTRS_PER_PTE; + + /* + * Set the right block/page bits for this level unless we are + * clearing the mapping + */ + if (protval) + protval |= (level < 3) ? PMD_TYPE_SECT : PTE_TYPE_PAGE; + + while (start < end) { + u64 next = min((start | lmask) + 1, PAGE_ALIGN(end)); + + if (level < 3 && (start | next | pa) & lmask) { + /* + * This chunk needs a finer grained mapping. Create a + * table mapping if necessary and recurse. + */ + if (pte_none(*tbl)) { + *tbl = __pte(__phys_to_pte_val(*pte) | + PMD_TYPE_TABLE | PMD_TABLE_UXN); + *pte += PTRS_PER_PTE * sizeof(pte_t); + } + map_range(pte, start, next, pa, prot, level + 1, + (pte_t *)(__pte_to_phys(*tbl) + va_offset), + may_use_cont, va_offset); + } else { + /* + * Start a contiguous range if start and pa are + * suitably aligned + */ + if (((start | pa) & cmask) == 0 && may_use_cont) + protval |= PTE_CONT; + + /* + * Clear the contiguous attribute if the remaining + * range does not cover a contiguous block + */ + if ((end & ~cmask) <= start) + protval &= ~PTE_CONT; + + /* Put down a block or page mapping */ + *tbl = __pte(__phys_to_pte_val(pa) | protval); + } + pa += next - start; + start = next; + tbl++; + } +} + +asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, pteval_t clrmask) +{ + u64 ptep = (u64)pg_dir + PAGE_SIZE; + pgprot_t text_prot = PAGE_KERNEL_ROX; + pgprot_t data_prot = PAGE_KERNEL; + + pgprot_val(text_prot) &= ~clrmask; + pgprot_val(data_prot) &= ~clrmask; + + map_range(&ptep, (u64)_stext, (u64)__initdata_begin, (u64)_stext, + text_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); + map_range(&ptep, (u64)__initdata_begin, (u64)_end, (u64)__initdata_begin, + data_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); + + return ptep; +} diff --git a/arch/arm64/kernel/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c index a1fe4b4ff591..55d0cd64ef71 100644 --- a/arch/arm64/kernel/patch-scs.c +++ b/arch/arm64/kernel/pi/patch-scs.c @@ -4,16 +4,17 @@ * Author: Ard Biesheuvel <ardb@google.com> */ -#include <linux/bug.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/linkage.h> -#include <linux/printk.h> #include <linux/types.h> -#include <asm/cacheflush.h> #include <asm/scs.h> +#include "pi.h" + +bool dynamic_scs_is_enabled; + // // This minimal DWARF CFI parser is partially based on the code in // arch/arc/kernel/unwind.c, and on the document below: @@ -49,7 +50,9 @@ #define DW_CFA_GNU_negative_offset_extended 0x2f #define DW_CFA_hi_user 0x3f -extern const u8 __eh_frame_start[], __eh_frame_end[]; +#define DW_EH_PE_sdata4 0x0b +#define DW_EH_PE_sdata8 0x0c +#define DW_EH_PE_pcrel 0x10 enum { PACIASP = 0xd503233f, @@ -81,7 +84,11 @@ static void __always_inline scs_patch_loc(u64 loc) */ return; } - dcache_clean_pou(loc, loc + sizeof(u32)); + if (IS_ENABLED(CONFIG_ARM64_WORKAROUND_CLEAN_CACHE)) + asm("dc civac, %0" :: "r"(loc)); + else + asm(ALTERNATIVE("dc cvau, %0", "nop", ARM64_HAS_CACHE_IDC) + :: "r"(loc)); } /* @@ -117,7 +124,12 @@ struct eh_frame { union { struct { // CIE u8 version; - u8 augmentation_string[]; + u8 augmentation_string[3]; + u8 code_alignment_factor; + u8 data_alignment_factor; + u8 return_address_register; + u8 augmentation_data_size; + u8 fde_pointer_format; }; struct { // FDE @@ -125,29 +137,38 @@ struct eh_frame { s32 range; u8 opcodes[]; }; + + struct { // FDE + s64 initial_loc64; + s64 range64; + u8 opcodes64[]; + }; }; }; -static int noinstr scs_handle_fde_frame(const struct eh_frame *frame, - bool fde_has_augmentation_data, - int code_alignment_factor, - bool dry_run) +static int scs_handle_fde_frame(const struct eh_frame *frame, + int code_alignment_factor, + bool use_sdata8, + bool dry_run) { int size = frame->size - offsetof(struct eh_frame, opcodes) + 4; u64 loc = (u64)offset_to_ptr(&frame->initial_loc); const u8 *opcode = frame->opcodes; + int l; - if (fde_has_augmentation_data) { - int l; + if (use_sdata8) { + loc = (u64)&frame->initial_loc64 + frame->initial_loc64; + opcode = frame->opcodes64; + size -= 8; + } - // assume single byte uleb128_t - if (WARN_ON(*opcode & BIT(7))) - return -ENOEXEC; + // assume single byte uleb128_t for augmentation data size + if (*opcode & BIT(7)) + return EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE; - l = *opcode++; - opcode += l; - size -= l + 1; - } + l = *opcode++; + opcode += l; + size -= l + 1; /* * Starting from 'loc', apply the CFA opcodes that advance the location @@ -198,21 +219,20 @@ static int noinstr scs_handle_fde_frame(const struct eh_frame *frame, break; default: - pr_err("unhandled opcode: %02x in FDE frame %lx\n", opcode[-1], (uintptr_t)frame); - return -ENOEXEC; + return EDYNSCS_INVALID_CFA_OPCODE; } } return 0; } -int noinstr scs_patch(const u8 eh_frame[], int size) +int scs_patch(const u8 eh_frame[], int size) { + int code_alignment_factor = 1; + bool fde_use_sdata8 = false; const u8 *p = eh_frame; while (size > 4) { const struct eh_frame *frame = (const void *)p; - bool fde_has_augmentation_data = true; - int code_alignment_factor = 1; int ret; if (frame->size == 0 || @@ -221,28 +241,47 @@ int noinstr scs_patch(const u8 eh_frame[], int size) break; if (frame->cie_id_or_pointer == 0) { - const u8 *p = frame->augmentation_string; - - /* a 'z' in the augmentation string must come first */ - fde_has_augmentation_data = *p == 'z'; + /* + * Require presence of augmentation data (z) with a + * specifier for the size of the FDE initial_loc and + * range fields (R), and nothing else. + */ + if (strcmp(frame->augmentation_string, "zR")) + return EDYNSCS_INVALID_CIE_HEADER; /* * The code alignment factor is a uleb128 encoded field * but given that the only sensible values are 1 or 4, - * there is no point in decoding the whole thing. + * there is no point in decoding the whole thing. Also + * sanity check the size of the data alignment factor + * field, and the values of the return address register + * and augmentation data size fields. */ - p += strlen(p) + 1; - if (!WARN_ON(*p & BIT(7))) - code_alignment_factor = *p; + if ((frame->code_alignment_factor & BIT(7)) || + (frame->data_alignment_factor & BIT(7)) || + frame->return_address_register != 30 || + frame->augmentation_data_size != 1) + return EDYNSCS_INVALID_CIE_HEADER; + + code_alignment_factor = frame->code_alignment_factor; + + switch (frame->fde_pointer_format) { + case DW_EH_PE_pcrel | DW_EH_PE_sdata4: + fde_use_sdata8 = false; + break; + case DW_EH_PE_pcrel | DW_EH_PE_sdata8: + fde_use_sdata8 = true; + break; + default: + return EDYNSCS_INVALID_CIE_SDATA_SIZE; + } } else { - ret = scs_handle_fde_frame(frame, - fde_has_augmentation_data, - code_alignment_factor, - true); + ret = scs_handle_fde_frame(frame, code_alignment_factor, + fde_use_sdata8, true); if (ret) return ret; - scs_handle_fde_frame(frame, fde_has_augmentation_data, - code_alignment_factor, false); + scs_handle_fde_frame(frame, code_alignment_factor, + fde_use_sdata8, false); } p += sizeof(frame->size) + frame->size; @@ -250,13 +289,3 @@ int noinstr scs_patch(const u8 eh_frame[], int size) } return 0; } - -asmlinkage void __init scs_patch_vmlinux(void) -{ - if (!should_patch_pac_into_scs()) - return; - - WARN_ON(scs_patch(__eh_frame_start, __eh_frame_end - __eh_frame_start)); - icache_inval_all_pou(); - isb(); -} diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h new file mode 100644 index 000000000000..c91e5e965cd3 --- /dev/null +++ b/arch/arm64/kernel/pi/pi.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +#include <linux/types.h> + +#define __prel64_initconst __section(".init.rodata.prel64") + +#define PREL64(type, name) union { type *name; prel64_t name ## _prel; } + +#define prel64_pointer(__d) (typeof(__d))prel64_to_pointer(&__d##_prel) + +typedef volatile signed long prel64_t; + +static inline void *prel64_to_pointer(const prel64_t *offset) +{ + if (!*offset) + return NULL; + return (void *)offset + *offset; +} + +extern bool dynamic_scs_is_enabled; + +extern pgd_t init_idmap_pg_dir[], init_idmap_pg_end[]; + +void init_feature_override(u64 boot_status, const void *fdt, int chosen); +u64 kaslr_early_init(void *fdt, int chosen); +void relocate_kernel(u64 offset); +int scs_patch(const u8 eh_frame[], int size); + +void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, + int level, pte_t *tbl, bool may_use_cont, u64 va_offset); + +asmlinkage void early_map_kernel(u64 boot_status, void *fdt); + +asmlinkage u64 create_init_idmap(pgd_t *pgd, pteval_t clrmask); diff --git a/arch/arm64/kernel/pi/relacheck.c b/arch/arm64/kernel/pi/relacheck.c new file mode 100644 index 000000000000..b0cd4d0d275b --- /dev/null +++ b/arch/arm64/kernel/pi/relacheck.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 - Google LLC + * Author: Ard Biesheuvel <ardb@google.com> + */ + +#include <elf.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define HOST_ORDER ELFDATA2LSB +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define HOST_ORDER ELFDATA2MSB +#endif + +static Elf64_Ehdr *ehdr; +static Elf64_Shdr *shdr; +static const char *strtab; +static bool swap; + +static uint64_t swab_elfxword(uint64_t val) +{ + return swap ? __builtin_bswap64(val) : val; +} + +static uint32_t swab_elfword(uint32_t val) +{ + return swap ? __builtin_bswap32(val) : val; +} + +static uint16_t swab_elfhword(uint16_t val) +{ + return swap ? __builtin_bswap16(val) : val; +} + +int main(int argc, char *argv[]) +{ + struct stat stat; + int fd, ret; + + if (argc < 3) { + fprintf(stderr, "file arguments missing\n"); + exit(EXIT_FAILURE); + } + + fd = open(argv[1], O_RDWR); + if (fd < 0) { + fprintf(stderr, "failed to open %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + ret = fstat(fd, &stat); + if (ret < 0) { + fprintf(stderr, "failed to stat() %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + ehdr = mmap(0, stat.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ehdr == MAP_FAILED) { + fprintf(stderr, "failed to mmap() %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + swap = ehdr->e_ident[EI_DATA] != HOST_ORDER; + shdr = (void *)ehdr + swab_elfxword(ehdr->e_shoff); + strtab = (void *)ehdr + + swab_elfxword(shdr[swab_elfhword(ehdr->e_shstrndx)].sh_offset); + + for (int i = 0; i < swab_elfhword(ehdr->e_shnum); i++) { + unsigned long info, flags; + bool prel64 = false; + Elf64_Rela *rela; + int numrela; + + if (swab_elfword(shdr[i].sh_type) != SHT_RELA) + continue; + + /* only consider RELA sections operating on data */ + info = swab_elfword(shdr[i].sh_info); + flags = swab_elfxword(shdr[info].sh_flags); + if ((flags & (SHF_ALLOC | SHF_EXECINSTR)) != SHF_ALLOC) + continue; + + /* + * We generally don't permit ABS64 relocations in the code that + * runs before relocation processing occurs. If statically + * initialized absolute symbol references are unavoidable, they + * may be emitted into a *.rodata.prel64 section and they will + * be converted to place-relative 64-bit references. This + * requires special handling in the referring code. + */ + if (strstr(strtab + swab_elfword(shdr[info].sh_name), + ".rodata.prel64")) { + prel64 = true; + } + + rela = (void *)ehdr + swab_elfxword(shdr[i].sh_offset); + numrela = swab_elfxword(shdr[i].sh_size) / sizeof(*rela); + + for (int j = 0; j < numrela; j++) { + uint64_t info = swab_elfxword(rela[j].r_info); + + if (ELF64_R_TYPE(info) != R_AARCH64_ABS64) + continue; + + if (prel64) { + /* convert ABS64 into PREL64 */ + info ^= R_AARCH64_ABS64 ^ R_AARCH64_PREL64; + rela[j].r_info = swab_elfxword(info); + } else { + fprintf(stderr, + "Unexpected absolute relocations detected in %s\n", + argv[2]); + close(fd); + unlink(argv[1]); + exit(EXIT_FAILURE); + } + } + } + close(fd); + return 0; +} diff --git a/arch/arm64/kernel/pi/relocate.c b/arch/arm64/kernel/pi/relocate.c new file mode 100644 index 000000000000..2407d2696398 --- /dev/null +++ b/arch/arm64/kernel/pi/relocate.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Authors: Ard Biesheuvel <ardb@google.com> +// Peter Collingbourne <pcc@google.com> + +#include <linux/elf.h> +#include <linux/init.h> +#include <linux/types.h> + +#include "pi.h" + +extern const Elf64_Rela rela_start[], rela_end[]; +extern const u64 relr_start[], relr_end[]; + +void __init relocate_kernel(u64 offset) +{ + u64 *place = NULL; + + for (const Elf64_Rela *rela = rela_start; rela < rela_end; rela++) { + if (ELF64_R_TYPE(rela->r_info) != R_AARCH64_RELATIVE) + continue; + *(u64 *)(rela->r_offset + offset) = rela->r_addend + offset; + } + + if (!IS_ENABLED(CONFIG_RELR) || !offset) + return; + + /* + * Apply RELR relocations. + * + * RELR is a compressed format for storing relative relocations. The + * encoded sequence of entries looks like: + * [ AAAAAAAA BBBBBBB1 BBBBBBB1 ... AAAAAAAA BBBBBB1 ... ] + * + * i.e. start with an address, followed by any number of bitmaps. The + * address entry encodes 1 relocation. The subsequent bitmap entries + * encode up to 63 relocations each, at subsequent offsets following + * the last address entry. + * + * The bitmap entries must have 1 in the least significant bit. The + * assumption here is that an address cannot have 1 in lsb. Odd + * addresses are not supported. Any odd addresses are stored in the + * RELA section, which is handled above. + * + * With the exception of the least significant bit, each bit in the + * bitmap corresponds with a machine word that follows the base address + * word, and the bit value indicates whether or not a relocation needs + * to be applied to it. The second least significant bit represents the + * machine word immediately following the initial address, and each bit + * that follows represents the next word, in linear order. As such, a + * single bitmap can encode up to 63 relocations in a 64-bit object. + */ + for (const u64 *relr = relr_start; relr < relr_end; relr++) { + if ((*relr & 1) == 0) { + place = (u64 *)(*relr + offset); + *place++ += offset; + } else { + for (u64 *p = place, r = *relr >> 1; r; p++, r >>= 1) + if (r & 1) + *p += offset; + place += 63; + } + } +} diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 968d5fffe233..6438bf62e753 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -58,10 +58,13 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * Instructions which load PC relative literals are not going to work * when executed from an XOL slot. Instructions doing an exclusive * load/store are not going to complete successfully when single-step - * exception handling happens in the middle of the sequence. + * exception handling happens in the middle of the sequence. Memory + * copy/set instructions require that all three instructions be placed + * consecutively in memory. */ if (aarch64_insn_uses_literal(insn) || - aarch64_insn_is_exclusive(insn)) + aarch64_insn_is_exclusive(insn) || + aarch64_insn_is_mops(insn)) return false; return true; @@ -73,9 +76,18 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot. */ enum probe_insn __kprobes -arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) +arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api) { /* + * While 'nop' instruction can execute in the out-of-line slot, + * simulating them in breakpoint handling offers better performance. + */ + if (aarch64_insn_is_nop(insn)) { + api->handler = simulate_nop; + return INSN_GOOD_NO_SLOT; + } + + /* * Instructions reading or modifying the PC won't work from the XOL * slot. */ @@ -99,10 +111,6 @@ arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) aarch64_insn_is_blr(insn) || aarch64_insn_is_ret(insn)) { api->handler = simulate_br_blr_ret; - } else if (aarch64_insn_is_ldr_lit(insn)) { - api->handler = simulate_ldr_literal; - } else if (aarch64_insn_is_ldrsw_lit(insn)) { - api->handler = simulate_ldrsw_literal; } else { /* * Instruction cannot be stepped out-of-line and we don't @@ -137,9 +145,20 @@ enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) { enum probe_insn decoded; - probe_opcode_t insn = le32_to_cpu(*addr); - probe_opcode_t *scan_end = NULL; + u32 insn = le32_to_cpu(*addr); + kprobe_opcode_t *scan_end = NULL; unsigned long size = 0, offset = 0; + struct arch_probe_insn *api = &asi->api; + + if (aarch64_insn_is_ldr_lit(insn)) { + api->handler = simulate_ldr_literal; + decoded = INSN_GOOD_NO_SLOT; + } else if (aarch64_insn_is_ldrsw_lit(insn)) { + api->handler = simulate_ldrsw_literal; + decoded = INSN_GOOD_NO_SLOT; + } else { + decoded = arm_probe_decode_insn(insn, &asi->api); + } /* * If there's a symbol defined in front of and near enough to @@ -157,7 +176,6 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) else scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE; } - decoded = arm_probe_decode_insn(insn, &asi->api); if (decoded != INSN_REJECTED && scan_end) if (is_probed_address_atomic(addr - 1, scan_end)) diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h index 8b758c5a2062..0e4195de8206 100644 --- a/arch/arm64/kernel/probes/decode-insn.h +++ b/arch/arm64/kernel/probes/decode-insn.h @@ -28,6 +28,6 @@ enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi); #endif enum probe_insn __kprobes -arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *asi); +arm_probe_decode_insn(u32 insn, struct arch_probe_insn *asi); #endif /* _ARM_KERNEL_KPROBES_ARM64_H */ diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 70b91a8c6bb3..d9e462eafb95 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -27,7 +27,7 @@ #include <asm/debug-monitors.h> #include <asm/insn.h> #include <asm/irq.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/ptrace.h> #include <asm/sections.h> #include <asm/system_misc.h> @@ -43,7 +43,7 @@ post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *); static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { - kprobe_opcode_t *addr = p->ainsn.api.insn; + kprobe_opcode_t *addr = p->ainsn.xol_insn; /* * Prepare insn slot, Mark Rutland points out it depends on a coupe of @@ -64,20 +64,20 @@ static void __kprobes arch_prepare_ss_slot(struct kprobe *p) * the BRK exception handler, so it is unnecessary to generate * Contex-Synchronization-Event via ISB again. */ - aarch64_insn_patch_text_nosync(addr, p->opcode); + aarch64_insn_patch_text_nosync(addr, le32_to_cpu(p->opcode)); aarch64_insn_patch_text_nosync(addr + 1, BRK64_OPCODE_KPROBES_SS); /* * Needs restoring of return address after stepping xol. */ - p->ainsn.api.restore = (unsigned long) p->addr + + p->ainsn.xol_restore = (unsigned long) p->addr + sizeof(kprobe_opcode_t); } static void __kprobes arch_prepare_simulate(struct kprobe *p) { /* This instructions is not executed xol. No need to adjust the PC */ - p->ainsn.api.restore = 0; + p->ainsn.xol_restore = 0; } static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) @@ -85,7 +85,7 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); if (p->ainsn.api.handler) - p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs); + p->ainsn.api.handler(le32_to_cpu(p->opcode), (long)p->addr, regs); /* single step simulated, now go for post processing */ post_kprobe_handler(p, kcb, regs); @@ -99,7 +99,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; /* copy instruction */ - p->opcode = le32_to_cpu(*p->addr); + p->opcode = *p->addr; if (search_exception_tables(probe_addr)) return -EINVAL; @@ -110,18 +110,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; case INSN_GOOD_NO_SLOT: /* insn need simulation */ - p->ainsn.api.insn = NULL; + p->ainsn.xol_insn = NULL; break; case INSN_GOOD: /* instruction uses slot */ - p->ainsn.api.insn = get_insn_slot(); - if (!p->ainsn.api.insn) + p->ainsn.xol_insn = get_insn_slot(); + if (!p->ainsn.xol_insn) return -ENOMEM; break; } /* prepare the instruction */ - if (p->ainsn.api.insn) + if (p->ainsn.xol_insn) arch_prepare_ss_slot(p); else arch_prepare_simulate(p); @@ -129,13 +129,6 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return 0; } -void *alloc_insn_page(void) -{ - return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - /* arm kprobe: install breakpoint in text */ void __kprobes arch_arm_kprobe(struct kprobe *p) { @@ -149,15 +142,16 @@ void __kprobes arch_arm_kprobe(struct kprobe *p) void __kprobes arch_disarm_kprobe(struct kprobe *p) { void *addr = p->addr; + u32 insn = le32_to_cpu(p->opcode); - aarch64_insn_patch_text(&addr, &p->opcode, 1); + aarch64_insn_patch_text(&addr, &insn, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) { - if (p->ainsn.api.insn) { - free_insn_slot(p->ainsn.api.insn, 0); - p->ainsn.api.insn = NULL; + if (p->ainsn.xol_insn) { + free_insn_slot(p->ainsn.xol_insn, 0); + p->ainsn.xol_insn = NULL; } } @@ -212,9 +206,9 @@ static void __kprobes setup_singlestep(struct kprobe *p, } - if (p->ainsn.api.insn) { + if (p->ainsn.xol_insn) { /* prepare for single stepping */ - slot = (unsigned long)p->ainsn.api.insn; + slot = (unsigned long)p->ainsn.xol_insn; kprobes_save_local_irqflag(kcb, regs); instruction_pointer_set(regs, slot); @@ -252,8 +246,8 @@ static void __kprobes post_kprobe_handler(struct kprobe *cur, struct kprobe_ctlblk *kcb, struct pt_regs *regs) { /* return addr restore if non-branching insn */ - if (cur->ainsn.api.restore != 0) - instruction_pointer_set(regs, cur->ainsn.api.restore); + if (cur->ainsn.xol_restore != 0) + instruction_pointer_set(regs, cur->ainsn.xol_restore); /* restore back original saved kprobe variables and continue */ if (kcb->kprobe_status == KPROBE_REENTER) { @@ -355,7 +349,7 @@ kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr) struct kprobe *cur = kprobe_running(); if (cur && (kcb->kprobe_status & (KPROBE_HIT_SS | KPROBE_REENTER)) && - ((unsigned long)&cur->ainsn.api.insn[1] == addr)) { + ((unsigned long)&cur->ainsn.xol_insn[1] == addr)) { kprobes_restore_local_irqflag(kcb, regs); post_kprobe_handler(cur, kcb, regs); @@ -371,6 +365,21 @@ static struct break_hook kprobes_break_ss_hook = { .fn = kprobe_breakpoint_ss_handler, }; +static int __kprobes +kretprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr) +{ + if (regs->pc != (unsigned long)__kretprobe_trampoline) + return DBG_HOOK_ERROR; + + regs->pc = kretprobe_trampoline_handler(regs, (void *)regs->regs[29]); + return DBG_HOOK_HANDLED; +} + +static struct break_hook kretprobes_break_hook = { + .imm = KRETPROBES_BRK_IMM, + .fn = kretprobe_breakpoint_handler, +}; + /* * Provide a blacklist of symbols identifying ranges which cannot be kprobed. * This blacklist is exposed to userspace via debugfs (kprobes/blacklist). @@ -396,11 +405,6 @@ int __init arch_populate_kprobe_blacklist(void) return ret; } -void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs) -{ - return (void *)kretprobe_trampoline_handler(regs, (void *)regs->regs[29]); -} - void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -420,6 +424,7 @@ int __init arch_init_kprobes(void) { register_kernel_break_hook(&kprobes_break_hook); register_kernel_break_hook(&kprobes_break_ss_hook); + register_kernel_break_hook(&kretprobes_break_hook); return 0; } diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S index 9a6499bed58b..a362f3dbb3d1 100644 --- a/arch/arm64/kernel/probes/kprobes_trampoline.S +++ b/arch/arm64/kernel/probes/kprobes_trampoline.S @@ -4,83 +4,17 @@ */ #include <linux/linkage.h> -#include <asm/asm-offsets.h> +#include <asm/asm-bug.h> #include <asm/assembler.h> .text - .macro save_all_base_regs - stp x0, x1, [sp, #S_X0] - stp x2, x3, [sp, #S_X2] - stp x4, x5, [sp, #S_X4] - stp x6, x7, [sp, #S_X6] - stp x8, x9, [sp, #S_X8] - stp x10, x11, [sp, #S_X10] - stp x12, x13, [sp, #S_X12] - stp x14, x15, [sp, #S_X14] - stp x16, x17, [sp, #S_X16] - stp x18, x19, [sp, #S_X18] - stp x20, x21, [sp, #S_X20] - stp x22, x23, [sp, #S_X22] - stp x24, x25, [sp, #S_X24] - stp x26, x27, [sp, #S_X26] - stp x28, x29, [sp, #S_X28] - add x0, sp, #PT_REGS_SIZE - stp lr, x0, [sp, #S_LR] - /* - * Construct a useful saved PSTATE - */ - mrs x0, nzcv - mrs x1, daif - orr x0, x0, x1 - mrs x1, CurrentEL - orr x0, x0, x1 - mrs x1, SPSel - orr x0, x0, x1 - stp xzr, x0, [sp, #S_PC] - .endm - - .macro restore_all_base_regs - ldr x0, [sp, #S_PSTATE] - and x0, x0, #(PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT) - msr nzcv, x0 - ldp x0, x1, [sp, #S_X0] - ldp x2, x3, [sp, #S_X2] - ldp x4, x5, [sp, #S_X4] - ldp x6, x7, [sp, #S_X6] - ldp x8, x9, [sp, #S_X8] - ldp x10, x11, [sp, #S_X10] - ldp x12, x13, [sp, #S_X12] - ldp x14, x15, [sp, #S_X14] - ldp x16, x17, [sp, #S_X16] - ldp x18, x19, [sp, #S_X18] - ldp x20, x21, [sp, #S_X20] - ldp x22, x23, [sp, #S_X22] - ldp x24, x25, [sp, #S_X24] - ldp x26, x27, [sp, #S_X26] - ldp x28, x29, [sp, #S_X28] - .endm - SYM_CODE_START(__kretprobe_trampoline) - sub sp, sp, #PT_REGS_SIZE - - save_all_base_regs - - /* Setup a frame pointer. */ - add x29, sp, #S_FP - - mov x0, sp - bl trampoline_probe_handler /* - * Replace trampoline address in lr with actual orig_ret_addr return - * address. + * Trigger a breakpoint exception. The PC will be adjusted by + * kretprobe_breakpoint_handler(), and no subsequent instructions will + * be executed from the trampoline. */ - mov lr, x0 - - /* The frame pointer (x29) is restored with other registers. */ - restore_all_base_regs - - add sp, sp, #PT_REGS_SIZE - ret - + brk #KRETPROBES_BRK_IMM + ASM_BUG() SYM_CODE_END(__kretprobe_trampoline) diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 22d0b3252476..4c6d2d712fbd 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -171,17 +171,15 @@ simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs) void __kprobes simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs) { - u64 *load_addr; + unsigned long load_addr; int xn = opcode & 0x1f; - int disp; - disp = ldr_displacement(opcode); - load_addr = (u64 *) (addr + disp); + load_addr = addr + ldr_displacement(opcode); if (opcode & (1 << 30)) /* x0-x30 */ - set_x_reg(regs, xn, *load_addr); + set_x_reg(regs, xn, READ_ONCE(*(u64 *)load_addr)); else /* w0-w30 */ - set_w_reg(regs, xn, *load_addr); + set_w_reg(regs, xn, READ_ONCE(*(u32 *)load_addr)); instruction_pointer_set(regs, instruction_pointer(regs) + 4); } @@ -189,14 +187,18 @@ simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs) void __kprobes simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs) { - s32 *load_addr; + unsigned long load_addr; int xn = opcode & 0x1f; - int disp; - disp = ldr_displacement(opcode); - load_addr = (s32 *) (addr + disp); + load_addr = addr + ldr_displacement(opcode); - set_x_reg(regs, xn, *load_addr); + set_x_reg(regs, xn, READ_ONCE(*(s32 *)load_addr)); instruction_pointer_set(regs, instruction_pointer(regs) + 4); } + +void __kprobes +simulate_nop(u32 opcode, long addr, struct pt_regs *regs) +{ + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); +} diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h index e065dc92218e..efb2803ec943 100644 --- a/arch/arm64/kernel/probes/simulate-insn.h +++ b/arch/arm64/kernel/probes/simulate-insn.h @@ -16,5 +16,6 @@ void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs); +void simulate_nop(u32 opcode, long addr, struct pt_regs *regs); #endif /* _ARM_KERNEL_KPROBES_SIMULATE_INSN_H */ diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index d49aef2657cd..cb3d05af36e3 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -17,12 +17,20 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *xol_page_kaddr = kmap_atomic(page); void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); + /* + * Initial cache maintenance of the xol page done via set_pte_at(). + * Subsequent CMOs only needed if the xol slot changes. + */ + if (!memcmp(dst, src, len)) + goto done; + /* Initialize the slot */ memcpy(dst, src, len); /* flush caches (dcache/icache) */ sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len); +done: kunmap_atomic(xol_page_kaddr); } @@ -34,7 +42,7 @@ unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - probe_opcode_t insn; + u32 insn; /* TODO: Currently we do not support AARCH32 instruction probing */ if (mm->context.flags & MMCF_AARCH32) @@ -42,7 +50,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) return -EINVAL; - insn = *(probe_opcode_t *)(&auprobe->insn[0]); + insn = le32_to_cpu(auprobe->insn); switch (arm_probe_decode_insn(insn, &auprobe->api)) { case INSN_REJECTED: @@ -102,13 +110,13 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - probe_opcode_t insn; + u32 insn; unsigned long addr; if (!auprobe->simulate) return false; - insn = *(probe_opcode_t *)(&auprobe->insn[0]); + insn = le32_to_cpu(auprobe->insn); addr = instruction_pointer(regs); if (auprobe->api.handler) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 7387b68c745b..42faebb7b712 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -43,11 +43,13 @@ #include <linux/stacktrace.h> #include <asm/alternative.h> +#include <asm/arch_timer.h> #include <asm/compat.h> #include <asm/cpufeature.h> #include <asm/cacheflush.h> #include <asm/exec.h> #include <asm/fpsimd.h> +#include <asm/gcs.h> #include <asm/mmu_context.h> #include <asm/mte.h> #include <asm/processor.h> @@ -226,7 +228,7 @@ void __show_regs(struct pt_regs *regs) printk("sp : %016llx\n", sp); if (system_uses_irq_prio_masking()) - printk("pmr_save: %08llx\n", regs->pmr_save); + printk("pmr: %08x\n", regs->pmr); i = top_reg; @@ -271,17 +273,73 @@ static void flush_tagged_addr_state(void) clear_thread_flag(TIF_TAGGED_ADDR); } +static void flush_poe(void) +{ + if (!system_supports_poe()) + return; + + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); +} + +#ifdef CONFIG_ARM64_GCS + +static void flush_gcs(void) +{ + if (!system_supports_gcs()) + return; + + gcs_free(current); + current->thread.gcs_el0_mode = 0; + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); + write_sysreg_s(0, SYS_GCSPR_EL0); +} + +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + unsigned long gcs; + + if (!system_supports_gcs()) + return 0; + + p->thread.gcs_base = 0; + p->thread.gcs_size = 0; + + gcs = gcs_alloc_thread_stack(p, args); + if (IS_ERR_VALUE(gcs)) + return PTR_ERR((void *)gcs); + + p->thread.gcs_el0_mode = current->thread.gcs_el0_mode; + p->thread.gcs_el0_locked = current->thread.gcs_el0_locked; + + return 0; +} + +#else + +static void flush_gcs(void) { } +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + return 0; +} + +#endif + void flush_thread(void) { fpsimd_flush_thread(); tls_thread_flush(); flush_ptrace_hw_breakpoint(current); flush_tagged_addr_state(); + flush_poe(); + flush_gcs(); } void arch_release_task_struct(struct task_struct *tsk) { fpsimd_release_task(tsk); + gcs_free(tsk); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) @@ -290,9 +348,6 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) fpsimd_preserve_current_state(); *dst = *src; - /* We rely on the above assignment to initialize dst's thread_flags: */ - BUILD_BUG_ON(!IS_ENABLED(CONFIG_THREAD_INFO_IN_TASK)); - /* * Detach src's sve_state (if any) from dst so that it does not * get erroneously used or freed prematurely. dst's copies @@ -348,6 +403,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) unsigned long stack_start = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); + int ret; memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); @@ -374,6 +430,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (system_supports_tpidr2()) p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + if (system_supports_poe()) + p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (stack_start) { if (is_compat_thread(task_thread_info(p))) childregs->compat_sp = stack_start; @@ -389,6 +448,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) p->thread.uw.tp_value = tls; p->thread.tpidr2_el0 = 0; } + + ret = copy_thread_gcs(p, args); + if (ret != 0) + return ret; } else { /* * A kthread has no context to ERET to, so ensure any buggy @@ -399,9 +462,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT; + childregs->stackframe.type = FRAME_META_TYPE_FINAL; p->thread.cpu_context.x19 = (unsigned long)args->fn; p->thread.cpu_context.x20 = (unsigned long)args->fn_arg; + + if (system_supports_poe()) + p->thread.por_el0 = POR_EL0_INIT; } p->thread.cpu_context.pc = (unsigned long)ret_from_fork; p->thread.cpu_context.sp = (unsigned long)childregs; @@ -409,7 +476,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * For the benefit of the unwinder, set up childregs->stackframe * as the final frame for the new task. */ - p->thread.cpu_context.fp = (unsigned long)childregs->stackframe; + p->thread.cpu_context.fp = (unsigned long)&childregs->stackframe; ptrace_hw_copy_thread(p); @@ -429,7 +496,7 @@ static void tls_thread_switch(struct task_struct *next) if (is_compat_thread(task_thread_info(next))) write_sysreg(next->thread.uw.tp_value, tpidrro_el0); - else if (!arm64_kernel_unmapped_at_el0()) + else write_sysreg(0, tpidrro_el0); write_sysreg(*task_user_tls(next), tpidr_el0); @@ -474,28 +541,104 @@ static void entry_task_switch(struct task_struct *next) __this_cpu_write(__entry_task, next); } +#ifdef CONFIG_ARM64_GCS + +void gcs_preserve_current_state(void) +{ + current->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); +} + +static void gcs_thread_switch(struct task_struct *next) +{ + if (!system_supports_gcs()) + return; + + /* GCSPR_EL0 is always readable */ + gcs_preserve_current_state(); + write_sysreg_s(next->thread.gcspr_el0, SYS_GCSPR_EL0); + + if (current->thread.gcs_el0_mode != next->thread.gcs_el0_mode) + gcs_set_el0_mode(next); + + /* + * Ensure that GCS memory effects of the 'prev' thread are + * ordered before other memory accesses with release semantics + * (or preceded by a DMB) on the current PE. In addition, any + * memory accesses with acquire semantics (or succeeded by a + * DMB) are ordered before GCS memory effects of the 'next' + * thread. This will ensure that the GCS memory effects are + * visible to other PEs in case of migration. + */ + if (task_gcs_el0_enabled(current) || task_gcs_el0_enabled(next)) + gcsb_dsync(); +} + +#else + +static void gcs_thread_switch(struct task_struct *next) +{ +} + +#endif + /* - * ARM erratum 1418040 handling, affecting the 32bit view of CNTVCT. - * Ensure access is disabled when switching to a 32bit task, ensure - * access is enabled when switching to a 64bit task. + * Handle sysreg updates for ARM erratum 1418040 which affects the 32bit view of + * CNTVCT, various other errata which require trapping all CNTVCT{,_EL0} + * accesses and prctl(PR_SET_TSC). Ensure access is disabled iff a workaround is + * required or PR_TSC_SIGSEGV is set. */ -static void erratum_1418040_thread_switch(struct task_struct *next) +static void update_cntkctl_el1(struct task_struct *next) { - if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) || - !this_cpu_has_cap(ARM64_WORKAROUND_1418040)) - return; + struct thread_info *ti = task_thread_info(next); - if (is_compat_thread(task_thread_info(next))) + if (test_ti_thread_flag(ti, TIF_TSC_SIGSEGV) || + has_erratum_handler(read_cntvct_el0) || + (IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) && + this_cpu_has_cap(ARM64_WORKAROUND_1418040) && + is_compat_thread(ti))) sysreg_clear_set(cntkctl_el1, ARCH_TIMER_USR_VCT_ACCESS_EN, 0); else sysreg_clear_set(cntkctl_el1, 0, ARCH_TIMER_USR_VCT_ACCESS_EN); } -static void erratum_1418040_new_exec(void) +static void cntkctl_thread_switch(struct task_struct *prev, + struct task_struct *next) +{ + if ((read_ti_thread_flags(task_thread_info(prev)) & + (_TIF_32BIT | _TIF_TSC_SIGSEGV)) != + (read_ti_thread_flags(task_thread_info(next)) & + (_TIF_32BIT | _TIF_TSC_SIGSEGV))) + update_cntkctl_el1(next); +} + +static int do_set_tsc_mode(unsigned int val) { + bool tsc_sigsegv; + + if (val == PR_TSC_SIGSEGV) + tsc_sigsegv = true; + else if (val == PR_TSC_ENABLE) + tsc_sigsegv = false; + else + return -EINVAL; + preempt_disable(); - erratum_1418040_thread_switch(current); + update_thread_flag(TIF_TSC_SIGSEGV, tsc_sigsegv); + update_cntkctl_el1(current); preempt_enable(); + + return 0; +} + +static void permission_overlay_switch(struct task_struct *next) +{ + if (!system_supports_poe()) + return; + + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (current->thread.por_el0 != next->thread.por_el0) { + write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); + } } /* @@ -531,8 +674,10 @@ struct task_struct *__switch_to(struct task_struct *prev, contextidr_thread_switch(next); entry_task_switch(next); ssbs_thread_switch(next); - erratum_1418040_thread_switch(next); + cntkctl_thread_switch(prev, next); ptrauth_thread_switch_user(next); + permission_overlay_switch(next); + gcs_thread_switch(next); /* * Complete any pending TLB or cache maintenance on this CPU in case @@ -648,7 +793,7 @@ void arch_setup_new_exec(void) current->mm->context.flags = mmflags; ptrauth_thread_init_user(); mte_thread_init_user(); - erratum_1418040_new_exec(); + do_set_tsc_mode(PR_TSC_ENABLE); if (task_spec_ssb_noexec(current)) { arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS, @@ -714,7 +859,7 @@ long get_tagged_addr_ctrl(struct task_struct *task) * disable it for tasks that already opted in to the relaxed ABI. */ -static struct ctl_table tagged_addr_sysctl_table[] = { +static const struct ctl_table tagged_addr_sysctl_table[] = { { .procname = "tagged_addr_disabled", .mode = 0644, @@ -757,3 +902,26 @@ int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state, return prot; } #endif + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (is_compat_task()) + return -EINVAL; + + if (test_thread_flag(TIF_TSC_SIGSEGV)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (is_compat_task()) + return -EINVAL; + + return do_set_tsc_mode(val); +} diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 6268a13a1d58..da53722f95d4 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -558,6 +558,18 @@ static enum mitigation_state spectre_v4_enable_hw_mitigation(void) /* SCTLR_EL1.DSSBS was initialised to 0 during boot */ set_pstate_ssbs(0); + + /* + * SSBS is self-synchronizing and is intended to affect subsequent + * speculative instructions, but some CPUs can speculate with a stale + * value of SSBS. + * + * Mitigate this with an unconditional speculation barrier, as CPUs + * could mis-speculate branches and bypass a conditional barrier. + */ + if (IS_ENABLED(CONFIG_ARM64_ERRATUM_3194386)) + spec_bar(); + return SPECTRE_MITIGATED; } diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c index 29a8e444db83..fabd732d0a2d 100644 --- a/arch/arm64/kernel/psci.c +++ b/arch/arm64/kernel/psci.c @@ -40,7 +40,7 @@ static int cpu_psci_cpu_boot(unsigned int cpu) { phys_addr_t pa_secondary_entry = __pa_symbol(secondary_entry); int err = psci_ops.cpu_on(cpu_logical_map(cpu), pa_secondary_entry); - if (err) + if (err && err != -EPERM) pr_err("failed to boot CPU%d (%d)\n", cpu, err); return err; diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index e3bef38fc2e2..f79b0d5f71ac 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -34,6 +34,7 @@ #include <asm/cpufeature.h> #include <asm/debug-monitors.h> #include <asm/fpsimd.h> +#include <asm/gcs.h> #include <asm/mte.h> #include <asm/pointer_auth.h> #include <asm/stacktrace.h> @@ -174,7 +175,6 @@ static void ptrace_hbptriggered(struct perf_event *bp, struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp); const char *desc = "Hardware breakpoint trap (ptrace)"; -#ifdef CONFIG_COMPAT if (is_compat_task()) { int si_errno = 0; int i; @@ -196,7 +196,7 @@ static void ptrace_hbptriggered(struct perf_event *bp, desc); return; } -#endif + arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, bkpt->trigger, desc); } @@ -698,6 +698,41 @@ static int tls_set(struct task_struct *target, const struct user_regset *regset, return ret; } +static int fpmr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_fpmr()) + return -EINVAL; + + if (target == current) + fpsimd_preserve_current_state(); + + return membuf_store(&to, target->thread.uw.fpmr); +} + +static int fpmr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + unsigned long fpmr; + + if (!system_supports_fpmr()) + return -EINVAL; + + fpmr = target->thread.uw.fpmr; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpmr, 0, count); + if (ret) + return ret; + + target->thread.uw.fpmr = fpmr; + + fpsimd_flush_task_state(target); + + return 0; +} + static int system_call_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) @@ -729,7 +764,6 @@ static void sve_init_header_from_task(struct user_sve_header *header, { unsigned int vq; bool active; - bool fpsimd_only; enum vec_type task_type; memset(header, 0, sizeof(*header)); @@ -745,12 +779,10 @@ static void sve_init_header_from_task(struct user_sve_header *header, case ARM64_VEC_SVE: if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) header->flags |= SVE_PT_VL_INHERIT; - fpsimd_only = !test_tsk_thread_flag(target, TIF_SVE); break; case ARM64_VEC_SME: if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) header->flags |= SVE_PT_VL_INHERIT; - fpsimd_only = false; break; default: WARN_ON_ONCE(1); @@ -758,7 +790,7 @@ static void sve_init_header_from_task(struct user_sve_header *header, } if (active) { - if (fpsimd_only) { + if (target->thread.fp_type == FP_STATE_FPSIMD) { header->flags |= SVE_PT_REGS_FPSIMD; } else { header->flags |= SVE_PT_REGS_SVE; @@ -869,7 +901,11 @@ static int sve_set_common(struct task_struct *target, if (ret) goto out; - /* Actual VL set may be less than the user asked for: */ + /* + * Actual VL set may be different from what the user asked + * for, or we may have configured the _ONEXEC VL not the + * current VL: + */ vq = sve_vq_from_vl(task_get_vl(target, type)); /* Enter/exit streaming mode */ @@ -1096,7 +1132,11 @@ static int za_set(struct task_struct *target, if (ret) goto out; - /* Actual VL set may be less than the user asked for: */ + /* + * Actual VL set may be different from what the user asked + * for, or we may have configured the _ONEXEC rather than + * current VL: + */ vq = sve_vq_from_vl(task_get_sme_vl(target)); /* Ensure there is some SVE storage for streaming mode */ @@ -1389,7 +1429,7 @@ static int tagged_addr_ctrl_get(struct task_struct *target, { long ctrl = get_tagged_addr_ctrl(target); - if (IS_ERR_VALUE(ctrl)) + if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl))) return ctrl; return membuf_write(&to, &ctrl, sizeof(ctrl)); @@ -1403,6 +1443,10 @@ static int tagged_addr_ctrl_set(struct task_struct *target, const struct int ret; long ctrl; + ctrl = get_tagged_addr_ctrl(target); + if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl))) + return ctrl; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); if (ret) return ret; @@ -1411,6 +1455,101 @@ static int tagged_addr_ctrl_set(struct task_struct *target, const struct } #endif +#ifdef CONFIG_ARM64_POE +static int poe_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_poe()) + return -EINVAL; + + return membuf_write(&to, &target->thread.por_el0, + sizeof(target->thread.por_el0)); +} + +static int poe_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + long ctrl; + + if (!system_supports_poe()) + return -EINVAL; + + ctrl = target->thread.por_el0; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + target->thread.por_el0 = ctrl; + + return 0; +} +#endif + +#ifdef CONFIG_ARM64_GCS +static void task_gcs_to_user(struct user_gcs *user_gcs, + const struct task_struct *target) +{ + user_gcs->features_enabled = target->thread.gcs_el0_mode; + user_gcs->features_locked = target->thread.gcs_el0_locked; + user_gcs->gcspr_el0 = target->thread.gcspr_el0; +} + +static void task_gcs_from_user(struct task_struct *target, + const struct user_gcs *user_gcs) +{ + target->thread.gcs_el0_mode = user_gcs->features_enabled; + target->thread.gcs_el0_locked = user_gcs->features_locked; + target->thread.gcspr_el0 = user_gcs->gcspr_el0; +} + +static int gcs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + if (target == current) + gcs_preserve_current_state(); + + task_gcs_to_user(&user_gcs, target); + + return membuf_write(&to, &user_gcs, sizeof(user_gcs)); +} + +static int gcs_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + task_gcs_to_user(&user_gcs, target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1); + if (ret) + return ret; + + if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + task_gcs_from_user(target, &user_gcs); + + return 0; +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -1419,6 +1558,7 @@ enum aarch64_regset { REGSET_HW_BREAK, REGSET_HW_WATCH, #endif + REGSET_FPMR, REGSET_SYSTEM_CALL, #ifdef CONFIG_ARM64_SVE REGSET_SVE, @@ -1439,6 +1579,12 @@ enum aarch64_regset { #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI REGSET_TAGGED_ADDR_CTRL, #endif +#ifdef CONFIG_ARM64_POE + REGSET_POE, +#endif +#ifdef CONFIG_ARM64_GCS + REGSET_GCS, +#endif }; static const struct user_regset aarch64_regsets[] = { @@ -1497,6 +1643,14 @@ static const struct user_regset aarch64_regsets[] = { .regset_get = system_call_get, .set = system_call_set, }, + [REGSET_FPMR] = { + .core_note_type = NT_ARM_FPMR, + .n = 1, + .size = sizeof(u64), + .align = sizeof(u64), + .regset_get = fpmr_get, + .set = fpmr_set, + }, #ifdef CONFIG_ARM64_SVE [REGSET_SVE] = { /* Scalable Vector Extension */ .core_note_type = NT_ARM_SVE, @@ -1590,6 +1744,26 @@ static const struct user_regset aarch64_regsets[] = { .set = tagged_addr_ctrl_set, }, #endif +#ifdef CONFIG_ARM64_POE + [REGSET_POE] = { + .core_note_type = NT_ARM_POE, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = poe_get, + .set = poe_set, + }, +#endif +#ifdef CONFIG_ARM64_GCS + [REGSET_GCS] = { + .core_note_type = NT_ARM_GCS, + .n = sizeof(struct user_gcs) / sizeof(u64), + .size = sizeof(u64), + .align = sizeof(u64), + .regset_get = gcs_get, + .set = gcs_set, + }, +#endif }; static const struct user_regset_view user_aarch64_view = { @@ -1597,7 +1771,6 @@ static const struct user_regset_view user_aarch64_view = { .regsets = aarch64_regsets, .n = ARRAY_SIZE(aarch64_regsets) }; -#ifdef CONFIG_COMPAT enum compat_regset { REGSET_COMPAT_GPR, REGSET_COMPAT_VFP, @@ -1854,6 +2027,7 @@ static const struct user_regset_view user_aarch32_ptrace_view = { .regsets = aarch32_ptrace_regsets, .n = ARRAY_SIZE(aarch32_ptrace_regsets) }; +#ifdef CONFIG_COMPAT static int compat_ptrace_read_user(struct task_struct *tsk, compat_ulong_t off, compat_ulong_t __user *ret) { @@ -2115,7 +2289,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, const struct user_regset_view *task_user_regset_view(struct task_struct *task) { -#ifdef CONFIG_COMPAT /* * Core dumping of 32-bit tasks or compat ptrace requests must use the * user_aarch32_view compatible with arm32. Native ptrace requests on @@ -2126,7 +2299,7 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) return &user_aarch32_view; else if (is_compat_thread(task_thread_info(task))) return &user_aarch32_ptrace_view; -#endif + return &user_aarch64_view; } diff --git a/arch/arm64/kernel/reloc_test_core.c b/arch/arm64/kernel/reloc_test_core.c index 99f2ffe9fc05..5b0891146054 100644 --- a/arch/arm64/kernel/reloc_test_core.c +++ b/arch/arm64/kernel/reloc_test_core.c @@ -74,4 +74,5 @@ static void __exit reloc_test_exit(void) module_init(reloc_test_init); module_exit(reloc_test_exit); +MODULE_DESCRIPTION("Relocation testing module"); MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c new file mode 100644 index 000000000000..ce4778141ec7 --- /dev/null +++ b/arch/arm64/kernel/rsi.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include <linux/jump_label.h> +#include <linux/memblock.h> +#include <linux/psci.h> +#include <linux/swiotlb.h> +#include <linux/cc_platform.h> +#include <linux/platform_device.h> + +#include <asm/io.h> +#include <asm/mem_encrypt.h> +#include <asm/rsi.h> + +static struct realm_config config; + +unsigned long prot_ns_shared; +EXPORT_SYMBOL(prot_ns_shared); + +DEFINE_STATIC_KEY_FALSE_RO(rsi_present); +EXPORT_SYMBOL(rsi_present); + +bool cc_platform_has(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_MEM_ENCRYPT: + return is_realm_world(); + default: + return false; + } +} +EXPORT_SYMBOL_GPL(cc_platform_has); + +static bool rsi_version_matches(void) +{ + unsigned long ver_lower, ver_higher; + unsigned long ret = rsi_request_version(RSI_ABI_VERSION, + &ver_lower, + &ver_higher); + + if (ret == SMCCC_RET_NOT_SUPPORTED) + return false; + + if (ret != RSI_SUCCESS) { + pr_err("RME: RMM doesn't support RSI version %lu.%lu. Supported range: %lu.%lu-%lu.%lu\n", + RSI_ABI_VERSION_MAJOR, RSI_ABI_VERSION_MINOR, + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower), + RSI_ABI_VERSION_GET_MAJOR(ver_higher), + RSI_ABI_VERSION_GET_MINOR(ver_higher)); + return false; + } + + pr_info("RME: Using RSI version %lu.%lu\n", + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower)); + + return true; +} + +static void __init arm64_rsi_setup_memory(void) +{ + u64 i; + phys_addr_t start, end; + + /* + * Iterate over the available memory ranges and convert the state to + * protected memory. We should take extra care to ensure that we DO NOT + * permit any "DESTROYED" pages to be converted to "RAM". + * + * panic() is used because if the attempt to switch the memory to + * protected has failed here, then future accesses to the memory are + * simply going to be reflected as a SEA (Synchronous External Abort) + * which we can't handle. Bailing out early prevents the guest limping + * on and dying later. + */ + for_each_mem_range(i, &start, &end) { + if (rsi_set_memory_range_protected_safe(start, end)) { + panic("Failed to set memory range to protected: %pa-%pa", + &start, &end); + } + } +} + +bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) +{ + enum ripas ripas; + phys_addr_t end, top; + + /* Overflow ? */ + if (WARN_ON(base + size <= base)) + return false; + + end = ALIGN(base + size, RSI_GRANULE_SIZE); + base = ALIGN_DOWN(base, RSI_GRANULE_SIZE); + + while (base < end) { + if (WARN_ON(rsi_ipa_state_get(base, end, &ripas, &top))) + break; + if (WARN_ON(top <= base)) + break; + if (ripas != RSI_RIPAS_DEV) + break; + base = top; + } + + return base >= end; +} +EXPORT_SYMBOL(__arm64_is_protected_mmio); + +static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot) +{ + if (__arm64_is_protected_mmio(phys, size)) + *prot = pgprot_encrypted(*prot); + else + *prot = pgprot_decrypted(*prot); + + return 0; +} + +void __init arm64_rsi_init(void) +{ + if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) + return; + if (!rsi_version_matches()) + return; + if (WARN_ON(rsi_get_realm_config(&config))) + return; + prot_ns_shared = BIT(config.ipa_bits - 1); + + if (arm64_ioremap_prot_hook_register(realm_ioremap_hook)) + return; + + if (realm_register_memory_enc_ops()) + return; + + arm64_rsi_setup_memory(); + + static_branch_enable(&rsi_present); +} + +static struct platform_device rsi_dev = { + .name = RSI_PDEV_NAME, + .id = PLATFORM_DEVID_NONE +}; + +static int __init arm64_create_dummy_rsi_dev(void) +{ + if (is_realm_world() && + platform_device_register(&rsi_dev)) + pr_err("failed to register rsi platform device\n"); + return 0; +} + +arch_initcall(arm64_create_dummy_rsi_dev) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 42c690bb2d60..85104587f849 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -43,6 +43,7 @@ #include <asm/cpu_ops.h> #include <asm/kasan.h> #include <asm/numa.h> +#include <asm/rsi.h> #include <asm/scs.h> #include <asm/sections.h> #include <asm/setup.h> @@ -166,21 +167,6 @@ static void __init smp_build_mpidr_hash(void) pr_warn("Large number of MPIDR hash buckets detected\n"); } -static void *early_fdt_ptr __initdata; - -void __init *get_early_fdt_ptr(void) -{ - return early_fdt_ptr; -} - -asmlinkage void __init early_fdt_map(u64 dt_phys) -{ - int fdt_size; - - early_fixmap_init(); - early_fdt_ptr = fixmap_remap_fdt(dt_phys, &fdt_size, PAGE_KERNEL); -} - static void __init setup_machine_fdt(phys_addr_t dt_phys) { int size; @@ -190,7 +176,11 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys) if (dt_virt) memblock_reserve(dt_phys, size); - if (!dt_virt || !early_init_dt_scan(dt_virt)) { + /* + * dt_virt is a fixmap address, hence __pa(dt_virt) can't be used. + * Pass dt_phys directly. + */ + if (!early_init_dt_scan(dt_virt, dt_phys)) { pr_crit("\n" "Error: invalid device tree blob at physical address %pa (virtual address 0x%px)\n" "The dtb must be 8-byte aligned and must not exceed 2 MB in size\n" @@ -233,9 +223,7 @@ static void __init request_standard_resources(void) num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); - standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES); - if (!standard_resources) - panic("%s: Failed to allocate %zu bytes\n", __func__, res_size); + standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES); for_each_mem_region(region) { res = &standard_resources[i++]; @@ -298,13 +286,6 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) kaslr_init(); - /* - * If know now we are going to need KPTI then use non-global - * mappings from the start, avoiding the cost of rewriting - * everything later. - */ - arm64_use_ng_mappings = kaslr_requires_kpti(); - early_fixmap_init(); early_ioremap_init(); @@ -320,9 +301,15 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) dynamic_scs_init(); /* - * Unmask asynchronous aborts and fiq after bringing up possible - * earlycon. (Report possible System Errors once we can report this - * occurred). + * The primary CPU enters the kernel with all DAIF exceptions masked. + * + * We must unmask Debug and SError before preemption or scheduling is + * possible to ensure that these are consistently unmasked across + * threads, and we want to unmask SError as soon as possible after + * initializing earlycon so that we can report any SErrors immediately. + * + * IRQ and FIQ will be unmasked after the root irqchip has been + * detected and initialized. */ local_daif_restore(DAIF_PROCCTX_NOIRQ); @@ -367,13 +354,12 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) else psci_acpi_init(); + arm64_rsi_init(); + init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); - /* Init percpu seeds for random tags after cpus are set up. */ - kasan_init_sw_tags(); - #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Make sure init_thread_info.ttbr0 always generates translation diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 425b1bc17a3f..99ea26d400ff 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -16,15 +16,17 @@ #include <linux/uaccess.h> #include <linux/sizes.h> #include <linux/string.h> -#include <linux/resume_user_mode.h> #include <linux/ratelimit.h> +#include <linux/rseq.h> #include <linux/syscalls.h> +#include <linux/pkeys.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/elf.h> #include <asm/exception.h> #include <asm/cacheflush.h> +#include <asm/gcs.h> #include <asm/ucontext.h> #include <asm/unistd.h> #include <asm/fpsimd.h> @@ -34,6 +36,8 @@ #include <asm/traps.h> #include <asm/vdso.h> +#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) + /* * Do a signal return; undo the signal stack. These are aligned to 128-bit. */ @@ -42,11 +46,6 @@ struct rt_sigframe { struct ucontext uc; }; -struct frame_record { - u64 fp; - u64 lr; -}; - struct rt_sigframe_user_layout { struct rt_sigframe __user *sigframe; struct frame_record __user *next_frame; @@ -56,18 +55,73 @@ struct rt_sigframe_user_layout { unsigned long fpsimd_offset; unsigned long esr_offset; + unsigned long gcs_offset; unsigned long sve_offset; unsigned long tpidr2_offset; unsigned long za_offset; unsigned long zt_offset; + unsigned long fpmr_offset; + unsigned long poe_offset; unsigned long extra_offset; unsigned long end_offset; }; -#define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16) +/* + * Holds any EL0-controlled state that influences unprivileged memory accesses. + * This includes both accesses done in userspace and uaccess done in the kernel. + * + * This state needs to be carefully managed to ensure that it doesn't cause + * uaccess to fail when setting up the signal frame, and the signal handler + * itself also expects a well-defined state when entered. + */ +struct user_access_state { + u64 por_el0; +}; + #define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16) #define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16) +/* + * Save the user access state into ua_state and reset it to disable any + * restrictions. + */ +static void save_reset_user_access_state(struct user_access_state *ua_state) +{ + if (system_supports_poe()) { + u64 por_enable_all = 0; + + for (int pkey = 0; pkey < arch_max_pkey(); pkey++) + por_enable_all |= POE_RXW << (pkey * POR_BITS_PER_PKEY); + + ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0); + write_sysreg_s(por_enable_all, SYS_POR_EL0); + /* Ensure that any subsequent uaccess observes the updated value */ + isb(); + } +} + +/* + * Set the user access state for invoking the signal handler. + * + * No uaccess should be done after that function is called. + */ +static void set_handler_user_access_state(void) +{ + if (system_supports_poe()) + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); +} + +/* + * Restore the user access state to the values saved in ua_state. + * + * No uaccess should be done after that function is called. + */ +static void restore_user_access_state(const struct user_access_state *ua_state) +{ + if (system_supports_poe()) + write_sysreg_s(ua_state->por_el0, SYS_POR_EL0); +} + static void init_user_layout(struct rt_sigframe_user_layout *user) { const size_t reserved_size = @@ -182,6 +236,12 @@ struct user_ctxs { u32 za_size; struct zt_context __user *zt; u32 zt_size; + struct fpmr_context __user *fpmr; + u32 fpmr_size; + struct poe_context __user *poe; + u32 poe_size; + struct gcs_context __user *gcs; + u32 gcs_size; }; static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) @@ -227,6 +287,61 @@ static int restore_fpsimd_context(struct user_ctxs *user) return err ? -EFAULT : 0; } +static int preserve_fpmr_context(struct fpmr_context __user *ctx) +{ + int err = 0; + + current->thread.uw.fpmr = read_sysreg_s(SYS_FPMR); + + __put_user_error(FPMR_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(current->thread.uw.fpmr, &ctx->fpmr, err); + + return err; +} + +static int restore_fpmr_context(struct user_ctxs *user) +{ + u64 fpmr; + int err = 0; + + if (user->fpmr_size != sizeof(*user->fpmr)) + return -EINVAL; + + __get_user_error(fpmr, &user->fpmr->fpmr, err); + if (!err) + write_sysreg_s(fpmr, SYS_FPMR); + + return err; +} + +static int preserve_poe_context(struct poe_context __user *ctx, + const struct user_access_state *ua_state) +{ + int err = 0; + + __put_user_error(POE_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(ua_state->por_el0, &ctx->por_el0, err); + + return err; +} + +static int restore_poe_context(struct user_ctxs *user, + struct user_access_state *ua_state) +{ + u64 por_el0; + int err = 0; + + if (user->poe_size != sizeof(*user->poe)) + return -EINVAL; + + __get_user_error(por_el0, &(user->poe->por_el0), err); + if (!err) + ua_state->por_el0 = por_el0; + + return err; +} #ifdef CONFIG_ARM64_SVE @@ -574,6 +689,82 @@ extern int restore_zt_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_SME */ +#ifdef CONFIG_ARM64_GCS + +static int preserve_gcs_context(struct gcs_context __user *ctx) +{ + int err = 0; + u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * If GCS is enabled we will add a cap token to the frame, + * include it in the GCSPR_EL0 we report to support stack + * switching via sigreturn if GCS is enabled. We do not allow + * enabling via sigreturn so the token is only relevant for + * threads with GCS enabled. + */ + if (task_gcs_el0_enabled(current)) + gcspr -= 8; + + __put_user_error(GCS_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(gcspr, &ctx->gcspr, err); + __put_user_error(0, &ctx->reserved, err); + __put_user_error(current->thread.gcs_el0_mode, + &ctx->features_enabled, err); + + return err; +} + +static int restore_gcs_context(struct user_ctxs *user) +{ + u64 gcspr, enabled; + int err = 0; + + if (user->gcs_size != sizeof(*user->gcs)) + return -EINVAL; + + __get_user_error(gcspr, &user->gcs->gcspr, err); + __get_user_error(enabled, &user->gcs->features_enabled, err); + if (err) + return err; + + /* Don't allow unknown modes */ + if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + err = gcs_check_locked(current, enabled); + if (err != 0) + return err; + + /* Don't allow enabling */ + if (!task_gcs_el0_enabled(current) && + (enabled & PR_SHADOW_STACK_ENABLE)) + return -EINVAL; + + /* If we are disabling disable everything */ + if (!(enabled & PR_SHADOW_STACK_ENABLE)) + enabled = 0; + + current->thread.gcs_el0_mode = enabled; + + /* + * We let userspace set GCSPR_EL0 to anything here, we will + * validate later in gcs_restore_signal(). + */ + write_sysreg_s(gcspr, SYS_GCSPR_EL0); + + return 0; +} + +#else /* ! CONFIG_ARM64_GCS */ + +/* Turn any non-optimised out attempts to use these into a link error: */ +extern int preserve_gcs_context(void __user *ctx); +extern int restore_gcs_context(struct user_ctxs *user); + +#endif /* ! CONFIG_ARM64_GCS */ + static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) { @@ -590,6 +781,9 @@ static int parse_user_sigframe(struct user_ctxs *user, user->tpidr2 = NULL; user->za = NULL; user->zt = NULL; + user->fpmr = NULL; + user->poe = NULL; + user->gcs = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -640,6 +834,17 @@ static int parse_user_sigframe(struct user_ctxs *user, /* ignore */ break; + case POE_MAGIC: + if (!system_supports_poe()) + goto invalid; + + if (user->poe) + goto invalid; + + user->poe = (struct poe_context __user *)head; + user->poe_size = size; + break; + case SVE_MAGIC: if (!system_supports_sve() && !system_supports_sme()) goto invalid; @@ -684,6 +889,28 @@ static int parse_user_sigframe(struct user_ctxs *user, user->zt_size = size; break; + case FPMR_MAGIC: + if (!system_supports_fpmr()) + goto invalid; + + if (user->fpmr) + goto invalid; + + user->fpmr = (struct fpmr_context __user *)head; + user->fpmr_size = size; + break; + + case GCS_MAGIC: + if (!system_supports_gcs()) + goto invalid; + + if (user->gcs) + goto invalid; + + user->gcs = (struct gcs_context __user *)head; + user->gcs_size = size; + break; + case EXTRA_MAGIC: if (have_extra_context) goto invalid; @@ -767,7 +994,8 @@ invalid: } static int restore_sigframe(struct pt_regs *regs, - struct rt_sigframe __user *sf) + struct rt_sigframe __user *sf, + struct user_access_state *ua_state) { sigset_t set; int i, err; @@ -803,22 +1031,84 @@ static int restore_sigframe(struct pt_regs *regs, err = restore_fpsimd_context(&user); } + if (err == 0 && system_supports_gcs() && user.gcs) + err = restore_gcs_context(&user); + if (err == 0 && system_supports_tpidr2() && user.tpidr2) err = restore_tpidr2_context(&user); + if (err == 0 && system_supports_fpmr() && user.fpmr) + err = restore_fpmr_context(&user); + if (err == 0 && system_supports_sme() && user.za) err = restore_za_context(&user); if (err == 0 && system_supports_sme2() && user.zt) err = restore_zt_context(&user); + if (err == 0 && system_supports_poe() && user.poe) + err = restore_poe_context(&user, ua_state); + return err; } +#ifdef CONFIG_ARM64_GCS +static int gcs_restore_signal(void) +{ + u64 gcspr_el0, cap; + int ret; + + if (!system_supports_gcs()) + return 0; + + if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)) + return 0; + + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * Ensure that any changes to the GCS done via GCS operations + * are visible to the normal reads we do to validate the + * token. + */ + gcsb_dsync(); + + /* + * GCSPR_EL0 should be pointing at a capped GCS, read the cap. + * We don't enforce that this is in a GCS page, if it is not + * then faults will be generated on GCS operations - the main + * concern is to protect GCS pages. + */ + ret = copy_from_user(&cap, (unsigned long __user *)gcspr_el0, + sizeof(cap)); + if (ret) + return -EFAULT; + + /* + * Check that the cap is the actual GCS before replacing it. + */ + if (cap != GCS_SIGNAL_CAP(gcspr_el0)) + return -EINVAL; + + /* Invalidate the token to prevent reuse */ + put_user_gcs(0, (unsigned long __user *)gcspr_el0, &ret); + if (ret != 0) + return -EFAULT; + + write_sysreg_s(gcspr_el0 + 8, SYS_GCSPR_EL0); + + return 0; +} + +#else +static int gcs_restore_signal(void) { return 0; } +#endif + SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; + struct user_access_state ua_state; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; @@ -835,12 +1125,17 @@ SYSCALL_DEFINE0(rt_sigreturn) if (!access_ok(frame, sizeof (*frame))) goto badframe; - if (restore_sigframe(regs, frame)) + if (restore_sigframe(regs, frame, &ua_state)) + goto badframe; + + if (gcs_restore_signal()) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) goto badframe; + restore_user_access_state(&ua_state); + return regs->regs[0]; badframe: @@ -875,6 +1170,15 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } +#ifdef CONFIG_ARM64_GCS + if (system_supports_gcs() && (add_all || current->thread.gcspr_el0)) { + err = sigframe_alloc(user, &user->gcs_offset, + sizeof(struct gcs_context)); + if (err) + return err; + } +#endif + if (system_supports_sve() || system_supports_sme()) { unsigned int vq = 0; @@ -928,11 +1232,26 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, } } + if (system_supports_fpmr()) { + err = sigframe_alloc(user, &user->fpmr_offset, + sizeof(struct fpmr_context)); + if (err) + return err; + } + + if (system_supports_poe()) { + err = sigframe_alloc(user, &user->poe_offset, + sizeof(struct poe_context)); + if (err) + return err; + } + return sigframe_alloc_end(user); } static int setup_sigframe(struct rt_sigframe_user_layout *user, - struct pt_regs *regs, sigset_t *set) + struct pt_regs *regs, sigset_t *set, + const struct user_access_state *ua_state) { int i, err = 0; struct rt_sigframe __user *sf = user->sigframe; @@ -968,6 +1287,12 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); } + if (system_supports_gcs() && err == 0 && user->gcs_offset) { + struct gcs_context __user *gcs_ctx = + apply_user_offset(user, user->gcs_offset); + err |= preserve_gcs_context(gcs_ctx); + } + /* Scalable Vector Extension state (including streaming), if present */ if ((system_supports_sve() || system_supports_sme()) && err == 0 && user->sve_offset) { @@ -983,6 +1308,20 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, err |= preserve_tpidr2_context(tpidr2_ctx); } + /* FPMR if supported */ + if (system_supports_fpmr() && err == 0) { + struct fpmr_context __user *fpmr_ctx = + apply_user_offset(user, user->fpmr_offset); + err |= preserve_fpmr_context(fpmr_ctx); + } + + if (system_supports_poe() && err == 0) { + struct poe_context __user *poe_ctx = + apply_user_offset(user, user->poe_offset); + + err |= preserve_poe_context(poe_ctx, ua_state); + } + /* ZA state if present */ if (system_supports_sme() && err == 0 && user->za_offset) { struct za_context __user *za_ctx = @@ -1071,15 +1410,81 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, return 0; } -static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, +#ifdef CONFIG_ARM64_GCS + +static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) +{ + u64 gcspr_el0; + int ret = 0; + + if (!system_supports_gcs()) + return 0; + + if (!task_gcs_el0_enabled(current)) + return 0; + + /* + * We are entering a signal handler, current register state is + * active. + */ + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * Push a cap and the GCS entry for the trampoline onto the GCS. + */ + put_user_gcs((unsigned long)sigtramp, + (unsigned long __user *)(gcspr_el0 - 16), &ret); + put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 8), + (unsigned long __user *)(gcspr_el0 - 8), &ret); + if (ret != 0) + return ret; + + gcspr_el0 -= 16; + write_sysreg_s(gcspr_el0, SYS_GCSPR_EL0); + + return 0; +} +#else + +static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) +{ + return 0; +} + +#endif + +static int setup_return(struct pt_regs *regs, struct ksignal *ksig, struct rt_sigframe_user_layout *user, int usig) { __sigrestore_t sigtramp; + int err; + + if (ksig->ka.sa.sa_flags & SA_RESTORER) + sigtramp = ksig->ka.sa.sa_restorer; + else + sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); + + err = gcs_signal_entry(sigtramp, ksig); + if (err) + return err; + + /* + * We must not fail from this point onwards. We are going to update + * registers, including SP, in order to invoke the signal handler. If + * we failed and attempted to deliver a nested SIGSEGV to a handler + * after that point, the subsequent sigreturn would end up restoring + * the (partial) state for the original signal handler. + */ regs->regs[0] = usig; + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + regs->regs[1] = (unsigned long)&user->sigframe->info; + regs->regs[2] = (unsigned long)&user->sigframe->uc; + } regs->sp = (unsigned long)user->sigframe; regs->regs[29] = (unsigned long)&user->next_frame->fp; - regs->pc = (unsigned long)ka->sa.sa_handler; + regs->regs[30] = (unsigned long)sigtramp; + regs->pc = (unsigned long)ksig->ka.sa.sa_handler; /* * Signal delivery is a (wacky) indirect function call in @@ -1119,12 +1524,7 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, sme_smstop(); } - if (ka->sa.sa_flags & SA_RESTORER) - sigtramp = ka->sa.sa_restorer; - else - sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); - - regs->regs[30] = (unsigned long)sigtramp; + return 0; } static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, @@ -1132,6 +1532,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, { struct rt_sigframe_user_layout user; struct rt_sigframe __user *frame; + struct user_access_state ua_state; int err = 0; fpsimd_signal_preserve_current_state(); @@ -1139,21 +1540,29 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, if (get_sigframe(&user, ksig, regs)) return 1; + save_reset_user_access_state(&ua_state); frame = user.sigframe; __put_user_error(0, &frame->uc.uc_flags, err); __put_user_error(NULL, &frame->uc.uc_link, err); err |= __save_altstack(&frame->uc.uc_stack, regs->sp); - err |= setup_sigframe(&user, regs, set); - if (err == 0) { - setup_return(regs, &ksig->ka, &user, usig); - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, &ksig->info); - regs->regs[1] = (unsigned long)&frame->info; - regs->regs[2] = (unsigned long)&frame->uc; - } - } + err |= setup_sigframe(&user, regs, set, &ua_state); + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + err |= copy_siginfo_to_user(&frame->info, &ksig->info); + + if (err == 0) + err = setup_return(regs, ksig, &user, usig); + + /* + * We must not fail if setup_return() succeeded - see comment at the + * beginning of setup_return(). + */ + + if (err == 0) + set_handler_user_access_state(); + else + restore_user_access_state(&ua_state); return err; } @@ -1207,7 +1616,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ -static void do_signal(struct pt_regs *regs) +void do_signal(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; @@ -1278,41 +1687,6 @@ static void do_signal(struct pt_regs *regs) restore_saved_sigmask(); } -void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) -{ - do { - if (thread_flags & _TIF_NEED_RESCHED) { - /* Unmask Debug and SError for the next task */ - local_daif_restore(DAIF_PROCCTX_NOIRQ); - - schedule(); - } else { - local_daif_restore(DAIF_PROCCTX); - - if (thread_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - if (thread_flags & _TIF_MTE_ASYNC_FAULT) { - clear_thread_flag(TIF_MTE_ASYNC_FAULT); - send_sig_fault(SIGSEGV, SEGV_MTEAERR, - (void __user *)NULL, current); - } - - if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) - do_signal(regs); - - if (thread_flags & _TIF_NOTIFY_RESUME) - resume_user_mode_work(regs); - - if (thread_flags & _TIF_FOREIGN_FPSTATE) - fpsimd_restore_current_state(); - } - - local_daif_mask(); - thread_flags = read_thread_flags(); - } while (thread_flags & _TIF_WORK_MASK); -} - unsigned long __ro_after_init signal_minsigstksz; /* diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index bbd542704730..81e798b6dada 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -17,7 +17,7 @@ #include <asm/signal32.h> #include <asm/traps.h> #include <linux/uaccess.h> -#include <asm/unistd.h> +#include <asm/unistd_compat_32.h> #include <asm/vdso.h> struct compat_vfp_sigframe { @@ -451,7 +451,7 @@ int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set, void compat_setup_restart_syscall(struct pt_regs *regs) { - regs->regs[7] = __NR_compat_restart_syscall; + regs->regs[7] = __NR_compat32_restart_syscall; } /* diff --git a/arch/arm64/kernel/sigreturn32.S b/arch/arm64/kernel/sigreturn32.S index ccbd4aab4ba4..6f486b95b413 100644 --- a/arch/arm64/kernel/sigreturn32.S +++ b/arch/arm64/kernel/sigreturn32.S @@ -13,7 +13,7 @@ * need two 16-bit instructions. */ -#include <asm/unistd.h> +#include <asm/unistd_compat_32.h> .section .rodata .globl __aarch32_sigret_code_start @@ -22,26 +22,26 @@ __aarch32_sigret_code_start: /* * ARM Code */ - .byte __NR_compat_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_sigreturn - .byte __NR_compat_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_sigreturn + .byte __NR_compat32_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat32_sigreturn + .byte __NR_compat32_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat32_sigreturn /* * Thumb code */ - .byte __NR_compat_sigreturn, 0x27 // svc #__NR_compat_sigreturn - .byte __NR_compat_sigreturn, 0xdf // mov r7, #__NR_compat_sigreturn + .byte __NR_compat32_sigreturn, 0x27 // svc #__NR_compat32_sigreturn + .byte __NR_compat32_sigreturn, 0xdf // mov r7, #__NR_compat32_sigreturn /* * ARM code */ - .byte __NR_compat_rt_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_rt_sigreturn - .byte __NR_compat_rt_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_rt_sigreturn + .byte __NR_compat32_rt_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat32_rt_sigreturn + .byte __NR_compat32_rt_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat32_rt_sigreturn /* * Thumb code */ - .byte __NR_compat_rt_sigreturn, 0x27 // svc #__NR_compat_rt_sigreturn - .byte __NR_compat_rt_sigreturn, 0xdf // mov r7, #__NR_compat_rt_sigreturn + .byte __NR_compat32_rt_sigreturn, 0x27 // svc #__NR_compat32_rt_sigreturn + .byte __NR_compat32_rt_sigreturn, 0xdf // mov r7, #__NR_compat32_rt_sigreturn .globl __aarch32_sigret_code_end __aarch32_sigret_code_end: diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 2aa5129d8253..f093cdf71be1 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -102,9 +102,6 @@ SYM_CODE_START(cpu_resume) mov x0, xzr bl init_kernel_el mov x19, x0 // preserve boot mode -#if VA_BITS > 48 - ldr_l x0, vabits_actual -#endif bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ adrp x1, swapper_pg_dir diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S index 487381164ff6..2def9d0dd3dd 100644 --- a/arch/arm64/kernel/smccc-call.S +++ b/arch/arm64/kernel/smccc-call.S @@ -7,48 +7,19 @@ #include <asm/asm-offsets.h> #include <asm/assembler.h> -#include <asm/thread_info.h> - -/* - * If we have SMCCC v1.3 and (as is likely) no SVE state in - * the registers then set the SMCCC hint bit to say there's no - * need to preserve it. Do this by directly adjusting the SMCCC - * function value which is already stored in x0 ready to be called. - */ -SYM_FUNC_START(__arm_smccc_sve_check) - - ldr_l x16, smccc_has_sve_hint - cbz x16, 2f - - get_current_task x16 - ldr x16, [x16, #TSK_TI_FLAGS] - tbnz x16, #TIF_FOREIGN_FPSTATE, 1f // Any live FP state? - tbnz x16, #TIF_SVE, 2f // Does that state include SVE? - -1: orr x0, x0, ARM_SMCCC_1_3_SVE_HINT - -2: ret -SYM_FUNC_END(__arm_smccc_sve_check) -EXPORT_SYMBOL(__arm_smccc_sve_check) .macro SMCCC instr - stp x29, x30, [sp, #-16]! - mov x29, sp -alternative_if ARM64_SVE - bl __arm_smccc_sve_check -alternative_else_nop_endif \instr #0 - ldr x4, [sp, #16] + ldr x4, [sp] stp x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS] stp x2, x3, [x4, #ARM_SMCCC_RES_X2_OFFS] - ldr x4, [sp, #24] + ldr x4, [sp, #8] cbz x4, 1f /* no quirk structure */ ldr x9, [x4, #ARM_SMCCC_QUIRK_ID_OFFS] cmp x9, #ARM_SMCCC_QUIRK_QCOM_A6 b.ne 1f str x6, [x4, ARM_SMCCC_QUIRK_STATE_OFFS] -1: ldp x29, x30, [sp], #16 - ret +1: ret .endm /* diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 4ced34f62dab..3b3f6b56e733 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -55,9 +55,6 @@ #include <trace/events/ipi.h> -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); -EXPORT_PER_CPU_SYMBOL(cpu_number); - /* * as from 2.5, kernels no longer have an init_tasks structure * so we need some other way of telling a new secondary core @@ -71,7 +68,7 @@ enum ipi_msg_type { IPI_RESCHEDULE, IPI_CALL_FUNC, IPI_CPU_STOP, - IPI_CPU_CRASH_STOP, + IPI_CPU_STOP_NMI, IPI_TIMER, IPI_IRQ_WORK, NR_IPI, @@ -88,6 +85,8 @@ static int ipi_irq_base __ro_after_init; static int nr_ipi __ro_after_init = NR_IPI; static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init; +static bool crash_stop; + static void ipi_setup(int cpu); #ifdef CONFIG_HOTPLUG_CPU @@ -132,7 +131,8 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) /* Now bring the CPU into our world */ ret = boot_secondary(cpu, idle); if (ret) { - pr_err("CPU%u: failed to boot: %d\n", cpu, ret); + if (ret != -EPERM) + pr_err("CPU%u: failed to boot: %d\n", cpu, ret); return ret; } @@ -264,6 +264,13 @@ asmlinkage notrace void secondary_start_kernel(void) set_cpu_online(cpu, true); complete(&cpu_running); + /* + * Secondary CPUs enter the kernel with all DAIF exceptions masked. + * + * As with setup_arch() we must unmask Debug and SError exceptions, and + * as the root irqchip has already been detected and initialized we can + * unmask IRQ and FIQ at the same time. + */ local_daif_restore(DAIF_PROCCTX); /* @@ -462,6 +469,8 @@ void __init smp_prepare_boot_cpu(void) init_gic_priority_masking(); kasan_init_hw_tags(); + /* Init percpu seeds for random tags after cpus are set up. */ + kasan_init_sw_tags(); } /* @@ -503,6 +512,59 @@ static int __init smp_cpu_setup(int cpu) static bool bootcpu_valid __initdata; static unsigned int cpu_count = 1; +int arch_register_cpu(int cpu) +{ + acpi_handle acpi_handle = acpi_get_processor_handle(cpu); + struct cpu *c = &per_cpu(cpu_devices, cpu); + + if (!acpi_disabled && !acpi_handle && + IS_ENABLED(CONFIG_ACPI_HOTPLUG_CPU)) + return -EPROBE_DEFER; + +#ifdef CONFIG_ACPI_HOTPLUG_CPU + /* For now block anything that looks like physical CPU Hotplug */ + if (invalid_logical_cpuid(cpu) || !cpu_present(cpu)) { + pr_err_once("Changing CPU present bit is not supported\n"); + return -ENODEV; + } +#endif + + /* + * Availability of the acpi handle is sufficient to establish + * that _STA has aleady been checked. No need to recheck here. + */ + c->hotpluggable = arch_cpu_is_hotpluggable(cpu); + + return register_cpu(c, cpu); +} + +#ifdef CONFIG_ACPI_HOTPLUG_CPU +void arch_unregister_cpu(int cpu) +{ + acpi_handle acpi_handle = acpi_get_processor_handle(cpu); + struct cpu *c = &per_cpu(cpu_devices, cpu); + acpi_status status; + unsigned long long sta; + + if (!acpi_handle) { + pr_err_once("Removing a CPU without associated ACPI handle\n"); + return; + } + + status = acpi_evaluate_integer(acpi_handle, "_STA", NULL, &sta); + if (ACPI_FAILURE(status)) + return; + + /* For now do not allow anything that looks like physical CPU HP */ + if (cpu_present(cpu) && !(sta & ACPI_STA_DEVICE_PRESENT)) { + pr_err_once("Changing CPU present bit is not supported\n"); + return; + } + + unregister_cpu(c); +} +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ + #ifdef CONFIG_ACPI static struct acpi_madt_generic_interrupt cpu_madt_gicc[NR_CPUS]; @@ -523,7 +585,8 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) { u64 hwid = processor->arm_mpidr; - if (!acpi_gicc_is_usable(processor)) { + if (!(processor->flags & + (ACPI_MADT_ENABLED | ACPI_MADT_GICC_ONLINE_CAPABLE))) { pr_debug("skipping disabled CPU entry with 0x%llx MPIDR\n", hwid); return; } @@ -742,8 +805,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) */ for_each_possible_cpu(cpu) { - per_cpu(cpu_number, cpu) = cpu; - if (cpu == smp_processor_id()) continue; @@ -760,13 +821,15 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } } -static const char *ipi_types[NR_IPI] __tracepoint_string = { +static const char *ipi_types[MAX_IPI] __tracepoint_string = { [IPI_RESCHEDULE] = "Rescheduling interrupts", [IPI_CALL_FUNC] = "Function call interrupts", [IPI_CPU_STOP] = "CPU stop interrupts", - [IPI_CPU_CRASH_STOP] = "CPU stop (for crash dump) interrupts", + [IPI_CPU_STOP_NMI] = "CPU stop NMIs", [IPI_TIMER] = "Timer broadcast interrupts", [IPI_IRQ_WORK] = "IRQ work interrupts", + [IPI_CPU_BACKTRACE] = "CPU backtrace interrupts", + [IPI_KGDB_ROUNDUP] = "KGDB roundup interrupts", }; static void smp_cross_call(const struct cpumask *target, unsigned int ipinr); @@ -777,7 +840,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) { unsigned int cpu, i; - for (i = 0; i < NR_IPI; i++) { + for (i = 0; i < MAX_IPI; i++) { seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i, prec >= 4 ? " " : ""); for_each_online_cpu(cpu) @@ -806,9 +869,9 @@ void arch_irq_work_raise(void) } #endif -static void __noreturn local_cpu_stop(void) +static void __noreturn local_cpu_stop(unsigned int cpu) { - set_cpu_online(smp_processor_id(), false); + set_cpu_online(cpu, false); local_daif_mask(); sdei_mask_local_cpu(); @@ -822,21 +885,26 @@ static void __noreturn local_cpu_stop(void) */ void __noreturn panic_smp_self_stop(void) { - local_cpu_stop(); + local_cpu_stop(smp_processor_id()); } -#ifdef CONFIG_KEXEC_CORE -static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0); -#endif - static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) { #ifdef CONFIG_KEXEC_CORE + /* + * Use local_daif_mask() instead of local_irq_disable() to make sure + * that pseudo-NMIs are disabled. The "crash stop" code starts with + * an IRQ and falls back to NMI (which might be pseudo). If the IRQ + * finally goes through right as we're timing out then the NMI could + * interrupt us. It's better to prevent the NMI and let the IRQ + * finish since the pt_regs will be better. + */ + local_daif_mask(); + crash_save_cpu(regs, cpu); - atomic_dec(&waiting_for_crash_ipi); + set_cpu_online(cpu, false); - local_irq_disable(); sdei_mask_local_cpu(); if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) @@ -901,14 +969,12 @@ static void do_handle_IPI(int ipinr) break; case IPI_CPU_STOP: - local_cpu_stop(); - break; - - case IPI_CPU_CRASH_STOP: - if (IS_ENABLED(CONFIG_KEXEC_CORE)) { + case IPI_CPU_STOP_NMI: + if (IS_ENABLED(CONFIG_KEXEC_CORE) && crash_stop) { ipi_cpu_crash_stop(cpu, get_irq_regs()); - unreachable(); + } else { + local_cpu_stop(cpu); } break; @@ -963,8 +1029,7 @@ static bool ipi_should_be_nmi(enum ipi_msg_type ipi) return false; switch (ipi) { - case IPI_CPU_STOP: - case IPI_CPU_CRASH_STOP: + case IPI_CPU_STOP_NMI: case IPI_CPU_BACKTRACE: case IPI_KGDB_ROUNDUP: return true; @@ -1021,12 +1086,12 @@ void __init set_smp_ipi_range(int ipi_base, int n) if (ipi_should_be_nmi(i)) { err = request_percpu_nmi(ipi_base + i, ipi_handler, - "IPI", &cpu_number); + "IPI", &irq_stat); WARN(err, "Could not request IPI %d as NMI, err=%d\n", i, err); } else { err = request_percpu_irq(ipi_base + i, ipi_handler, - "IPI", &cpu_number); + "IPI", &irq_stat); WARN(err, "Could not request IPI %d as IRQ, err=%d\n", i, err); } @@ -1077,79 +1142,109 @@ static inline unsigned int num_other_online_cpus(void) void smp_send_stop(void) { + static unsigned long stop_in_progress; + cpumask_t mask; unsigned long timeout; - if (num_other_online_cpus()) { - cpumask_t mask; + /* + * If this cpu is the only one alive at this point in time, online or + * not, there are no stop messages to be sent around, so just back out. + */ + if (num_other_online_cpus() == 0) + goto skip_ipi; - cpumask_copy(&mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &mask); + /* Only proceed if this is the first CPU to reach this code */ + if (test_and_set_bit(0, &stop_in_progress)) + return; - if (system_state <= SYSTEM_RUNNING) - pr_crit("SMP: stopping secondary CPUs\n"); - smp_cross_call(&mask, IPI_CPU_STOP); - } + /* + * Send an IPI to all currently online CPUs except the CPU running + * this code. + * + * NOTE: we don't do anything here to prevent other CPUs from coming + * online after we snapshot `cpu_online_mask`. Ideally, the calling code + * should do something to prevent other CPUs from coming up. This code + * can be called in the panic path and thus it doesn't seem wise to + * grab the CPU hotplug mutex ourselves. Worst case: + * - If a CPU comes online as we're running, we'll likely notice it + * during the 1 second wait below and then we'll catch it when we try + * with an NMI (assuming NMIs are enabled) since we re-snapshot the + * mask before sending an NMI. + * - If we leave the function and see that CPUs are still online we'll + * at least print a warning. Especially without NMIs this function + * isn't foolproof anyway so calling code will just have to accept + * the fact that there could be cases where a CPU can't be stopped. + */ + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + + if (system_state <= SYSTEM_RUNNING) + pr_crit("SMP: stopping secondary CPUs\n"); - /* Wait up to one second for other CPUs to stop */ + /* + * Start with a normal IPI and wait up to one second for other CPUs to + * stop. We do this first because it gives other processors a chance + * to exit critical sections / drop locks and makes the rest of the + * stop process (especially console flush) more robust. + */ + smp_cross_call(&mask, IPI_CPU_STOP); timeout = USEC_PER_SEC; while (num_other_online_cpus() && timeout--) udelay(1); - if (num_other_online_cpus()) + /* + * If CPUs are still online, try an NMI. There's no excuse for this to + * be slow, so we only give them an extra 10 ms to respond. + */ + if (num_other_online_cpus() && ipi_should_be_nmi(IPI_CPU_STOP_NMI)) { + smp_rmb(); + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + + pr_info("SMP: retry stop with NMI for CPUs %*pbl\n", + cpumask_pr_args(&mask)); + + smp_cross_call(&mask, IPI_CPU_STOP_NMI); + timeout = USEC_PER_MSEC * 10; + while (num_other_online_cpus() && timeout--) + udelay(1); + } + + if (num_other_online_cpus()) { + smp_rmb(); + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(cpu_online_mask)); + cpumask_pr_args(&mask)); + } +skip_ipi: sdei_mask_local_cpu(); } #ifdef CONFIG_KEXEC_CORE void crash_smp_send_stop(void) { - static int cpus_stopped; - cpumask_t mask; - unsigned long timeout; - /* * This function can be called twice in panic path, but obviously * we execute this only once. + * + * We use this same boolean to tell whether the IPI we send was a + * stop or a "crash stop". */ - if (cpus_stopped) + if (crash_stop) return; + crash_stop = 1; - cpus_stopped = 1; - - /* - * If this cpu is the only one alive at this point in time, online or - * not, there are no stop messages to be sent around, so just back out. - */ - if (num_other_online_cpus() == 0) - goto skip_ipi; - - cpumask_copy(&mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &mask); - - atomic_set(&waiting_for_crash_ipi, num_other_online_cpus()); + smp_send_stop(); - pr_crit("SMP: stopping secondary CPUs\n"); - smp_cross_call(&mask, IPI_CPU_CRASH_STOP); - - /* Wait up to one second for other CPUs to stop */ - timeout = USEC_PER_SEC; - while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--) - udelay(1); - - if (atomic_read(&waiting_for_crash_ipi) > 0) - pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(&mask)); - -skip_ipi: - sdei_mask_local_cpu(); sdei_handler_abort(); } bool smp_crash_stop_failed(void) { - return (atomic_read(&waiting_for_crash_ipi) > 0); + return num_other_online_cpus() != 0; } #endif diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index b2a60e0bcfd2..1d9d51d7627f 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/efi.h> #include <linux/export.h> +#include <linux/filter.h> #include <linux/ftrace.h> #include <linux/kprobes.h> #include <linux/sched.h> @@ -19,11 +20,28 @@ #include <asm/stack_pointer.h> #include <asm/stacktrace.h> +enum kunwind_source { + KUNWIND_SOURCE_UNKNOWN, + KUNWIND_SOURCE_FRAME, + KUNWIND_SOURCE_CALLER, + KUNWIND_SOURCE_TASK, + KUNWIND_SOURCE_REGS_PC, +}; + +union unwind_flags { + unsigned long all; + struct { + unsigned long fgraph : 1, + kretprobe : 1; + }; +}; + /* * Kernel unwind state * * @common: Common unwind state. * @task: The task being unwound. + * @graph_idx: Used by ftrace_graph_ret_addr() for optimized stack unwinding. * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance * associated with the most recently encountered replacement lr * value. @@ -31,9 +49,13 @@ struct kunwind_state { struct unwind_state common; struct task_struct *task; + int graph_idx; #ifdef CONFIG_KRETPROBES struct llist_node *kr_cur; #endif + enum kunwind_source source; + union unwind_flags flags; + struct pt_regs *regs; }; static __always_inline void @@ -42,6 +64,9 @@ kunwind_init(struct kunwind_state *state, { unwind_init_common(&state->common); state->task = task; + state->source = KUNWIND_SOURCE_UNKNOWN; + state->flags.all = 0; + state->regs = NULL; } /* @@ -57,8 +82,10 @@ kunwind_init_from_regs(struct kunwind_state *state, { kunwind_init(state, current); + state->regs = regs; state->common.fp = regs->regs[29]; state->common.pc = regs->pc; + state->source = KUNWIND_SOURCE_REGS_PC; } /* @@ -76,6 +103,7 @@ kunwind_init_from_caller(struct kunwind_state *state) state->common.fp = (unsigned long)__builtin_frame_address(1); state->common.pc = (unsigned long)__builtin_return_address(0); + state->source = KUNWIND_SOURCE_CALLER; } /* @@ -96,6 +124,7 @@ kunwind_init_from_task(struct kunwind_state *state, state->common.fp = thread_saved_fp(task); state->common.pc = thread_saved_pc(task); + state->source = KUNWIND_SOURCE_TASK; } static __always_inline int @@ -105,12 +134,15 @@ kunwind_recover_return_address(struct kunwind_state *state) if (state->task->ret_stack && (state->common.pc == (unsigned long)return_to_handler)) { unsigned long orig_pc; - orig_pc = ftrace_graph_ret_addr(state->task, NULL, + orig_pc = ftrace_graph_ret_addr(state->task, &state->graph_idx, state->common.pc, (void *)state->common.fp); - if (WARN_ON_ONCE(state->common.pc == orig_pc)) + if (state->common.pc == orig_pc) { + WARN_ON_ONCE(state->task == current); return -EINVAL; + } state->common.pc = orig_pc; + state->flags.fgraph = 1; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -121,12 +153,95 @@ kunwind_recover_return_address(struct kunwind_state *state) (void *)state->common.fp, &state->kr_cur); state->common.pc = orig_pc; + state->flags.kretprobe = 1; } #endif /* CONFIG_KRETPROBES */ return 0; } +static __always_inline +int kunwind_next_regs_pc(struct kunwind_state *state) +{ + struct stack_info *info; + unsigned long fp = state->common.fp; + struct pt_regs *regs; + + regs = container_of((u64 *)fp, struct pt_regs, stackframe.record.fp); + + info = unwind_find_stack(&state->common, (unsigned long)regs, sizeof(*regs)); + if (!info) + return -EINVAL; + + unwind_consume_stack(&state->common, info, (unsigned long)regs, + sizeof(*regs)); + + state->regs = regs; + state->common.pc = regs->pc; + state->common.fp = regs->regs[29]; + state->regs = NULL; + state->source = KUNWIND_SOURCE_REGS_PC; + return 0; +} + +static __always_inline int +kunwind_next_frame_record_meta(struct kunwind_state *state) +{ + struct task_struct *tsk = state->task; + unsigned long fp = state->common.fp; + struct frame_record_meta *meta; + struct stack_info *info; + + info = unwind_find_stack(&state->common, fp, sizeof(*meta)); + if (!info) + return -EINVAL; + + meta = (struct frame_record_meta *)fp; + switch (READ_ONCE(meta->type)) { + case FRAME_META_TYPE_FINAL: + if (meta == &task_pt_regs(tsk)->stackframe) + return -ENOENT; + WARN_ON_ONCE(tsk == current); + return -EINVAL; + case FRAME_META_TYPE_PT_REGS: + return kunwind_next_regs_pc(state); + default: + WARN_ON_ONCE(tsk == current); + return -EINVAL; + } +} + +static __always_inline int +kunwind_next_frame_record(struct kunwind_state *state) +{ + unsigned long fp = state->common.fp; + struct frame_record *record; + struct stack_info *info; + unsigned long new_fp, new_pc; + + if (fp & 0x7) + return -EINVAL; + + info = unwind_find_stack(&state->common, fp, sizeof(*record)); + if (!info) + return -EINVAL; + + record = (struct frame_record *)fp; + new_fp = READ_ONCE(record->fp); + new_pc = READ_ONCE(record->lr); + + if (!new_fp && !new_pc) + return kunwind_next_frame_record_meta(state); + + unwind_consume_stack(&state->common, info, fp, sizeof(*record)); + + state->common.fp = new_fp; + state->common.pc = new_pc; + state->source = KUNWIND_SOURCE_FRAME; + + return 0; +} + /* * Unwind from one frame record (A) to the next frame record (B). * @@ -137,15 +252,21 @@ kunwind_recover_return_address(struct kunwind_state *state) static __always_inline int kunwind_next(struct kunwind_state *state) { - struct task_struct *tsk = state->task; - unsigned long fp = state->common.fp; int err; - /* Final frame; nothing to unwind */ - if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) - return -ENOENT; + state->flags.all = 0; + + switch (state->source) { + case KUNWIND_SOURCE_FRAME: + case KUNWIND_SOURCE_CALLER: + case KUNWIND_SOURCE_TASK: + case KUNWIND_SOURCE_REGS_PC: + err = kunwind_next_frame_record(state); + break; + default: + err = -EINVAL; + } - err = unwind_next_frame_record(&state->common); if (err) return err; @@ -266,10 +387,57 @@ noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, kunwind_stack_walk(arch_kunwind_consume_entry, &data, task, regs); } -static bool dump_backtrace_entry(void *arg, unsigned long where) +struct bpf_unwind_consume_entry_data { + bool (*consume_entry)(void *cookie, u64 ip, u64 sp, u64 fp); + void *cookie; +}; + +static bool +arch_bpf_unwind_consume_entry(const struct kunwind_state *state, void *cookie) +{ + struct bpf_unwind_consume_entry_data *data = cookie; + + return data->consume_entry(data->cookie, state->common.pc, 0, + state->common.fp); +} + +noinline noinstr void arch_bpf_stack_walk(bool (*consume_entry)(void *cookie, u64 ip, u64 sp, + u64 fp), void *cookie) { + struct bpf_unwind_consume_entry_data data = { + .consume_entry = consume_entry, + .cookie = cookie, + }; + + kunwind_stack_walk(arch_bpf_unwind_consume_entry, &data, current, NULL); +} + +static const char *state_source_string(const struct kunwind_state *state) +{ + switch (state->source) { + case KUNWIND_SOURCE_FRAME: return NULL; + case KUNWIND_SOURCE_CALLER: return "C"; + case KUNWIND_SOURCE_TASK: return "T"; + case KUNWIND_SOURCE_REGS_PC: return "P"; + default: return "U"; + } +} + +static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg) +{ + const char *source = state_source_string(state); + union unwind_flags flags = state->flags; + bool has_info = source || flags.all; char *loglvl = arg; - printk("%s %pSb\n", loglvl, (void *)where); + + printk("%s %pSb%s%s%s%s%s\n", loglvl, + (void *)state->common.pc, + has_info ? " (" : "", + source ? source : "", + flags.fgraph ? "F" : "", + flags.kretprobe ? "K" : "", + has_info ? ")" : ""); + return true; } @@ -288,7 +456,7 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, return; printk("%sCall trace:\n", loglvl); - arch_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); + kunwind_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); put_task_stack(tsk); } @@ -298,3 +466,123 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) dump_backtrace(NULL, tsk, loglvl); barrier(); } + +/* + * The struct defined for userspace stack frame in AARCH64 mode. + */ +struct frame_tail { + struct frame_tail __user *fp; + unsigned long lr; +} __attribute__((packed)); + +/* + * Get the return address for a single stackframe and return a pointer to the + * next frame tail. + */ +static struct frame_tail __user * +unwind_user_frame(struct frame_tail __user *tail, void *cookie, + stack_trace_consume_fn consume_entry) +{ + struct frame_tail buftail; + unsigned long err; + unsigned long lr; + + /* Also check accessibility of one struct frame_tail beyond */ + if (!access_ok(tail, sizeof(buftail))) + return NULL; + + pagefault_disable(); + err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); + pagefault_enable(); + + if (err) + return NULL; + + lr = ptrauth_strip_user_insn_pac(buftail.lr); + + if (!consume_entry(cookie, lr)) + return NULL; + + /* + * Frame pointers should strictly progress back up the stack + * (towards higher addresses). + */ + if (tail >= buftail.fp) + return NULL; + + return buftail.fp; +} + +#ifdef CONFIG_COMPAT +/* + * The registers we're interested in are at the end of the variable + * length saved register structure. The fp points at the end of this + * structure so the address of this struct is: + * (struct compat_frame_tail *)(xxx->fp)-1 + * + * This code has been adapted from the ARM OProfile support. + */ +struct compat_frame_tail { + compat_uptr_t fp; /* a (struct compat_frame_tail *) in compat mode */ + u32 sp; + u32 lr; +} __attribute__((packed)); + +static struct compat_frame_tail __user * +unwind_compat_user_frame(struct compat_frame_tail __user *tail, void *cookie, + stack_trace_consume_fn consume_entry) +{ + struct compat_frame_tail buftail; + unsigned long err; + + /* Also check accessibility of one struct frame_tail beyond */ + if (!access_ok(tail, sizeof(buftail))) + return NULL; + + pagefault_disable(); + err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); + pagefault_enable(); + + if (err) + return NULL; + + if (!consume_entry(cookie, buftail.lr)) + return NULL; + + /* + * Frame pointers should strictly progress back up the stack + * (towards higher addresses). + */ + if (tail + 1 >= (struct compat_frame_tail __user *) + compat_ptr(buftail.fp)) + return NULL; + + return (struct compat_frame_tail __user *)compat_ptr(buftail.fp) - 1; +} +#endif /* CONFIG_COMPAT */ + + +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) +{ + if (!consume_entry(cookie, regs->pc)) + return; + + if (!compat_user_mode(regs)) { + /* AARCH64 mode */ + struct frame_tail __user *tail; + + tail = (struct frame_tail __user *)regs->regs[29]; + while (tail && !((unsigned long)tail & 0x7)) + tail = unwind_user_frame(tail, cookie, consume_entry); + } else { +#ifdef CONFIG_COMPAT + /* AARCH32 compat mode */ + struct compat_frame_tail __user *tail; + + tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; + while (tail && !((unsigned long)tail & 0x3)) + tail = unwind_compat_user_frame(tail, cookie, consume_entry); +#endif + } +} diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c index d5ffaaab31a7..f08408b6e826 100644 --- a/arch/arm64/kernel/sys.c +++ b/arch/arm64/kernel/sys.c @@ -48,14 +48,16 @@ asmlinkage long __arm64_sys_ni_syscall(const struct pt_regs *__unused) */ #define __arm64_sys_personality __arm64_sys_arm64_personality +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) + #undef __SYSCALL #define __SYSCALL(nr, sym) asmlinkage long __arm64_##sym(const struct pt_regs *); -#include <asm/unistd.h> +#include <asm/syscall_table_64.h> #undef __SYSCALL #define __SYSCALL(nr, sym) [nr] = __arm64_##sym, const syscall_fn_t sys_call_table[__NR_syscalls] = { [0 ... __NR_syscalls - 1] = __arm64_sys_ni_syscall, -#include <asm/unistd.h> +#include <asm/syscall_table_64.h> }; diff --git a/arch/arm64/kernel/sys32.c b/arch/arm64/kernel/sys32.c index fc40386afb1b..96bcfb907443 100644 --- a/arch/arm64/kernel/sys32.c +++ b/arch/arm64/kernel/sys32.c @@ -5,17 +5,12 @@ * Copyright (C) 2015 ARM Ltd. */ -/* - * Needed to avoid conflicting __NR_* macros between uapi/asm/unistd.h and - * asm/unistd32.h. - */ -#define __COMPAT_SYSCALL_NR - #include <linux/compat.h> #include <linux/compiler.h> #include <linux/syscalls.h> #include <asm/syscall.h> +#include <asm/unistd_compat_32.h> asmlinkage long compat_sys_sigreturn(void); asmlinkage long compat_sys_rt_sigreturn(void); @@ -122,14 +117,16 @@ COMPAT_SYSCALL_DEFINE6(aarch32_fallocate, int, fd, int, mode, return ksys_fallocate(fd, mode, arg_u64(offset), arg_u64(len)); } +#define __SYSCALL_WITH_COMPAT(nr, sym, compat) __SYSCALL(nr, compat) + #undef __SYSCALL #define __SYSCALL(nr, sym) asmlinkage long __arm64_##sym(const struct pt_regs *); -#include <asm/unistd32.h> +#include <asm/syscall_table_32.h> #undef __SYSCALL #define __SYSCALL(nr, sym) [nr] = __arm64_##sym, -const syscall_fn_t compat_sys_call_table[__NR_compat_syscalls] = { - [0 ... __NR_compat_syscalls - 1] = __arm64_sys_ni_syscall, -#include <asm/unistd32.h> +const syscall_fn_t compat_sys_call_table[__NR_compat32_syscalls] = { + [0 ... __NR_compat32_syscalls - 1] = __arm64_sys_ni_syscall, +#include <asm/syscall_table_32.h> }; diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index 9a70d9746b66..c442fcec6b9e 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -14,20 +14,18 @@ #include <asm/syscall.h> #include <asm/thread_info.h> #include <asm/unistd.h> +#include <asm/unistd_compat_32.h> long compat_arm_syscall(struct pt_regs *regs, int scno); long sys_ni_syscall(void); static long do_ni_syscall(struct pt_regs *regs, int scno) { -#ifdef CONFIG_COMPAT - long ret; if (is_compat_task()) { - ret = compat_arm_syscall(regs, scno); + long ret = compat_arm_syscall(regs, scno); if (ret != -ENOSYS) return ret; } -#endif return sys_ni_syscall(); } @@ -56,17 +54,15 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, syscall_set_return_value(current, regs, 0, ret); /* - * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), - * but not enough for arm64 stack utilization comfort. To keep - * reasonable stack head room, reduce the maximum offset to 9 bits. - * - * The actual entropy will be further reduced by the compiler when - * applying stack alignment constraints: the AAPCS mandates a - * 16-byte (i.e. 4-bit) aligned SP at function boundaries. + * This value will get limited by KSTACK_OFFSET_MAX(), which is 10 + * bits. The actual entropy will be further reduced by the compiler + * when applying stack alignment constraints: the AAPCS mandates a + * 16-byte aligned SP at function boundaries, which will remove the + * 4 low bits from any entropy chosen here. * - * The resulting 5 bits of entropy is seen in SP[8:4]. + * The resulting 6 bits of entropy is seen in SP[9:4]. */ - choose_random_kstack_offset(get_random_u16() & 0x1FF); + choose_random_kstack_offset(get_random_u16()); } static inline bool has_syscall_work(unsigned long flags) @@ -158,7 +154,7 @@ void do_el0_svc(struct pt_regs *regs) #ifdef CONFIG_COMPAT void do_el0_svc_compat(struct pt_regs *regs) { - el0_svc_common(regs, regs->regs[7], __NR_compat_syscalls, + el0_svc_common(regs, regs->regs[7], __NR_compat32_syscalls, compat_sys_call_table); } #endif diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 1a2c72f3e7f8..cb180684d10d 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -194,12 +194,19 @@ static void amu_fie_setup(const struct cpumask *cpus) int cpu; /* We are already set since the last insmod of cpufreq driver */ - if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + if (cpumask_available(amu_fie_cpus) && + unlikely(cpumask_subset(cpus, amu_fie_cpus))) return; - for_each_cpu(cpu, cpus) { + for_each_cpu(cpu, cpus) if (!freq_counters_valid(cpu)) return; + + if (!cpumask_available(amu_fie_cpus) && + !zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) { + WARN_ONCE(1, "Failed to allocate FIE cpumask for CPUs[%*pbl]\n", + cpumask_pr_args(cpus)); + return; } cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); @@ -237,17 +244,8 @@ static struct notifier_block init_amu_fie_notifier = { static int __init init_amu_fie(void) { - int ret; - - if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) - return -ENOMEM; - - ret = cpufreq_register_notifier(&init_amu_fie_notifier, + return cpufreq_register_notifier(&init_amu_fie_notifier, CPUFREQ_POLICY_NOTIFIER); - if (ret) - free_cpumask_var(amu_fie_cpus); - - return ret; } core_initcall(init_amu_fie); diff --git a/arch/arm64/kernel/trace-events-emulation.h b/arch/arm64/kernel/trace-events-emulation.h index 6c40f58b844a..c51b547b583e 100644 --- a/arch/arm64/kernel/trace-events-emulation.h +++ b/arch/arm64/kernel/trace-events-emulation.h @@ -18,7 +18,7 @@ TRACE_EVENT(instruction_emulation, ), TP_fast_assign( - __assign_str(instr, instr); + __assign_str(instr); __entry->addr = addr; ), diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 215e6d7f2df8..4e26bd356a48 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -41,7 +41,7 @@ #include <asm/extable.h> #include <asm/insn.h> #include <asm/kprobes.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/traps.h> #include <asm/smp.h> #include <asm/stack_pointer.h> @@ -273,6 +273,12 @@ void arm64_force_sig_fault(int signo, int code, unsigned long far, force_sig_fault(signo, code, (void __user *)far); } +void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey) +{ + arm64_show_signal(SIGSEGV, str); + force_sig_pkuerr((void __user *)far, pkey); +} + void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str) { @@ -500,6 +506,16 @@ void do_el1_bti(struct pt_regs *regs, unsigned long esr) die("Oops - BTI", regs, esr); } +void do_el0_gcs(struct pt_regs *regs, unsigned long esr) +{ + force_signal_inject(SIGSEGV, SEGV_CPERR, regs->pc, 0); +} + +void do_el1_gcs(struct pt_regs *regs, unsigned long esr) +{ + die("Oops - GCS", regs, esr); +} + void do_el0_fpac(struct pt_regs *regs, unsigned long esr) { force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr); @@ -525,6 +541,13 @@ void do_el0_mops(struct pt_regs *regs, unsigned long esr) user_fastforward_single_step(current); } +void do_el1_mops(struct pt_regs *regs, unsigned long esr) +{ + arm64_mops_reset_regs(®s->user_regs, esr); + + kernel_fastforward_single_step(regs); +} + #define __user_cache_maint(insn, address, res) \ if (address >= TASK_SIZE_MAX) { \ res = -EFAULT; \ @@ -601,18 +624,26 @@ static void ctr_read_handler(unsigned long esr, struct pt_regs *regs) static void cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = ESR_ELx_SYS64_ISS_RT(esr); + if (test_thread_flag(TIF_TSC_SIGSEGV)) { + force_sig(SIGSEGV); + } else { + int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_timer_read_counter()); - arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + pt_regs_write_reg(regs, rt, arch_timer_read_counter()); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } } static void cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = ESR_ELx_SYS64_ISS_RT(esr); + if (test_thread_flag(TIF_TSC_SIGSEGV)) { + force_sig(SIGSEGV); + } else { + int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_timer_get_rate()); - arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + pt_regs_write_reg(regs, rt, arch_timer_get_rate()); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } } static void mrs_handler(unsigned long esr, struct pt_regs *regs) @@ -838,6 +869,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_MOPS] = "MOPS", [ESR_ELx_EC_FP_EXC32] = "FP (AArch32)", [ESR_ELx_EC_FP_EXC64] = "FP (AArch64)", + [ESR_ELx_EC_GCS] = "Guarded Control Stack", [ESR_ELx_EC_SERROR] = "SError", [ESR_ELx_EC_BREAKPT_LOW] = "Breakpoint (lower EL)", [ESR_ELx_EC_BREAKPT_CUR] = "Breakpoint (current EL)", @@ -1105,8 +1137,6 @@ static struct break_hook ubsan_break_hook = { }; #endif -#define esr_comment(esr) ((esr) & ESR_ELx_BRK64_ISS_COMMENT_MASK) - /* * Initial handler for AArch64 BRK exceptions * This handler only used until debug_traps_init(). @@ -1115,15 +1145,15 @@ int __init early_brk64(unsigned long addr, unsigned long esr, struct pt_regs *regs) { #ifdef CONFIG_CFI_CLANG - if ((esr_comment(esr) & ~CFI_BRK_IMM_MASK) == CFI_BRK_IMM_BASE) + if (esr_is_cfi_brk(esr)) return cfi_handler(regs, esr) != DBG_HOOK_HANDLED; #endif #ifdef CONFIG_KASAN_SW_TAGS - if ((esr_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) + if ((esr_brk_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) return kasan_handler(regs, esr) != DBG_HOOK_HANDLED; #endif #ifdef CONFIG_UBSAN_TRAP - if ((esr_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM) + if ((esr_brk_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM) return ubsan_handler(regs, esr) != DBG_HOOK_HANDLED; #endif return bug_handler(regs, esr) != DBG_HOOK_HANDLED; diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 5562daf38a22..e8ed8e5b713b 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -19,7 +19,6 @@ #include <linux/signal.h> #include <linux/slab.h> #include <linux/time_namespace.h> -#include <linux/timekeeper_internal.h> #include <linux/vmalloc.h> #include <vdso/datapage.h> #include <vdso/helpers.h> @@ -34,19 +33,11 @@ enum vdso_abi { VDSO_ABI_AA32, }; -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES, -}; - struct vdso_abi_info { const char *name; const char *vdso_code_start; const char *vdso_code_end; unsigned long vdso_pages; - /* Data Mapping */ - struct vm_special_mapping *dm; /* Code Mapping */ struct vm_special_mapping *cm; }; @@ -69,10 +60,7 @@ static struct vdso_abi_info vdso_info[] __ro_after_init = { /* * The vDSO data page. */ -static union { - struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; +static union vdso_data_store vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; static int vdso_mremap(const struct vm_special_mapping *sm, @@ -122,6 +110,8 @@ struct vdso_data *arch_get_vdso_data(void *vvar_page) return (struct vdso_data *)(vvar_page); } +static const struct vm_special_mapping vvar_map; + /* * The vvar mapping contains data for a specific time namespace, so when a task * changes namespace we must unmap its vvar data for the old namespace. @@ -138,12 +128,8 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm)) + if (vma_is_special_mapping(vma, &vvar_map)) zap_vma_pages(vma); -#ifdef CONFIG_COMPAT_VDSO - if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA32].dm)) - zap_vma_pages(vma); -#endif } mmap_read_unlock(mm); @@ -185,6 +171,11 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, return vmf_insert_pfn(vma, vmf->address, pfn); } +static const struct vm_special_mapping vvar_map = { + .name = "[vvar]", + .fault = vvar_fault, +}; + static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, @@ -208,7 +199,7 @@ static int __setup_additional_pages(enum vdso_abi abi, ret = _install_special_mapping(mm, vdso_base, VVAR_NR_PAGES * PAGE_SIZE, VM_READ|VM_MAYREAD|VM_PFNMAP, - vdso_info[abi].dm); + &vvar_map); if (IS_ERR(ret)) goto up_fail; @@ -238,7 +229,6 @@ up_fail: enum aarch32_map { AA32_MAP_VECTORS, /* kuser helpers */ AA32_MAP_SIGPAGE, - AA32_MAP_VVAR, AA32_MAP_VDSO, }; @@ -263,10 +253,6 @@ static struct vm_special_mapping aarch32_vdso_maps[] = { .pages = &aarch32_sig_page, .mremap = aarch32_sigpage_mremap, }, - [AA32_MAP_VVAR] = { - .name = "[vvar]", - .fault = vvar_fault, - }, [AA32_MAP_VDSO] = { .name = "[vdso]", .mremap = vdso_mremap, @@ -316,7 +302,6 @@ static int __init __aarch32_alloc_vdso_pages(void) if (!IS_ENABLED(CONFIG_COMPAT_VDSO)) return 0; - vdso_info[VDSO_ABI_AA32].dm = &aarch32_vdso_maps[AA32_MAP_VVAR]; vdso_info[VDSO_ABI_AA32].cm = &aarch32_vdso_maps[AA32_MAP_VDSO]; return __vdso_init(VDSO_ABI_AA32); @@ -411,26 +396,14 @@ out: } #endif /* CONFIG_COMPAT */ -enum aarch64_map { - AA64_MAP_VVAR, - AA64_MAP_VDSO, -}; - -static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { - [AA64_MAP_VVAR] = { - .name = "[vvar]", - .fault = vvar_fault, - }, - [AA64_MAP_VDSO] = { - .name = "[vdso]", - .mremap = vdso_mremap, - }, +static struct vm_special_mapping aarch64_vdso_map __ro_after_init = { + .name = "[vdso]", + .mremap = vdso_mremap, }; static int __init vdso_init(void) { - vdso_info[VDSO_ABI_AA64].dm = &aarch64_vdso_maps[AA64_MAP_VVAR]; - vdso_info[VDSO_ABI_AA64].cm = &aarch64_vdso_maps[AA64_MAP_VDSO]; + vdso_info[VDSO_ABI_AA64].cm = &aarch64_vdso_map; return __vdso_init(VDSO_ABI_AA64); } diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 8818287f1095..35685c036044 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -9,7 +9,7 @@ # Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile -obj-vdso := vgettimeofday.o note.o sigreturn.o +obj-vdso := vgettimeofday.o note.o sigreturn.o vgetrandom.o vgetrandom-chacha.o # Build rules targets := $(obj-vdso) vdso.so vdso.so.dbg @@ -21,7 +21,7 @@ btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti # potential future proofing if we end up with internal calls to the exported # routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so # preparation in build-time C")). -ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv \ +ldflags-y := -shared -soname=linux-vdso.so.1 \ -Bsymbolic --build-id=sha1 -n $(btildflags-y) ifdef CONFIG_LD_ORPHAN_WARN @@ -34,26 +34,27 @@ ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO # -Wmissing-prototypes and -Wmissing-declarations are removed from -# the CFLAGS of vgettimeofday.c to make possible to build the -# kernel with CONFIG_WERROR enabled. -CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \ - $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \ - $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \ - -Wmissing-prototypes -Wmissing-declarations -KASAN_SANITIZE := n -KCSAN_SANITIZE := n -UBSAN_SANITIZE := n -OBJECT_FILES_NON_STANDARD := y -KCOV_INSTRUMENT := n - -CFLAGS_vgettimeofday.o = -O2 -mcmodel=tiny -fasynchronous-unwind-tables +# the CFLAGS to make possible to build the kernel with CONFIG_WERROR enabled. +CC_FLAGS_REMOVE_VDSO := $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \ + $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \ + $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \ + -Wmissing-prototypes -Wmissing-declarations + +CC_FLAGS_ADD_VDSO := -O2 -mcmodel=tiny -fasynchronous-unwind-tables + +CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_REMOVE_VDSO) +CFLAGS_REMOVE_vgetrandom.o = $(CC_FLAGS_REMOVE_VDSO) + +CFLAGS_vgettimeofday.o = $(CC_FLAGS_ADD_VDSO) +CFLAGS_vgetrandom.o = $(CC_FLAGS_ADD_VDSO) ifneq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y) endif -# Disable gcov profiling for VDSO code -GCOV_PROFILE := n +ifneq ($(c-getrandom-y),) + CFLAGS_vgetrandom.o += -include $(c-getrandom-y) +endif targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) @@ -68,7 +69,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) # Generate VDSO offsets using helper script -gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +gen-vdsosym := $(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index 45354f2ddf70..47ad6944f9f0 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -11,7 +11,9 @@ #include <linux/const.h> #include <asm/page.h> #include <asm/vdso.h> +#include <asm/vdso/vsyscall.h> #include <asm-generic/vmlinux.lds.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") OUTPUT_ARCH(aarch64) @@ -19,10 +21,11 @@ OUTPUT_ARCH(aarch64) SECTIONS { PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); + PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET); #ifdef CONFIG_TIME_NS PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); #endif - . = VDSO_LBASE + SIZEOF_HEADERS; + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -38,6 +41,7 @@ SECTIONS */ /DISCARD/ : { *(.note.GNU-stack .note.gnu.property) + *(.ARM.attributes) } .note : { *(.note.*) } :text :note @@ -102,6 +106,7 @@ VERSION __kernel_gettimeofday; __kernel_clock_gettime; __kernel_clock_getres; + __kernel_getrandom; local: *; }; } diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..67890b445309 --- /dev/null +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/linkage.h> +#include <asm/cache.h> +#include <asm/assembler.h> + + .text + +#define state0 v0 +#define state1 v1 +#define state2 v2 +#define state3 v3 +#define copy0 v4 +#define copy0_q q4 +#define copy1 v5 +#define copy2 v6 +#define copy3 v7 +#define copy3_d d7 +#define one_d d16 +#define one_q q16 +#define one_v v16 +#define tmp v17 +#define rot8 v18 + +/* + * ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive + * number of blocks of output with nonce 0, taking an input key and 8-bytes + * counter. Importantly does not spill to the stack. + * + * This implementation avoids d8-d15 because they are callee-save in user + * space. + * + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, + * const uint8_t *key, + * uint32_t *counter, + * size_t nblocks) + * + * x0: output bytes + * x1: 32-byte key input + * x2: 8-byte counter input/output + * x3: number of 64-byte block to write to output + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + + /* copy0 = "expand 32-byte k" */ + mov_q x8, 0x3320646e61707865 + mov_q x9, 0x6b20657479622d32 + mov copy0.d[0], x8 + mov copy0.d[1], x9 + + /* copy1,copy2 = key */ + ld1 { copy1.4s, copy2.4s }, [x1] + /* copy3 = counter || zero nonce */ + ld1 { copy3.2s }, [x2] + + movi one_v.2s, #1 + uzp1 one_v.4s, one_v.4s, one_v.4s + +.Lblock: + /* copy state to auxiliary vectors for the final add after the permute. */ + mov state0.16b, copy0.16b + mov state1.16b, copy1.16b + mov state2.16b, copy2.16b + mov state3.16b, copy3.16b + + mov w4, 20 +.Lpermute: + /* + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers state0-state3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + */ + +.Ldoubleround: + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + add state0.4s, state0.4s, state1.4s + eor state3.16b, state3.16b, state0.16b + rev32 state3.8h, state3.8h + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + add state2.4s, state2.4s, state3.4s + eor tmp.16b, state1.16b, state2.16b + shl state1.4s, tmp.4s, #12 + sri state1.4s, tmp.4s, #20 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + add state0.4s, state0.4s, state1.4s + eor tmp.16b, state3.16b, state0.16b + shl state3.4s, tmp.4s, #8 + sri state3.4s, tmp.4s, #24 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + add state2.4s, state2.4s, state3.4s + eor tmp.16b, state1.16b, state2.16b + shl state1.4s, tmp.4s, #7 + sri state1.4s, tmp.4s, #25 + + /* state1[0,1,2,3] = state1[1,2,3,0] */ + ext state1.16b, state1.16b, state1.16b, #4 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + ext state2.16b, state2.16b, state2.16b, #8 + /* state3[0,1,2,3] = state3[1,2,3,0] */ + ext state3.16b, state3.16b, state3.16b, #12 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + add state0.4s, state0.4s, state1.4s + eor state3.16b, state3.16b, state0.16b + rev32 state3.8h, state3.8h + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + add state2.4s, state2.4s, state3.4s + eor tmp.16b, state1.16b, state2.16b + shl state1.4s, tmp.4s, #12 + sri state1.4s, tmp.4s, #20 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + add state0.4s, state0.4s, state1.4s + eor tmp.16b, state3.16b, state0.16b + shl state3.4s, tmp.4s, #8 + sri state3.4s, tmp.4s, #24 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + add state2.4s, state2.4s, state3.4s + eor tmp.16b, state1.16b, state2.16b + shl state1.4s, tmp.4s, #7 + sri state1.4s, tmp.4s, #25 + + /* state1[0,1,2,3] = state1[3,0,1,2] */ + ext state1.16b, state1.16b, state1.16b, #12 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + ext state2.16b, state2.16b, state2.16b, #8 + /* state3[0,1,2,3] = state3[1,2,3,0] */ + ext state3.16b, state3.16b, state3.16b, #4 + + subs w4, w4, #2 + b.ne .Ldoubleround + + /* output0 = state0 + state0 */ + add state0.4s, state0.4s, copy0.4s + /* output1 = state1 + state1 */ + add state1.4s, state1.4s, copy1.4s + /* output2 = state2 + state2 */ + add state2.4s, state2.4s, copy2.4s + /* output2 = state3 + state3 */ + add state3.4s, state3.4s, copy3.4s + st1 { state0.16b - state3.16b }, [x0] + + /* + * ++copy3.counter, the 'add' clears the upper half of the SIMD register + * which is the expected behaviour here. + */ + add copy3_d, copy3_d, one_d + + /* output += 64, --nblocks */ + add x0, x0, 64 + subs x3, x3, #1 + b.ne .Lblock + + /* counter = copy3.counter */ + st1 { copy3.2s }, [x2] + + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ + movi state0.16b, #0 + movi state1.16b, #0 + movi state2.16b, #0 + movi state3.16b, #0 + movi copy1.16b, #0 + movi copy2.16b, #0 + ret +SYM_FUNC_END(__arch_chacha20_blocks_nostack) + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso/vgetrandom.c b/arch/arm64/kernel/vdso/vgetrandom.c new file mode 100644 index 000000000000..832fe195292b --- /dev/null +++ b/arch/arm64/kernel/vdso/vgetrandom.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <uapi/asm-generic/errno.h> + +typeof(__cvdso_getrandom) __kernel_getrandom; + +ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + if (alternative_has_cap_likely(ARM64_HAS_FPSIMD)) + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); + + if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) + return -ENOSYS; + return getrandom_syscall(buffer, len, flags); +} diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index f5f80fdce0fe..25a2cb6317f3 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -98,7 +98,7 @@ VDSO_AFLAGS += -D__ASSEMBLY__ # From arm vDSO Makefile VDSO_LDFLAGS += -Bsymbolic --no-undefined -soname=linux-vdso.so.1 VDSO_LDFLAGS += -z max-page-size=4096 -z common-page-size=4096 -VDSO_LDFLAGS += -shared --hash-style=sysv --build-id=sha1 +VDSO_LDFLAGS += -shared --build-id=sha1 VDSO_LDFLAGS += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) @@ -136,7 +136,7 @@ $(obj)/vdso32.so.dbg: $(obj)/vdso.so.raw $(obj)/$(munge) FORCE $(call if_changed,vdsomunge) # Link rule for the .so file, .lds has to be first -$(obj)/vdso.so.raw: $(src)/vdso.lds $(obj-vdso) FORCE +$(obj)/vdso.so.raw: $(obj)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold_and_vdso_check) # Compilation rules for the vDSO sources diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S index 8d95d7d35057..732702a187e9 100644 --- a/arch/arm64/kernel/vdso32/vdso.lds.S +++ b/arch/arm64/kernel/vdso32/vdso.lds.S @@ -22,7 +22,7 @@ SECTIONS #ifdef CONFIG_TIME_NS PROVIDE_HIDDEN(_timens_data = _vdso_data + PAGE_SIZE); #endif - . = VDSO_LBASE + SIZEOF_HEADERS; + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/vmcore_info.c index 66cde752cd74..b19d5d6cb8b3 100644 --- a/arch/arm64/kernel/crash_core.c +++ b/arch/arm64/kernel/vmcore_info.c @@ -4,7 +4,7 @@ * Copyright (C) Huawei Futurewei Technologies. */ -#include <linux/crash_core.h> +#include <linux/vmcore_info.h> #include <asm/cpufeature.h> #include <asm/memory.h> #include <asm/pgtable-hwdef.h> @@ -23,7 +23,6 @@ void arch_crash_save_vmcoreinfo(void) /* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */ vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR); vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END); - vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START); vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END); vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START); vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END); diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 3cd7e76cc562..e73326bd3ff7 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -126,9 +126,9 @@ jiffies = jiffies_64; #ifdef CONFIG_UNWIND_TABLES #define UNWIND_DATA_SECTIONS \ .eh_frame : { \ - __eh_frame_start = .; \ + __pi___eh_frame_start = .; \ *(.eh_frame) \ - __eh_frame_end = .; \ + __pi___eh_frame_end = .; \ } #else #define UNWIND_DATA_SECTIONS @@ -162,6 +162,7 @@ SECTIONS /DISCARD/ : { *(.interp .dynamic) *(.dynsym .dynstr .hash .gnu.hash) + *(.ARM.attributes) } . = KIMAGE_VADDR; @@ -264,27 +265,32 @@ SECTIONS EXIT_DATA } + RUNTIME_CONST_VARIABLES + PERCPU_SECTION(L1_CACHE_BYTES) HYPERVISOR_PERCPU_SECTION HYPERVISOR_RELOC_SECTION .rela.dyn : ALIGN(8) { - __rela_start = .; + __pi_rela_start = .; *(.rela .rela*) - __rela_end = .; + __pi_rela_end = .; } .relr.dyn : ALIGN(8) { - __relr_start = .; + __pi_relr_start = .; *(.relr.dyn) - __relr_end = .; + __pi_relr_end = .; } . = ALIGN(SEGMENT_ALIGN); __initdata_end = .; __init_end = .; + .data.rel.ro : { *(.data.rel.ro) } + ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!") + _data = .; _sdata = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) @@ -311,12 +317,17 @@ SECTIONS __pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin); _edata = .; + /* start of zero-init region */ BSS_SECTION(SBSS_ALIGN, 0, 0) . = ALIGN(PAGE_SIZE); init_pg_dir = .; . += INIT_DIR_SIZE; init_pg_end = .; + /* end of zero-init region */ + + . += SZ_4K; /* stack for the early C runtime */ + early_init_stack = .; . = ALIGN(SEGMENT_ALIGN); __pecoff_data_size = ABSOLUTE(. - __initdata_begin); @@ -336,9 +347,6 @@ SECTIONS *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) } ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") - - .data.rel.ro : { *(.data.rel.ro) } - ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!") } #include "image-vars.h" |