diff options
Diffstat (limited to 'arch/arm64/kernel')
129 files changed, 21576 insertions, 12839 deletions
diff --git a/arch/arm64/kernel/.gitignore b/arch/arm64/kernel/.gitignore index c5f676c3c224..bbb90f92d051 100644 --- a/arch/arm64/kernel/.gitignore +++ b/arch/arm64/kernel/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only vmlinux.lds diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index cd434d0719c1..76f32e424065 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -3,44 +3,52 @@ # Makefile for the linux kernel. # -CPPFLAGS_vmlinux.lds := -DTEXT_OFFSET=$(TEXT_OFFSET) -AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET) CFLAGS_armv8_deprecated.o := -I$(src) -CFLAGS_REMOVE_ftrace.o = -pg -CFLAGS_REMOVE_insn.o = -pg -CFLAGS_REMOVE_return_address.o = -pg +CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_insn.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) + +# Remove stack protector to avoid triggering unneeded stack canary +# checks due to randomize_kstack_offset. +CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong +CFLAGS_syscall.o += -fno-stack-protector + +# When KASAN is enabled, a stack trace is recorded for every alloc/free, which +# can significantly impact performance. Avoid instrumenting the stack trace +# collection code to minimize this impact. +KASAN_SANITIZE_stacktrace.o := n + +# It's not safe to invoke KCOV when portions of the kernel environment aren't +# available or are out-of-sync with HW state. Since `noinstr` doesn't always +# inhibit KCOV instrumentation, disable it for the entire compilation unit. +KCOV_INSTRUMENT_entry-common.o := n +KCOV_INSTRUMENT_idle.o := n # Object file lists. obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ - entry-fpsimd.o process.o ptrace.o setup.o signal.o \ - sys.o stacktrace.o time.o traps.o io.o vdso.o \ - hyp-stub.o psci.o cpu_ops.o insn.o \ + entry-common.o entry-fpsimd.o process.o ptrace.o \ + setup.o signal.o sys.o stacktrace.o time.o traps.o \ + io.o vdso.o hyp-stub.o psci.o cpu_ops.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - syscall.o - -extra-$(CONFIG_EFI) := efi-entry.o + syscall.o proton-pack.o idle.o patching.o pi/ \ + rsi.o jump_label.o -OBJCOPYFLAGS := --prefix-symbols=__efistub_ -$(obj)/%.stub.o: $(obj)/%.o FORCE - $(call if_changed,objcopy) - -obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \ +obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o +obj-$(CONFIG_COMPAT) += sigreturn32.o +obj-$(CONFIG_COMPAT_ALIGNMENT_FIXUPS) += compat_alignment.o +obj-$(CONFIG_KUSER_HELPERS) += kuser32.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o -obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o +obj-$(CONFIG_MODULES) += module.o module-plts.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o -obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_CPU_PM) += sleep.o suspend.o -obj-$(CONFIG_CPU_IDLE) += cpuidle.o -obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KGDB) += kgdb.o -obj-$(CONFIG_EFI) += efi.o efi-entry.stub.o \ - efi-rt-wrapper.o +obj-$(CONFIG_EFI) += efi.o efi-rt-wrapper.o obj-$(CONFIG_PCI) += pci.o obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o obj-$(CONFIG_ACPI) += acpi.o @@ -49,21 +57,31 @@ obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o obj-$(CONFIG_PARAVIRT) += paravirt.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o +obj-$(CONFIG_ELF_CORE) += elfcore.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \ cpu-reset.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o -obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o -obj-$(CONFIG_ARM64_SSBD) += ssbd.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o +obj-$(CONFIG_ARM64_MTE) += mte.o +obj-y += vdso-wrap.o +obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o + +# Force dependency (vdso*-wrap.S includes vdso.so through incbin) +$(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so +$(obj)/vdso32-wrap.o: $(obj)/vdso32/vdso.so -obj-y += vdso/ probes/ -head-y := head.o -extra-y += $(head-y) vmlinux.lds +obj-y += probes/ +obj-y += head.o +always-$(KBUILD_BUILTIN) += vmlinux.lds ifeq ($(CONFIG_DEBUG_EFI),y) -AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\"" +AFLAGS_head.o += -DVMLINUX_PATH="\"$(abspath vmlinux)\"" endif + +# for cleaning +subdir- += vdso vdso32 diff --git a/arch/arm64/kernel/Makefile.syscalls b/arch/arm64/kernel/Makefile.syscalls new file mode 100644 index 000000000000..0542a718871a --- /dev/null +++ b/arch/arm64/kernel/Makefile.syscalls @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 + +syscall_abis_32 += +syscall_abis_64 += renameat rlimit memfd_secret + +syscalltbl = arch/arm64/tools/syscall_%.tbl diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 44e3c351e1ea..af90128cfed5 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * ARM64 Specific Low-Level ACPI Boot Support * @@ -7,29 +8,32 @@ * Author: Hanjun Guo <hanjun.guo@linaro.org> * Author: Tomasz Nowicki <tomasz.nowicki@linaro.org> * Author: Naresh Bhat <naresh.bhat@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/acpi.h> +#include <linux/arm-smccc.h> #include <linux/cpumask.h> #include <linux/efi.h> #include <linux/efi-bgrt.h> #include <linux/init.h> #include <linux/irq.h> #include <linux/irqdomain.h> +#include <linux/irq_work.h> #include <linux/memblock.h> #include <linux/of_fdt.h> +#include <linux/libfdt.h> #include <linux/smp.h> #include <linux/serial_core.h> +#include <linux/suspend.h> +#include <linux/pgtable.h> +#include <acpi/ghes.h> +#include <acpi/processor.h> #include <asm/cputype.h> #include <asm/cpu_ops.h> -#include <asm/pgtable.h> +#include <asm/daifflags.h> #include <asm/smp_plat.h> int acpi_noirq = 1; /* skip ACPI IRQ initialization */ @@ -42,6 +46,7 @@ EXPORT_SYMBOL(acpi_pci_disabled); static bool param_acpi_off __initdata; static bool param_acpi_on __initdata; static bool param_acpi_force __initdata; +static bool param_acpi_nospcr __initdata; static int __init parse_acpi(char *arg) { @@ -55,6 +60,8 @@ static int __init parse_acpi(char *arg) param_acpi_on = true; else if (strcmp(arg, "force") == 0) /* force ACPI to be enabled */ param_acpi_force = true; + else if (strcmp(arg, "nospcr") == 0) /* disable SPCR as default console */ + param_acpi_nospcr = true; else return -EINVAL; /* Core will print when we return error */ @@ -62,29 +69,22 @@ static int __init parse_acpi(char *arg) } early_param("acpi", parse_acpi); -static int __init dt_scan_depth1_nodes(unsigned long node, - const char *uname, int depth, - void *data) +static bool __init dt_is_stub(void) { - /* - * Ignore anything not directly under the root node; we'll - * catch its parent instead. - */ - if (depth != 1) - return 0; + int node; - if (strcmp(uname, "chosen") == 0) - return 0; + fdt_for_each_subnode(node, initial_boot_params, 0) { + const char *name = fdt_get_name(initial_boot_params, node, NULL); + if (strcmp(name, "chosen") == 0) + continue; + if (strcmp(name, "hypervisor") == 0 && + of_flat_dt_is_compatible(node, "xen,xen")) + continue; - if (strcmp(uname, "hypervisor") == 0 && - of_flat_dt_is_compatible(node, "xen,xen")) - return 0; + return false; + } - /* - * This node at depth 1 is neither a chosen node nor a xen node, - * which we do not expect. - */ - return 1; + return true; } /* @@ -133,7 +133,7 @@ static int __init acpi_fadt_sanity_check(void) /* * FADT is required on arm64; retrieve it to check its presence - * and carry out revision and ACPI HW reduced compliancy tests + * and carry out revision and ACPI HW reduced compliance tests */ status = acpi_get_table(ACPI_SIG_FADT, 0, &table); if (ACPI_FAILURE(status)) { @@ -153,10 +153,14 @@ static int __init acpi_fadt_sanity_check(void) */ if (table->revision < 5 || (table->revision == 5 && fadt->minor_revision < 1)) { - pr_err("Unsupported FADT revision %d.%d, should be 5.1+\n", + pr_err(FW_BUG "Unsupported FADT revision %d.%d, should be 5.1+\n", table->revision, fadt->minor_revision); - ret = -EINVAL; - goto out; + + if (!fadt->arm_boot_flags) { + ret = -EINVAL; + goto out; + } + pr_err("FADT has ARM boot flags set, assuming 5.1\n"); } if (!(fadt->flags & ACPI_FADT_HW_REDUCED)) { @@ -201,8 +205,7 @@ void __init acpi_boot_table_init(void) * and ACPI has not been [force] enabled (acpi=on|force) */ if (param_acpi_off || - (!param_acpi_on && !param_acpi_force && - of_scan_flat_dt(dt_scan_depth1_nodes, NULL))) + (!param_acpi_on && !param_acpi_force && !dt_is_stub())) goto done; /* @@ -229,12 +232,44 @@ done: if (earlycon_acpi_spcr_enable) early_init_dt_scan_chosen_stdout(); } else { - acpi_parse_spcr(earlycon_acpi_spcr_enable, true); +#ifdef CONFIG_HIBERNATION + struct acpi_table_header *facs = NULL; + acpi_get_table(ACPI_SIG_FACS, 1, &facs); + if (facs) { + swsusp_hardware_signature = + ((struct acpi_table_facs *)facs)->hardware_signature; + acpi_put_table(facs); + } +#endif + + /* + * For varying privacy and security reasons, sometimes need + * to completely silence the serial console output, and only + * enable it when needed. + * But there are many existing systems that depend on this + * behaviour, use acpi=nospcr to disable console in ACPI SPCR + * table as default serial console. + */ + acpi_parse_spcr(earlycon_acpi_spcr_enable, + !param_acpi_nospcr); + if (IS_ENABLED(CONFIG_ACPI_BGRT)) acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); } } +static pgprot_t __acpi_get_writethrough_mem_attribute(void) +{ + /* + * Although UEFI specifies the use of Normal Write-through for + * EFI_MEMORY_WT, it is seldom used in practice and not implemented + * by most (all?) CPUs. Rather than allocate a MAIR just for this + * purpose, emit a warning and use Normal Non-cacheable instead. + */ + pr_warn_once("No MAIR allocation for EFI_MEMORY_WT; treating as Normal Non-cacheable\n"); + return __pgprot(PROT_NORMAL_NC); +} + pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) { /* @@ -242,7 +277,7 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) * types" of UEFI 2.5 section 2.3.6.1, each EFI memory type is * mapped to a corresponding MAIR attribute encoding. * The EFI memory attribute advises all possible capabilities - * of a memory region. We use the most efficient capability. + * of a memory region. */ u64 attr; @@ -250,9 +285,176 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) attr = efi_mem_attributes(addr); if (attr & EFI_MEMORY_WB) return PAGE_KERNEL; - if (attr & EFI_MEMORY_WT) - return __pgprot(PROT_NORMAL_WT); if (attr & EFI_MEMORY_WC) return __pgprot(PROT_NORMAL_NC); + if (attr & EFI_MEMORY_WT) + return __acpi_get_writethrough_mem_attribute(); return __pgprot(PROT_DEVICE_nGnRnE); } + +void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) +{ + efi_memory_desc_t *md, *region = NULL; + pgprot_t prot; + + if (WARN_ON_ONCE(!efi_enabled(EFI_MEMMAP))) + return NULL; + + for_each_efi_memory_desc(md) { + u64 end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + + if (phys < md->phys_addr || phys >= end) + continue; + + if (phys + size > end) { + pr_warn(FW_BUG "requested region covers multiple EFI memory regions\n"); + return NULL; + } + region = md; + break; + } + + /* + * It is fine for AML to remap regions that are not represented in the + * EFI memory map at all, as it only describes normal memory, and MMIO + * regions that require a virtual mapping to make them accessible to + * the EFI runtime services. + */ + prot = __pgprot(PROT_DEVICE_nGnRnE); + if (region) { + switch (region->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + case EFI_PERSISTENT_MEMORY: + if (memblock_is_map_memory(phys) || + !memblock_is_region_memory(phys, size)) { + pr_warn(FW_BUG "requested region covers kernel memory @ %pa\n", &phys); + return NULL; + } + /* + * Mapping kernel memory is permitted if the region in + * question is covered by a single memblock with the + * NOMAP attribute set: this enables the use of ACPI + * table overrides passed via initramfs, which are + * reserved in memory using arch_reserve_mem_area() + * below. As this particular use case only requires + * read access, fall through to the R/O mapping case. + */ + fallthrough; + + case EFI_RUNTIME_SERVICES_CODE: + /* + * This would be unusual, but not problematic per se, + * as long as we take care not to create a writable + * mapping for executable code. + */ + prot = PAGE_KERNEL_RO; + break; + + case EFI_ACPI_RECLAIM_MEMORY: + /* + * ACPI reclaim memory is used to pass firmware tables + * and other data that is intended for consumption by + * the OS only, which may decide it wants to reclaim + * that memory and use it for something else. We never + * do that, but we usually add it to the linear map + * anyway, in which case we should use the existing + * mapping. + */ + if (memblock_is_map_memory(phys)) + return (void __iomem *)__phys_to_virt(phys); + fallthrough; + + default: + if (region->attribute & EFI_MEMORY_WB) + prot = PAGE_KERNEL; + else if (region->attribute & EFI_MEMORY_WC) + prot = __pgprot(PROT_NORMAL_NC); + else if (region->attribute & EFI_MEMORY_WT) + prot = __acpi_get_writethrough_mem_attribute(); + } + } + return ioremap_prot(phys, size, prot); +} + +/* + * Claim Synchronous External Aborts as a firmware first notification. + * + * Used by KVM and the arch do_sea handler. + * @regs may be NULL when called from process context. + */ +int apei_claim_sea(struct pt_regs *regs) +{ + int err = -ENOENT; + bool return_to_irqs_enabled; + unsigned long current_flags; + + if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES)) + return err; + + current_flags = local_daif_save_flags(); + + /* current_flags isn't useful here as daif doesn't tell us about pNMI */ + return_to_irqs_enabled = !irqs_disabled_flags(arch_local_save_flags()); + + if (regs) + return_to_irqs_enabled = !regs_irqs_disabled(regs); + + /* + * SEA can interrupt SError, mask it and describe this as an NMI so + * that APEI defers the handling. + */ + local_daif_restore(DAIF_ERRCTX); + nmi_enter(); + err = ghes_notify_sea(); + nmi_exit(); + + /* + * APEI NMI-like notifications are deferred to irq_work. Unless + * we interrupted irqs-masked code, we can do that now. + */ + if (!err) { + if (return_to_irqs_enabled) { + local_daif_restore(DAIF_PROCCTX_NOIRQ); + __irq_enter(); + irq_work_run(); + __irq_exit(); + } else { + pr_warn_ratelimited("APEI work queued but not completed\n"); + err = -EINPROGRESS; + } + } + + local_daif_restore(current_flags); + + return err; +} + +void arch_reserve_mem_area(acpi_physical_address addr, size_t size) +{ + memblock_mark_nomap(addr, size); +} + +#ifdef CONFIG_ACPI_HOTPLUG_CPU +int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 apci_id, + int *pcpu) +{ + /* If an error code is passed in this stub can't fix it */ + if (*pcpu < 0) { + pr_warn_once("Unable to map CPU to valid ID\n"); + return *pcpu; + } + + return 0; +} +EXPORT_SYMBOL(acpi_map_cpu); + +int acpi_unmap_cpu(int cpu) +{ + return 0; +} +EXPORT_SYMBOL(acpi_unmap_cpu); +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c index eac1d0cc595c..2465f291c7e1 100644 --- a/arch/arm64/kernel/acpi_numa.c +++ b/arch/arm64/kernel/acpi_numa.c @@ -27,25 +27,14 @@ #include <asm/numa.h> -static int acpi_early_node_map[NR_CPUS] __initdata = { NUMA_NO_NODE }; +static int acpi_early_node_map[NR_CPUS] __initdata = { [0 ... NR_CPUS - 1] = NUMA_NO_NODE }; int __init acpi_numa_get_nid(unsigned int cpu) { return acpi_early_node_map[cpu]; } -static inline int get_cpu_for_acpi_id(u32 uid) -{ - int cpu; - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) - if (uid == get_acpi_id_for_cpu(cpu)) - return cpu; - - return -EINVAL; -} - -static int __init acpi_parse_gicc_pxm(struct acpi_subtable_header *header, +static int __init acpi_parse_gicc_pxm(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_srat_gicc_affinity *pa; @@ -109,7 +98,7 @@ void __init acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) pxm = pa->proximity_domain; node = acpi_map_pxm_to_node(pxm); - if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) { + if (node == NUMA_NO_NODE) { pr_err("SRAT: Too many proximity domains %d\n", pxm); bad_srat(); return; @@ -118,15 +107,3 @@ void __init acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) node_set(node, numa_nodes_parsed); } -int __init arm64_acpi_numa_init(void) -{ - int ret; - - ret = acpi_numa_init(); - if (ret) { - pr_info("Failed to initialise from firmware\n"); - return ret; - } - - return srat_disabled() ? -EINVAL : 0; -} diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c index 98a20e58758b..e1be29e608b7 100644 --- a/arch/arm64/kernel/acpi_parking_protocol.c +++ b/arch/arm64/kernel/acpi_parking_protocol.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * ARM64 ACPI Parking Protocol implementation * * Authors: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> * Mark Salter <msalter@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/acpi.h> #include <linux/mm.h> @@ -110,10 +99,11 @@ static int acpi_parking_protocol_cpu_boot(unsigned int cpu) * that read this address need to convert this address to the * Boot-Loader's endianness before jumping. */ - writeq_relaxed(__pa_symbol(secondary_entry), &mailbox->entry_point); + writeq_relaxed(__pa_symbol(secondary_entry), + &mailbox->entry_point); writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id); - arch_send_wakeup_ipi_mask(cpumask_of(cpu)); + arch_send_wakeup_ipi(cpu); return 0; } diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index b5d603992d40..f5ec7e7c1d3f 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -1,68 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * alternative runtime patching * inspired by the x86 version * * Copyright (C) 2014 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "alternatives: " fmt #include <linux/init.h> #include <linux/cpu.h> +#include <linux/elf.h> #include <asm/cacheflush.h> #include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/insn.h> +#include <asm/module.h> #include <asm/sections.h> +#include <asm/vdso.h> #include <linux/stop_machine.h> -#define __ALT_PTR(a,f) ((void *)&(a)->f + (a)->f) +#define __ALT_PTR(a, f) ((void *)&(a)->f + (a)->f) #define ALT_ORIG_PTR(a) __ALT_PTR(a, orig_offset) #define ALT_REPL_PTR(a) __ALT_PTR(a, alt_offset) -int alternatives_applied; +#define ALT_CAP(a) ((a)->cpucap & ~ARM64_CB_BIT) +#define ALT_HAS_CB(a) ((a)->cpucap & ARM64_CB_BIT) + +/* Volatile, as we may be patching the guts of READ_ONCE() */ +static volatile int all_alternatives_applied; + +static DECLARE_BITMAP(applied_alternatives, ARM64_NCAPS); struct alt_region { struct alt_instr *begin; struct alt_instr *end; }; +bool alternative_is_applied(u16 cpucap) +{ + if (WARN_ON(cpucap >= ARM64_NCAPS)) + return false; + + return test_bit(cpucap, applied_alternatives); +} + /* * Check if the target PC is within an alternative block. */ -static bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc) +static __always_inline bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc) { - unsigned long replptr; - - if (kernel_text_address(pc)) - return true; - - replptr = (unsigned long)ALT_REPL_PTR(alt); - if (pc >= replptr && pc <= (replptr + alt->alt_len)) - return false; - - /* - * Branching into *another* alternate sequence is doomed, and - * we're not even trying to fix it up. - */ - BUG(); + unsigned long replptr = (unsigned long)ALT_REPL_PTR(alt); + return !(pc >= replptr && pc <= (replptr + alt->alt_len)); } #define align_down(x, a) ((unsigned long)(x) & ~(((unsigned long)(a)) - 1)) -static u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnptr) +static __always_inline u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnptr) { u32 insn; @@ -107,7 +101,7 @@ static u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnp return insn; } -static void patch_alternative(struct alt_instr *alt, +static noinstr void patch_alternative(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst) { __le32 *replptr; @@ -127,13 +121,13 @@ static void patch_alternative(struct alt_instr *alt, * accidentally call into the cache.S code, which is patched by us at * runtime. */ -static void clean_dcache_range_nopatch(u64 start, u64 end) +static noinstr void clean_dcache_range_nopatch(u64 start, u64 end) { u64 cur, d_size, ctr_el0; - ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); + ctr_el0 = arm64_ftr_reg_ctrel0.sys_val; d_size = 4 << cpuid_feature_extract_unsigned_field(ctr_el0, - CTR_DMINLINE_SHIFT); + CTR_EL0_DminLine_SHIFT); cur = start & ~(d_size - 1); do { /* @@ -145,36 +139,40 @@ static void clean_dcache_range_nopatch(u64 start, u64 end) } while (cur += d_size, cur < end); } -static void __apply_alternatives(void *alt_region, bool is_module) +static int __apply_alternatives(const struct alt_region *region, + bool is_module, + unsigned long *cpucap_mask) { struct alt_instr *alt; - struct alt_region *region = alt_region; __le32 *origptr, *updptr; alternative_cb_t alt_cb; for (alt = region->begin; alt < region->end; alt++) { int nr_inst; + int cap = ALT_CAP(alt); - /* Use ARM64_CB_PATCH as an unconditional patch */ - if (alt->cpufeature < ARM64_CB_PATCH && - !cpus_have_cap(alt->cpufeature)) + if (!test_bit(cap, cpucap_mask)) continue; - if (alt->cpufeature == ARM64_CB_PATCH) + if (!cpus_have_cap(cap)) + continue; + + if (ALT_HAS_CB(alt)) BUG_ON(alt->alt_len != 0); else BUG_ON(alt->alt_len != alt->orig_len); - pr_info_once("patching kernel code\n"); - origptr = ALT_ORIG_PTR(alt); updptr = is_module ? origptr : lm_alias(origptr); nr_inst = alt->orig_len / AARCH64_INSN_SIZE; - if (alt->cpufeature < ARM64_CB_PATCH) - alt_cb = patch_alternative; - else + if (ALT_HAS_CB(alt)) { alt_cb = ALT_REPL_PTR(alt); + if (is_module && !core_kernel_text((unsigned long)alt_cb)) + return -ENOEXEC; + } else { + alt_cb = patch_alternative; + } alt_cb(alt, origptr, updptr, nr_inst); @@ -190,32 +188,69 @@ static void __apply_alternatives(void *alt_region, bool is_module) */ if (!is_module) { dsb(ish); - __flush_icache_all(); + icache_inval_all_pou(); isb(); + + bitmap_or(applied_alternatives, applied_alternatives, + cpucap_mask, ARM64_NCAPS); + bitmap_and(applied_alternatives, applied_alternatives, + system_cpucaps, ARM64_NCAPS); } + + return 0; } +static void __init apply_alternatives_vdso(void) +{ + struct alt_region region; + const struct elf64_hdr *hdr; + const struct elf64_shdr *shdr; + const struct elf64_shdr *alt; + DECLARE_BITMAP(all_capabilities, ARM64_NCAPS); + + bitmap_fill(all_capabilities, ARM64_NCAPS); + + hdr = (struct elf64_hdr *)vdso_start; + shdr = (void *)hdr + hdr->e_shoff; + alt = find_section(hdr, shdr, ".altinstructions"); + if (!alt) + return; + + region = (struct alt_region){ + .begin = (void *)hdr + alt->sh_offset, + .end = (void *)hdr + alt->sh_offset + alt->sh_size, + }; + + __apply_alternatives(®ion, false, &all_capabilities[0]); +} + +static const struct alt_region kernel_alternatives __initconst = { + .begin = (struct alt_instr *)__alt_instructions, + .end = (struct alt_instr *)__alt_instructions_end, +}; + /* * We might be patching the stop_machine state machine, so implement a * really simple polling protocol here. */ -static int __apply_alternatives_multi_stop(void *unused) +static int __init __apply_alternatives_multi_stop(void *unused) { - struct alt_region region = { - .begin = (struct alt_instr *)__alt_instructions, - .end = (struct alt_instr *)__alt_instructions_end, - }; - /* We always have a CPU 0 at this point (__init) */ if (smp_processor_id()) { - while (!READ_ONCE(alternatives_applied)) + while (!all_alternatives_applied) cpu_relax(); isb(); } else { - BUG_ON(alternatives_applied); - __apply_alternatives(®ion, false); + DECLARE_BITMAP(remaining_capabilities, ARM64_NCAPS); + + bitmap_complement(remaining_capabilities, boot_cpucaps, + ARM64_NCAPS); + + BUG_ON(all_alternatives_applied); + __apply_alternatives(&kernel_alternatives, false, + remaining_capabilities); /* Barriers provided by the cache flushing */ - WRITE_ONCE(alternatives_applied, 1); + all_alternatives_applied = 1; } return 0; @@ -223,18 +258,48 @@ static int __apply_alternatives_multi_stop(void *unused) void __init apply_alternatives_all(void) { + pr_info("applying system-wide alternatives\n"); + + apply_alternatives_vdso(); /* better not try code patching on a live SMP system */ stop_machine(__apply_alternatives_multi_stop, NULL, cpu_online_mask); } +/* + * This is called very early in the boot process (directly after we run + * a feature detect on the boot CPU). No need to worry about other CPUs + * here. + */ +void __init apply_boot_alternatives(void) +{ + /* If called on non-boot cpu things could go wrong */ + WARN_ON(smp_processor_id() != 0); + + pr_info("applying boot alternatives\n"); + + __apply_alternatives(&kernel_alternatives, false, + &boot_cpucaps[0]); +} + #ifdef CONFIG_MODULES -void apply_alternatives_module(void *start, size_t length) +int apply_alternatives_module(void *start, size_t length) { struct alt_region region = { .begin = start, .end = start + length, }; + DECLARE_BITMAP(all_capabilities, ARM64_NCAPS); + + bitmap_fill(all_capabilities, ARM64_NCAPS); - __apply_alternatives(®ion, true); + return __apply_alternatives(®ion, true, &all_capabilities[0]); } #endif + +noinstr void alt_cb_patch_nops(struct alt_instr *alt, __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + for (int i = 0; i < nr_inst; i++) + updptr[i] = cpu_to_le32(aarch64_insn_gen_nop()); +} +EXPORT_SYMBOL(alt_cb_patch_nops); diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index e52e7280884a..e737c6295ec7 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -1,9 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2014 ARM Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/cpu.h> @@ -20,7 +17,6 @@ #include <asm/sysreg.h> #include <asm/system_misc.h> #include <asm/traps.h> -#include <asm/kprobes.h> #define CREATE_TRACE_POINTS #include "trace-events-emulation.h" @@ -42,220 +38,44 @@ enum insn_emulation_mode { enum legacy_insn_status { INSN_DEPRECATED, INSN_OBSOLETE, -}; - -struct insn_emulation_ops { - const char *name; - enum legacy_insn_status status; - struct undef_hook *hooks; - int (*set_hw_mode)(bool enable); + INSN_UNAVAILABLE, }; struct insn_emulation { - struct list_head node; - struct insn_emulation_ops *ops; + const char *name; + enum legacy_insn_status status; + bool (*try_emulate)(struct pt_regs *regs, + u32 insn); + int (*set_hw_mode)(bool enable); + int current_mode; int min; int max; -}; - -static LIST_HEAD(insn_emulation); -static int nr_insn_emulated __initdata; -static DEFINE_RAW_SPINLOCK(insn_emulation_lock); - -static void register_emulation_hooks(struct insn_emulation_ops *ops) -{ - struct undef_hook *hook; - - BUG_ON(!ops->hooks); - - for (hook = ops->hooks; hook->instr_mask; hook++) - register_undef_hook(hook); - - pr_notice("Registered %s emulation handler\n", ops->name); -} - -static void remove_emulation_hooks(struct insn_emulation_ops *ops) -{ - struct undef_hook *hook; - - BUG_ON(!ops->hooks); - - for (hook = ops->hooks; hook->instr_mask; hook++) - unregister_undef_hook(hook); - - pr_notice("Removed %s emulation handler\n", ops->name); -} - -static void enable_insn_hw_mode(void *data) -{ - struct insn_emulation *insn = (struct insn_emulation *)data; - if (insn->ops->set_hw_mode) - insn->ops->set_hw_mode(true); -} - -static void disable_insn_hw_mode(void *data) -{ - struct insn_emulation *insn = (struct insn_emulation *)data; - if (insn->ops->set_hw_mode) - insn->ops->set_hw_mode(false); -} - -/* Run set_hw_mode(mode) on all active CPUs */ -static int run_all_cpu_set_hw_mode(struct insn_emulation *insn, bool enable) -{ - if (!insn->ops->set_hw_mode) - return -EINVAL; - if (enable) - on_each_cpu(enable_insn_hw_mode, (void *)insn, true); - else - on_each_cpu(disable_insn_hw_mode, (void *)insn, true); - return 0; -} - -/* - * Run set_hw_mode for all insns on a starting CPU. - * Returns: - * 0 - If all the hooks ran successfully. - * -EINVAL - At least one hook is not supported by the CPU. - */ -static int run_all_insn_set_hw_mode(unsigned int cpu) -{ - int rc = 0; - unsigned long flags; - struct insn_emulation *insn; - - raw_spin_lock_irqsave(&insn_emulation_lock, flags); - list_for_each_entry(insn, &insn_emulation, node) { - bool enable = (insn->current_mode == INSN_HW); - if (insn->ops->set_hw_mode && insn->ops->set_hw_mode(enable)) { - pr_warn("CPU[%u] cannot support the emulation of %s", - cpu, insn->ops->name); - rc = -EINVAL; - } - } - raw_spin_unlock_irqrestore(&insn_emulation_lock, flags); - return rc; -} - -static int update_insn_emulation_mode(struct insn_emulation *insn, - enum insn_emulation_mode prev) -{ - int ret = 0; - - switch (prev) { - case INSN_UNDEF: /* Nothing to be done */ - break; - case INSN_EMULATE: - remove_emulation_hooks(insn->ops); - break; - case INSN_HW: - if (!run_all_cpu_set_hw_mode(insn, false)) - pr_notice("Disabled %s support\n", insn->ops->name); - break; - } - - switch (insn->current_mode) { - case INSN_UNDEF: - break; - case INSN_EMULATE: - register_emulation_hooks(insn->ops); - break; - case INSN_HW: - ret = run_all_cpu_set_hw_mode(insn, true); - if (!ret) - pr_notice("Enabled %s support\n", insn->ops->name); - break; - } - - return ret; -} - -static void __init register_insn_emulation(struct insn_emulation_ops *ops) -{ - unsigned long flags; - struct insn_emulation *insn; - - insn = kzalloc(sizeof(*insn), GFP_KERNEL); - insn->ops = ops; - insn->min = INSN_UNDEF; - switch (ops->status) { - case INSN_DEPRECATED: - insn->current_mode = INSN_EMULATE; - /* Disable the HW mode if it was turned on at early boot time */ - run_all_cpu_set_hw_mode(insn, false); - insn->max = INSN_HW; - break; - case INSN_OBSOLETE: - insn->current_mode = INSN_UNDEF; - insn->max = INSN_EMULATE; - break; - } - - raw_spin_lock_irqsave(&insn_emulation_lock, flags); - list_add(&insn->node, &insn_emulation); - nr_insn_emulated++; - raw_spin_unlock_irqrestore(&insn_emulation_lock, flags); - - /* Register any handlers if required */ - update_insn_emulation_mode(insn, INSN_UNDEF); -} - -static int emulation_proc_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret = 0; - struct insn_emulation *insn = (struct insn_emulation *) table->data; - enum insn_emulation_mode prev_mode = insn->current_mode; - - table->data = &insn->current_mode; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + /* sysctl for this emulation */ + struct ctl_table sysctl; +}; - if (ret || !write || prev_mode == insn->current_mode) - goto ret; +#define ARM_OPCODE_CONDTEST_FAIL 0 +#define ARM_OPCODE_CONDTEST_PASS 1 +#define ARM_OPCODE_CONDTEST_UNCOND 2 - ret = update_insn_emulation_mode(insn, prev_mode); - if (ret) { - /* Mode change failed, revert to previous mode. */ - insn->current_mode = prev_mode; - update_insn_emulation_mode(insn, INSN_UNDEF); - } -ret: - table->data = insn; - return ret; -} +#define ARM_OPCODE_CONDITION_UNCOND 0xf -static void __init register_insn_emulation_sysctl(void) +static unsigned int __maybe_unused aarch32_check_condition(u32 opcode, u32 psr) { - unsigned long flags; - int i = 0; - struct insn_emulation *insn; - struct ctl_table *insns_sysctl, *sysctl; - - insns_sysctl = kcalloc(nr_insn_emulated + 1, sizeof(*sysctl), - GFP_KERNEL); - - raw_spin_lock_irqsave(&insn_emulation_lock, flags); - list_for_each_entry(insn, &insn_emulation, node) { - sysctl = &insns_sysctl[i]; - - sysctl->mode = 0644; - sysctl->maxlen = sizeof(int); + u32 cc_bits = opcode >> 28; - sysctl->procname = insn->ops->name; - sysctl->data = insn; - sysctl->extra1 = &insn->min; - sysctl->extra2 = &insn->max; - sysctl->proc_handler = emulation_proc_handler; - i++; + if (cc_bits != ARM_OPCODE_CONDITION_UNCOND) { + if ((*aarch32_opcode_cond_checks[cc_bits])(psr)) + return ARM_OPCODE_CONDTEST_PASS; + else + return ARM_OPCODE_CONDTEST_FAIL; } - raw_spin_unlock_irqrestore(&insn_emulation_lock, flags); - - register_sysctl("abi", insns_sysctl); + return ARM_OPCODE_CONDTEST_UNCOND; } +#ifdef CONFIG_SWP_EMULATION /* * Implement emulation of the SWP/SWPB instructions using load-exclusive and * store-exclusive. @@ -275,9 +95,9 @@ static void __init register_insn_emulation_sysctl(void) #define __user_swpX_asm(data, addr, res, temp, temp2, B) \ do { \ - uaccess_enable(); \ + uaccess_enable_privileged(); \ __asm__ __volatile__( \ - " mov %w3, %w7\n" \ + " mov %w3, %w6\n" \ "0: ldxr"B" %w2, [%4]\n" \ "1: stxr"B" %w0, %w1, [%4]\n" \ " cbz %w0, 2f\n" \ @@ -288,19 +108,13 @@ do { \ "2:\n" \ " mov %w1, %w2\n" \ "3:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "4: mov %w0, %w6\n" \ - " b 3b\n" \ - " .popsection" \ - _ASM_EXTABLE(0b, 4b) \ - _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE_UACCESS_ERR(0b, 3b, %w0) \ + _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0) \ : "=&r" (res), "+r" (data), "=&r" (temp), "=&r" (temp2) \ : "r" ((unsigned long)addr), "i" (-EAGAIN), \ - "i" (-EFAULT), \ "i" (__SWP_LL_SC_LOOPS) \ : "memory"); \ - uaccess_disable(); \ + uaccess_disable_privileged(); \ } while (0) #define __user_swp_asm(data, addr, res, temp, temp2) \ @@ -342,25 +156,6 @@ static int emulate_swpX(unsigned int address, unsigned int *data, return res; } -#define ARM_OPCODE_CONDTEST_FAIL 0 -#define ARM_OPCODE_CONDTEST_PASS 1 -#define ARM_OPCODE_CONDTEST_UNCOND 2 - -#define ARM_OPCODE_CONDITION_UNCOND 0xf - -static unsigned int __kprobes aarch32_check_condition(u32 opcode, u32 psr) -{ - u32 cc_bits = opcode >> 28; - - if (cc_bits != ARM_OPCODE_CONDITION_UNCOND) { - if ((*aarch32_opcode_cond_checks[cc_bits])(psr)) - return ARM_OPCODE_CONDTEST_PASS; - else - return ARM_OPCODE_CONDTEST_FAIL; - } - return ARM_OPCODE_CONDTEST_UNCOND; -} - /* * swp_handler logs the id of calling process, dissects the instruction, sanity * checks the memory location, calls emulate_swpX for the actual operation and @@ -433,28 +228,27 @@ fault: return 0; } -/* - * Only emulate SWP/SWPB executed in ARM state/User mode. - * The kernel must be SWP free and SWP{B} does not exist in Thumb. - */ -static struct undef_hook swp_hooks[] = { - { - .instr_mask = 0x0fb00ff0, - .instr_val = 0x01000090, - .pstate_mask = PSR_AA32_MODE_MASK, - .pstate_val = PSR_AA32_MODE_USR, - .fn = swp_handler - }, - { } -}; +static bool try_emulate_swp(struct pt_regs *regs, u32 insn) +{ + /* SWP{B} only exists in ARM state and does not exist in Thumb */ + if (!compat_user_mode(regs) || compat_thumb_mode(regs)) + return false; + + if ((insn & 0x0fb00ff0) != 0x01000090) + return false; -static struct insn_emulation_ops swp_ops = { + return swp_handler(regs, insn) == 0; +} + +static struct insn_emulation insn_swp = { .name = "swp", .status = INSN_OBSOLETE, - .hooks = swp_hooks, + .try_emulate = try_emulate_swp, .set_hw_mode = NULL, }; +#endif /* CONFIG_SWP_EMULATION */ +#ifdef CONFIG_CP15_BARRIER_EMULATION static int cp15barrier_handler(struct pt_regs *regs, u32 instr) { perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, regs->pc); @@ -517,31 +311,29 @@ static int cp15_barrier_set_hw_mode(bool enable) return 0; } -static struct undef_hook cp15_barrier_hooks[] = { - { - .instr_mask = 0x0fff0fdf, - .instr_val = 0x0e070f9a, - .pstate_mask = PSR_AA32_MODE_MASK, - .pstate_val = PSR_AA32_MODE_USR, - .fn = cp15barrier_handler, - }, - { - .instr_mask = 0x0fff0fff, - .instr_val = 0x0e070f95, - .pstate_mask = PSR_AA32_MODE_MASK, - .pstate_val = PSR_AA32_MODE_USR, - .fn = cp15barrier_handler, - }, - { } -}; +static bool try_emulate_cp15_barrier(struct pt_regs *regs, u32 insn) +{ + if (!compat_user_mode(regs) || compat_thumb_mode(regs)) + return false; + + if ((insn & 0x0fff0fdf) == 0x0e070f9a) + return cp15barrier_handler(regs, insn) == 0; -static struct insn_emulation_ops cp15_barrier_ops = { + if ((insn & 0x0fff0fff) == 0x0e070f95) + return cp15barrier_handler(regs, insn) == 0; + + return false; +} + +static struct insn_emulation insn_cp15_barrier = { .name = "cp15_barrier", .status = INSN_DEPRECATED, - .hooks = cp15_barrier_hooks, + .try_emulate = try_emulate_cp15_barrier, .set_hw_mode = cp15_barrier_set_hw_mode, }; +#endif /* CONFIG_CP15_BARRIER_EMULATION */ +#ifdef CONFIG_SETEND_EMULATION static int setend_set_hw_mode(bool enable) { if (!cpu_supports_mixed_endian_el0()) @@ -589,55 +381,247 @@ static int t16_setend_handler(struct pt_regs *regs, u32 instr) return rc; } -static struct undef_hook setend_hooks[] = { - { - .instr_mask = 0xfffffdff, - .instr_val = 0xf1010000, - .pstate_mask = PSR_AA32_MODE_MASK, - .pstate_val = PSR_AA32_MODE_USR, - .fn = a32_setend_handler, - }, - { - /* Thumb mode */ - .instr_mask = 0x0000fff7, - .instr_val = 0x0000b650, - .pstate_mask = (PSR_AA32_T_BIT | PSR_AA32_MODE_MASK), - .pstate_val = (PSR_AA32_T_BIT | PSR_AA32_MODE_USR), - .fn = t16_setend_handler, - }, - {} -}; +static bool try_emulate_setend(struct pt_regs *regs, u32 insn) +{ + if (compat_thumb_mode(regs) && + (insn & 0xfffffff7) == 0x0000b650) + return t16_setend_handler(regs, insn) == 0; -static struct insn_emulation_ops setend_ops = { + if (compat_user_mode(regs) && + (insn & 0xfffffdff) == 0xf1010000) + return a32_setend_handler(regs, insn) == 0; + + return false; +} + +static struct insn_emulation insn_setend = { .name = "setend", .status = INSN_DEPRECATED, - .hooks = setend_hooks, + .try_emulate = try_emulate_setend, .set_hw_mode = setend_set_hw_mode, }; +#endif /* CONFIG_SETEND_EMULATION */ + +static struct insn_emulation *insn_emulations[] = { +#ifdef CONFIG_SWP_EMULATION + &insn_swp, +#endif +#ifdef CONFIG_CP15_BARRIER_EMULATION + &insn_cp15_barrier, +#endif +#ifdef CONFIG_SETEND_EMULATION + &insn_setend, +#endif +}; + +static DEFINE_MUTEX(insn_emulation_mutex); + +static void enable_insn_hw_mode(void *data) +{ + struct insn_emulation *insn = data; + if (insn->set_hw_mode) + insn->set_hw_mode(true); +} + +static void disable_insn_hw_mode(void *data) +{ + struct insn_emulation *insn = data; + if (insn->set_hw_mode) + insn->set_hw_mode(false); +} + +/* Run set_hw_mode(mode) on all active CPUs */ +static int run_all_cpu_set_hw_mode(struct insn_emulation *insn, bool enable) +{ + if (!insn->set_hw_mode) + return -EINVAL; + if (enable) + on_each_cpu(enable_insn_hw_mode, (void *)insn, true); + else + on_each_cpu(disable_insn_hw_mode, (void *)insn, true); + return 0; +} + +/* + * Run set_hw_mode for all insns on a starting CPU. + * Returns: + * 0 - If all the hooks ran successfully. + * -EINVAL - At least one hook is not supported by the CPU. + */ +static int run_all_insn_set_hw_mode(unsigned int cpu) +{ + int rc = 0; + unsigned long flags; + + /* + * Disable IRQs to serialize against an IPI from + * run_all_cpu_set_hw_mode(), ensuring the HW is programmed to the most + * recent enablement state if the two race with one another. + */ + local_irq_save(flags); + for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) { + struct insn_emulation *insn = insn_emulations[i]; + bool enable = READ_ONCE(insn->current_mode) == INSN_HW; + if (insn->status == INSN_UNAVAILABLE) + continue; + + if (insn->set_hw_mode && insn->set_hw_mode(enable)) { + pr_warn("CPU[%u] cannot support the emulation of %s", + cpu, insn->name); + rc = -EINVAL; + } + } + local_irq_restore(flags); + + return rc; +} + +static int update_insn_emulation_mode(struct insn_emulation *insn, + enum insn_emulation_mode prev) +{ + int ret = 0; + + switch (prev) { + case INSN_UNDEF: /* Nothing to be done */ + break; + case INSN_EMULATE: + break; + case INSN_HW: + if (!run_all_cpu_set_hw_mode(insn, false)) + pr_notice("Disabled %s support\n", insn->name); + break; + } + + switch (insn->current_mode) { + case INSN_UNDEF: + break; + case INSN_EMULATE: + break; + case INSN_HW: + ret = run_all_cpu_set_hw_mode(insn, true); + if (!ret) + pr_notice("Enabled %s support\n", insn->name); + break; + } + + return ret; +} + +static int emulation_proc_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = 0; + struct insn_emulation *insn = container_of(table->data, struct insn_emulation, current_mode); + enum insn_emulation_mode prev_mode = insn->current_mode; + + mutex_lock(&insn_emulation_mutex); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write || prev_mode == insn->current_mode) + goto ret; + + ret = update_insn_emulation_mode(insn, prev_mode); + if (ret) { + /* Mode change failed, revert to previous mode. */ + WRITE_ONCE(insn->current_mode, prev_mode); + update_insn_emulation_mode(insn, INSN_UNDEF); + } +ret: + mutex_unlock(&insn_emulation_mutex); + return ret; +} + +static void __init register_insn_emulation(struct insn_emulation *insn) +{ + struct ctl_table *sysctl; + + insn->min = INSN_UNDEF; + + switch (insn->status) { + case INSN_DEPRECATED: + insn->current_mode = INSN_EMULATE; + /* Disable the HW mode if it was turned on at early boot time */ + run_all_cpu_set_hw_mode(insn, false); + insn->max = INSN_HW; + break; + case INSN_OBSOLETE: + insn->current_mode = INSN_UNDEF; + insn->max = INSN_EMULATE; + break; + case INSN_UNAVAILABLE: + insn->current_mode = INSN_UNDEF; + insn->max = INSN_UNDEF; + break; + } + + /* Program the HW if required */ + update_insn_emulation_mode(insn, INSN_UNDEF); + + if (insn->status != INSN_UNAVAILABLE) { + sysctl = &insn->sysctl; + + sysctl->mode = 0644; + sysctl->maxlen = sizeof(int); + + sysctl->procname = insn->name; + sysctl->data = &insn->current_mode; + sysctl->extra1 = &insn->min; + sysctl->extra2 = &insn->max; + sysctl->proc_handler = emulation_proc_handler; + + register_sysctl_sz("abi", sysctl, 1); + } +} + +bool try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn) +{ + for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) { + struct insn_emulation *ie = insn_emulations[i]; + + if (ie->status == INSN_UNAVAILABLE) + continue; + + /* + * A trap may race with the mode being changed + * INSN_EMULATE<->INSN_HW. Try to emulate the instruction to + * avoid a spurious UNDEF. + */ + if (READ_ONCE(ie->current_mode) == INSN_UNDEF) + continue; + + if (ie->try_emulate(regs, insn)) + return true; + } + + return false; +} /* - * Invoked as late_initcall, since not needed before init spawned. + * Invoked as core_initcall, which guarantees that the instruction + * emulation is ready for userspace. */ static int __init armv8_deprecated_init(void) { - if (IS_ENABLED(CONFIG_SWP_EMULATION)) - register_insn_emulation(&swp_ops); +#ifdef CONFIG_SETEND_EMULATION + if (!system_supports_mixed_endian_el0()) { + insn_setend.status = INSN_UNAVAILABLE; + pr_info("setend instruction emulation is not supported on this system\n"); + } - if (IS_ENABLED(CONFIG_CP15_BARRIER_EMULATION)) - register_insn_emulation(&cp15_barrier_ops); +#endif + for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) { + struct insn_emulation *ie = insn_emulations[i]; - if (IS_ENABLED(CONFIG_SETEND_EMULATION)) { - if(system_supports_mixed_endian_el0()) - register_insn_emulation(&setend_ops); - else - pr_info("setend instruction emulation is not supported on this system\n"); + if (ie->status == INSN_UNAVAILABLE) + continue; + + register_insn_emulation(ie); } cpuhp_setup_state_nocalls(CPUHP_AP_ARM64_ISNDEP_STARTING, "arm64/isndep:starting", run_all_insn_set_hw_mode, NULL); - register_insn_emulation_sysctl(); - return 0; } diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 65b8afc84466..b6367ff3a49c 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -1,29 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/asm-offsets.c * * Copyright (C) 1995-2003 Russell King * 2001-2002 Keith Owens * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#define COMPILE_OFFSETS #include <linux/arm_sdei.h> #include <linux/sched.h> +#include <linux/ftrace.h> +#include <linux/kexec.h> #include <linux/mm.h> -#include <linux/dma-mapping.h> #include <linux/kvm_host.h> -#include <linux/preempt.h> #include <linux/suspend.h> #include <asm/cpufeature.h> #include <asm/fixmap.h> @@ -31,35 +21,42 @@ #include <asm/memory.h> #include <asm/smp_plat.h> #include <asm/suspend.h> -#include <asm/vdso_datapage.h> #include <linux/kbuild.h> #include <linux/arm-smccc.h> int main(void) { - DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); - BLANK(); + DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); - DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit)); #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif +#ifdef CONFIG_SHADOW_CALL_STACK + DEFINE(TSK_TI_SCS_BASE, offsetof(struct task_struct, thread_info.scs_base)); + DEFINE(TSK_TI_SCS_SP, offsetof(struct task_struct, thread_info.scs_sp)); +#endif DEFINE(TSK_STACK, offsetof(struct task_struct, stack)); #ifdef CONFIG_STACKPROTECTOR DEFINE(TSK_STACK_CANARY, offsetof(struct task_struct, stack_canary)); #endif BLANK(); DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context)); + DEFINE(THREAD_SCTLR_USER, offsetof(struct task_struct, thread.sctlr_user)); +#ifdef CONFIG_ARM64_PTR_AUTH + DEFINE(THREAD_KEYS_USER, offsetof(struct task_struct, thread.keys_user)); +#endif +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL + DEFINE(THREAD_KEYS_KERNEL, offsetof(struct task_struct, thread.keys_kernel)); +#endif +#ifdef CONFIG_ARM64_MTE + DEFINE(THREAD_MTE_CTRL, offsetof(struct task_struct, thread.mte_ctrl)); +#endif BLANK(); DEFINE(S_X0, offsetof(struct pt_regs, regs[0])); - DEFINE(S_X1, offsetof(struct pt_regs, regs[1])); DEFINE(S_X2, offsetof(struct pt_regs, regs[2])); - DEFINE(S_X3, offsetof(struct pt_regs, regs[3])); DEFINE(S_X4, offsetof(struct pt_regs, regs[4])); - DEFINE(S_X5, offsetof(struct pt_regs, regs[5])); DEFINE(S_X6, offsetof(struct pt_regs, regs[6])); - DEFINE(S_X7, offsetof(struct pt_regs, regs[7])); DEFINE(S_X8, offsetof(struct pt_regs, regs[8])); DEFINE(S_X10, offsetof(struct pt_regs, regs[10])); DEFINE(S_X12, offsetof(struct pt_regs, regs[12])); @@ -71,84 +68,65 @@ int main(void) DEFINE(S_X24, offsetof(struct pt_regs, regs[24])); DEFINE(S_X26, offsetof(struct pt_regs, regs[26])); DEFINE(S_X28, offsetof(struct pt_regs, regs[28])); + DEFINE(S_FP, offsetof(struct pt_regs, regs[29])); DEFINE(S_LR, offsetof(struct pt_regs, regs[30])); DEFINE(S_SP, offsetof(struct pt_regs, sp)); -#ifdef CONFIG_COMPAT - DEFINE(S_COMPAT_SP, offsetof(struct pt_regs, compat_sp)); -#endif - DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate)); DEFINE(S_PC, offsetof(struct pt_regs, pc)); - DEFINE(S_ORIG_X0, offsetof(struct pt_regs, orig_x0)); + DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); - DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); + DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1)); + DEFINE(S_PMR, offsetof(struct pt_regs, pmr)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); - DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); - BLANK(); - DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); - BLANK(); - DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm)); - DEFINE(VMA_VM_FLAGS, offsetof(struct vm_area_struct, vm_flags)); - BLANK(); - DEFINE(VM_EXEC, VM_EXEC); - BLANK(); - DEFINE(PAGE_SZ, PAGE_SIZE); - BLANK(); - DEFINE(DMA_BIDIRECTIONAL, DMA_BIDIRECTIONAL); - DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); - DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); - BLANK(); - DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); - BLANK(); - DEFINE(CLOCK_REALTIME, CLOCK_REALTIME); - DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); - DEFINE(CLOCK_MONOTONIC_RAW, CLOCK_MONOTONIC_RAW); - DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC); - DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE); - DEFINE(CLOCK_MONOTONIC_COARSE,CLOCK_MONOTONIC_COARSE); - DEFINE(CLOCK_COARSE_RES, LOW_RES_NSEC); - DEFINE(NSEC_PER_SEC, NSEC_PER_SEC); - BLANK(); - DEFINE(VDSO_CS_CYCLE_LAST, offsetof(struct vdso_data, cs_cycle_last)); - DEFINE(VDSO_RAW_TIME_SEC, offsetof(struct vdso_data, raw_time_sec)); - DEFINE(VDSO_RAW_TIME_NSEC, offsetof(struct vdso_data, raw_time_nsec)); - DEFINE(VDSO_XTIME_CLK_SEC, offsetof(struct vdso_data, xtime_clock_sec)); - DEFINE(VDSO_XTIME_CLK_NSEC, offsetof(struct vdso_data, xtime_clock_nsec)); - DEFINE(VDSO_XTIME_CRS_SEC, offsetof(struct vdso_data, xtime_coarse_sec)); - DEFINE(VDSO_XTIME_CRS_NSEC, offsetof(struct vdso_data, xtime_coarse_nsec)); - DEFINE(VDSO_WTM_CLK_SEC, offsetof(struct vdso_data, wtm_clock_sec)); - DEFINE(VDSO_WTM_CLK_NSEC, offsetof(struct vdso_data, wtm_clock_nsec)); - DEFINE(VDSO_TB_SEQ_COUNT, offsetof(struct vdso_data, tb_seq_count)); - DEFINE(VDSO_CS_MONO_MULT, offsetof(struct vdso_data, cs_mono_mult)); - DEFINE(VDSO_CS_RAW_MULT, offsetof(struct vdso_data, cs_raw_mult)); - DEFINE(VDSO_CS_SHIFT, offsetof(struct vdso_data, cs_shift)); - DEFINE(VDSO_TZ_MINWEST, offsetof(struct vdso_data, tz_minuteswest)); - DEFINE(VDSO_TZ_DSTTIME, offsetof(struct vdso_data, tz_dsttime)); - DEFINE(VDSO_USE_SYSCALL, offsetof(struct vdso_data, use_syscall)); - BLANK(); - DEFINE(TVAL_TV_SEC, offsetof(struct timeval, tv_sec)); - DEFINE(TVAL_TV_USEC, offsetof(struct timeval, tv_usec)); - DEFINE(TSPEC_TV_SEC, offsetof(struct timespec, tv_sec)); - DEFINE(TSPEC_TV_NSEC, offsetof(struct timespec, tv_nsec)); - BLANK(); - DEFINE(TZ_MINWEST, offsetof(struct timezone, tz_minuteswest)); - DEFINE(TZ_DSTTIME, offsetof(struct timezone, tz_dsttime)); + DEFINE(S_STACKFRAME_TYPE, offsetof(struct pt_regs, stackframe.type)); + DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); + BLANK(); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS + DEFINE(FREGS_X0, offsetof(struct __arch_ftrace_regs, regs[0])); + DEFINE(FREGS_X2, offsetof(struct __arch_ftrace_regs, regs[2])); + DEFINE(FREGS_X4, offsetof(struct __arch_ftrace_regs, regs[4])); + DEFINE(FREGS_X6, offsetof(struct __arch_ftrace_regs, regs[6])); + DEFINE(FREGS_X8, offsetof(struct __arch_ftrace_regs, regs[8])); + DEFINE(FREGS_FP, offsetof(struct __arch_ftrace_regs, fp)); + DEFINE(FREGS_LR, offsetof(struct __arch_ftrace_regs, lr)); + DEFINE(FREGS_SP, offsetof(struct __arch_ftrace_regs, sp)); + DEFINE(FREGS_PC, offsetof(struct __arch_ftrace_regs, pc)); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + DEFINE(FREGS_DIRECT_TRAMP, offsetof(struct __arch_ftrace_regs, direct_tramp)); +#endif + DEFINE(FREGS_SIZE, sizeof(struct __arch_ftrace_regs)); BLANK(); - DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); +#endif DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); -#ifdef CONFIG_KVM_ARM_HOST + DEFINE(FTR_OVR_VAL_OFFSET, offsetof(struct arm64_ftr_override, val)); + DEFINE(FTR_OVR_MASK_OFFSET, offsetof(struct arm64_ftr_override, mask)); + BLANK(); +#ifdef CONFIG_KVM DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1)); - DEFINE(VCPU_WORKAROUND_FLAGS, offsetof(struct kvm_vcpu, arch.workaround_flags)); - DEFINE(CPU_GP_REGS, offsetof(struct kvm_cpu_context, gp_regs)); - DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_regs, regs)); - DEFINE(CPU_FP_REGS, offsetof(struct kvm_regs, fp_regs)); - DEFINE(VCPU_FPEXC32_EL2, offsetof(struct kvm_vcpu, arch.ctxt.sys_regs[FPEXC32_EL2])); - DEFINE(VCPU_HOST_CONTEXT, offsetof(struct kvm_vcpu, arch.host_cpu_context)); + DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2)); + DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_cpu_context, regs)); + DEFINE(CPU_ELR_EL2, offsetof(struct kvm_cpu_context, sys_regs[ELR_EL2])); + DEFINE(CPU_RGSR_EL1, offsetof(struct kvm_cpu_context, sys_regs[RGSR_EL1])); + DEFINE(CPU_GCR_EL1, offsetof(struct kvm_cpu_context, sys_regs[GCR_EL1])); + DEFINE(CPU_APIAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIAKEYLO_EL1])); + DEFINE(CPU_APIBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIBKEYLO_EL1])); + DEFINE(CPU_APDAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDAKEYLO_EL1])); + DEFINE(CPU_APDBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDBKEYLO_EL1])); + DEFINE(CPU_APGAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APGAKEYLO_EL1])); DEFINE(HOST_CONTEXT_VCPU, offsetof(struct kvm_cpu_context, __hyp_running_vcpu)); + DEFINE(HOST_DATA_CONTEXT, offsetof(struct kvm_host_data, host_ctxt)); + DEFINE(NVHE_INIT_MAIR_EL2, offsetof(struct kvm_nvhe_init_params, mair_el2)); + DEFINE(NVHE_INIT_TCR_EL2, offsetof(struct kvm_nvhe_init_params, tcr_el2)); + DEFINE(NVHE_INIT_TPIDR_EL2, offsetof(struct kvm_nvhe_init_params, tpidr_el2)); + DEFINE(NVHE_INIT_STACK_HYP_VA, offsetof(struct kvm_nvhe_init_params, stack_hyp_va)); + DEFINE(NVHE_INIT_PGD_PA, offsetof(struct kvm_nvhe_init_params, pgd_pa)); + DEFINE(NVHE_INIT_HCR_EL2, offsetof(struct kvm_nvhe_init_params, hcr_el2)); + DEFINE(NVHE_INIT_VTTBR, offsetof(struct kvm_nvhe_init_params, vttbr)); + DEFINE(NVHE_INIT_VTCR, offsetof(struct kvm_nvhe_init_params, vtcr)); + DEFINE(NVHE_INIT_TMP, offsetof(struct kvm_nvhe_init_params, tmp)); #endif #ifdef CONFIG_CPU_PM - DEFINE(CPU_SUSPEND_SZ, sizeof(struct cpu_suspend_ctx)); DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); DEFINE(MPIDR_HASH_MASK, offsetof(struct mpidr_hash, mask)); DEFINE(MPIDR_HASH_SHIFTS, offsetof(struct mpidr_hash, shift_aff)); @@ -159,6 +137,15 @@ int main(void) DEFINE(ARM_SMCCC_RES_X2_OFFS, offsetof(struct arm_smccc_res, a2)); DEFINE(ARM_SMCCC_QUIRK_ID_OFFS, offsetof(struct arm_smccc_quirk, id)); DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS, offsetof(struct arm_smccc_quirk, state)); + DEFINE(ARM_SMCCC_1_2_REGS_X0_OFFS, offsetof(struct arm_smccc_1_2_regs, a0)); + DEFINE(ARM_SMCCC_1_2_REGS_X2_OFFS, offsetof(struct arm_smccc_1_2_regs, a2)); + DEFINE(ARM_SMCCC_1_2_REGS_X4_OFFS, offsetof(struct arm_smccc_1_2_regs, a4)); + DEFINE(ARM_SMCCC_1_2_REGS_X6_OFFS, offsetof(struct arm_smccc_1_2_regs, a6)); + DEFINE(ARM_SMCCC_1_2_REGS_X8_OFFS, offsetof(struct arm_smccc_1_2_regs, a8)); + DEFINE(ARM_SMCCC_1_2_REGS_X10_OFFS, offsetof(struct arm_smccc_1_2_regs, a10)); + DEFINE(ARM_SMCCC_1_2_REGS_X12_OFFS, offsetof(struct arm_smccc_1_2_regs, a12)); + DEFINE(ARM_SMCCC_1_2_REGS_X14_OFFS, offsetof(struct arm_smccc_1_2_regs, a14)); + DEFINE(ARM_SMCCC_1_2_REGS_X16_OFFS, offsetof(struct arm_smccc_1_2_regs, a16)); BLANK(); DEFINE(HIBERN_PBE_ORIG, offsetof(struct pbe, orig_address)); DEFINE(HIBERN_PBE_ADDR, offsetof(struct pbe, address)); @@ -172,5 +159,31 @@ int main(void) DEFINE(SDEI_EVENT_INTREGS, offsetof(struct sdei_registered_event, interrupted_regs)); DEFINE(SDEI_EVENT_PRIORITY, offsetof(struct sdei_registered_event, priority)); #endif +#ifdef CONFIG_ARM64_PTR_AUTH + DEFINE(PTRAUTH_USER_KEY_APIA, offsetof(struct ptrauth_keys_user, apia)); +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL + DEFINE(PTRAUTH_KERNEL_KEY_APIA, offsetof(struct ptrauth_keys_kernel, apia)); +#endif + BLANK(); +#endif +#ifdef CONFIG_KEXEC_CORE + DEFINE(KIMAGE_ARCH_DTB_MEM, offsetof(struct kimage, arch.dtb_mem)); + DEFINE(KIMAGE_ARCH_EL2_VECTORS, offsetof(struct kimage, arch.el2_vectors)); + DEFINE(KIMAGE_ARCH_ZERO_PAGE, offsetof(struct kimage, arch.zero_page)); + DEFINE(KIMAGE_ARCH_PHYS_OFFSET, offsetof(struct kimage, arch.phys_offset)); + DEFINE(KIMAGE_ARCH_TTBR1, offsetof(struct kimage, arch.ttbr1)); + DEFINE(KIMAGE_HEAD, offsetof(struct kimage, head)); + DEFINE(KIMAGE_START, offsetof(struct kimage, start)); + BLANK(); +#endif +#ifdef CONFIG_FUNCTION_TRACER + DEFINE(FTRACE_OPS_FUNC, offsetof(struct ftrace_ops, func)); +#endif + BLANK(); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + DEFINE(FTRACE_OPS_DIRECT_CALL, offsetof(struct ftrace_ops, direct_call)); +#endif + DEFINE(PIE_E0_ASM, PIE_E0); + DEFINE(PIE_E1_ASM, PIE_E1); return 0; } diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index 0bf0a835122f..309942b06c5b 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * ARM64 cacheinfo support * * Copyright (C) 2015 ARM Ltd. * All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed "as is" WITHOUT ANY WARRANTY of any - * kind, whether express or implied; without even the implied warranty - * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/acpi.h> @@ -22,11 +11,15 @@ #include <linux/of.h> #define MAX_CACHE_LEVEL 7 /* Max 7 level supported */ -/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */ -#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1)) -#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level)) -#define CLIDR_CTYPE(clidr, level) \ - (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) + +int cache_line_size(void) +{ + if (coherency_max_size != 0) + return coherency_max_size; + + return cache_line_size_of_cpu(); +} +EXPORT_SYMBOL_GPL(cache_line_size); static inline enum cache_type get_cache_type(int level) { @@ -45,10 +38,9 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, this_leaf->type = type; } -static int __init_cache_level(unsigned int cpu) +static void detect_cache_level(unsigned int *level_p, unsigned int *leaves_p) { - unsigned int ctype, level, leaves, fw_level; - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + unsigned int ctype, level, leaves; for (level = 1, leaves = 0; level <= MAX_CACHE_LEVEL; level++) { ctype = get_cache_type(level); @@ -60,10 +52,34 @@ static int __init_cache_level(unsigned int cpu) leaves += (ctype == CACHE_TYPE_SEPARATE) ? 2 : 1; } - if (acpi_disabled) + *level_p = level; + *leaves_p = leaves; +} + +int early_cache_level(unsigned int cpu) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + + detect_cache_level(&this_cpu_ci->num_levels, &this_cpu_ci->num_leaves); + + return 0; +} + +int init_cache_level(unsigned int cpu) +{ + unsigned int level, leaves; + int fw_level, ret; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + + detect_cache_level(&level, &leaves); + + if (acpi_disabled) { fw_level = of_find_last_cache_level(cpu); - else - fw_level = acpi_find_last_cache_level(cpu); + } else { + ret = acpi_get_cache_info(cpu, &fw_level, NULL); + if (ret < 0) + fw_level = 0; + } if (level < fw_level) { /* @@ -80,25 +96,24 @@ static int __init_cache_level(unsigned int cpu) return 0; } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { unsigned int level, idx; enum cache_type type; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; + struct cacheinfo *infos = this_cpu_ci->info_list; for (idx = 0, level = 1; level <= this_cpu_ci->num_levels && - idx < this_cpu_ci->num_leaves; idx++, level++) { + idx < this_cpu_ci->num_leaves; level++) { type = get_cache_type(level); if (type == CACHE_TYPE_SEPARATE) { - ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); - ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); + if (idx + 1 >= this_cpu_ci->num_leaves) + break; + ci_leaf_init(&infos[idx++], CACHE_TYPE_DATA, level); + ci_leaf_init(&infos[idx++], CACHE_TYPE_INST, level); } else { - ci_leaf_init(this_leaf++, type, level); + ci_leaf_init(&infos[idx++], type, level); } } return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c new file mode 100644 index 000000000000..b68e1d328d4c --- /dev/null +++ b/arch/arm64/kernel/compat_alignment.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0-only +// based on arch/arm/mm/alignment.c + +#include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/perf_event.h> +#include <linux/uaccess.h> + +#include <asm/exception.h> +#include <asm/ptrace.h> +#include <asm/traps.h> + +/* + * 32-bit misaligned trap handler (c) 1998 San Mehat (CCC) -July 1998 + * + * Speed optimisations and better fault handling by Russell King. + */ +#define CODING_BITS(i) (i & 0x0e000000) + +#define LDST_P_BIT(i) (i & (1 << 24)) /* Preindex */ +#define LDST_U_BIT(i) (i & (1 << 23)) /* Add offset */ +#define LDST_W_BIT(i) (i & (1 << 21)) /* Writeback */ +#define LDST_L_BIT(i) (i & (1 << 20)) /* Load */ + +#define LDST_P_EQ_U(i) ((((i) ^ ((i) >> 1)) & (1 << 23)) == 0) + +#define LDSTHD_I_BIT(i) (i & (1 << 22)) /* double/half-word immed */ + +#define RN_BITS(i) ((i >> 16) & 15) /* Rn */ +#define RD_BITS(i) ((i >> 12) & 15) /* Rd */ +#define RM_BITS(i) (i & 15) /* Rm */ + +#define REGMASK_BITS(i) (i & 0xffff) + +#define BAD_INSTR 0xdeadc0de + +/* Thumb-2 32 bit format per ARMv7 DDI0406A A6.3, either f800h,e800h,f800h */ +#define IS_T32(hi16) \ + (((hi16) & 0xe000) == 0xe000 && ((hi16) & 0x1800)) + +union offset_union { + unsigned long un; + signed long sn; +}; + +#define TYPE_ERROR 0 +#define TYPE_FAULT 1 +#define TYPE_LDST 2 +#define TYPE_DONE 3 + +static void +do_alignment_finish_ldst(unsigned long addr, u32 instr, struct pt_regs *regs, + union offset_union offset) +{ + if (!LDST_U_BIT(instr)) + offset.un = -offset.un; + + if (!LDST_P_BIT(instr)) + addr += offset.un; + + if (!LDST_P_BIT(instr) || LDST_W_BIT(instr)) + regs->regs[RN_BITS(instr)] = addr; +} + +static int +do_alignment_ldrdstrd(unsigned long addr, u32 instr, struct pt_regs *regs) +{ + unsigned int rd = RD_BITS(instr); + unsigned int rd2; + int load; + + if ((instr & 0xfe000000) == 0xe8000000) { + /* ARMv7 Thumb-2 32-bit LDRD/STRD */ + rd2 = (instr >> 8) & 0xf; + load = !!(LDST_L_BIT(instr)); + } else if (((rd & 1) == 1) || (rd == 14)) { + return TYPE_ERROR; + } else { + load = ((instr & 0xf0) == 0xd0); + rd2 = rd + 1; + } + + if (load) { + unsigned int val, val2; + + if (get_user(val, (u32 __user *)addr) || + get_user(val2, (u32 __user *)(addr + 4))) + return TYPE_FAULT; + regs->regs[rd] = val; + regs->regs[rd2] = val2; + } else { + if (put_user(regs->regs[rd], (u32 __user *)addr) || + put_user(regs->regs[rd2], (u32 __user *)(addr + 4))) + return TYPE_FAULT; + } + return TYPE_LDST; +} + +/* + * LDM/STM alignment handler. + * + * There are 4 variants of this instruction: + * + * B = rn pointer before instruction, A = rn pointer after instruction + * ------ increasing address -----> + * | | r0 | r1 | ... | rx | | + * PU = 01 B A + * PU = 11 B A + * PU = 00 A B + * PU = 10 A B + */ +static int +do_alignment_ldmstm(unsigned long addr, u32 instr, struct pt_regs *regs) +{ + unsigned int rd, rn, nr_regs, regbits; + unsigned long eaddr, newaddr; + unsigned int val; + + /* count the number of registers in the mask to be transferred */ + nr_regs = hweight16(REGMASK_BITS(instr)) * 4; + + rn = RN_BITS(instr); + newaddr = eaddr = regs->regs[rn]; + + if (!LDST_U_BIT(instr)) + nr_regs = -nr_regs; + newaddr += nr_regs; + if (!LDST_U_BIT(instr)) + eaddr = newaddr; + + if (LDST_P_EQ_U(instr)) /* U = P */ + eaddr += 4; + + for (regbits = REGMASK_BITS(instr), rd = 0; regbits; + regbits >>= 1, rd += 1) + if (regbits & 1) { + if (LDST_L_BIT(instr)) { + if (get_user(val, (u32 __user *)eaddr)) + return TYPE_FAULT; + if (rd < 15) + regs->regs[rd] = val; + else + regs->pc = val; + } else { + /* + * The PC register has a bias of +8 in ARM mode + * and +4 in Thumb mode. This means that a read + * of the value of PC should account for this. + * Since Thumb does not permit STM instructions + * to refer to PC, just add 8 here. + */ + val = (rd < 15) ? regs->regs[rd] : regs->pc + 8; + if (put_user(val, (u32 __user *)eaddr)) + return TYPE_FAULT; + } + eaddr += 4; + } + + if (LDST_W_BIT(instr)) + regs->regs[rn] = newaddr; + + return TYPE_DONE; +} + +/* + * Convert Thumb multi-word load/store instruction forms to equivalent ARM + * instructions so we can reuse ARM userland alignment fault fixups for Thumb. + * + * This implementation was initially based on the algorithm found in + * gdb/sim/arm/thumbemu.c. It is basically just a code reduction of same + * to convert only Thumb ld/st instruction forms to equivalent ARM forms. + * + * NOTES: + * 1. Comments below refer to ARM ARM DDI0100E Thumb Instruction sections. + * 2. If for some reason we're passed an non-ld/st Thumb instruction to + * decode, we return 0xdeadc0de. This should never happen under normal + * circumstances but if it does, we've got other problems to deal with + * elsewhere and we obviously can't fix those problems here. + */ + +static unsigned long thumb2arm(u16 tinstr) +{ + u32 L = (tinstr & (1<<11)) >> 11; + + switch ((tinstr & 0xf800) >> 11) { + /* 6.6.1 Format 1: */ + case 0xc000 >> 11: /* 7.1.51 STMIA */ + case 0xc800 >> 11: /* 7.1.25 LDMIA */ + { + u32 Rn = (tinstr & (7<<8)) >> 8; + u32 W = ((L<<Rn) & (tinstr&255)) ? 0 : 1<<21; + + return 0xe8800000 | W | (L<<20) | (Rn<<16) | + (tinstr&255); + } + + /* 6.6.1 Format 2: */ + case 0xb000 >> 11: /* 7.1.48 PUSH */ + case 0xb800 >> 11: /* 7.1.47 POP */ + if ((tinstr & (3 << 9)) == 0x0400) { + static const u32 subset[4] = { + 0xe92d0000, /* STMDB sp!,{registers} */ + 0xe92d4000, /* STMDB sp!,{registers,lr} */ + 0xe8bd0000, /* LDMIA sp!,{registers} */ + 0xe8bd8000 /* LDMIA sp!,{registers,pc} */ + }; + return subset[(L<<1) | ((tinstr & (1<<8)) >> 8)] | + (tinstr & 255); /* register_list */ + } + fallthrough; /* for illegal instruction case */ + + default: + return BAD_INSTR; + } +} + +/* + * Convert Thumb-2 32 bit LDM, STM, LDRD, STRD to equivalent instruction + * handlable by ARM alignment handler, also find the corresponding handler, + * so that we can reuse ARM userland alignment fault fixups for Thumb. + * + * @pinstr: original Thumb-2 instruction; returns new handlable instruction + * @regs: register context. + * @poffset: return offset from faulted addr for later writeback + * + * NOTES: + * 1. Comments below refer to ARMv7 DDI0406A Thumb Instruction sections. + * 2. Register name Rt from ARMv7 is same as Rd from ARMv6 (Rd is Rt) + */ +static void * +do_alignment_t32_to_handler(u32 *pinstr, struct pt_regs *regs, + union offset_union *poffset) +{ + u32 instr = *pinstr; + u16 tinst1 = (instr >> 16) & 0xffff; + u16 tinst2 = instr & 0xffff; + + switch (tinst1 & 0xffe0) { + /* A6.3.5 Load/Store multiple */ + case 0xe880: /* STM/STMIA/STMEA,LDM/LDMIA, PUSH/POP T2 */ + case 0xe8a0: /* ...above writeback version */ + case 0xe900: /* STMDB/STMFD, LDMDB/LDMEA */ + case 0xe920: /* ...above writeback version */ + /* no need offset decision since handler calculates it */ + return do_alignment_ldmstm; + + case 0xf840: /* POP/PUSH T3 (single register) */ + if (RN_BITS(instr) == 13 && (tinst2 & 0x09ff) == 0x0904) { + u32 L = !!(LDST_L_BIT(instr)); + const u32 subset[2] = { + 0xe92d0000, /* STMDB sp!,{registers} */ + 0xe8bd0000, /* LDMIA sp!,{registers} */ + }; + *pinstr = subset[L] | (1<<RD_BITS(instr)); + return do_alignment_ldmstm; + } + /* Else fall through for illegal instruction case */ + break; + + /* A6.3.6 Load/store double, STRD/LDRD(immed, lit, reg) */ + case 0xe860: + case 0xe960: + case 0xe8e0: + case 0xe9e0: + poffset->un = (tinst2 & 0xff) << 2; + fallthrough; + + case 0xe940: + case 0xe9c0: + return do_alignment_ldrdstrd; + + /* + * No need to handle load/store instructions up to word size + * since ARMv6 and later CPUs can perform unaligned accesses. + */ + default: + break; + } + return NULL; +} + +static int alignment_get_arm(struct pt_regs *regs, __le32 __user *ip, u32 *inst) +{ + __le32 instr = 0; + int fault; + + fault = get_user(instr, ip); + if (fault) + return fault; + + *inst = __le32_to_cpu(instr); + return 0; +} + +static int alignment_get_thumb(struct pt_regs *regs, __le16 __user *ip, u16 *inst) +{ + __le16 instr = 0; + int fault; + + fault = get_user(instr, ip); + if (fault) + return fault; + + *inst = __le16_to_cpu(instr); + return 0; +} + +int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) +{ + union offset_union offset; + unsigned long instrptr; + int (*handler)(unsigned long addr, u32 instr, struct pt_regs *regs); + unsigned int type; + u32 instr = 0; + int isize = 4; + int thumb2_32b = 0; + + instrptr = instruction_pointer(regs); + + if (compat_thumb_mode(regs)) { + __le16 __user *ptr = (__le16 __user *)(instrptr & ~1); + u16 tinstr, tinst2; + + if (alignment_get_thumb(regs, ptr, &tinstr)) + return 1; + + if (IS_T32(tinstr)) { /* Thumb-2 32-bit */ + if (alignment_get_thumb(regs, ptr + 1, &tinst2)) + return 1; + instr = ((u32)tinstr << 16) | tinst2; + thumb2_32b = 1; + } else { + isize = 2; + instr = thumb2arm(tinstr); + } + } else { + if (alignment_get_arm(regs, (__le32 __user *)instrptr, &instr)) + return 1; + } + + switch (CODING_BITS(instr)) { + case 0x00000000: /* 3.13.4 load/store instruction extensions */ + if (LDSTHD_I_BIT(instr)) + offset.un = (instr & 0xf00) >> 4 | (instr & 15); + else + offset.un = regs->regs[RM_BITS(instr)]; + + if ((instr & 0x001000f0) == 0x000000d0 || /* LDRD */ + (instr & 0x001000f0) == 0x000000f0) /* STRD */ + handler = do_alignment_ldrdstrd; + else + return 1; + break; + + case 0x08000000: /* ldm or stm, or thumb-2 32bit instruction */ + if (thumb2_32b) { + offset.un = 0; + handler = do_alignment_t32_to_handler(&instr, regs, &offset); + } else { + offset.un = 0; + handler = do_alignment_ldmstm; + } + break; + + default: + return 1; + } + + if (!handler) + return 1; + type = handler(addr, instr, regs); + + if (type == TYPE_ERROR || type == TYPE_FAULT) + return 1; + + if (type == TYPE_LDST) + do_alignment_finish_ldst(addr, instr, regs, offset); + + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, regs->pc); + arm64_skip_faulting_instruction(regs, isize); + + return 0; +} diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S index a2be30275a73..c87445dde674 100644 --- a/arch/arm64/kernel/cpu-reset.S +++ b/arch/arm64/kernel/cpu-reset.S @@ -1,26 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * CPU reset routines * * Copyright (C) 2001 Deep Blue Solutions Ltd. * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2015 Huawei Futurewei Technologies. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/assembler.h> #include <asm/sysreg.h> #include <asm/virt.h> .text -.pushsection .idmap.text, "awx" +.pushsection .idmap.text, "a" /* - * __cpu_soft_restart(el2_switch, entry, arg0, arg1, arg2) - Helper for - * cpu_soft_restart. + * cpu_soft_restart(el2_switch, entry, arg0, arg1, arg2) * * @el2_switch: Flag to indicate a switch to EL2 is needed. * @entry: Location to jump to for soft reset. @@ -32,12 +29,13 @@ * branch to what would be the reset vector. It must be executed with the * flat identity mapping. */ -ENTRY(__cpu_soft_restart) - /* Clear sctlr_el1 flags. */ - mrs x12, sctlr_el1 - ldr x13, =SCTLR_ELx_FLAGS - bic x12, x12, x13 +SYM_TYPED_FUNC_START(cpu_soft_restart) + mov_q x12, INIT_SCTLR_EL1_MMU_OFF pre_disable_mmu_workaround + /* + * either disable EL1&0 translation regime or disable EL2&0 translation + * regime if HCR_EL2.E2H == 1 + */ msr sctlr_el1, x12 isb @@ -45,11 +43,11 @@ ENTRY(__cpu_soft_restart) mov x0, #HVC_SOFT_RESTART hvc #0 // no return -1: mov x18, x1 // entry +1: mov x8, x1 // entry mov x0, x2 // arg0 mov x1, x3 // arg1 mov x2, x4 // arg2 - br x18 -ENDPROC(__cpu_soft_restart) + br x8 +SYM_FUNC_END(cpu_soft_restart) .popsection diff --git a/arch/arm64/kernel/cpu-reset.h b/arch/arm64/kernel/cpu-reset.h deleted file mode 100644 index fad90e4935fb..000000000000 --- a/arch/arm64/kernel/cpu-reset.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * CPU reset routines - * - * Copyright (C) 2015 Huawei Futurewei Technologies. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef _ARM64_CPU_RESET_H -#define _ARM64_CPU_RESET_H - -#include <asm/virt.h> - -void __cpu_soft_restart(unsigned long el2_switch, unsigned long entry, - unsigned long arg0, unsigned long arg1, unsigned long arg2); - -static inline void __noreturn cpu_soft_restart(unsigned long entry, - unsigned long arg0, - unsigned long arg1, - unsigned long arg2) -{ - typeof(__cpu_soft_restart) *restart; - - unsigned long el2_switch = !is_kernel_in_hyp_mode() && - is_hyp_mode_available(); - restart = (void *)__pa_symbol(__cpu_soft_restart); - - cpu_install_idmap(); - restart(el2_switch, entry, arg0, arg1, arg2); - unreachable(); -} - -#endif diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 9950bb0cbd52..8cb3b575a031 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -1,53 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Contains CPU specific errata definitions * * Copyright (C) 2014 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/arm-smccc.h> -#include <linux/psci.h> #include <linux/types.h> +#include <linux/cpu.h> #include <asm/cpu.h> #include <asm/cputype.h> #include <asm/cpufeature.h> +#include <asm/kvm_asm.h> +#include <asm/smp_plat.h> + +static u64 target_impl_cpu_num; +static struct target_impl_cpu *target_impl_cpus; + +bool cpu_errata_set_target_impl(u64 num, void *impl_cpus) +{ + if (target_impl_cpu_num || !num || !impl_cpus) + return false; + + target_impl_cpu_num = num; + target_impl_cpus = impl_cpus; + return true; +} + +static inline bool is_midr_in_range(struct midr_range const *range) +{ + int i; + + if (!target_impl_cpu_num) + return midr_is_cpu_model_range(read_cpuid_id(), range->model, + range->rv_min, range->rv_max); + + for (i = 0; i < target_impl_cpu_num; i++) { + if (midr_is_cpu_model_range(target_impl_cpus[i].midr, + range->model, + range->rv_min, range->rv_max)) + return true; + } + return false; +} + +bool is_midr_in_range_list(struct midr_range const *ranges) +{ + while (ranges->model) + if (is_midr_in_range(ranges++)) + return true; + return false; +} +EXPORT_SYMBOL_GPL(is_midr_in_range_list); static bool __maybe_unused -is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) +__is_affected_midr_range(const struct arm64_cpu_capabilities *entry, + u32 midr, u32 revidr) { const struct arm64_midr_revidr *fix; - u32 midr = read_cpuid_id(), revidr; - - WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - if (!is_midr_in_range(midr, &entry->midr_range)) + if (!is_midr_in_range(&entry->midr_range)) return false; midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK; - revidr = read_cpuid(REVIDR_EL1); for (fix = entry->fixed_revs; fix && fix->revidr_mask; fix++) if (midr == fix->midr_rv && (revidr & fix->revidr_mask)) return false; - return true; } static bool __maybe_unused +is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) +{ + int i; + + if (!target_impl_cpu_num) { + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + return __is_affected_midr_range(entry, read_cpuid_id(), + read_cpuid(REVIDR_EL1)); + } + + for (i = 0; i < target_impl_cpu_num; i++) { + if (__is_affected_midr_range(entry, target_impl_cpus[i].midr, + target_impl_cpus[i].midr)) + return true; + } + return false; +} + +static bool __maybe_unused is_affected_midr_range_list(const struct arm64_cpu_capabilities *entry, int scope) { WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - return is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list); + return is_midr_in_range_list(entry->midr_range_list); } static bool __maybe_unused @@ -97,372 +142,32 @@ has_mismatched_cache_type(const struct arm64_cpu_capabilities *entry, } static void -cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *__unused) +cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap) { u64 mask = arm64_ftr_reg_ctrel0.strict_mask; + bool enable_uct_trap = false; /* Trap CTR_EL0 access on this CPU, only if it has a mismatch */ if ((read_cpuid_cachetype() & mask) != (arm64_ftr_reg_ctrel0.sys_val & mask)) - sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0); -} - -atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1); - -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR -#include <asm/mmu_context.h> -#include <asm/cacheflush.h> - -DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); - -#ifdef CONFIG_KVM_INDIRECT_VECTORS -extern char __smccc_workaround_1_smc_start[]; -extern char __smccc_workaround_1_smc_end[]; - -static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, - const char *hyp_vecs_end) -{ - void *dst = lm_alias(__bp_harden_hyp_vecs_start + slot * SZ_2K); - int i; - - for (i = 0; i < SZ_2K; i += 0x80) - memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start); - - __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); -} - -static void __install_bp_hardening_cb(bp_hardening_cb_t fn, - const char *hyp_vecs_start, - const char *hyp_vecs_end) -{ - static DEFINE_RAW_SPINLOCK(bp_lock); - int cpu, slot = -1; - - /* - * enable_smccc_arch_workaround_1() passes NULL for the hyp_vecs - * start/end if we're a guest. Skip the hyp-vectors work. - */ - if (!hyp_vecs_start) { - __this_cpu_write(bp_hardening_data.fn, fn); - return; - } - - raw_spin_lock(&bp_lock); - for_each_possible_cpu(cpu) { - if (per_cpu(bp_hardening_data.fn, cpu) == fn) { - slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu); - break; - } - } - - if (slot == -1) { - slot = atomic_inc_return(&arm64_el2_vector_last_slot); - BUG_ON(slot >= BP_HARDEN_EL2_SLOTS); - __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end); - } - - __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); - __this_cpu_write(bp_hardening_data.fn, fn); - raw_spin_unlock(&bp_lock); -} -#else -#define __smccc_workaround_1_smc_start NULL -#define __smccc_workaround_1_smc_end NULL - -static void __install_bp_hardening_cb(bp_hardening_cb_t fn, - const char *hyp_vecs_start, - const char *hyp_vecs_end) -{ - __this_cpu_write(bp_hardening_data.fn, fn); -} -#endif /* CONFIG_KVM_INDIRECT_VECTORS */ - -static void install_bp_hardening_cb(const struct arm64_cpu_capabilities *entry, - bp_hardening_cb_t fn, - const char *hyp_vecs_start, - const char *hyp_vecs_end) -{ - u64 pfr0; - - if (!entry->matches(entry, SCOPE_LOCAL_CPU)) - return; - - pfr0 = read_cpuid(ID_AA64PFR0_EL1); - if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT)) - return; - - __install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end); -} - -#include <uapi/linux/psci.h> -#include <linux/arm-smccc.h> -#include <linux/psci.h> - -static void call_smc_arch_workaround_1(void) -{ - arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); -} - -static void call_hvc_arch_workaround_1(void) -{ - arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); -} - -static void qcom_link_stack_sanitization(void) -{ - u64 tmp; - - asm volatile("mov %0, x30 \n" - ".rept 16 \n" - "bl . + 4 \n" - ".endr \n" - "mov x30, %0 \n" - : "=&r" (tmp)); -} - -static void -enable_smccc_arch_workaround_1(const struct arm64_cpu_capabilities *entry) -{ - bp_hardening_cb_t cb; - void *smccc_start, *smccc_end; - struct arm_smccc_res res; - u32 midr = read_cpuid_id(); - - if (!entry->matches(entry, SCOPE_LOCAL_CPU)) - return; - - if (psci_ops.smccc_version == SMCCC_VERSION_1_0) - return; - - switch (psci_ops.conduit) { - case PSCI_CONDUIT_HVC: - arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, - ARM_SMCCC_ARCH_WORKAROUND_1, &res); - if ((int)res.a0 < 0) - return; - cb = call_hvc_arch_workaround_1; - /* This is a guest, no need to patch KVM vectors */ - smccc_start = NULL; - smccc_end = NULL; - break; - - case PSCI_CONDUIT_SMC: - arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, - ARM_SMCCC_ARCH_WORKAROUND_1, &res); - if ((int)res.a0 < 0) - return; - cb = call_smc_arch_workaround_1; - smccc_start = __smccc_workaround_1_smc_start; - smccc_end = __smccc_workaround_1_smc_end; - break; - - default: - return; - } - - if (((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR) || - ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1)) - cb = qcom_link_stack_sanitization; - - install_bp_hardening_cb(entry, cb, smccc_start, smccc_end); - - return; -} -#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ - -#ifdef CONFIG_ARM64_SSBD -DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); - -int ssbd_state __read_mostly = ARM64_SSBD_KERNEL; - -static const struct ssbd_options { - const char *str; - int state; -} ssbd_options[] = { - { "force-on", ARM64_SSBD_FORCE_ENABLE, }, - { "force-off", ARM64_SSBD_FORCE_DISABLE, }, - { "kernel", ARM64_SSBD_KERNEL, }, -}; - -static int __init ssbd_cfg(char *buf) -{ - int i; - - if (!buf || !buf[0]) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(ssbd_options); i++) { - int len = strlen(ssbd_options[i].str); - - if (strncmp(buf, ssbd_options[i].str, len)) - continue; - - ssbd_state = ssbd_options[i].state; - return 0; - } - - return -EINVAL; -} -early_param("ssbd", ssbd_cfg); + enable_uct_trap = true; -void __init arm64_update_smccc_conduit(struct alt_instr *alt, - __le32 *origptr, __le32 *updptr, - int nr_inst) -{ - u32 insn; - - BUG_ON(nr_inst != 1); - - switch (psci_ops.conduit) { - case PSCI_CONDUIT_HVC: - insn = aarch64_insn_get_hvc_value(); - break; - case PSCI_CONDUIT_SMC: - insn = aarch64_insn_get_smc_value(); - break; - default: - return; - } + /* ... or if the system is affected by an erratum */ + if (cap->capability == ARM64_WORKAROUND_1542419) + enable_uct_trap = true; - *updptr = cpu_to_le32(insn); -} - -void __init arm64_enable_wa2_handling(struct alt_instr *alt, - __le32 *origptr, __le32 *updptr, - int nr_inst) -{ - BUG_ON(nr_inst != 1); - /* - * Only allow mitigation on EL1 entry/exit and guest - * ARCH_WORKAROUND_2 handling if the SSBD state allows it to - * be flipped. - */ - if (arm64_get_ssbd_state() == ARM64_SSBD_KERNEL) - *updptr = cpu_to_le32(aarch64_insn_gen_nop()); + if (enable_uct_trap) + sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0); } -void arm64_set_ssbd_mitigation(bool state) +#ifdef CONFIG_ARM64_ERRATUM_1463225 +static bool +has_cortex_a76_erratum_1463225(const struct arm64_cpu_capabilities *entry, + int scope) { - if (this_cpu_has_cap(ARM64_SSBS)) { - if (state) - asm volatile(SET_PSTATE_SSBS(0)); - else - asm volatile(SET_PSTATE_SSBS(1)); - return; - } - - switch (psci_ops.conduit) { - case PSCI_CONDUIT_HVC: - arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_2, state, NULL); - break; - - case PSCI_CONDUIT_SMC: - arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, state, NULL); - break; - - default: - WARN_ON_ONCE(1); - break; - } + return is_affected_midr_range_list(entry, scope) && is_kernel_in_hyp_mode(); } - -static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry, - int scope) -{ - struct arm_smccc_res res; - bool required = true; - s32 val; - - WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - - if (this_cpu_has_cap(ARM64_SSBS)) { - required = false; - goto out_printmsg; - } - - if (psci_ops.smccc_version == SMCCC_VERSION_1_0) { - ssbd_state = ARM64_SSBD_UNKNOWN; - return false; - } - - switch (psci_ops.conduit) { - case PSCI_CONDUIT_HVC: - arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, - ARM_SMCCC_ARCH_WORKAROUND_2, &res); - break; - - case PSCI_CONDUIT_SMC: - arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, - ARM_SMCCC_ARCH_WORKAROUND_2, &res); - break; - - default: - ssbd_state = ARM64_SSBD_UNKNOWN; - return false; - } - - val = (s32)res.a0; - - switch (val) { - case SMCCC_RET_NOT_SUPPORTED: - ssbd_state = ARM64_SSBD_UNKNOWN; - return false; - - case SMCCC_RET_NOT_REQUIRED: - pr_info_once("%s mitigation not required\n", entry->desc); - ssbd_state = ARM64_SSBD_MITIGATED; - return false; - - case SMCCC_RET_SUCCESS: - required = true; - break; - - case 1: /* Mitigation not required on this CPU */ - required = false; - break; - - default: - WARN_ON(1); - return false; - } - - switch (ssbd_state) { - case ARM64_SSBD_FORCE_DISABLE: - arm64_set_ssbd_mitigation(false); - required = false; - break; - - case ARM64_SSBD_KERNEL: - if (required) { - __this_cpu_write(arm64_ssbd_callback_required, 1); - arm64_set_ssbd_mitigation(true); - } - break; - - case ARM64_SSBD_FORCE_ENABLE: - arm64_set_ssbd_mitigation(true); - required = true; - break; - - default: - WARN_ON(1); - break; - } - -out_printmsg: - switch (ssbd_state) { - case ARM64_SSBD_FORCE_DISABLE: - pr_info_once("%s disabled from command-line\n", entry->desc); - break; - - case ARM64_SSBD_FORCE_ENABLE: - pr_info_once("%s forced from command-line\n", entry->desc); - break; - } - - return required; -} -#endif /* CONFIG_ARM64_SSBD */ +#endif static void __maybe_unused cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused) @@ -507,53 +212,130 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused) .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, \ CAP_MIDR_RANGE_LIST(midr_list) -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR - -/* - * List of CPUs where we need to issue a psci call to - * harden the branch predictor. - */ -static const struct midr_range arm64_bp_harden_smccc_cpus[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), +static const __maybe_unused struct midr_range tx2_family_cpus[] = { MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), - MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1), - MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR), - MIDR_ALL_VERSIONS(MIDR_NVIDIA_DENVER), {}, }; -#endif +static bool __maybe_unused +needs_tx2_tvm_workaround(const struct arm64_cpu_capabilities *entry, + int scope) +{ + int i; -#ifdef CONFIG_HARDEN_EL2_VECTORS + if (!is_affected_midr_range_list(entry, scope) || + !is_hyp_mode_available()) + return false; -static const struct midr_range arm64_harden_el2_vectors[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + for_each_possible_cpu(i) { + if (MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0) != 0) + return true; + } + + return false; +} + +static bool __maybe_unused +has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry, + int scope) +{ + bool has_dic = read_cpuid_cachetype() & BIT(CTR_EL0_DIC_SHIFT); + const struct midr_range range = MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1); + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + return is_midr_in_range(&range) && has_dic; +} + +static const struct midr_range impdef_pmuv3_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), {}, }; -#endif +static bool has_impdef_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + unsigned int pmuver; -#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI + if (!is_kernel_in_hyp_mode()) + return false; + + pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_EL1_PMUVer_SHIFT); + if (pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return false; -static const struct midr_range arm64_repeat_tlbi_cpus[] = { + return is_midr_in_range_list(impdef_pmuv3_cpus); +} + +static void cpu_enable_impdef_pmuv3_traps(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set_s(SYS_HACR_EL2, 0, BIT(56)); +} + +#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI +static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = { #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1009 - MIDR_RANGE(MIDR_QCOM_FALKOR_V1, 0, 0, 0, 0), + { + ERRATA_MIDR_REV(MIDR_QCOM_FALKOR_V1, 0, 0) + }, + { + .midr_range.model = MIDR_QCOM_KRYO, + .matches = is_kryo_midr, + }, #endif #ifdef CONFIG_ARM64_ERRATUM_1286807 - MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 0), + { + ERRATA_MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 0), + }, + { + /* Kryo4xx Gold (rcpe to rfpe) => (r0p0 to r3p0) */ + ERRATA_MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xe), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2441007 + { + ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2441009 + { + /* Cortex-A510 r0p0 -> r1p1. Fixed in r1p2 */ + ERRATA_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1), + }, #endif {}, }; +#endif +#ifdef CONFIG_CAVIUM_ERRATUM_23154 +static const struct midr_range cavium_erratum_23154_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_THUNDERX), + MIDR_ALL_VERSIONS(MIDR_THUNDERX_81XX), + MIDR_ALL_VERSIONS(MIDR_THUNDERX_83XX), + MIDR_ALL_VERSIONS(MIDR_OCTX2_98XX), + MIDR_ALL_VERSIONS(MIDR_OCTX2_96XX), + MIDR_ALL_VERSIONS(MIDR_OCTX2_95XX), + MIDR_ALL_VERSIONS(MIDR_OCTX2_95XXN), + MIDR_ALL_VERSIONS(MIDR_OCTX2_95XXMM), + MIDR_ALL_VERSIONS(MIDR_OCTX2_95XXO), + {}, +}; #endif #ifdef CONFIG_CAVIUM_ERRATUM_27456 -const struct midr_range cavium_erratum_27456_cpus[] = { +static const struct midr_range cavium_erratum_27456_cpus[] = { /* Cavium ThunderX, T88 pass 1.x - 2.1 */ MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), /* Cavium ThunderX, T81 pass 1.0 */ @@ -603,10 +385,191 @@ static const struct midr_range workaround_clean_cache[] = { }; #endif +#ifdef CONFIG_ARM64_ERRATUM_1418040 +/* + * - 1188873 affects r0p0 to r2p0 + * - 1418040 affects r0p0 to r3p1 + */ +static const struct midr_range erratum_1418040_list[] = { + /* Cortex-A76 r0p0 to r3p1 */ + MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 1), + /* Neoverse-N1 r0p0 to r3p1 */ + MIDR_RANGE(MIDR_NEOVERSE_N1, 0, 0, 3, 1), + /* Kryo4xx Gold (rcpe to rfpf) => (r0p0 to r3p1) */ + MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xf), + {}, +}; +#endif + +#ifdef CONFIG_ARM64_ERRATUM_845719 +static const struct midr_range erratum_845719_list[] = { + /* Cortex-A53 r0p[01234] */ + MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 4), + /* Brahma-B53 r0p[0] */ + MIDR_REV(MIDR_BRAHMA_B53, 0, 0), + /* Kryo2XX Silver rAp4 */ + MIDR_REV(MIDR_QCOM_KRYO_2XX_SILVER, 0xa, 0x4), + {}, +}; +#endif + +#ifdef CONFIG_ARM64_ERRATUM_843419 +static const struct arm64_cpu_capabilities erratum_843419_list[] = { + { + /* Cortex-A53 r0p[01234] */ + .matches = is_affected_midr_range, + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 4), + MIDR_FIXED(0x4, BIT(8)), + }, + { + /* Brahma-B53 r0p[0] */ + .matches = is_affected_midr_range, + ERRATA_MIDR_REV(MIDR_BRAHMA_B53, 0, 0), + }, + {}, +}; +#endif + +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT +static const struct midr_range erratum_speculative_at_list[] = { +#ifdef CONFIG_ARM64_ERRATUM_1165522 + /* Cortex A76 r0p0 to r2p0 */ + MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0), +#endif +#ifdef CONFIG_ARM64_ERRATUM_1319367 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), +#endif +#ifdef CONFIG_ARM64_ERRATUM_1530923 + /* Cortex A55 r0p0 to r2p0 */ + MIDR_RANGE(MIDR_CORTEX_A55, 0, 0, 2, 0), + /* Kryo4xx Silver (rdpe => r1p0) */ + MIDR_REV(MIDR_QCOM_KRYO_4XX_SILVER, 0xd, 0xe), +#endif + {}, +}; +#endif + +#ifdef CONFIG_ARM64_ERRATUM_1463225 +static const struct midr_range erratum_1463225[] = { + /* Cortex-A76 r0p0 - r3p1 */ + MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 1), + /* Kryo4xx Gold (rcpe to rfpf) => (r0p0 to r3p1) */ + MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xf), + {}, +}; +#endif + +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE +static const struct midr_range trbe_overwrite_fill_mode_cpus[] = { +#ifdef CONFIG_ARM64_ERRATUM_2139208 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_MICROSOFT_AZURE_COBALT_100), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2119858 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_RANGE(MIDR_CORTEX_X2, 0, 0, 2, 0), +#endif + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE */ + +#ifdef CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE +static const struct midr_range tsb_flush_fail_cpus[] = { +#ifdef CONFIG_ARM64_ERRATUM_2067961 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_MICROSOFT_AZURE_COBALT_100), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2054223 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), +#endif + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE */ + +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE +static struct midr_range trbe_write_out_of_range_cpus[] = { +#ifdef CONFIG_ARM64_ERRATUM_2253138 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_MICROSOFT_AZURE_COBALT_100), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2224489 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_RANGE(MIDR_CORTEX_X2, 0, 0, 2, 0), +#endif + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE */ + +#ifdef CONFIG_ARM64_ERRATUM_1742098 +static struct midr_range broken_aarch32_aes[] = { + MIDR_RANGE(MIDR_CORTEX_A57, 0, 1, 0xf, 0xf), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + {}, +}; +#endif /* CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE */ + +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD +static const struct midr_range erratum_spec_unpriv_load_list[] = { +#ifdef CONFIG_ARM64_ERRATUM_3117295 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A510), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2966298 + /* Cortex-A520 r0p0 to r0p1 */ + MIDR_REV_RANGE(MIDR_CORTEX_A520, 0, 0, 1), +#endif + {}, +}; +#endif + +#ifdef CONFIG_ARM64_ERRATUM_3194386 +static const struct midr_range erratum_spec_ssbs_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A725), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X4), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X925), + MIDR_ALL_VERSIONS(MIDR_MICROSOFT_AZURE_COBALT_100), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3AE), + {} +}; +#endif + +#ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38 +static const struct midr_range erratum_ac03_cpu_38_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {}, +}; +#endif + +#ifdef CONFIG_AMPERE_ERRATUM_AC04_CPU_23 +static const struct midr_range erratum_ac04_cpu_23_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { - .desc = "ARM errata 826319, 827319, 824069, 819472", + .desc = "ARM errata 826319, 827319, 824069, or 819472", .capability = ARM64_WORKAROUND_CLEAN_CACHE, ERRATA_MIDR_RANGE_LIST(workaround_clean_cache), .cpu_enable = cpu_enable_cache_maint_trap, @@ -634,27 +597,26 @@ const struct arm64_cpu_capabilities arm64_errata[] = { #endif #ifdef CONFIG_ARM64_ERRATUM_843419 { - /* Cortex-A53 r0p[01234] */ .desc = "ARM erratum 843419", .capability = ARM64_WORKAROUND_843419, - ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 4), - MIDR_FIXED(0x4, BIT(8)), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = cpucap_multi_entry_cap_matches, + .match_list = erratum_843419_list, }, #endif #ifdef CONFIG_ARM64_ERRATUM_845719 { - /* Cortex-A53 r0p[01234] */ .desc = "ARM erratum 845719", .capability = ARM64_WORKAROUND_845719, - ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 4), + ERRATA_MIDR_RANGE_LIST(erratum_845719_list), }, #endif #ifdef CONFIG_CAVIUM_ERRATUM_23154 { - /* Cavium ThunderX, pass 1.x */ - .desc = "Cavium erratum 23154", + .desc = "Cavium errata 23154 and 38545", .capability = ARM64_WORKAROUND_CAVIUM_23154, - ERRATA_MIDR_REV_RANGE(MIDR_THUNDERX, 0, 0, 1), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + ERRATA_MIDR_RANGE_LIST(cavium_erratum_23154_cpus), }, #endif #ifdef CONFIG_CAVIUM_ERRATUM_27456 @@ -682,15 +644,18 @@ const struct arm64_cpu_capabilities arm64_errata[] = { { .desc = "Qualcomm Technologies Falkor/Kryo erratum 1003", .capability = ARM64_WORKAROUND_QCOM_FALKOR_E1003, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, .matches = cpucap_multi_entry_cap_matches, .match_list = qcom_erratum_1003_list, }, #endif #ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI { - .desc = "Qualcomm erratum 1009, ARM erratum 1286807", + .desc = "Qualcomm erratum 1009, or ARM erratum 1286807, 2441009", .capability = ARM64_WORKAROUND_REPEAT_TLBI, - ERRATA_MIDR_RANGE_LIST(arm64_repeat_tlbi_cpus), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = cpucap_multi_entry_cap_matches, + .match_list = arm64_repeat_tlbi_list, }, #endif #ifdef CONFIG_ARM64_ERRATUM_858921 @@ -701,44 +666,247 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), }, #endif -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR { - .capability = ARM64_HARDEN_BRANCH_PREDICTOR, - .cpu_enable = enable_smccc_arch_workaround_1, - ERRATA_MIDR_RANGE_LIST(arm64_bp_harden_smccc_cpus), + .desc = "Spectre-v2", + .capability = ARM64_SPECTRE_V2, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_spectre_v2, + .cpu_enable = spectre_v2_enable_mitigation, + }, +#ifdef CONFIG_RANDOMIZE_BASE + { + /* Must come after the Spectre-v2 entry */ + .desc = "Spectre-v3a", + .capability = ARM64_SPECTRE_V3A, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_spectre_v3a, + .cpu_enable = spectre_v3a_enable_mitigation, + }, +#endif + { + .desc = "Spectre-v4", + .capability = ARM64_SPECTRE_V4, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_spectre_v4, + .cpu_enable = spectre_v4_enable_mitigation, + }, + { + .desc = "Spectre-BHB", + .capability = ARM64_SPECTRE_BHB, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = is_spectre_bhb_affected, + .cpu_enable = spectre_bhb_enable_mitigation, + }, +#ifdef CONFIG_ARM64_ERRATUM_1418040 + { + .desc = "ARM erratum 1418040", + .capability = ARM64_WORKAROUND_1418040, + ERRATA_MIDR_RANGE_LIST(erratum_1418040_list), + /* + * We need to allow affected CPUs to come in late, but + * also need the non-affected CPUs to be able to come + * in at any point in time. Wonderful. + */ + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT + { + .desc = "ARM errata 1165522, 1319367, or 1530923", + .capability = ARM64_WORKAROUND_SPECULATIVE_AT, + ERRATA_MIDR_RANGE_LIST(erratum_speculative_at_list), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_1463225 + { + .desc = "ARM erratum 1463225", + .capability = ARM64_WORKAROUND_1463225, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_cortex_a76_erratum_1463225, + .midr_range_list = erratum_1463225, }, #endif -#ifdef CONFIG_HARDEN_EL2_VECTORS +#ifdef CONFIG_CAVIUM_TX2_ERRATUM_219 + { + .desc = "Cavium ThunderX2 erratum 219 (KVM guest sysreg trapping)", + .capability = ARM64_WORKAROUND_CAVIUM_TX2_219_TVM, + ERRATA_MIDR_RANGE_LIST(tx2_family_cpus), + .matches = needs_tx2_tvm_workaround, + }, { - .desc = "EL2 vector hardening", - .capability = ARM64_HARDEN_EL2_VECTORS, - ERRATA_MIDR_RANGE_LIST(arm64_harden_el2_vectors), + .desc = "Cavium ThunderX2 erratum 219 (PRFM removal)", + .capability = ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM, + ERRATA_MIDR_RANGE_LIST(tx2_family_cpus), }, #endif -#ifdef CONFIG_ARM64_SSBD +#ifdef CONFIG_ARM64_ERRATUM_1542419 { - .desc = "Speculative Store Bypass Disable", - .capability = ARM64_SSBD, + /* we depend on the firmware portion for correctness */ + .desc = "ARM erratum 1542419 (kernel portion)", + .capability = ARM64_WORKAROUND_1542419, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, - .matches = has_ssbd_mitigation, + .matches = has_neoverse_n1_erratum_1542419, + .cpu_enable = cpu_enable_trap_ctr_access, }, #endif -#ifdef CONFIG_ARM64_ERRATUM_1188873 +#ifdef CONFIG_ARM64_ERRATUM_1508412 { - /* Cortex-A76 r0p0 to r2p0 */ - .desc = "ARM erratum 1188873", - .capability = ARM64_WORKAROUND_1188873, - ERRATA_MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0), + /* we depend on the firmware portion for correctness */ + .desc = "ARM erratum 1508412 (kernel portion)", + .capability = ARM64_WORKAROUND_1508412, + ERRATA_MIDR_RANGE(MIDR_CORTEX_A77, + 0, 0, + 1, 0), }, #endif -#ifdef CONFIG_ARM64_ERRATUM_1165522 +#ifdef CONFIG_NVIDIA_CARMEL_CNP_ERRATUM + { + /* NVIDIA Carmel */ + .desc = "NVIDIA Carmel CNP erratum", + .capability = ARM64_WORKAROUND_NVIDIA_CARMEL_CNP, + ERRATA_MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL), + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE { - /* Cortex-A76 r0p0 to r2p0 */ - .desc = "ARM erratum 1165522", - .capability = ARM64_WORKAROUND_1165522, - ERRATA_MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0), + /* + * The erratum work around is handled within the TRBE + * driver and can be applied per-cpu. So, we can allow + * a late CPU to come online with this erratum. + */ + .desc = "ARM erratum 2119858 or 2139208", + .capability = ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + CAP_MIDR_RANGE_LIST(trbe_overwrite_fill_mode_cpus), }, #endif +#ifdef CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE + { + .desc = "ARM erratum 2067961 or 2054223", + .capability = ARM64_WORKAROUND_TSB_FLUSH_FAILURE, + ERRATA_MIDR_RANGE_LIST(tsb_flush_fail_cpus), + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE + { + .desc = "ARM erratum 2253138 or 2224489", + .capability = ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + CAP_MIDR_RANGE_LIST(trbe_write_out_of_range_cpus), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2645198 + { + .desc = "ARM erratum 2645198", + .capability = ARM64_WORKAROUND_2645198, + ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A715) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2077057 + { + .desc = "ARM erratum 2077057", + .capability = ARM64_WORKAROUND_2077057, + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2064142 + { + .desc = "ARM erratum 2064142", + .capability = ARM64_WORKAROUND_2064142, + + /* Cortex-A510 r0p0 - r0p2 */ + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2457168 + { + .desc = "ARM erratum 2457168", + .capability = ARM64_WORKAROUND_2457168, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + + /* Cortex-A510 r0p0-r1p1 */ + CAP_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2038923 + { + .desc = "ARM erratum 2038923", + .capability = ARM64_WORKAROUND_2038923, + + /* Cortex-A510 r0p0 - r0p2 */ + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_1902691 + { + .desc = "ARM erratum 1902691", + .capability = ARM64_WORKAROUND_1902691, + + /* Cortex-A510 r0p0 - r0p1 */ + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 1) + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_1742098 + { + .desc = "ARM erratum 1742098", + .capability = ARM64_WORKAROUND_1742098, + CAP_MIDR_RANGE_LIST(broken_aarch32_aes), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_2658417 + { + .desc = "ARM erratum 2658417", + .capability = ARM64_WORKAROUND_2658417, + /* Cortex-A510 r0p0 - r1p1 */ + ERRATA_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1), + MIDR_FIXED(MIDR_CPU_VAR_REV(1,1), BIT(25)), + }, +#endif +#ifdef CONFIG_ARM64_ERRATUM_3194386 + { + .desc = "SSBS not fully self-synchronizing", + .capability = ARM64_WORKAROUND_SPECULATIVE_SSBS, + ERRATA_MIDR_RANGE_LIST(erratum_spec_ssbs_list), + }, +#endif +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD + { + .desc = "ARM errata 2966298, 3117295", + .capability = ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD, + /* Cortex-A520 r0p0 - r0p1 */ + ERRATA_MIDR_RANGE_LIST(erratum_spec_unpriv_load_list), + }, +#endif +#ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38 + { + .desc = "AmpereOne erratum AC03_CPU_38", + .capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38, + ERRATA_MIDR_RANGE_LIST(erratum_ac03_cpu_38_list), + }, +#endif +#ifdef CONFIG_AMPERE_ERRATUM_AC04_CPU_23 + { + .desc = "AmpereOne erratum AC04_CPU_23", + .capability = ARM64_WORKAROUND_AMPERE_AC04_CPU_23, + ERRATA_MIDR_RANGE_LIST(erratum_ac04_cpu_23_list), + }, +#endif + { + .desc = "Broken CNTVOFF_EL2", + .capability = ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF, + ERRATA_MIDR_RANGE_LIST(((const struct midr_range[]) { + MIDR_ALL_VERSIONS(MIDR_QCOM_ORYON_X1), + {} + })), + }, + { + .desc = "Apple IMPDEF PMUv3 Traps", + .capability = ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = has_impdef_pmuv3, + .cpu_enable = cpu_enable_impdef_pmuv3_traps, + }, { } }; diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c index ea001241bdd4..e133011f64b5 100644 --- a/arch/arm64/kernel/cpu_ops.c +++ b/arch/arm64/kernel/cpu_ops.c @@ -1,19 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * CPU kernel entry/exit control * * Copyright (C) 2013 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/acpi.h> @@ -26,10 +15,12 @@ #include <asm/smp_plat.h> extern const struct cpu_operations smp_spin_table_ops; +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL extern const struct cpu_operations acpi_parking_protocol_ops; +#endif extern const struct cpu_operations cpu_psci_ops; -const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; +static const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; static const struct cpu_operations *const dt_supported_cpu_ops[] __initconst = { &smp_spin_table_ops, @@ -85,6 +76,7 @@ static const char *__init cpu_read_enable_method(int cpu) pr_err("%pOF: missing enable-method property\n", dn); } + of_node_put(dn); } else { enable_method = acpi_get_enable_method(cpu); if (!enable_method) { @@ -104,7 +96,7 @@ static const char *__init cpu_read_enable_method(int cpu) /* * Read a cpu's enable method and record it in cpu_ops. */ -int __init cpu_read_ops(int cpu) +int __init init_cpu_ops(int cpu) { const char *enable_method = cpu_read_enable_method(cpu); @@ -119,3 +111,8 @@ int __init cpu_read_ops(int cpu) return 0; } + +const struct cpu_operations *get_cpu_ops(int cpu) +{ + return cpu_ops[cpu]; +} diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index f6d84e2c92fe..c840a93b9ef9 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1,19 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Contains CPU feature definitions * * Copyright (C) 2015 ARM Ltd. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * A note for the weary kernel hacker: the code here is confusing and hard to + * follow! That's partly because it's solving a nasty problem, but also because + * there's a little bit of over-abstraction that tends to obscure what's going + * on behind a maze of helper functions and macros. * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * The basic problem is that hardware folks have started gluing together CPUs + * with distinct architectural features; in some cases even creating SoCs where + * user-visible instructions are available only on a subset of the available + * cores. We try to address this by snapshotting the feature registers of the + * boot CPU and comparing these with the feature registers of each secondary + * CPU when bringing them up. If there is a mismatch, then we update the + * snapshot state to indicate the lowest-common denominator of the feature, + * known as the "safe" value. This snapshot state can be queried to view the + * "sanitised" value of a feature register. * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. + * The sanitised register values are used to decide which capabilities we + * have in the system. These may be in the form of traditional "hwcaps" + * advertised to userspace or internal "cpucaps" which are used to configure + * things like alternative patching and static keys. While a feature mismatch + * may result in a TAINT_CPU_OUT_OF_SPEC kernel taint, a capability mismatch + * may prevent a CPU from being onlined at all. + * + * Some implementation details worth remembering: + * + * - Mismatched features are *always* sanitised to a "safe" value, which + * usually indicates that the feature is not supported. + * + * - A mismatched feature marked with FTR_STRICT will cause a "SANITY CHECK" + * warning when onlining an offending CPU and the kernel will be tainted + * with TAINT_CPU_OUT_OF_SPEC. + * + * - Features marked as FTR_VISIBLE have their sanitised value visible to + * userspace. FTR_VISIBLE features in registers that are only visible + * to EL0 by trapping *must* have a corresponding HWCAP so that late + * onlining of CPUs cannot lead to features disappearing at runtime. + * + * - A "feature" is typically a 4-bit register field. A "capability" is the + * high-level description derived from the sanitised field value. + * + * - Read the Arm ARM (DDI 0487F.a) section D13.1.3 ("Principles of the ID + * scheme for fields in ID registers") to understand when feature fields + * may be signed or unsigned (FTR_SIGNED and FTR_UNSIGNED accordingly). + * + * - KVM exposes its own view of the feature registers to guest operating + * systems regardless of FTR_VISIBLE. This is typically driven from the + * sanitised register values to allow virtual CPUs to be migrated between + * arbitrary physical CPUs, but some features not present on the host are + * also advertised and emulated. Look at sys_reg_descs[] for the gory + * details. + * + * - If the arm64_ftr_bits[] for a register has a missing field, then this + * field is treated as STRICT RES0, including for read_sanitised_ftr_reg(). + * This is stronger than FTR_HIDDEN and can be used to hide features from + * KVM guests. */ #define pr_fmt(fmt) "CPU features: " fmt @@ -21,75 +65,129 @@ #include <linux/bsearch.h> #include <linux/cpumask.h> #include <linux/crash_dump.h> +#include <linux/kstrtox.h> #include <linux/sort.h> #include <linux/stop_machine.h> +#include <linux/sysfs.h> #include <linux/types.h> +#include <linux/minmax.h> #include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/kasan.h> +#include <linux/percpu.h> +#include <linux/sched/isolation.h> + #include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/cpu_ops.h> #include <asm/fpsimd.h> +#include <asm/hwcap.h> +#include <asm/insn.h> +#include <asm/kvm_host.h> +#include <asm/mmu.h> #include <asm/mmu_context.h> +#include <asm/mte.h> +#include <asm/hypervisor.h> #include <asm/processor.h> +#include <asm/smp.h> #include <asm/sysreg.h> #include <asm/traps.h> +#include <asm/vectors.h> #include <asm/virt.h> -unsigned long elf_hwcap __read_mostly; -EXPORT_SYMBOL_GPL(elf_hwcap); +#include <asm/spectre.h> +/* Kernel representation of AT_HWCAP and AT_HWCAP2 */ +static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly; #ifdef CONFIG_COMPAT #define COMPAT_ELF_HWCAP_DEFAULT \ (COMPAT_HWCAP_HALF|COMPAT_HWCAP_THUMB|\ COMPAT_HWCAP_FAST_MULT|COMPAT_HWCAP_EDSP|\ - COMPAT_HWCAP_TLS|COMPAT_HWCAP_VFP|\ - COMPAT_HWCAP_VFPv3|COMPAT_HWCAP_VFPv4|\ - COMPAT_HWCAP_NEON|COMPAT_HWCAP_IDIV|\ + COMPAT_HWCAP_TLS|COMPAT_HWCAP_IDIV|\ COMPAT_HWCAP_LPAE) unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT; unsigned int compat_elf_hwcap2 __read_mostly; +unsigned int compat_elf_hwcap3 __read_mostly; #endif -DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); -EXPORT_SYMBOL(cpu_hwcaps); -static struct arm64_cpu_capabilities const __ro_after_init *cpu_hwcaps_ptrs[ARM64_NCAPS]; +DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS); +EXPORT_SYMBOL(system_cpucaps); +static struct arm64_cpu_capabilities const __ro_after_init *cpucap_ptrs[ARM64_NCAPS]; + +DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS); /* - * Flag to indicate if we have computed the system wide - * capabilities based on the boot time active CPUs. This - * will be used to determine if a new booting CPU should - * go through the verification process to make sure that it - * supports the system capabilities, without using a hotplug - * notifier. + * arm64_use_ng_mappings must be placed in the .data section, otherwise it + * ends up in the .bss section where it is initialized in early_map_kernel() + * after the MMU (with the idmap) was enabled. create_init_idmap() - which + * runs before early_map_kernel() and reads the variable via PTE_MAYBE_NG - + * may end up generating an incorrect idmap page table attributes. */ -static bool sys_caps_initialised; +bool arm64_use_ng_mappings __read_mostly = false; +EXPORT_SYMBOL(arm64_use_ng_mappings); -static inline void set_sys_caps_initialised(void) -{ - sys_caps_initialised = true; -} +DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors; + +/* + * Permit PER_LINUX32 and execve() of 32-bit binaries even if not all CPUs + * support it? + */ +static bool __read_mostly allow_mismatched_32bit_el0; -static int dump_cpu_hwcaps(struct notifier_block *self, unsigned long v, void *p) +/* + * Static branch enabled only if allow_mismatched_32bit_el0 is set and we have + * seen at least one CPU capable of 32-bit EL0. + */ +DEFINE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0); + +/* + * Mask of CPUs supporting 32-bit EL0. + * Only valid if arm64_mismatched_32bit_el0 is enabled. + */ +static cpumask_var_t cpu_32bit_el0_mask __cpumask_var_read_mostly; + +void dump_cpu_features(void) { /* file-wide pr_fmt adds "CPU features: " prefix */ - pr_emerg("0x%*pb\n", ARM64_NCAPS, &cpu_hwcaps); - return 0; + pr_emerg("0x%*pb\n", ARM64_NCAPS, &system_cpucaps); } -static struct notifier_block cpu_hwcaps_notifier = { - .notifier_call = dump_cpu_hwcaps -}; +#define __ARM64_MAX_POSITIVE(reg, field) \ + ((reg##_##field##_SIGNED ? \ + BIT(reg##_##field##_WIDTH - 1) : \ + BIT(reg##_##field##_WIDTH)) - 1) -static int __init register_cpu_hwcaps_dumper(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, - &cpu_hwcaps_notifier); - return 0; -} -__initcall(register_cpu_hwcaps_dumper); +#define __ARM64_MIN_NEGATIVE(reg, field) BIT(reg##_##field##_WIDTH - 1) + +#define __ARM64_CPUID_FIELDS(reg, field, min_value, max_value) \ + .sys_reg = SYS_##reg, \ + .field_pos = reg##_##field##_SHIFT, \ + .field_width = reg##_##field##_WIDTH, \ + .sign = reg##_##field##_SIGNED, \ + .min_field_value = min_value, \ + .max_field_value = max_value, + +/* + * ARM64_CPUID_FIELDS() encodes a field with a range from min_value to + * an implicit maximum that depends on the sign-ess of the field. + * + * An unsigned field will be capped at all ones, while a signed field + * will be limited to the positive half only. + */ +#define ARM64_CPUID_FIELDS(reg, field, min_value) \ + __ARM64_CPUID_FIELDS(reg, field, \ + SYS_FIELD_VALUE(reg, field, min_value), \ + __ARM64_MAX_POSITIVE(reg, field)) -DEFINE_STATIC_KEY_ARRAY_FALSE(cpu_hwcap_keys, ARM64_NCAPS); -EXPORT_SYMBOL(cpu_hwcap_keys); +/* + * ARM64_CPUID_FIELDS_NEG() encodes a field with a range from an + * implicit minimal value to max_value. This should be used when + * matching a non-implemented property. + */ +#define ARM64_CPUID_FIELDS_NEG(reg, field, max_value) \ + __ARM64_CPUID_FIELDS(reg, field, \ + __ARM64_MIN_NEGATIVE(reg, field), \ + SYS_FIELD_VALUE(reg, field, max_value)) #define __ARM64_FTR_BITS(SIGNED, VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ { \ @@ -115,211 +213,521 @@ EXPORT_SYMBOL(cpu_hwcap_keys); .width = 0, \ } -/* meta feature for alternatives */ -static bool __maybe_unused -cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused); - static void cpu_enable_cnp(struct arm64_cpu_capabilities const *cap); +static bool __system_matches_cap(unsigned int n); + /* * NOTE: Any changes to the visibility of features should be kept in * sync with the documentation of the CPU feature register ABI. */ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_FHM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_DP_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM4_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RDM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_ATOMICS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_CRC32_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA1_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RNDR_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TLB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_FHM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_DP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM4_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RDM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_ATOMIC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_CRC32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_AES_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_SB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_XS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SPECRES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FRINTTS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPI_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_GPI_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_LRCPC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FCMA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_JSCVT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_GPA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_LRCPC_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_FCMA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_JSCVT_SHIFT, 4, 0), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_API_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_APA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DPB_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_LUT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CSSC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRFM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CLRBHB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_MOPS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_API_SHIFT, 4, 0), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_APA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_DPB_SHIFT, 4, 0), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_GPA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_WFxT_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64isar3[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSFE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_DIT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_DIT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AMU_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_MPAM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SEL2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_RAS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0), - S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), - S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), - /* Linux doesn't care about the EL3 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SVE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_RAS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_GIC_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AdvSIMD_SHIFT, 4, ID_AA64PFR0_EL1_AdvSIMD_NI), + S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_FP_SHIFT, 4, ID_AA64PFR0_EL1_FP_NI), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL1_SHIFT, 4, ID_AA64PFR0_EL1_EL1_IMP), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL0_SHIFT, 4, ID_AA64PFR0_EL1_EL0_IMP), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_DF2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_RAS_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_SHIFT, 4, ID_AA64PFR1_EL1_MTE_NI), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SSBS_SHIFT, 4, ID_AA64PFR1_EL1_SSBS_NI), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_BT_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64pfr2[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_FPMR_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_MTEFAR_SHIFT, 4, ID_AA64PFR2_EL1_MTEFAR_NI), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_MTESTOREONLY_SHIFT, 4, ID_AA64PFR2_EL1_MTESTOREONLY_NI), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F64MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F32MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F16MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SHA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_B16B16_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_EltPerm_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_LUTv2_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SMEver_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I16I64_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F64F64_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I16I32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_B16B16_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F16_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F8F16_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F8F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I8I32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_B16F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_BI32I32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F32F32_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8FMA_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP4_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP2_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SBitPerm_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_AES_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SFEXPA_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_STMOP_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), + FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SMOP4_SHIFT, 1, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64fpfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8CVT_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8FMA_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP4_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP2_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM8_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM4_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E4M3_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E5M2_SHIFT, 1, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN4_SHIFT, 4, ID_AA64MMFR0_TGRAN4_NI), - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN64_SHIFT, 4, ID_AA64MMFR0_TGRAN64_NI), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN16_SHIFT, 4, ID_AA64MMFR0_TGRAN16_NI), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_BIGENDEL0_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_ECV_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_FGT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_EXS_SHIFT, 4, 0), + /* + * Page size not being supported at Stage-2 is not fatal. You + * just give up KVM if PAGE_SIZE isn't supported there. Go fix + * your favourite nesting hypervisor. + * + * There is a small corner case where the hypervisor explicitly + * advertises a given granule size at Stage-2 (value 2) on some + * vCPUs, and uses the fallback to Stage-1 (value 0) for other + * vCPUs. Although this is not forbidden by the architecture, it + * indicates that the hypervisor is being silly (or buggy). + * + * We make no effort to cope with this and pretend that if these + * fields are inconsistent across vCPUs, then it isn't worth + * trying to bring KVM up. + */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN4_2_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN64_2_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN16_2_SHIFT, 4, 1), + /* + * We already refuse to boot CPUs that don't support our configured + * page size, so we can only detect mismatches for a page size other + * than the one we're currently using. Unfortunately, SoCs like this + * exist in the wild so, even though we don't like it, we'll have to go + * along with it and treat them as non-strict. + */ + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN4_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN4_NI), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN64_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN64_NI), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN16_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN16_NI), + + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_BIGENDEL0_SHIFT, 4, 0), /* Linux shouldn't care about secure memory */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_SNSMEM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_BIGENDEL_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_ASID_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_SNSMEM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_BIGEND_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_ASIDBITS_SHIFT, 4, 0), /* * Differing PARange is fine as long as all peripherals and memory are mapped * within the minimum PARange of all CPUs */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_PARANGE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_PARANGE_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_PAN_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_LOR_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_HPD_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_VHE_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_VMIDBITS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_HADBS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ECBHB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TIDCP1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HCX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ETS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TWED_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_XNX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64MMFR1_EL1_SpecSEI_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_PAN_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_LO_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HPDS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_VH_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_VMIDBits_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HAFDBS_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LSM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_UAO_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_CNP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_E0PD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_EVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_BBM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_TTL_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_FWB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_IDS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_AT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_ST_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_NV_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_CCIDX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_VARange_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_IESB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_LSM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_UAO_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_CnP_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_POE), + FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1POE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_SCTLRX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = { + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_NV_frac_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_ctr[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RES1 */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DIC_SHIFT, 1, 1), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IDC_SHIFT, 1, 1), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_SAFE, CTR_CWG_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_SAFE, CTR_ERG_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DMINLINE_SHIFT, 4, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DIC_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IDC_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_CWG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_ERG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DminLine_SHIFT, 4, 1), /* * Linux can handle differing I-cache policies. Userspace JITs will * make use of *minLine. * If we have differing I-cache policies, report it as the weakest - VIPT. */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, 14, 2, ICACHE_POLICY_VIPT), /* L1Ip */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IMINLINE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, CTR_EL0_L1Ip_SHIFT, 2, CTR_EL0_L1Ip_VIPT), /* L1Ip */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IminLine_SHIFT, 4, 0), ARM64_FTR_END, }; +static struct arm64_ftr_override __ro_after_init no_override = { }; + struct arm64_ftr_reg arm64_ftr_reg_ctrel0 = { .name = "SYS_CTR_EL0", - .ftr_bits = ftr_ctr + .ftr_bits = ftr_ctr, + .override = &no_override, }; static const struct arm64_ftr_bits ftr_id_mmfr0[] = { - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0xf), /* InnerShr */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0), /* FCSE */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, 20, 4, 0), /* AuxReg */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0), /* TCM */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), /* ShareLvl */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0xf), /* OuterShr */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* PMSA */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* VMSA */ + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_InnerShr_SHIFT, 4, 0xf), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_FCSE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_AuxReg_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_TCM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_ShareLvl_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_OuterShr_SHIFT, 4, 0xf), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_PMSA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_VMSA_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 36, 28, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_PMSVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_CTX_CMPS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_WRPS_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_BRPS_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_DoubleLock_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_PMSVer_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_CTX_CMPs_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_WRPs_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_BRPs_SHIFT, 4, 0), /* * We can instantiate multiple PMU instances with different levels * of support. */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_PMUVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_TRACEVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_DEBUGVER_SHIFT, 4, 0x6), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_EL1_DebugVer_SHIFT, 4, 0x6), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_mvfr0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPRound_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPShVec_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPSqrt_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPDivide_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPTrap_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPDP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPSP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_SIMDReg_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_mvfr1[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDFMAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_FPHP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDHP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDSP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDInt_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDLS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_FPDNaN_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_FPFtZ_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_mvfr2[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* FPMisc */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* SIMDMisc */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_EL1_FPMisc_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_EL1_SIMDMisc_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_dczid[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 4, 1, 1), /* DZP */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* BS */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, DCZID_EL0_DZP_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, DCZID_EL0_BS_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_gmid[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, GMID_EL1_BS_SHIFT, 4, 0), ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_isar0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Divide_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Debug_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Coproc_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_CmpBranch_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_BitField_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_BitCount_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Swap_SHIFT, 4, 0), + ARM64_FTR_END, +}; static const struct arm64_ftr_bits ftr_id_isar5[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_RDM_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_CRC32_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SHA1_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_AES_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_SEVL_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_RDM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_CRC32_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_SHA2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_SHA1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_SEVL_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_mmfr4[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* ac2 */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_EVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_CCIDX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_LSM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_HPDS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_CnP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_XNX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_AC2_SHIFT, 4, 0), + + /* + * SpecSEI = 1 indicates that the PE might generate an SError on an + * external abort on speculative read. It is safe to assume that an + * SError might be generated than it will not be. Hence it has been + * classified as FTR_HIGHER_SAFE. + */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_MMFR4_EL1_SpecSEI_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_isar4[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_SWP_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_PSR_M_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_SynchPrim_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_Barrier_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_SMC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_Writeback_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_WithShifts_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_Unpriv_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_mmfr5[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR5_EL1_ETS_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_isar6[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_I8MM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_BF16_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_SPECRES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_SB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_FHM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_DP_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_JSCVT_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_pfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), /* State3 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0), /* State2 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* State1 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* State0 */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_DIT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_CSV2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State0_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_pfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_GIC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Virt_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Sec_frac_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_GenTimer_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Virtualization_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_MProgMod_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Security_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_ProgMod_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_pfr2[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_EL1_SSBS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_EL1_CSV3_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_dfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0), - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0xf), /* PerfMon */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), + /* [31:28] TraceFilt */ + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_EL1_PerfMon_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MProfDbg_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MMapTrc_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopTrc_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MMapDbg_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopSDbg_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopDbg_SHIFT, 4, 0), + ARM64_FTR_END, +}; + +static const struct arm64_ftr_bits ftr_id_dfr1[] = { + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR1_EL1_MTPMU_SHIFT, 4, 0), ARM64_FTR_END, }; -static const struct arm64_ftr_bits ftr_zcr[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, - ZCR_ELx_LEN_SHIFT, ZCR_ELx_LEN_SIZE, 0), /* LEN */ +static const struct arm64_ftr_bits ftr_mpamidr[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PMG_MAX_SHIFT, MPAMIDR_EL1_PMG_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_VPMR_MAX_SHIFT, MPAMIDR_EL1_VPMR_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_HAS_HCR_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PARTID_MAX_SHIFT, MPAMIDR_EL1_PARTID_MAX_WIDTH, 0), ARM64_FTR_END, }; @@ -327,7 +735,7 @@ static const struct arm64_ftr_bits ftr_zcr[] = { * Common ftr bits for a 32bit register with all hidden, strict * attributes, with 4bit feature fields and a default safe value of * 0. Covers the following 32bit registers: - * id_isar[0-4], id_mmfr[1-3], id_pfr1, mvfr[0-1] + * id_isar[1-3], id_mmfr[1-3] */ static const struct arm64_ftr_bits ftr_generic_32bits[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0), @@ -351,13 +759,32 @@ static const struct arm64_ftr_bits ftr_raz[] = { ARM64_FTR_END, }; -#define ARM64_FTR_REG(id, table) { \ - .sys_id = id, \ - .reg = &(struct arm64_ftr_reg){ \ - .name = #id, \ - .ftr_bits = &((table)[0]), \ +#define __ARM64_FTR_REG_OVERRIDE(id_str, id, table, ovr) { \ + .sys_id = id, \ + .reg = &(struct arm64_ftr_reg){ \ + .name = id_str, \ + .override = (ovr), \ + .ftr_bits = &((table)[0]), \ }} +#define ARM64_FTR_REG_OVERRIDE(id, table, ovr) \ + __ARM64_FTR_REG_OVERRIDE(#id, id, table, ovr) + +#define ARM64_FTR_REG(id, table) \ + __ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override) + +struct arm64_ftr_override __read_mostly id_aa64mmfr0_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr1_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr2_override; +struct arm64_ftr_override __read_mostly id_aa64pfr0_override; +struct arm64_ftr_override __read_mostly id_aa64pfr1_override; +struct arm64_ftr_override __read_mostly id_aa64zfr0_override; +struct arm64_ftr_override __read_mostly id_aa64smfr0_override; +struct arm64_ftr_override __read_mostly id_aa64isar1_override; +struct arm64_ftr_override __read_mostly id_aa64isar2_override; + +struct arm64_ftr_override __read_mostly arm64_sw_feature_override; + static const struct __ftr_reg_entry { u32 sys_id; struct arm64_ftr_reg *reg; @@ -365,7 +792,7 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 1 */ ARM64_FTR_REG(SYS_ID_PFR0_EL1, ftr_id_pfr0), - ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_id_pfr1), ARM64_FTR_REG(SYS_ID_DFR0_EL1, ftr_id_dfr0), ARM64_FTR_REG(SYS_ID_MMFR0_EL1, ftr_id_mmfr0), ARM64_FTR_REG(SYS_ID_MMFR1_EL1, ftr_generic_32bits), @@ -373,23 +800,34 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_MMFR3_EL1, ftr_generic_32bits), /* Op1 = 0, CRn = 0, CRm = 2 */ - ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_id_isar0), ARM64_FTR_REG(SYS_ID_ISAR1_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_ISAR2_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_ISAR3_EL1, ftr_generic_32bits), - ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_id_isar4), ARM64_FTR_REG(SYS_ID_ISAR5_EL1, ftr_id_isar5), ARM64_FTR_REG(SYS_ID_MMFR4_EL1, ftr_id_mmfr4), + ARM64_FTR_REG(SYS_ID_ISAR6_EL1, ftr_id_isar6), /* Op1 = 0, CRn = 0, CRm = 3 */ - ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_generic_32bits), - ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_generic_32bits), + ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_mvfr0), + ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_mvfr1), ARM64_FTR_REG(SYS_MVFR2_EL1, ftr_mvfr2), + ARM64_FTR_REG(SYS_ID_PFR2_EL1, ftr_id_pfr2), + ARM64_FTR_REG(SYS_ID_DFR1_EL1, ftr_id_dfr1), + ARM64_FTR_REG(SYS_ID_MMFR5_EL1, ftr_id_mmfr5), /* Op1 = 0, CRn = 0, CRm = 4 */ - ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0), - ARM64_FTR_REG(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1), - ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_raz), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0, + &id_aa64pfr0_override), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1, + &id_aa64pfr1_override), + ARM64_FTR_REG(SYS_ID_AA64PFR2_EL1, ftr_id_aa64pfr2), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0, + &id_aa64zfr0_override), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0, + &id_aa64smfr0_override), + ARM64_FTR_REG(SYS_ID_AA64FPFR0_EL1, ftr_id_aa64fpfr0), /* Op1 = 0, CRn = 0, CRm = 5 */ ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0), @@ -397,15 +835,27 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 6 */ ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0), - ARM64_FTR_REG(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1, + &id_aa64isar1_override), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2, + &id_aa64isar2_override), + ARM64_FTR_REG(SYS_ID_AA64ISAR3_EL1, ftr_id_aa64isar3), /* Op1 = 0, CRn = 0, CRm = 7 */ - ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), - ARM64_FTR_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1), - ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0, + &id_aa64mmfr0_override), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1, + &id_aa64mmfr1_override), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2, + &id_aa64mmfr2_override), + ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3), + ARM64_FTR_REG(SYS_ID_AA64MMFR4_EL1, ftr_id_aa64mmfr4), - /* Op1 = 0, CRn = 1, CRm = 2 */ - ARM64_FTR_REG(SYS_ZCR_EL1, ftr_zcr), + /* Op1 = 0, CRn = 10, CRm = 4 */ + ARM64_FTR_REG(SYS_MPAMIDR_EL1, ftr_mpamidr), + + /* Op1 = 1, CRn = 0, CRm = 0 */ + ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), /* Op1 = 3, CRn = 0, CRm = 0 */ { SYS_CTR_EL0, &arm64_ftr_reg_ctrel0 }, @@ -421,16 +871,16 @@ static int search_cmp_ftr_reg(const void *id, const void *regp) } /* - * get_arm64_ftr_reg - Lookup a feature register entry using its - * sys_reg() encoding. With the array arm64_ftr_regs sorted in the - * ascending order of sys_id , we use binary search to find a matching + * get_arm64_ftr_reg_nowarn - Looks up a feature register entry using + * its sys_reg() encoding. With the array arm64_ftr_regs sorted in the + * ascending order of sys_id, we use binary search to find a matching * entry. * * returns - Upon success, matching ftr_reg entry for id. * - NULL on failure. It is upto the caller to decide * the impact of a failure. */ -static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) +static struct arm64_ftr_reg *get_arm64_ftr_reg_nowarn(u32 sys_id) { const struct __ftr_reg_entry *ret; @@ -444,6 +894,27 @@ static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) return NULL; } +/* + * get_arm64_ftr_reg - Looks up a feature register entry using + * its sys_reg() encoding. This calls get_arm64_ftr_reg_nowarn(). + * + * returns - Upon success, matching ftr_reg entry for id. + * - NULL on failure but with an WARN_ON(). + */ +struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) +{ + struct arm64_ftr_reg *reg; + + reg = get_arm64_ftr_reg_nowarn(sys_id); + + /* + * Requesting a non-existent register search is an error. Warn + * and let the caller handle it. + */ + WARN_ON(!reg); + return reg; +} + static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg, s64 ftr_val) { @@ -454,7 +925,7 @@ static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg, return reg; } -static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, +s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { s64 ret = 0; @@ -464,10 +935,14 @@ static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, ret = ftrp->safe_val; break; case FTR_LOWER_SAFE: - ret = new < cur ? new : cur; + ret = min(new, cur); break; + case FTR_HIGHER_OR_ZERO_SAFE: + if (!cur || !new) + break; + fallthrough; case FTR_HIGHER_SAFE: - ret = new > cur ? new : cur; + ret = max(new, cur); break; default: BUG(); @@ -478,20 +953,61 @@ static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, static void __init sort_ftr_regs(void) { - int i; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(arm64_ftr_regs); i++) { + const struct arm64_ftr_reg *ftr_reg = arm64_ftr_regs[i].reg; + const struct arm64_ftr_bits *ftr_bits = ftr_reg->ftr_bits; + unsigned int j = 0; + + /* + * Features here must be sorted in descending order with respect + * to their shift values and should not overlap with each other. + */ + for (; ftr_bits->width != 0; ftr_bits++, j++) { + unsigned int width = ftr_reg->ftr_bits[j].width; + unsigned int shift = ftr_reg->ftr_bits[j].shift; + unsigned int prev_shift; + + WARN((shift + width) > 64, + "%s has invalid feature at shift %d\n", + ftr_reg->name, shift); - /* Check that the array is sorted so that we can do the binary search */ - for (i = 1; i < ARRAY_SIZE(arm64_ftr_regs); i++) - BUG_ON(arm64_ftr_regs[i].sys_id < arm64_ftr_regs[i - 1].sys_id); + /* + * Skip the first feature. There is nothing to + * compare against for now. + */ + if (j == 0) + continue; + + prev_shift = ftr_reg->ftr_bits[j - 1].shift; + WARN((shift + width) > prev_shift, + "%s has feature overlap at shift %d\n", + ftr_reg->name, shift); + } + + /* + * Skip the first register. There is nothing to + * compare against for now. + */ + if (i == 0) + continue; + /* + * Registers here must be sorted in ascending order with respect + * to sys_id for subsequent binary search in get_arm64_ftr_reg() + * to work correctly. + */ + BUG_ON(arm64_ftr_regs[i].sys_id <= arm64_ftr_regs[i - 1].sys_id); + } } /* * Initialise the CPU feature register from Boot CPU values. - * Also initiliases the strict_mask for the register. + * Also initialises the strict_mask for the register. * Any bits that are not covered by an arm64_ftr_bits entry are considered * RES0 for the system-wide value, and must strictly match. */ -static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new) +static void init_cpu_ftr_reg(u32 sys_reg, u64 new) { u64 val = 0; u64 strict_mask = ~0x0ULL; @@ -501,11 +1017,45 @@ static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new) const struct arm64_ftr_bits *ftrp; struct arm64_ftr_reg *reg = get_arm64_ftr_reg(sys_reg); - BUG_ON(!reg); + if (!reg) + return; - for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) { + for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) { u64 ftr_mask = arm64_ftr_mask(ftrp); s64 ftr_new = arm64_ftr_value(ftrp, new); + s64 ftr_ovr = arm64_ftr_value(ftrp, reg->override->val); + + if ((ftr_mask & reg->override->mask) == ftr_mask) { + s64 tmp = arm64_ftr_safe_value(ftrp, ftr_ovr, ftr_new); + char *str = NULL; + + if (ftr_ovr != tmp) { + /* Unsafe, remove the override */ + reg->override->mask &= ~ftr_mask; + reg->override->val &= ~ftr_mask; + tmp = ftr_ovr; + str = "ignoring override"; + } else if (ftr_new != tmp) { + /* Override was valid */ + ftr_new = tmp; + str = "forced"; + } else { + /* Override was the safe value */ + str = "already set"; + } + + pr_warn("%s[%d:%d]: %s to %llx\n", + reg->name, + ftrp->shift + ftrp->width - 1, + ftrp->shift, str, + tmp & (BIT(ftrp->width) - 1)); + } else if ((ftr_mask & reg->override->val) == ftr_mask) { + reg->override->val &= ~ftr_mask; + pr_warn("%s[%d:%d]: impossible override, ignored\n", + reg->name, + ftrp->shift + ftrp->width - 1, + ftrp->shift); + } val = arm64_ftr_set_value(ftrp, val, ftr_new); @@ -531,28 +1081,84 @@ extern const struct arm64_cpu_capabilities arm64_errata[]; static const struct arm64_cpu_capabilities arm64_features[]; static void __init -init_cpu_hwcaps_indirect_list_from_array(const struct arm64_cpu_capabilities *caps) +init_cpucap_indirect_list_from_array(const struct arm64_cpu_capabilities *caps) { for (; caps->matches; caps++) { if (WARN(caps->capability >= ARM64_NCAPS, "Invalid capability %d\n", caps->capability)) continue; - if (WARN(cpu_hwcaps_ptrs[caps->capability], + if (WARN(cpucap_ptrs[caps->capability], "Duplicate entry for capability %d\n", caps->capability)) continue; - cpu_hwcaps_ptrs[caps->capability] = caps; + cpucap_ptrs[caps->capability] = caps; } } -static void __init init_cpu_hwcaps_indirect_list(void) +static void __init init_cpucap_indirect_list(void) { - init_cpu_hwcaps_indirect_list_from_array(arm64_features); - init_cpu_hwcaps_indirect_list_from_array(arm64_errata); + init_cpucap_indirect_list_from_array(arm64_features); + init_cpucap_indirect_list_from_array(arm64_errata); } static void __init setup_boot_cpu_capabilities(void); +static void init_32bit_cpu_features(struct cpuinfo_32bit *info) +{ + init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); + init_cpu_ftr_reg(SYS_ID_DFR1_EL1, info->reg_id_dfr1); + init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0); + init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1); + init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2); + init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3); + init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4); + init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5); + init_cpu_ftr_reg(SYS_ID_ISAR6_EL1, info->reg_id_isar6); + init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0); + init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1); + init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2); + init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3); + init_cpu_ftr_reg(SYS_ID_MMFR4_EL1, info->reg_id_mmfr4); + init_cpu_ftr_reg(SYS_ID_MMFR5_EL1, info->reg_id_mmfr5); + init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0); + init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1); + init_cpu_ftr_reg(SYS_ID_PFR2_EL1, info->reg_id_pfr2); + init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0); + init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1); + init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2); +} + +#ifdef CONFIG_ARM64_PSEUDO_NMI +static bool enable_pseudo_nmi; + +static int __init early_enable_pseudo_nmi(char *p) +{ + return kstrtobool(p, &enable_pseudo_nmi); +} +early_param("irqchip.gicv3_pseudo_nmi", early_enable_pseudo_nmi); + +static __init void detect_system_supports_pseudo_nmi(void) +{ + struct device_node *np; + + if (!enable_pseudo_nmi) + return; + + /* + * Detect broken MediaTek firmware that doesn't properly save and + * restore GIC priorities. + */ + np = of_find_compatible_node(NULL, NULL, "arm,gic-v3"); + if (np && of_property_read_bool(np, "mediatek,broken-save-restore-fw")) { + pr_info("Pseudo-NMI disabled due to MediaTek Chromebook GICR save problem\n"); + enable_pseudo_nmi = false; + } + of_node_put(np); +} +#else /* CONFIG_ARM64_PSEUDO_NMI */ +static inline void detect_system_supports_pseudo_nmi(void) { } +#endif + void __init init_cpu_features(struct cpuinfo_arm64 *info) { /* Before we start using the tables, make sure it is sorted */ @@ -565,48 +1171,48 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64DFR1_EL1, info->reg_id_aa64dfr1); init_cpu_ftr_reg(SYS_ID_AA64ISAR0_EL1, info->reg_id_aa64isar0); init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1); + init_cpu_ftr_reg(SYS_ID_AA64ISAR2_EL1, info->reg_id_aa64isar2); + init_cpu_ftr_reg(SYS_ID_AA64ISAR3_EL1, info->reg_id_aa64isar3); init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0); init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); + init_cpu_ftr_reg(SYS_ID_AA64MMFR3_EL1, info->reg_id_aa64mmfr3); + init_cpu_ftr_reg(SYS_ID_AA64MMFR4_EL1, info->reg_id_aa64mmfr4); init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); + init_cpu_ftr_reg(SYS_ID_AA64PFR2_EL1, info->reg_id_aa64pfr2); init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0); + init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0); + init_cpu_ftr_reg(SYS_ID_AA64FPFR0_EL1, info->reg_id_aa64fpfr0); - if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { - init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); - init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0); - init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1); - init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2); - init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3); - init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4); - init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5); - init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0); - init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1); - init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2); - init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3); - init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0); - init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1); - init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0); - init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1); - init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2); + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) + init_32bit_cpu_features(&info->aarch32); + + if (IS_ENABLED(CONFIG_ARM64_SVE) && + id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + unsigned long cpacr = cpacr_save_enable_kernel_sve(); + + vec_init_vq_map(ARM64_VEC_SVE); + + cpacr_restore(cpacr); } - if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) { - init_cpu_ftr_reg(SYS_ZCR_EL1, info->reg_zcr); - sve_init_vq_map(); + if (IS_ENABLED(CONFIG_ARM64_SME) && + id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { + unsigned long cpacr = cpacr_save_enable_kernel_sme(); + + vec_init_vq_map(ARM64_VEC_SME); + + cpacr_restore(cpacr); } - /* - * Initialize the indirect array of CPU hwcaps capabilities pointers - * before we handle the boot CPU below. - */ - init_cpu_hwcaps_indirect_list(); + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr); + } - /* - * Detect and enable early CPU capabilities based on the boot CPU, - * after we have initialised the CPU feature infrastructure. - */ - setup_boot_cpu_capabilities(); + if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) + init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); } static void update_cpu_ftr_reg(struct arm64_ftr_reg *reg, u64 new) @@ -630,7 +1236,9 @@ static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot) { struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id); - BUG_ON(!regp); + if (!regp) + return 0; + update_cpu_ftr_reg(regp, val); if ((boot & regp->strict_mask) == (val & regp->strict_mask)) return 0; @@ -639,6 +1247,112 @@ static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot) return 1; } +static void relax_cpu_ftr_reg(u32 sys_id, int field) +{ + const struct arm64_ftr_bits *ftrp; + struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id); + + if (!regp) + return; + + for (ftrp = regp->ftr_bits; ftrp->width; ftrp++) { + if (ftrp->shift == field) { + regp->strict_mask &= ~arm64_ftr_mask(ftrp); + break; + } + } + + /* Bogus field? */ + WARN_ON(!ftrp->width); +} + +static void lazy_init_32bit_cpu_features(struct cpuinfo_arm64 *info, + struct cpuinfo_arm64 *boot) +{ + static bool boot_cpu_32bit_regs_overridden = false; + + if (!allow_mismatched_32bit_el0 || boot_cpu_32bit_regs_overridden) + return; + + if (id_aa64pfr0_32bit_el0(boot->reg_id_aa64pfr0)) + return; + + boot->aarch32 = info->aarch32; + init_32bit_cpu_features(&boot->aarch32); + boot_cpu_32bit_regs_overridden = true; +} + +static int update_32bit_cpu_features(int cpu, struct cpuinfo_32bit *info, + struct cpuinfo_32bit *boot) +{ + int taint = 0; + u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + /* + * If we don't have AArch32 at EL1, then relax the strictness of + * EL1-dependent register fields to avoid spurious sanity check fails. + */ + if (!id_aa64pfr0_32bit_el1(pfr0)) { + relax_cpu_ftr_reg(SYS_ID_ISAR4_EL1, ID_ISAR4_EL1_SMC_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Virt_frac_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Sec_frac_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Virtualization_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Security_SHIFT); + relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_ProgMod_SHIFT); + } + + taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu, + info->reg_id_dfr0, boot->reg_id_dfr0); + taint |= check_update_ftr_reg(SYS_ID_DFR1_EL1, cpu, + info->reg_id_dfr1, boot->reg_id_dfr1); + taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu, + info->reg_id_isar0, boot->reg_id_isar0); + taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu, + info->reg_id_isar1, boot->reg_id_isar1); + taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu, + info->reg_id_isar2, boot->reg_id_isar2); + taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu, + info->reg_id_isar3, boot->reg_id_isar3); + taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu, + info->reg_id_isar4, boot->reg_id_isar4); + taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu, + info->reg_id_isar5, boot->reg_id_isar5); + taint |= check_update_ftr_reg(SYS_ID_ISAR6_EL1, cpu, + info->reg_id_isar6, boot->reg_id_isar6); + + /* + * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and + * ACTLR formats could differ across CPUs and therefore would have to + * be trapped for virtualization anyway. + */ + taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu, + info->reg_id_mmfr0, boot->reg_id_mmfr0); + taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu, + info->reg_id_mmfr1, boot->reg_id_mmfr1); + taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu, + info->reg_id_mmfr2, boot->reg_id_mmfr2); + taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu, + info->reg_id_mmfr3, boot->reg_id_mmfr3); + taint |= check_update_ftr_reg(SYS_ID_MMFR4_EL1, cpu, + info->reg_id_mmfr4, boot->reg_id_mmfr4); + taint |= check_update_ftr_reg(SYS_ID_MMFR5_EL1, cpu, + info->reg_id_mmfr5, boot->reg_id_mmfr5); + taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu, + info->reg_id_pfr0, boot->reg_id_pfr0); + taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu, + info->reg_id_pfr1, boot->reg_id_pfr1); + taint |= check_update_ftr_reg(SYS_ID_PFR2_EL1, cpu, + info->reg_id_pfr2, boot->reg_id_pfr2); + taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu, + info->reg_mvfr0, boot->reg_mvfr0); + taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu, + info->reg_mvfr1, boot->reg_mvfr1); + taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu, + info->reg_mvfr2, boot->reg_mvfr2); + + return taint; +} + /* * Update system wide CPU feature registers with the values from a * non-boot CPU. Also performs SANITY checks to make sure that there @@ -688,6 +1402,10 @@ void update_cpu_features(int cpu, info->reg_id_aa64isar0, boot->reg_id_aa64isar0); taint |= check_update_ftr_reg(SYS_ID_AA64ISAR1_EL1, cpu, info->reg_id_aa64isar1, boot->reg_id_aa64isar1); + taint |= check_update_ftr_reg(SYS_ID_AA64ISAR2_EL1, cpu, + info->reg_id_aa64isar2, boot->reg_id_aa64isar2); + taint |= check_update_ftr_reg(SYS_ID_AA64ISAR3_EL1, cpu, + info->reg_id_aa64isar3, boot->reg_id_aa64isar3); /* * Differing PARange support is fine as long as all peripherals and @@ -700,73 +1418,79 @@ void update_cpu_features(int cpu, info->reg_id_aa64mmfr1, boot->reg_id_aa64mmfr1); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu, info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); + taint |= check_update_ftr_reg(SYS_ID_AA64MMFR3_EL1, cpu, + info->reg_id_aa64mmfr3, boot->reg_id_aa64mmfr3); + taint |= check_update_ftr_reg(SYS_ID_AA64MMFR4_EL1, cpu, + info->reg_id_aa64mmfr4, boot->reg_id_aa64mmfr4); - /* - * EL3 is not our concern. - */ taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu, info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0); taint |= check_update_ftr_reg(SYS_ID_AA64PFR1_EL1, cpu, info->reg_id_aa64pfr1, boot->reg_id_aa64pfr1); + taint |= check_update_ftr_reg(SYS_ID_AA64PFR2_EL1, cpu, + info->reg_id_aa64pfr2, boot->reg_id_aa64pfr2); taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu, info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0); - /* - * If we have AArch32, we care about 32-bit features for compat. - * If the system doesn't support AArch32, don't update them. - */ - if (id_aa64pfr0_32bit_el0(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1)) && - id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { - - taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu, - info->reg_id_dfr0, boot->reg_id_dfr0); - taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu, - info->reg_id_isar0, boot->reg_id_isar0); - taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu, - info->reg_id_isar1, boot->reg_id_isar1); - taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu, - info->reg_id_isar2, boot->reg_id_isar2); - taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu, - info->reg_id_isar3, boot->reg_id_isar3); - taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu, - info->reg_id_isar4, boot->reg_id_isar4); - taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu, - info->reg_id_isar5, boot->reg_id_isar5); + taint |= check_update_ftr_reg(SYS_ID_AA64SMFR0_EL1, cpu, + info->reg_id_aa64smfr0, boot->reg_id_aa64smfr0); - /* - * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and - * ACTLR formats could differ across CPUs and therefore would have to - * be trapped for virtualization anyway. - */ - taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu, - info->reg_id_mmfr0, boot->reg_id_mmfr0); - taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu, - info->reg_id_mmfr1, boot->reg_id_mmfr1); - taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu, - info->reg_id_mmfr2, boot->reg_id_mmfr2); - taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu, - info->reg_id_mmfr3, boot->reg_id_mmfr3); - taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu, - info->reg_id_pfr0, boot->reg_id_pfr0); - taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu, - info->reg_id_pfr1, boot->reg_id_pfr1); - taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu, - info->reg_mvfr0, boot->reg_mvfr0); - taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu, - info->reg_mvfr1, boot->reg_mvfr1); - taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu, - info->reg_mvfr2, boot->reg_mvfr2); + taint |= check_update_ftr_reg(SYS_ID_AA64FPFR0_EL1, cpu, + info->reg_id_aa64fpfr0, boot->reg_id_aa64fpfr0); + + /* Probe vector lengths */ + if (IS_ENABLED(CONFIG_ARM64_SVE) && + id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + if (!system_capabilities_finalized()) { + unsigned long cpacr = cpacr_save_enable_kernel_sve(); + + vec_update_vq_map(ARM64_VEC_SVE); + + cpacr_restore(cpacr); + } + } + + if (IS_ENABLED(CONFIG_ARM64_SME) && + id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { + unsigned long cpacr = cpacr_save_enable_kernel_sme(); + + /* Probe vector lengths */ + if (!system_capabilities_finalized()) + vec_update_vq_map(ARM64_VEC_SME); + + cpacr_restore(cpacr); + } + + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu, + info->reg_mpamidr, boot->reg_mpamidr); } - if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) { - taint |= check_update_ftr_reg(SYS_ZCR_EL1, cpu, - info->reg_zcr, boot->reg_zcr); + /* + * The kernel uses the LDGM/STGM instructions and the number of tags + * they read/write depends on the GMID_EL1.BS field. Check that the + * value is the same on all CPUs. + */ + if (IS_ENABLED(CONFIG_ARM64_MTE) && + id_aa64pfr1_mte(info->reg_id_aa64pfr1)) { + taint |= check_update_ftr_reg(SYS_GMID_EL1, cpu, + info->reg_gmid, boot->reg_gmid); + } - /* Probe vector lengths, unless we already gave up on SVE */ - if (id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1)) && - !sys_caps_initialised) - sve_update_vq_map(); + /* + * If we don't have AArch32 at all then skip the checks entirely + * as the register values may be UNKNOWN and we're not going to be + * using them for anything. + * + * This relies on a sanitised view of the AArch64 ID registers + * (e.g. SYS_ID_AA64PFR0_EL1), so we call it last. + */ + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { + lazy_init_32bit_cpu_features(info, boot); + taint |= update_32bit_cpu_features(cpu, &info->aarch32, + &boot->aarch32); } /* @@ -783,47 +1507,64 @@ u64 read_sanitised_ftr_reg(u32 id) { struct arm64_ftr_reg *regp = get_arm64_ftr_reg(id); - /* We shouldn't get a request for an unsupported register */ - BUG_ON(!regp); + if (!regp) + return 0; return regp->sys_val; } +EXPORT_SYMBOL_GPL(read_sanitised_ftr_reg); #define read_sysreg_case(r) \ - case r: return read_sysreg_s(r) + case r: val = read_sysreg_s(r); break; /* * __read_sysreg_by_encoding() - Used by a STARTING cpu before cpuinfo is populated. * Read the system register on the current CPU */ -static u64 __read_sysreg_by_encoding(u32 sys_id) +u64 __read_sysreg_by_encoding(u32 sys_id) { + struct arm64_ftr_reg *regp; + u64 val; + switch (sys_id) { read_sysreg_case(SYS_ID_PFR0_EL1); read_sysreg_case(SYS_ID_PFR1_EL1); + read_sysreg_case(SYS_ID_PFR2_EL1); read_sysreg_case(SYS_ID_DFR0_EL1); + read_sysreg_case(SYS_ID_DFR1_EL1); read_sysreg_case(SYS_ID_MMFR0_EL1); read_sysreg_case(SYS_ID_MMFR1_EL1); read_sysreg_case(SYS_ID_MMFR2_EL1); read_sysreg_case(SYS_ID_MMFR3_EL1); + read_sysreg_case(SYS_ID_MMFR4_EL1); + read_sysreg_case(SYS_ID_MMFR5_EL1); read_sysreg_case(SYS_ID_ISAR0_EL1); read_sysreg_case(SYS_ID_ISAR1_EL1); read_sysreg_case(SYS_ID_ISAR2_EL1); read_sysreg_case(SYS_ID_ISAR3_EL1); read_sysreg_case(SYS_ID_ISAR4_EL1); read_sysreg_case(SYS_ID_ISAR5_EL1); + read_sysreg_case(SYS_ID_ISAR6_EL1); read_sysreg_case(SYS_MVFR0_EL1); read_sysreg_case(SYS_MVFR1_EL1); read_sysreg_case(SYS_MVFR2_EL1); read_sysreg_case(SYS_ID_AA64PFR0_EL1); read_sysreg_case(SYS_ID_AA64PFR1_EL1); + read_sysreg_case(SYS_ID_AA64PFR2_EL1); + read_sysreg_case(SYS_ID_AA64ZFR0_EL1); + read_sysreg_case(SYS_ID_AA64SMFR0_EL1); + read_sysreg_case(SYS_ID_AA64FPFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR0_EL1); read_sysreg_case(SYS_ID_AA64MMFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR2_EL1); + read_sysreg_case(SYS_ID_AA64MMFR3_EL1); + read_sysreg_case(SYS_ID_AA64MMFR4_EL1); read_sysreg_case(SYS_ID_AA64ISAR0_EL1); read_sysreg_case(SYS_ID_AA64ISAR1_EL1); + read_sysreg_case(SYS_ID_AA64ISAR2_EL1); + read_sysreg_case(SYS_ID_AA64ISAR3_EL1); read_sysreg_case(SYS_CNTFRQ_EL0); read_sysreg_case(SYS_CTR_EL0); @@ -833,32 +1574,148 @@ static u64 __read_sysreg_by_encoding(u32 sys_id) BUG(); return 0; } + + regp = get_arm64_ftr_reg(sys_id); + if (regp) { + val &= ~regp->override->mask; + val |= (regp->override->val & regp->override->mask); + } + + return val; } #include <linux/irqchip/arm-gic-v3.h> static bool -feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry) +has_always(const struct arm64_cpu_capabilities *entry, int scope) { - int val = cpuid_feature_extract_field(reg, entry->field_pos, entry->sign); - - return val >= entry->min_field_value; + return true; } static bool -has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) +feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry) { - u64 val; + int val, min, max; + u64 tmp; + + val = cpuid_feature_extract_field_width(reg, entry->field_pos, + entry->field_width, + entry->sign); + tmp = entry->min_field_value; + tmp <<= entry->field_pos; + + min = cpuid_feature_extract_field_width(tmp, entry->field_pos, + entry->field_width, + entry->sign); + + tmp = entry->max_field_value; + tmp <<= entry->field_pos; + + max = cpuid_feature_extract_field_width(tmp, entry->field_pos, + entry->field_width, + entry->sign); + + return val >= min && val <= max; +} + +static u64 +read_scoped_sysreg(const struct arm64_cpu_capabilities *entry, int scope) +{ WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible()); if (scope == SCOPE_SYSTEM) - val = read_sanitised_ftr_reg(entry->sys_reg); + return read_sanitised_ftr_reg(entry->sys_reg); else - val = __read_sysreg_by_encoding(entry->sys_reg); + return __read_sysreg_by_encoding(entry->sys_reg); +} + +static bool +has_user_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) +{ + int mask; + struct arm64_ftr_reg *regp; + u64 val = read_scoped_sysreg(entry, scope); + + regp = get_arm64_ftr_reg(entry->sys_reg); + if (!regp) + return false; + + mask = cpuid_feature_extract_unsigned_field_width(regp->user_mask, + entry->field_pos, + entry->field_width); + if (!mask) + return false; return feature_matches(val, entry); } +static bool +has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 val = read_scoped_sysreg(entry, scope); + return feature_matches(val, entry); +} + +const struct cpumask *system_32bit_el0_cpumask(void) +{ + if (!system_supports_32bit_el0()) + return cpu_none_mask; + + if (static_branch_unlikely(&arm64_mismatched_32bit_el0)) + return cpu_32bit_el0_mask; + + return cpu_possible_mask; +} + +const struct cpumask *task_cpu_fallback_mask(struct task_struct *p) +{ + return __task_cpu_possible_mask(p, housekeeping_cpumask(HK_TYPE_TICK)); +} + +static int __init parse_32bit_el0_param(char *str) +{ + allow_mismatched_32bit_el0 = true; + return 0; +} +early_param("allow_mismatched_32bit_el0", parse_32bit_el0_param); + +static ssize_t aarch32_el0_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const struct cpumask *mask = system_32bit_el0_cpumask(); + + return sysfs_emit(buf, "%*pbl\n", cpumask_pr_args(mask)); +} +static const DEVICE_ATTR_RO(aarch32_el0); + +static int __init aarch32_el0_sysfs_init(void) +{ + struct device *dev_root; + int ret = 0; + + if (!allow_mismatched_32bit_el0) + return 0; + + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + ret = device_create_file(dev_root, &dev_attr_aarch32_el0); + put_device(dev_root); + } + return ret; +} +device_initcall(aarch32_el0_sysfs_init); + +static bool has_32bit_el0(const struct arm64_cpu_capabilities *entry, int scope) +{ + if (!has_cpuid_feature(entry, scope)) + return allow_mismatched_32bit_el0; + + if (scope == SCOPE_SYSTEM) + pr_info("detected: 32-bit EL0 Support\n"); + + return true; +} + static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry, int scope) { bool has_sre; @@ -874,24 +1731,6 @@ static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry, return has_sre; } -static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry, int __unused) -{ - u32 midr = read_cpuid_id(); - - /* Cavium ThunderX pass 1.x and 2.x */ - return MIDR_IS_CPU_MODEL_RANGE(midr, MIDR_THUNDERX, - MIDR_CPU_VAR_REV(0, 0), - MIDR_CPU_VAR_REV(1, MIDR_REVISION_MASK)); -} - -static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unused) -{ - u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); - - return cpuid_feature_extract_signed_field(pfr0, - ID_AA64PFR0_FP_SHIFT) < 0; -} - static bool has_cache_idc(const struct arm64_cpu_capabilities *entry, int scope) { @@ -902,7 +1741,7 @@ static bool has_cache_idc(const struct arm64_cpu_capabilities *entry, else ctr = read_cpuid_effective_cachetype(); - return ctr & BIT(CTR_IDC_SHIFT); + return ctr & BIT(CTR_EL0_IDC_SHIFT); } static void cpu_emulate_effective_ctr(const struct arm64_cpu_capabilities *__unused) @@ -913,7 +1752,7 @@ static void cpu_emulate_effective_ctr(const struct arm64_cpu_capabilities *__unu * to the CTR_EL0 on this CPU and emulate it with the real/safe * value. */ - if (!(read_cpuid_cachetype() & BIT(CTR_IDC_SHIFT))) + if (!(read_cpuid_cachetype() & BIT(CTR_EL0_IDC_SHIFT))) sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0); } @@ -927,7 +1766,7 @@ static bool has_cache_dic(const struct arm64_cpu_capabilities *entry, else ctr = read_cpuid_cachetype(); - return ctr & BIT(CTR_DIC_SHIFT); + return ctr & BIT(CTR_EL0_DIC_SHIFT); } static bool __maybe_unused @@ -938,13 +1777,16 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) * may share TLB entries with a CPU stuck in the crashed * kernel. */ - if (is_kdump_kernel()) + if (is_kdump_kernel()) + return false; + + if (cpus_have_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP)) return false; return has_cpuid_feature(entry, scope); } -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +static bool __meltdown_safe = true; static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, @@ -954,26 +1796,65 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, static const struct midr_range kpti_safe_list[] = { MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), + MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_GOLD), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), { /* sentinel */ } }; - char const *str = "command line option"; + char const *str = "kpti command line option"; + bool meltdown_safe; + + meltdown_safe = is_midr_in_range_list(kpti_safe_list); + + /* Defer to CPU feature registers */ + if (has_cpuid_feature(entry, scope)) + meltdown_safe = true; + + if (!meltdown_safe) + __meltdown_safe = false; /* * For reasons that aren't entirely clear, enabling KPTI on Cavium * ThunderX leads to apparent I-cache corruption of kernel text, which - * ends as well as you might imagine. Don't even try. + * ends as well as you might imagine. Don't even try. We cannot rely + * on the cpus_have_*cap() helpers here to detect the CPU erratum + * because cpucap detection order may change. However, since we know + * affected CPUs are always in a homogeneous configuration, it is + * safe to rely on this_cpu_has_cap() here. */ - if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_27456)) { + if (this_cpu_has_cap(ARM64_WORKAROUND_CAVIUM_27456)) { str = "ARM64_WORKAROUND_CAVIUM_27456"; __kpti_forced = -1; } + /* Useful for KASLR robustness */ + if (kaslr_enabled() && kaslr_requires_kpti()) { + if (!__kpti_forced) { + str = "KASLR"; + __kpti_forced = 1; + } + } + + if (cpu_mitigations_off() && !__kpti_forced) { + str = "mitigations=off"; + __kpti_forced = -1; + } + + if (!IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) { + pr_info_once("kernel page table isolation disabled by kernel configuration\n"); + return false; + } + /* Forced? */ if (__kpti_forced) { pr_info_once("kernel page table isolation forced %s by %s\n", @@ -981,52 +1862,100 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, return __kpti_forced > 0; } - /* Useful for KASLR robustness */ - if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - return kaslr_offset() > 0; + return !meltdown_safe; +} - /* Don't force KPTI for CPUs that are not vulnerable */ - if (is_midr_in_range_list(read_cpuid_id(), kpti_safe_list)) - return false; +static bool has_nv1(const struct arm64_cpu_capabilities *entry, int scope) +{ + /* + * Although the Apple M2 family appears to support NV1, the + * PTW barfs on the nVHE EL2 S1 page table format. Pretend + * that it doesn't support NV1 at all. + */ + static const struct midr_range nv1_ni_list[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {} + }; - /* Defer to CPU feature registers */ - return !has_cpuid_feature(entry, scope); + return (__system_matches_cap(ARM64_HAS_NESTED_VIRT) && + !(has_cpuid_feature(entry, scope) || + is_midr_in_range_list(nv1_ni_list))); } -static void -kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) +#if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2) +static bool has_lpa2_at_stage1(u64 mmfr0) +{ + unsigned int tgran; + + tgran = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_TGRAN_SHIFT); + return tgran == ID_AA64MMFR0_EL1_TGRAN_LPA2; +} + +static bool has_lpa2_at_stage2(u64 mmfr0) +{ + unsigned int tgran; + + tgran = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_TGRAN_2_SHIFT); + return tgran == ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2; +} + +static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 mmfr0; + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + return has_lpa2_at_stage1(mmfr0) && has_lpa2_at_stage2(mmfr0); +} +#else +static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) { - typedef void (kpti_remap_fn)(int, int, phys_addr_t); - extern kpti_remap_fn idmap_kpti_install_ng_mappings; - kpti_remap_fn *remap_fn; + return false; +} +#endif - static bool kpti_applied = false; - int cpu = smp_processor_id(); +#ifdef CONFIG_HW_PERF_EVENTS +static bool has_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + unsigned int pmuver; /* - * We don't need to rewrite the page-tables if either we've done - * it already or we have KASLR enabled and therefore have not - * created any global mappings at all. + * PMUVer follows the standard ID scheme for an unsigned field with the + * exception of 0xF (IMP_DEF) which is treated specially and implies + * FEAT_PMUv3 is not implemented. + * + * See DDI0487L.a D24.1.3.2 for more details. */ - if (kpti_applied || kaslr_offset() > 0) - return; + pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_EL1_PMUVer_SHIFT); + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return false; - remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); + return pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP; +} +#endif - cpu_install_idmap(); - remap_fn(cpu, num_online_cpus(), __pa_symbol(swapper_pg_dir)); - cpu_uninstall_idmap(); +static void cpu_enable_kpti(struct arm64_cpu_capabilities const *cap) +{ + if (__this_cpu_read(this_cpu_vector) == vectors) { + const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI); - if (!cpu) - kpti_applied = true; + __this_cpu_write(this_cpu_vector, v); + } - return; } static int __init parse_kpti(char *str) { bool enabled; - int ret = strtobool(str, &enabled); + int ret = kstrtobool(str, &enabled); if (ret) return ret; @@ -1035,15 +1964,17 @@ static int __init parse_kpti(char *str) return 0; } early_param("kpti", parse_kpti); -#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ #ifdef CONFIG_ARM64_HW_AFDBM +static struct cpumask dbm_cpus __read_mostly; + static inline void __cpu_enable_hw_dbm(void) { - u64 tcr = read_sysreg(tcr_el1) | TCR_HD; + u64 tcr = read_sysreg(tcr_el1) | TCR_EL1_HD; write_sysreg(tcr, tcr_el1); isb(); + local_flush_tlb_all(); } static bool cpu_has_broken_dbm(void) @@ -1051,12 +1982,17 @@ static bool cpu_has_broken_dbm(void) /* List of CPUs which have broken DBM support. */ static const struct midr_range cpus[] = { #ifdef CONFIG_ARM64_ERRATUM_1024718 - MIDR_RANGE(MIDR_CORTEX_A55, 0, 0, 1, 0), // A55 r0p0 -r1p0 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + /* Kryo4xx Silver (rdpe => r1p0) */ + MIDR_REV(MIDR_QCOM_KRYO_4XX_SILVER, 0xd, 0xe), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2051678 + MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2), #endif {}, }; - return is_midr_in_range_list(read_cpuid_id(), cpus); + return is_midr_in_range_list(cpus); } static bool cpu_can_use_dbm(const struct arm64_cpu_capabilities *cap) @@ -1067,42 +2003,87 @@ static bool cpu_can_use_dbm(const struct arm64_cpu_capabilities *cap) static void cpu_enable_hw_dbm(struct arm64_cpu_capabilities const *cap) { - if (cpu_can_use_dbm(cap)) + if (cpu_can_use_dbm(cap)) { __cpu_enable_hw_dbm(); + cpumask_set_cpu(smp_processor_id(), &dbm_cpus); + } } static bool has_hw_dbm(const struct arm64_cpu_capabilities *cap, int __unused) { - static bool detected = false; /* * DBM is a non-conflicting feature. i.e, the kernel can safely * run a mix of CPUs with and without the feature. So, we * unconditionally enable the capability to allow any late CPU * to use the feature. We only enable the control bits on the - * CPU, if it actually supports. - * - * We have to make sure we print the "feature" detection only - * when at least one CPU actually uses it. So check if this CPU - * can actually use it and print the message exactly once. - * - * This is safe as all CPUs (including secondary CPUs - due to the - * LOCAL_CPU scope - and the hotplugged CPUs - via verification) - * goes through the "matches" check exactly once. Also if a CPU - * matches the criteria, it is guaranteed that the CPU will turn - * the DBM on, as the capability is unconditionally enabled. + * CPU, if it is supported. */ - if (!detected && cpu_can_use_dbm(cap)) { - detected = true; - pr_info("detected: Hardware dirty bit management\n"); - } return true; } #endif -#ifdef CONFIG_ARM64_VHE +#ifdef CONFIG_ARM64_AMU_EXTN + +/* + * The "amu_cpus" cpumask only signals that the CPU implementation for the + * flagged CPUs supports the Activity Monitors Unit (AMU) but does not provide + * information regarding all the events that it supports. When a CPU bit is + * set in the cpumask, the user of this feature can only rely on the presence + * of the 4 fixed counters for that CPU. But this does not guarantee that the + * counters are enabled or access to these counters is enabled by code + * executed at higher exception levels (firmware). + */ +static struct cpumask amu_cpus __read_mostly; + +bool cpu_has_amu_feat(int cpu) +{ + return cpumask_test_cpu(cpu, &amu_cpus); +} + +int get_cpu_with_amu_feat(void) +{ + return cpumask_any(&amu_cpus); +} + +static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap) +{ + if (has_cpuid_feature(cap, SCOPE_LOCAL_CPU)) { + cpumask_set_cpu(smp_processor_id(), &amu_cpus); + + /* 0 reference values signal broken/disabled counters */ + if (!this_cpu_has_cap(ARM64_WORKAROUND_2457168)) + update_freq_counters_refs(); + } +} + +static bool has_amu(const struct arm64_cpu_capabilities *cap, + int __unused) +{ + /* + * The AMU extension is a non-conflicting feature: the kernel can + * safely run a mix of CPUs with and without support for the + * activity monitors extension. Therefore, unconditionally enable + * the capability to allow any late CPU to use the feature. + * + * With this feature unconditionally enabled, the cpu_enable + * function will be called for all CPUs that match the criteria, + * including secondary and hotplugged, marking this feature as + * present on that respective CPU. The enable function will also + * print a detection message. + */ + + return true; +} +#else +int get_cpu_with_amu_feat(void) +{ + return nr_cpu_ids; +} +#endif + static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused) { return is_kernel_in_hyp_mode(); @@ -1118,60 +2099,70 @@ static void cpu_copy_el2regs(const struct arm64_cpu_capabilities *__unused) * that, freshly-onlined CPUs will set tpidr_el2, so we don't need to * do anything here. */ - if (!alternatives_applied) + if (!alternative_is_applied(ARM64_HAS_VIRT_HOST_EXTN)) write_sysreg(read_sysreg(tpidr_el1), tpidr_el2); } -#endif -static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused) +static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap, + int scope) { - u64 val = read_sysreg_s(SYS_CLIDR_EL1); + if (kvm_get_mode() != KVM_MODE_NV) + return false; - /* Check that CLIDR_EL1.LOU{U,IS} are both 0 */ - WARN_ON(val & (7 << 27 | 7 << 21)); + if (!cpucap_multi_entry_cap_matches(cap, scope)) { + pr_warn("unavailable: %s\n", cap->desc); + return false; + } + + return true; } -#ifdef CONFIG_ARM64_SSBD -static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr) +static bool hvhe_possible(const struct arm64_cpu_capabilities *entry, + int __unused) { - if (user_mode(regs)) - return 1; - - if (instr & BIT(PSTATE_Imm_shift)) - regs->pstate |= PSR_SSBS_BIT; - else - regs->pstate &= ~PSR_SSBS_BIT; - - arm64_skip_faulting_instruction(regs, 4); - return 0; + return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE); } -static struct undef_hook ssbs_emulation_hook = { - .instr_mask = ~(1U << PSTATE_Imm_shift), - .instr_val = 0xd500401f | PSTATE_SSBS, - .fn = ssbs_emulation_handler, -}; - -static void cpu_enable_ssbs(const struct arm64_cpu_capabilities *__unused) +bool cpu_supports_bbml2_noabort(void) { - static bool undef_hook_registered = false; - static DEFINE_SPINLOCK(hook_lock); + /* + * We want to allow usage of BBML2 in as wide a range of kernel contexts + * as possible. This list is therefore an allow-list of known-good + * implementations that both support BBML2 and additionally, fulfill the + * extra constraint of never generating TLB conflict aborts when using + * the relaxed BBML2 semantics (such aborts make use of BBML2 in certain + * kernel contexts difficult to prove safe against recursive aborts). + * + * Note that implementations can only be considered "known-good" if their + * implementors attest to the fact that the implementation never raises + * TLB conflict aborts for BBML2 mapping granularity changes. + */ + static const struct midr_range supports_bbml2_noabort_list[] = { + MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf), + MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf), + MIDR_REV_RANGE(MIDR_NEOVERSE_V3AE, 0, 2, 0xf), + MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS), + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {} + }; - spin_lock(&hook_lock); - if (!undef_hook_registered) { - register_undef_hook(&ssbs_emulation_hook); - undef_hook_registered = true; - } - spin_unlock(&hook_lock); + /* Does our cpu guarantee to never raise TLB conflict aborts? */ + if (!is_midr_in_range_list(supports_bbml2_noabort_list)) + return false; - if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) { - sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS); - arm64_set_ssbd_mitigation(false); - } else { - arm64_set_ssbd_mitigation(true); - } + /* + * We currently ignore the ID_AA64MMFR2_EL1 register, and only care + * about whether the MIDR check passes. + */ + + return true; +} + +static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope) +{ + return cpu_supports_bbml2_noabort(); } -#endif /* CONFIG_ARM64_SSBD */ #ifdef CONFIG_ARM64_PAN static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) @@ -1183,7 +2174,7 @@ static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) WARN_ON_ONCE(in_interrupt()); sysreg_clear_set(sctlr_el1, SCTLR_EL1_SPAN, 0); - asm(SET_PSTATE_PAN(1)); + set_pstate_pan(1); } #endif /* CONFIG_ARM64_PAN */ @@ -1193,26 +2184,362 @@ static void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused) /* Firmware may have left a deferred SError in this register. */ write_sysreg_s(0, SYS_DISR_EL1); } +static bool has_rasv1p1(const struct arm64_cpu_capabilities *__unused, int scope) +{ + const struct arm64_cpu_capabilities rasv1p1_caps[] = { + { + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, V1P1) + }, + { + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, IMP) + }, + { + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, RAS_frac, RASv1p1) + }, + }; + + return (has_cpuid_feature(&rasv1p1_caps[0], scope) || + (has_cpuid_feature(&rasv1p1_caps[1], scope) && + has_cpuid_feature(&rasv1p1_caps[2], scope))); +} #endif /* CONFIG_ARM64_RAS_EXTN */ #ifdef CONFIG_ARM64_PTR_AUTH -static void cpu_enable_address_auth(struct arm64_cpu_capabilities const *cap) +static bool has_address_auth_cpucap(const struct arm64_cpu_capabilities *entry, int scope) +{ + int boot_val, sec_val; + + /* We don't expect to be called with SCOPE_SYSTEM */ + WARN_ON(scope == SCOPE_SYSTEM); + /* + * The ptr-auth feature levels are not intercompatible with lower + * levels. Hence we must match ptr-auth feature level of the secondary + * CPUs with that of the boot CPU. The level of boot cpu is fetched + * from the sanitised register whereas direct register read is done for + * the secondary CPUs. + * The sanitised feature state is guaranteed to match that of the + * boot CPU as a mismatched secondary CPU is parked before it gets + * a chance to update the state, with the capability. + */ + boot_val = cpuid_feature_extract_field(read_sanitised_ftr_reg(entry->sys_reg), + entry->field_pos, entry->sign); + if (scope & SCOPE_BOOT_CPU) + return boot_val >= entry->min_field_value; + /* Now check for the secondary CPUs with SCOPE_LOCAL_CPU scope */ + sec_val = cpuid_feature_extract_field(__read_sysreg_by_encoding(entry->sys_reg), + entry->field_pos, entry->sign); + return (sec_val >= entry->min_field_value) && (sec_val == boot_val); +} + +static bool has_address_auth_metacap(const struct arm64_cpu_capabilities *entry, + int scope) +{ + bool api = has_address_auth_cpucap(cpucap_ptrs[ARM64_HAS_ADDRESS_AUTH_IMP_DEF], scope); + bool apa = has_address_auth_cpucap(cpucap_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA5], scope); + bool apa3 = has_address_auth_cpucap(cpucap_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA3], scope); + + return apa || apa3 || api; +} + +static bool has_generic_auth(const struct arm64_cpu_capabilities *entry, + int __unused) { - sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | - SCTLR_ELx_ENDA | SCTLR_ELx_ENDB); + bool gpi = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_IMP_DEF); + bool gpa = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_ARCH_QARMA5); + bool gpa3 = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_ARCH_QARMA3); + + return gpa || gpa3 || gpi; } #endif /* CONFIG_ARM64_PTR_AUTH */ +#ifdef CONFIG_ARM64_E0PD +static void cpu_enable_e0pd(struct arm64_cpu_capabilities const *cap) +{ + if (this_cpu_has_cap(ARM64_HAS_E0PD)) + sysreg_clear_set(tcr_el1, 0, TCR_EL1_E0PD1); +} +#endif /* CONFIG_ARM64_E0PD */ + +#ifdef CONFIG_ARM64_PSEUDO_NMI +static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry, + int scope) +{ + /* + * ARM64_HAS_GICV3_CPUIF has a lower index, and is a boot CPU + * feature, so will be detected earlier. + */ + BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_MASKING <= ARM64_HAS_GICV3_CPUIF); + if (!cpus_have_cap(ARM64_HAS_GICV3_CPUIF)) + return false; + + return enable_pseudo_nmi; +} + +static bool has_gic_prio_relaxed_sync(const struct arm64_cpu_capabilities *entry, + int scope) +{ + /* + * If we're not using priority masking then we won't be poking PMR_EL1, + * and there's no need to relax synchronization of writes to it, and + * ICC_CTLR_EL1 might not be accessible and we must avoid reads from + * that. + * + * ARM64_HAS_GIC_PRIO_MASKING has a lower index, and is a boot CPU + * feature, so will be detected earlier. + */ + BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_RELAXED_SYNC <= ARM64_HAS_GIC_PRIO_MASKING); + if (!cpus_have_cap(ARM64_HAS_GIC_PRIO_MASKING)) + return false; + + /* + * When Priority Mask Hint Enable (PMHE) == 0b0, PMR is not used as a + * hint for interrupt distribution, a DSB is not necessary when + * unmasking IRQs via PMR, and we can relax the barrier to a NOP. + * + * Linux itself doesn't use 1:N distribution, so has no need to + * set PMHE. The only reason to have it set is if EL3 requires it + * (and we can't change it). + */ + return (gic_read_ctlr() & ICC_CTLR_EL1_PMHE_MASK) == 0; +} +#endif + +static bool can_trap_icv_dir_el1(const struct arm64_cpu_capabilities *entry, + int scope) +{ + static const struct midr_range has_vgic_v3[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {}, + }; + struct arm_smccc_res res = {}; + + BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV3_CPUIF); + BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV5_LEGACY); + if (!this_cpu_has_cap(ARM64_HAS_GICV3_CPUIF) && + !is_midr_in_range_list(has_vgic_v3)) + return false; + + if (!is_hyp_mode_available()) + return false; + + if (this_cpu_has_cap(ARM64_HAS_GICV5_LEGACY)) + return true; + + if (is_kernel_in_hyp_mode()) + res.a1 = read_sysreg_s(SYS_ICH_VTR_EL2); + else + arm_smccc_1_1_hvc(HVC_GET_ICH_VTR_EL2, &res); + + if (res.a0 == HVC_STUB_ERR) + return false; + + return res.a1 & ICH_VTR_EL2_TDS; +} + +#ifdef CONFIG_ARM64_BTI +static void bti_enable(const struct arm64_cpu_capabilities *__unused) +{ + /* + * Use of X16/X17 for tail-calls and trampolines that jump to + * function entry points using BR is a requirement for + * marking binaries with GNU_PROPERTY_AARCH64_FEATURE_1_BTI. + * So, be strict and forbid other BRs using other registers to + * jump onto a PACIxSP instruction: + */ + sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_BT0 | SCTLR_EL1_BT1); + isb(); +} +#endif /* CONFIG_ARM64_BTI */ + +#ifdef CONFIG_ARM64_MTE +static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) +{ + static bool cleared_zero_page = false; + + sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_ATA | SCTLR_EL1_ATA0); + + mte_cpu_setup(); + + /* + * Clear the tags in the zero page. This needs to be done via the + * linear map which has the Tagged attribute. Since this page is + * always mapped as pte_special(), set_pte_at() will not attempt to + * clear the tags or set PG_mte_tagged. + */ + if (!cleared_zero_page) { + cleared_zero_page = true; + mte_clear_page_tags(lm_alias(empty_zero_page)); + } + + kasan_init_hw_tags_cpu(); +} +#endif /* CONFIG_ARM64_MTE */ + +static void user_feature_fixup(void) +{ + if (cpus_have_cap(ARM64_WORKAROUND_2658417)) { + struct arm64_ftr_reg *regp; + + regp = get_arm64_ftr_reg(SYS_ID_AA64ISAR1_EL1); + if (regp) + regp->user_mask &= ~ID_AA64ISAR1_EL1_BF16_MASK; + } + + if (cpus_have_cap(ARM64_WORKAROUND_SPECULATIVE_SSBS)) { + struct arm64_ftr_reg *regp; + + regp = get_arm64_ftr_reg(SYS_ID_AA64PFR1_EL1); + if (regp) + regp->user_mask &= ~ID_AA64PFR1_EL1_SSBS_MASK; + } +} + +static void elf_hwcap_fixup(void) +{ +#ifdef CONFIG_COMPAT + if (cpus_have_cap(ARM64_WORKAROUND_1742098)) + compat_elf_hwcap2 &= ~COMPAT_HWCAP2_AES; +#endif /* CONFIG_COMPAT */ +} + +#ifdef CONFIG_KVM +static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused) +{ + return kvm_get_mode() == KVM_MODE_PROTECTED; +} +#endif /* CONFIG_KVM */ + +static void cpu_trap_el0_impdef(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_TIDCP); +} + +static void cpu_enable_dit(const struct arm64_cpu_capabilities *__unused) +{ + set_pstate_dit(1); +} + +static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_MSCEn); +} + +#ifdef CONFIG_ARM64_POE +static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1_E0POE); + sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_E0POE); +} +#endif + +#ifdef CONFIG_ARM64_GCS +static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused) +{ + /* GCSPR_EL0 is always readable */ + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); +} +#endif + +/* Internal helper functions to match cpu capability type */ +static bool +cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) +{ + return !!(cap->type & ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU); +} + +static bool +cpucap_late_cpu_permitted(const struct arm64_cpu_capabilities *cap) +{ + return !!(cap->type & ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU); +} + +static bool +cpucap_panic_on_conflict(const struct arm64_cpu_capabilities *cap) +{ + return !!(cap->type & ARM64_CPUCAP_PANIC_ON_CONFLICT); +} + +static bool +test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) +{ + if (!has_cpuid_feature(entry, scope)) + return false; + + /* Check firmware actually enabled MPAM on this cpu. */ + return (read_sysreg_s(SYS_MPAM1_EL1) & MPAM1_EL1_MPAMEN); +} + +static void +cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) +{ + /* + * Access by the kernel (at EL1) should use the reserved PARTID + * which is configured unrestricted. This avoids priority-inversion + * where latency sensitive tasks have to wait for a task that has + * been throttled to release the lock. + */ + write_sysreg_s(0, SYS_MPAM1_EL1); +} + +static bool +test_has_mpam_hcr(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + + return idr & MPAMIDR_EL1_HAS_HCR; +} + +static bool +test_has_gicv5_legacy(const struct arm64_cpu_capabilities *entry, int scope) +{ + if (!this_cpu_has_cap(ARM64_HAS_GICV5_CPUIF)) + return false; + + return !!(read_sysreg_s(SYS_ICC_IDR0_EL1) & ICC_IDR0_EL1_GCIE_LEGACY); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { - .desc = "GIC system register CPU interface", - .capability = ARM64_HAS_SYSREG_GIC_CPUIF, + .capability = ARM64_ALWAYS_BOOT, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_always, + }, + { + .capability = ARM64_ALWAYS_SYSTEM, .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_always, + }, + { + .desc = "GICv3 CPU interface", + .capability = ARM64_HAS_GICV3_CPUIF, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = has_useable_gicv3_cpuif, - .sys_reg = SYS_ID_AA64PFR0_EL1, - .field_pos = ID_AA64PFR0_GIC_SHIFT, - .sign = FTR_UNSIGNED, - .min_field_value = 1, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, GIC, IMP) + }, + { + .desc = "Enhanced Counter Virtualization", + .capability = ARM64_HAS_ECV, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, ECV, IMP) + }, + { + .desc = "Enhanced Counter Virtualization (CNTPOFF)", + .capability = ARM64_HAS_ECV_CNTPOFF, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, ECV, CNTPOFF) }, #ifdef CONFIG_ARM64_PAN { @@ -1220,95 +2547,99 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_PAN, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64MMFR1_EL1, - .field_pos = ID_AA64MMFR1_PAN_SHIFT, - .sign = FTR_UNSIGNED, - .min_field_value = 1, .cpu_enable = cpu_enable_pan, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, PAN, IMP) }, #endif /* CONFIG_ARM64_PAN */ -#if defined(CONFIG_AS_LSE) && defined(CONFIG_ARM64_LSE_ATOMICS) +#ifdef CONFIG_ARM64_EPAN + { + .desc = "Enhanced Privileged Access Never", + .capability = ARM64_HAS_EPAN, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, PAN, PAN3) + }, +#endif /* CONFIG_ARM64_EPAN */ +#ifdef CONFIG_ARM64_LSE_ATOMICS { .desc = "LSE atomic instructions", .capability = ARM64_HAS_LSE_ATOMICS, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64ISAR0_EL1, - .field_pos = ID_AA64ISAR0_ATOMICS_SHIFT, - .sign = FTR_UNSIGNED, - .min_field_value = 2, + ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, ATOMIC, IMP) }, -#endif /* CONFIG_AS_LSE && CONFIG_ARM64_LSE_ATOMICS */ +#endif /* CONFIG_ARM64_LSE_ATOMICS */ { - .desc = "Software prefetching using PRFM", - .capability = ARM64_HAS_NO_HW_PREFETCH, - .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, - .matches = has_no_hw_prefetch, + .desc = "Virtualization Host Extensions", + .capability = ARM64_HAS_VIRT_HOST_EXTN, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, + .matches = runs_at_el2, + .cpu_enable = cpu_copy_el2regs, }, -#ifdef CONFIG_ARM64_UAO { - .desc = "User Access Override", - .capability = ARM64_HAS_UAO, + .desc = "Nested Virtualization Support", + .capability = ARM64_HAS_NESTED_VIRT, .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64MMFR2_EL1, - .field_pos = ID_AA64MMFR2_UAO_SHIFT, - .min_field_value = 1, - /* - * We rely on stop_machine() calling uao_thread_switch() to set - * UAO immediately after patching. - */ + .matches = has_nested_virt_support, + .match_list = (const struct arm64_cpu_capabilities []){ + { + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2) + }, + { + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY) + }, + { /* Sentinel */ } + }, }, -#endif /* CONFIG_ARM64_UAO */ -#ifdef CONFIG_ARM64_PAN { - .capability = ARM64_ALT_PAN_NOT_UAO, + .capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE, .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .matches = cpufeature_pan_not_uao, + .matches = has_32bit_el0, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, EL0, AARCH32) }, -#endif /* CONFIG_ARM64_PAN */ -#ifdef CONFIG_ARM64_VHE +#ifdef CONFIG_KVM { - .desc = "Virtualization Host Extensions", - .capability = ARM64_HAS_VIRT_HOST_EXTN, - .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, - .matches = runs_at_el2, - .cpu_enable = cpu_copy_el2regs, + .desc = "32-bit EL1 Support", + .capability = ARM64_HAS_32BIT_EL1, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, EL1, AARCH32) }, -#endif /* CONFIG_ARM64_VHE */ { - .desc = "32-bit EL0 Support", - .capability = ARM64_HAS_32BIT_EL0, + .desc = "Protected KVM", + .capability = ARM64_KVM_PROTECTED_MODE, .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = is_kvm_protected_mode, + }, + { + .desc = "HCRX_EL2 register", + .capability = ARM64_HAS_HCX, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64PFR0_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64PFR0_EL0_SHIFT, - .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HCX, IMP) }, -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#endif { .desc = "Kernel page table isolation (KPTI)", .capability = ARM64_UNMAP_KERNEL_AT_EL0, .type = ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE, + .cpu_enable = cpu_enable_kpti, + .matches = unmap_kernel_at_el0, /* * The ID feature fields below are used to indicate that * the CPU doesn't need KPTI. See unmap_kernel_at_el0 for * more details. */ - .sys_reg = SYS_ID_AA64PFR0_EL1, - .field_pos = ID_AA64PFR0_CSV3_SHIFT, - .min_field_value = 1, - .matches = unmap_kernel_at_el0, - .cpu_enable = kpti_install_ng_mappings, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, CSV3, IMP) }, -#endif { - /* FP/SIMD is not implemented */ - .capability = ARM64_HAS_NO_FPSIMD, + .capability = ARM64_HAS_FPSIMD, .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .min_field_value = 0, - .matches = has_no_fpsimd, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_fpsimd, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, FP, IMP) }, #ifdef CONFIG_ARM64_PMEM { @@ -1316,9 +2647,14 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_DCPOP, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64ISAR1_EL1, - .field_pos = ID_AA64ISAR1_DPB_SHIFT, - .min_field_value = 1, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, DPB, IMP) + }, + { + .desc = "Data cache clean to Point of Deep Persistence", + .capability = ARM64_HAS_DCPODP, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, DPB, DPB2) }, #endif #ifdef CONFIG_ARM64_SVE @@ -1326,12 +2662,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .desc = "Scalable Vector Extension", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_SVE, - .sys_reg = SYS_ID_AA64PFR0_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64PFR0_SVE_SHIFT, - .min_field_value = ID_AA64PFR0_SVE, + .cpu_enable = cpu_enable_sve, .matches = has_cpuid_feature, - .cpu_enable = sve_kernel_enable, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, SVE, IMP) }, #endif /* CONFIG_ARM64_SVE */ #ifdef CONFIG_ARM64_RAS_EXTN @@ -1340,13 +2673,27 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_RAS_EXTN, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64PFR0_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64PFR0_RAS_SHIFT, - .min_field_value = ID_AA64PFR0_RAS_V1, .cpu_enable = cpu_clear_disr, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, IMP) + }, + { + .desc = "RASv1p1 Extension Support", + .capability = ARM64_HAS_RASV1P1_EXTN, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_rasv1p1, }, #endif /* CONFIG_ARM64_RAS_EXTN */ +#ifdef CONFIG_ARM64_AMU_EXTN + { + .desc = "Activity Monitors Unit (AMU)", + .capability = ARM64_HAS_AMU_EXTN, + .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + .matches = has_amu, + .cpu_enable = cpu_amu_enable, + .cpus = &amu_cpus, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, AMU, IMP) + }, +#endif /* CONFIG_ARM64_AMU_EXTN */ { .desc = "Data cache clean to the PoU not required for I/D coherence", .capability = ARM64_HAS_CACHE_IDC, @@ -1364,31 +2711,47 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .desc = "Stage-2 Force Write-Back", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_HAS_STAGE2_FWB, - .sys_reg = SYS_ID_AA64MMFR2_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64MMFR2_FWB_SHIFT, - .min_field_value = 1, .matches = has_cpuid_feature, - .cpu_enable = cpu_has_fwb, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, FWB, IMP) + }, + { + .desc = "ARMv8.4 Translation Table Level", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_ARMv8_4_TTL, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, TTL, IMP) + }, + { + .desc = "TLB range maintenance instructions", + .capability = ARM64_HAS_TLB_RANGE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, TLB, RANGE) }, #ifdef CONFIG_ARM64_HW_AFDBM { - /* - * Since we turn this on always, we don't want the user to - * think that the feature is available when it may not be. - * So hide the description. - * - * .desc = "Hardware pagetable Dirty Bit Management", - * - */ + .desc = "Hardware dirty bit management", .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, .capability = ARM64_HW_DBM, - .sys_reg = SYS_ID_AA64MMFR1_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64MMFR1_HADBS_SHIFT, - .min_field_value = 2, .matches = has_hw_dbm, .cpu_enable = cpu_enable_hw_dbm, + .cpus = &dbm_cpus, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, DBM) + }, +#endif +#ifdef CONFIG_ARM64_HAFT + { + .desc = "Hardware managed Access Flag for Table Descriptors", + /* + * Contrary to the page/block access flag, the table access flag + * cannot be emulated in software (no access fault will occur). + * Therefore this should be used only if it's supported system + * wide. + */ + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAFT, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, HAFT) }, #endif { @@ -1396,34 +2759,23 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_CRC32, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64ISAR0_EL1, - .field_pos = ID_AA64ISAR0_CRC32_SHIFT, - .min_field_value = 1, + ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, CRC32, IMP) }, -#ifdef CONFIG_ARM64_SSBD { .desc = "Speculative Store Bypassing Safe (SSBS)", .capability = ARM64_SSBS, - .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64PFR1_EL1, - .field_pos = ID_AA64PFR1_SSBS_SHIFT, - .sign = FTR_UNSIGNED, - .min_field_value = ID_AA64PFR1_SSBS_PSTATE_ONLY, - .cpu_enable = cpu_enable_ssbs, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SSBS, IMP) }, -#endif #ifdef CONFIG_ARM64_CNP { .desc = "Common not Private translations", .capability = ARM64_HAS_CNP, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_useable_cnp, - .sys_reg = SYS_ID_AA64MMFR2_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64MMFR2_CNP_SHIFT, - .min_field_value = 1, .cpu_enable = cpu_enable_cnp, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, CnP, IMP) }, #endif { @@ -1431,64 +2783,377 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_SB, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64ISAR1_EL1, - .field_pos = ID_AA64ISAR1_SB_SHIFT, - .sign = FTR_UNSIGNED, - .min_field_value = 1, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, SB, IMP) }, #ifdef CONFIG_ARM64_PTR_AUTH { - .desc = "Address authentication (architected algorithm)", - .capability = ARM64_HAS_ADDRESS_AUTH_ARCH, - .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .sys_reg = SYS_ID_AA64ISAR1_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_APA_SHIFT, - .min_field_value = ID_AA64ISAR1_APA_ARCHITECTED, - .matches = has_cpuid_feature, - .cpu_enable = cpu_enable_address_auth, + .desc = "Address authentication (architected QARMA5 algorithm)", + .capability = ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA5, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_address_auth_cpucap, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, APA, PAuth) + }, + { + .desc = "Address authentication (architected QARMA3 algorithm)", + .capability = ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA3, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_address_auth_cpucap, + ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, APA3, PAuth) }, { .desc = "Address authentication (IMP DEF algorithm)", .capability = ARM64_HAS_ADDRESS_AUTH_IMP_DEF, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_address_auth_cpucap, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, API, PAuth) + }, + { + .capability = ARM64_HAS_ADDRESS_AUTH, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_address_auth_metacap, + }, + { + .desc = "Generic authentication (architected QARMA5 algorithm)", + .capability = ARM64_HAS_GENERIC_AUTH_ARCH_QARMA5, .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .sys_reg = SYS_ID_AA64ISAR1_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_API_SHIFT, - .min_field_value = ID_AA64ISAR1_API_IMP_DEF, .matches = has_cpuid_feature, - .cpu_enable = cpu_enable_address_auth, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, GPA, IMP) }, { - .desc = "Generic authentication (architected algorithm)", - .capability = ARM64_HAS_GENERIC_AUTH_ARCH, + .desc = "Generic authentication (architected QARMA3 algorithm)", + .capability = ARM64_HAS_GENERIC_AUTH_ARCH_QARMA3, .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .sys_reg = SYS_ID_AA64ISAR1_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_GPA_SHIFT, - .min_field_value = ID_AA64ISAR1_GPA_ARCHITECTED, .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, GPA3, IMP) }, { .desc = "Generic authentication (IMP DEF algorithm)", .capability = ARM64_HAS_GENERIC_AUTH_IMP_DEF, .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .sys_reg = SYS_ID_AA64ISAR1_EL1, - .sign = FTR_UNSIGNED, - .field_pos = ID_AA64ISAR1_GPI_SHIFT, - .min_field_value = ID_AA64ISAR1_GPI_IMP_DEF, .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, GPI, IMP) + }, + { + .capability = ARM64_HAS_GENERIC_AUTH, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_generic_auth, }, #endif /* CONFIG_ARM64_PTR_AUTH */ +#ifdef CONFIG_ARM64_PSEUDO_NMI + { + /* + * Depends on having GICv3 + */ + .desc = "IRQ priority masking", + .capability = ARM64_HAS_GIC_PRIO_MASKING, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, + .matches = can_use_gic_priorities, + }, + { + /* + * Depends on ARM64_HAS_GIC_PRIO_MASKING + */ + .capability = ARM64_HAS_GIC_PRIO_RELAXED_SYNC, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, + .matches = has_gic_prio_relaxed_sync, + }, +#endif + { + /* + * Depends on having GICv3 + */ + .desc = "ICV_DIR_EL1 trapping", + .capability = ARM64_HAS_ICH_HCR_EL2_TDIR, + .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE, + .matches = can_trap_icv_dir_el1, + }, +#ifdef CONFIG_ARM64_E0PD + { + .desc = "E0PD", + .capability = ARM64_HAS_E0PD, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .cpu_enable = cpu_enable_e0pd, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, E0PD, IMP) + }, +#endif + { + .desc = "Random Number Generator", + .capability = ARM64_HAS_RNG, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, RNDR, IMP) + }, +#ifdef CONFIG_ARM64_BTI + { + .desc = "Branch Target Identification", + .capability = ARM64_BTI, +#ifdef CONFIG_ARM64_BTI_KERNEL + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, +#else + .type = ARM64_CPUCAP_SYSTEM_FEATURE, +#endif + .matches = has_cpuid_feature, + .cpu_enable = bti_enable, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, BT, IMP) + }, +#endif +#ifdef CONFIG_ARM64_MTE + { + .desc = "Memory Tagging Extension", + .capability = ARM64_MTE, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_mte, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, MTE, MTE2) + }, + { + .desc = "Asymmetric MTE Tag Check Fault", + .capability = ARM64_MTE_ASYMM, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, MTE, MTE3) + }, + { + .desc = "FAR on MTE Tag Check Fault", + .capability = ARM64_MTE_FAR, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, MTEFAR, IMP) + }, + { + .desc = "Store Only MTE Tag Check", + .capability = ARM64_MTE_STORE_ONLY, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, MTESTOREONLY, IMP) + }, +#endif /* CONFIG_ARM64_MTE */ + { + .desc = "RCpc load-acquire (LDAPR)", + .capability = ARM64_HAS_LDAPR, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, LRCPC, IMP) + }, + { + .desc = "Fine Grained Traps", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_FGT, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, IMP) + }, + { + .desc = "Fine Grained Traps 2", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_FGT2, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, FGT2) + }, +#ifdef CONFIG_ARM64_SME + { + .desc = "Scalable Matrix Extension", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_sme, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SME, IMP) + }, + /* FA64 should be sorted after the base SME capability */ + { + .desc = "FA64", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME_FA64, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_fa64, + ARM64_CPUID_FIELDS(ID_AA64SMFR0_EL1, FA64, IMP) + }, + { + .desc = "SME2", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_SME2, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_sme2, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SME, SME2) + }, +#endif /* CONFIG_ARM64_SME */ + { + .desc = "WFx with timeout", + .capability = ARM64_HAS_WFXT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, WFxT, IMP) + }, + { + .desc = "Trap EL0 IMPLEMENTATION DEFINED functionality", + .capability = ARM64_HAS_TIDCP1, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_trap_el0_impdef, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, TIDCP1, IMP) + }, + { + .desc = "Data independent timing control (DIT)", + .capability = ARM64_HAS_DIT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_dit, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, DIT, IMP) + }, + { + .desc = "Memory Copy and Memory Set instructions", + .capability = ARM64_HAS_MOPS, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_mops, + ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, MOPS, IMP) + }, + { + .capability = ARM64_HAS_TCR2, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, TCRX, IMP) + }, + { + .desc = "Stage-1 Permission Indirection Extension (S1PIE)", + .capability = ARM64_HAS_S1PIE, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1PIE, IMP) + }, + { + .desc = "VHE for hypervisor only", + .capability = ARM64_KVM_HVHE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = hvhe_possible, + }, + { + .desc = "Enhanced Virtualization Traps", + .capability = ARM64_HAS_EVT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP) + }, + { + .desc = "BBM Level 2 without TLB conflict abort", + .capability = ARM64_HAS_BBML2_NOABORT, + .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE, + .matches = has_bbml2_noabort, + }, + { + .desc = "52-bit Virtual Addressing for KVM (LPA2)", + .capability = ARM64_HAS_LPA2, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_lpa2, + }, + { + .desc = "FPMR", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_FPMR, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_fpmr, + ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, FPMR, IMP) + }, +#ifdef CONFIG_ARM64_VA_BITS_52 + { + .capability = ARM64_HAS_VA52, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, +#ifdef CONFIG_ARM64_64K_PAGES + .desc = "52-bit Virtual Addressing (LVA)", + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, VARange, 52) +#else + .desc = "52-bit Virtual Addressing (LPA2)", +#ifdef CONFIG_ARM64_4K_PAGES + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN4, 52_BIT) +#else + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN16, 52_BIT) +#endif +#endif + }, +#endif + { + .desc = "Memory Partitioning And Monitoring", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM, + .matches = test_has_mpam, + .cpu_enable = cpu_enable_mpam, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, MPAM, 1) + }, + { + .desc = "Memory Partitioning And Monitoring Virtualisation", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM_HCR, + .matches = test_has_mpam_hcr, + }, + { + .desc = "NV1", + .capability = ARM64_HAS_HCR_NV1, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_nv1, + ARM64_CPUID_FIELDS_NEG(ID_AA64MMFR4_EL1, E2H0, NI_NV1) + }, +#ifdef CONFIG_ARM64_POE + { + .desc = "Stage-1 Permission Overlay Extension (S1POE)", + .capability = ARM64_HAS_S1POE, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_poe, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) + }, +#endif +#ifdef CONFIG_ARM64_GCS + { + .desc = "Guarded Control Stack (GCS)", + .capability = ARM64_HAS_GCS, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .cpu_enable = cpu_enable_gcs, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP) + }, +#endif +#ifdef CONFIG_HW_PERF_EVENTS + { + .desc = "PMUv3", + .capability = ARM64_HAS_PMUV3, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_pmuv3, + }, +#endif + { + .desc = "SCTLR2", + .capability = ARM64_HAS_SCTLR2, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, SCTLRX, IMP) + }, + { + .desc = "GICv5 CPU interface", + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, + .capability = ARM64_HAS_GICV5_CPUIF, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, GCIE, IMP) + }, + { + .desc = "GICv5 Legacy vCPU interface", + .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE, + .capability = ARM64_HAS_GICV5_LEGACY, + .matches = test_has_gicv5_legacy, + }, + { + .desc = "XNX", + .capability = ARM64_HAS_XNX, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, XNX, IMP) + }, {}, }; -#define HWCAP_CPUID_MATCH(reg, field, s, min_value) \ - .matches = has_cpuid_feature, \ - .sys_reg = reg, \ - .field_pos = field, \ - .sign = s, \ - .min_field_value = min_value, +#define HWCAP_CPUID_MATCH(reg, field, min_value) \ + .matches = has_user_cpuid_feature, \ + ARM64_CPUID_FIELDS(reg, field, min_value) #define __HWCAP_CAP(name, cap_type, cap) \ .desc = name, \ @@ -1496,10 +3161,10 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .hwcap_type = cap_type, \ .hwcap = cap, \ -#define HWCAP_CAP(reg, field, s, min_value, cap_type, cap) \ +#define HWCAP_CAP(reg, field, min_value, cap_type, cap) \ { \ __HWCAP_CAP(#cap, cap_type, cap) \ - HWCAP_CPUID_MATCH(reg, field, s, min_value) \ + HWCAP_CPUID_MATCH(reg, field, min_value) \ } #define HWCAP_MULTI_CAP(list, cap_type, cap) \ @@ -1509,86 +3174,243 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .match_list = list, \ } +#define HWCAP_CAP_MATCH(match, cap_type, cap) \ + { \ + __HWCAP_CAP(#cap, cap_type, cap) \ + .matches = match, \ + } + +#define HWCAP_CAP_MATCH_ID(match, reg, field, min_value, cap_type, cap) \ + { \ + __HWCAP_CAP(#cap, cap_type, cap) \ + HWCAP_CPUID_MATCH(reg, field, min_value) \ + .matches = match, \ + } + #ifdef CONFIG_ARM64_PTR_AUTH static const struct arm64_cpu_capabilities ptr_auth_hwcap_addr_matches[] = { { - HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_APA_SHIFT, - FTR_UNSIGNED, ID_AA64ISAR1_APA_ARCHITECTED) + HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, APA, PAuth) }, { - HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_API_SHIFT, - FTR_UNSIGNED, ID_AA64ISAR1_API_IMP_DEF) + HWCAP_CPUID_MATCH(ID_AA64ISAR2_EL1, APA3, PAuth) + }, + { + HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, API, PAuth) }, {}, }; static const struct arm64_cpu_capabilities ptr_auth_hwcap_gen_matches[] = { { - HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_GPA_SHIFT, - FTR_UNSIGNED, ID_AA64ISAR1_GPA_ARCHITECTED) + HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, GPA, IMP) + }, + { + HWCAP_CPUID_MATCH(ID_AA64ISAR2_EL1, GPA3, IMP) }, { - HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_GPI_SHIFT, - FTR_UNSIGNED, ID_AA64ISAR1_GPI_IMP_DEF) + HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, GPI, IMP) }, {}, }; #endif +#ifdef CONFIG_ARM64_SVE +static bool has_sve_feature(const struct arm64_cpu_capabilities *cap, int scope) +{ + return system_supports_sve() && has_user_cpuid_feature(cap, scope); +} +#endif + +#ifdef CONFIG_ARM64_SME +static bool has_sme_feature(const struct arm64_cpu_capabilities *cap, int scope) +{ + return system_supports_sme() && has_user_cpuid_feature(cap, scope); +} +#endif + static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_PMULL), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_AES), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA1), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA2), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_SHA512), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_CRC32), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_ATOMICS), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RDM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDRDM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA3), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM3), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM4), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDDP), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDFHM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FLAGM), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_FP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_ASIMDHP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_DIT_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_DIT), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_DCPOP), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_JSCVT), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FCMA), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_LRCPC), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_ILRCPC), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_SB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SB), - HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_USCAT), + HWCAP_CAP(ID_AA64ISAR0_EL1, AES, PMULL, CAP_HWCAP, KERNEL_HWCAP_PMULL), + HWCAP_CAP(ID_AA64ISAR0_EL1, AES, AES, CAP_HWCAP, KERNEL_HWCAP_AES), + HWCAP_CAP(ID_AA64ISAR0_EL1, SHA1, IMP, CAP_HWCAP, KERNEL_HWCAP_SHA1), + HWCAP_CAP(ID_AA64ISAR0_EL1, SHA2, SHA256, CAP_HWCAP, KERNEL_HWCAP_SHA2), + HWCAP_CAP(ID_AA64ISAR0_EL1, SHA2, SHA512, CAP_HWCAP, KERNEL_HWCAP_SHA512), + HWCAP_CAP(ID_AA64ISAR0_EL1, CRC32, IMP, CAP_HWCAP, KERNEL_HWCAP_CRC32), + HWCAP_CAP(ID_AA64ISAR0_EL1, ATOMIC, IMP, CAP_HWCAP, KERNEL_HWCAP_ATOMICS), + HWCAP_CAP(ID_AA64ISAR0_EL1, ATOMIC, FEAT_LSE128, CAP_HWCAP, KERNEL_HWCAP_LSE128), + HWCAP_CAP(ID_AA64ISAR0_EL1, RDM, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM), + HWCAP_CAP(ID_AA64ISAR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SHA3), + HWCAP_CAP(ID_AA64ISAR0_EL1, SM3, IMP, CAP_HWCAP, KERNEL_HWCAP_SM3), + HWCAP_CAP(ID_AA64ISAR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SM4), + HWCAP_CAP(ID_AA64ISAR0_EL1, DP, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP), + HWCAP_CAP(ID_AA64ISAR0_EL1, FHM, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM), + HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM, CAP_HWCAP, KERNEL_HWCAP_FLAGM), + HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), + HWCAP_CAP(ID_AA64ISAR0_EL1, RNDR, IMP, CAP_HWCAP, KERNEL_HWCAP_RNG), + HWCAP_CAP(ID_AA64ISAR3_EL1, FPRCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_FPRCVT), + HWCAP_CAP(ID_AA64PFR0_EL1, FP, IMP, CAP_HWCAP, KERNEL_HWCAP_FP), + HWCAP_CAP(ID_AA64PFR0_EL1, FP, FP16, CAP_HWCAP, KERNEL_HWCAP_FPHP), + HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMD), + HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, FP16, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP), + HWCAP_CAP(ID_AA64PFR0_EL1, DIT, IMP, CAP_HWCAP, KERNEL_HWCAP_DIT), + HWCAP_CAP(ID_AA64PFR2_EL1, FPMR, IMP, CAP_HWCAP, KERNEL_HWCAP_FPMR), + HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, IMP, CAP_HWCAP, KERNEL_HWCAP_DCPOP), + HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, DPB2, CAP_HWCAP, KERNEL_HWCAP_DCPODP), + HWCAP_CAP(ID_AA64ISAR1_EL1, JSCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_JSCVT), + HWCAP_CAP(ID_AA64ISAR1_EL1, FCMA, IMP, CAP_HWCAP, KERNEL_HWCAP_FCMA), + HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, IMP, CAP_HWCAP, KERNEL_HWCAP_LRCPC), + HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, LRCPC2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), + HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, LRCPC3, CAP_HWCAP, KERNEL_HWCAP_LRCPC3), + HWCAP_CAP(ID_AA64ISAR1_EL1, FRINTTS, IMP, CAP_HWCAP, KERNEL_HWCAP_FRINT), + HWCAP_CAP(ID_AA64ISAR1_EL1, SB, IMP, CAP_HWCAP, KERNEL_HWCAP_SB), + HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_BF16), + HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_EBF16), + HWCAP_CAP(ID_AA64ISAR1_EL1, DGH, IMP, CAP_HWCAP, KERNEL_HWCAP_DGH), + HWCAP_CAP(ID_AA64ISAR1_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_I8MM), + HWCAP_CAP(ID_AA64ISAR2_EL1, LUT, IMP, CAP_HWCAP, KERNEL_HWCAP_LUT), + HWCAP_CAP(ID_AA64ISAR3_EL1, FAMINMAX, IMP, CAP_HWCAP, KERNEL_HWCAP_FAMINMAX), + HWCAP_CAP(ID_AA64ISAR3_EL1, LSFE, IMP, CAP_HWCAP, KERNEL_HWCAP_LSFE), + HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, ID_AA64PFR0_SVE, CAP_HWCAP, HWCAP_SVE), + HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p2, CAP_HWCAP, KERNEL_HWCAP_SVE2P2), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p1, CAP_HWCAP, KERNEL_HWCAP_SVE2P1), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, AES2, CAP_HWCAP, KERNEL_HWCAP_SVE_AES2), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_B16B16), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, BFSCALE, CAP_HWCAP, KERNEL_HWCAP_SVE_BFSCALE), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F16MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_F16MM), + HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, EltPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_ELTPERM), +#endif +#ifdef CONFIG_ARM64_GCS + HWCAP_CAP(ID_AA64PFR1_EL1, GCS, IMP, CAP_HWCAP, KERNEL_HWCAP_GCS), +#endif + HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS), +#ifdef CONFIG_ARM64_BTI + HWCAP_CAP(ID_AA64PFR1_EL1, BT, IMP, CAP_HWCAP, KERNEL_HWCAP_BTI), #endif - HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, HWCAP_SSBS), #ifdef CONFIG_ARM64_PTR_AUTH - HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, HWCAP_PACA), - HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, HWCAP_PACG), + HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA), + HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG), +#endif +#ifdef CONFIG_ARM64_MTE + HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE2, CAP_HWCAP, KERNEL_HWCAP_MTE), + HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE3, CAP_HWCAP, KERNEL_HWCAP_MTE3), + HWCAP_CAP(ID_AA64PFR2_EL1, MTEFAR, IMP, CAP_HWCAP, KERNEL_HWCAP_MTE_FAR), + HWCAP_CAP(ID_AA64PFR2_EL1, MTESTOREONLY, IMP, CAP_HWCAP , KERNEL_HWCAP_MTE_STORE_ONLY), +#endif /* CONFIG_ARM64_MTE */ + HWCAP_CAP(ID_AA64MMFR0_EL1, ECV, IMP, CAP_HWCAP, KERNEL_HWCAP_ECV), + HWCAP_CAP(ID_AA64MMFR1_EL1, AFP, IMP, CAP_HWCAP, KERNEL_HWCAP_AFP), + HWCAP_CAP(ID_AA64ISAR2_EL1, CSSC, IMP, CAP_HWCAP, KERNEL_HWCAP_CSSC), + HWCAP_CAP(ID_AA64ISAR2_EL1, CSSC, CMPBR, CAP_HWCAP, KERNEL_HWCAP_CMPBR), + HWCAP_CAP(ID_AA64ISAR2_EL1, RPRFM, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRFM), + HWCAP_CAP(ID_AA64ISAR2_EL1, RPRES, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRES), + HWCAP_CAP(ID_AA64ISAR2_EL1, WFxT, IMP, CAP_HWCAP, KERNEL_HWCAP_WFXT), + HWCAP_CAP(ID_AA64ISAR2_EL1, MOPS, IMP, CAP_HWCAP, KERNEL_HWCAP_MOPS), + HWCAP_CAP(ID_AA64ISAR2_EL1, BC, IMP, CAP_HWCAP, KERNEL_HWCAP_HBC), +#ifdef CONFIG_ARM64_SME + HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4), +#endif /* CONFIG_ARM64_SME */ + HWCAP_CAP(ID_AA64FPFR0_EL1, F8CVT, IMP, CAP_HWCAP, KERNEL_HWCAP_F8CVT), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP4), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM8), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM4), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2), +#ifdef CONFIG_ARM64_POE + HWCAP_CAP(ID_AA64MMFR3_EL1, S1POE, IMP, CAP_HWCAP, KERNEL_HWCAP_POE), #endif {}, }; +#ifdef CONFIG_COMPAT +static bool compat_has_neon(const struct arm64_cpu_capabilities *cap, int scope) +{ + /* + * Check that all of MVFR1_EL1.{SIMDSP, SIMDInt, SIMDLS} are available, + * in line with that of arm32 as in vfp_init(). We make sure that the + * check is future proof, by making sure value is non-zero. + */ + u32 mvfr1; + + WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible()); + if (scope == SCOPE_SYSTEM) + mvfr1 = read_sanitised_ftr_reg(SYS_MVFR1_EL1); + else + mvfr1 = read_sysreg_s(SYS_MVFR1_EL1); + + return cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_EL1_SIMDSP_SHIFT) && + cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_EL1_SIMDInt_SHIFT) && + cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_EL1_SIMDLS_SHIFT); +} +#endif + static const struct arm64_cpu_capabilities compat_elf_hwcaps[] = { #ifdef CONFIG_COMPAT - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA1), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA2), - HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_CRC32), + HWCAP_CAP_MATCH(compat_has_neon, CAP_COMPAT_HWCAP, COMPAT_HWCAP_NEON), + HWCAP_CAP(MVFR1_EL1, SIMDFMAC, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv4), + /* Arm v8 mandates MVFR0.FPDP == {0, 2}. So, piggy back on this for the presence of VFP support */ + HWCAP_CAP(MVFR0_EL1, FPDP, VFPv3, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFP), + HWCAP_CAP(MVFR0_EL1, FPDP, VFPv3, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv3), + HWCAP_CAP(MVFR1_EL1, FPHP, FP16, CAP_COMPAT_HWCAP, COMPAT_HWCAP_FPHP), + HWCAP_CAP(MVFR1_EL1, SIMDHP, SIMDHP_FLOAT, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDHP), + HWCAP_CAP(ID_ISAR5_EL1, AES, VMULL, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL), + HWCAP_CAP(ID_ISAR5_EL1, AES, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES), + HWCAP_CAP(ID_ISAR5_EL1, SHA1, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA1), + HWCAP_CAP(ID_ISAR5_EL1, SHA2, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA2), + HWCAP_CAP(ID_ISAR5_EL1, CRC32, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_CRC32), + HWCAP_CAP(ID_ISAR6_EL1, DP, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDDP), + HWCAP_CAP(ID_ISAR6_EL1, FHM, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDFHM), + HWCAP_CAP(ID_ISAR6_EL1, SB, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SB), + HWCAP_CAP(ID_ISAR6_EL1, BF16, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDBF16), + HWCAP_CAP(ID_ISAR6_EL1, I8MM, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_I8MM), + HWCAP_CAP(ID_PFR2_EL1, SSBS, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SSBS), #endif {}, }; -static void __init cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap) +static void cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap) { switch (cap->hwcap_type) { case CAP_HWCAP: - elf_hwcap |= cap->hwcap; + cpu_set_feature(cap->hwcap); break; #ifdef CONFIG_COMPAT case CAP_COMPAT_HWCAP: @@ -1611,7 +3433,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap) switch (cap->hwcap_type) { case CAP_HWCAP: - rc = (elf_hwcap & cap->hwcap) != 0; + rc = cpu_have_feature(cap->hwcap); break; #ifdef CONFIG_COMPAT case CAP_COMPAT_HWCAP: @@ -1629,10 +3451,10 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap) return rc; } -static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps) +static void setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps) { /* We support emulation of accesses to CPU ID feature registers */ - elf_hwcap |= HWCAP_CPUID; + cpu_set_named_feature(CPUID); for (; hwcaps->matches; hwcaps++) if (hwcaps->matches(hwcaps, cpucap_default_scope(hwcaps))) cap_set_elf_hwcap(hwcaps); @@ -1645,15 +3467,50 @@ static void update_cpu_capabilities(u16 scope_mask) scope_mask &= ARM64_CPUCAP_SCOPE_MASK; for (i = 0; i < ARM64_NCAPS; i++) { - caps = cpu_hwcaps_ptrs[i]; - if (!caps || !(caps->type & scope_mask) || - cpus_have_cap(caps->capability) || - !caps->matches(caps, cpucap_default_scope(caps))) + bool match_all = false; + bool caps_set = false; + bool boot_cpu = false; + + caps = cpucap_ptrs[i]; + if (!caps || !(caps->type & scope_mask)) continue; - if (caps->desc) + match_all = cpucap_match_all_early_cpus(caps); + caps_set = cpus_have_cap(caps->capability); + boot_cpu = scope_mask & SCOPE_BOOT_CPU; + + /* + * Unless it's a match-all CPUs feature, avoid probing if + * already detected. + */ + if (!match_all && caps_set) + continue; + + /* + * A match-all CPUs capability is only set when probing the + * boot CPU. It may be cleared subsequently if not detected on + * secondary ones. + */ + if (match_all && !caps_set && !boot_cpu) + continue; + + if (!caps->matches(caps, cpucap_default_scope(caps))) { + if (match_all) + __clear_bit(caps->capability, system_cpucaps); + continue; + } + + /* + * Match-all CPUs capabilities are logged later when the + * system capabilities are finalised. + */ + if (!match_all && caps->desc && !caps->cpus) pr_info("detected: %s\n", caps->desc); - cpus_set_cap(caps->capability); + + __set_bit(caps->capability, system_cpucaps); + + if (boot_cpu && (caps->type & SCOPE_BOOT_CPU)) + set_bit(caps->capability, boot_cpucaps); } } @@ -1667,7 +3524,7 @@ static int cpu_enable_non_boot_scope_capabilities(void *__unused) u16 non_boot_scope = SCOPE_ALL & ~SCOPE_BOOT_CPU; for_each_available_cap(i) { - const struct arm64_cpu_capabilities *cap = cpu_hwcaps_ptrs[i]; + const struct arm64_cpu_capabilities *cap = cpucap_ptrs[i]; if (WARN_ON(!cap)) continue; @@ -1695,18 +3552,11 @@ static void __init enable_cpu_capabilities(u16 scope_mask) boot_scope = !!(scope_mask & SCOPE_BOOT_CPU); for (i = 0; i < ARM64_NCAPS; i++) { - unsigned int num; - - caps = cpu_hwcaps_ptrs[i]; - if (!caps || !(caps->type & scope_mask)) - continue; - num = caps->capability; - if (!cpus_have_cap(num)) + caps = cpucap_ptrs[i]; + if (!caps || !(caps->type & scope_mask) || + !cpus_have_cap(caps->capability)) continue; - /* Ensure cpus_have_const_cap(num) works */ - static_branch_enable(&cpu_hwcap_keys[num]); - if (boot_scope && caps->cpu_enable) /* * Capabilities with SCOPE_BOOT_CPU scope are finalised @@ -1735,10 +3585,8 @@ static void __init enable_cpu_capabilities(u16 scope_mask) * Run through the list of capabilities to check for conflicts. * If the system has already detected a capability, take necessary * action on this CPU. - * - * Returns "false" on conflicts. */ -static bool verify_local_cpu_caps(u16 scope_mask) +static void verify_local_cpu_caps(u16 scope_mask) { int i; bool cpu_has_cap, system_has_cap; @@ -1747,7 +3595,7 @@ static bool verify_local_cpu_caps(u16 scope_mask) scope_mask &= ARM64_CPUCAP_SCOPE_MASK; for (i = 0; i < ARM64_NCAPS; i++) { - caps = cpu_hwcaps_ptrs[i]; + caps = cpucap_ptrs[i]; if (!caps || !(caps->type & scope_mask)) continue; @@ -1783,10 +3631,12 @@ static bool verify_local_cpu_caps(u16 scope_mask) pr_crit("CPU%d: Detected conflict for capability %d (%s), System: %d, CPU: %d\n", smp_processor_id(), caps->capability, caps->desc, system_has_cap, cpu_has_cap); - return false; - } - return true; + if (cpucap_panic_on_conflict(caps)) + cpu_panic_kernel(); + else + cpu_die_early(); + } } /* @@ -1796,16 +3646,12 @@ static bool verify_local_cpu_caps(u16 scope_mask) static void check_early_cpu_features(void) { verify_cpu_asid_bits(); - /* - * Early features are used by the kernel already. If there - * is a conflict, we cannot proceed further. - */ - if (!verify_local_cpu_caps(SCOPE_BOOT_CPU)) - cpu_panic_kernel(); + + verify_local_cpu_caps(SCOPE_BOOT_CPU); } static void -verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) +__verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) { for (; caps->matches; caps++) @@ -1816,23 +3662,100 @@ verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) } } +static void verify_local_elf_hwcaps(void) +{ + __verify_local_elf_hwcaps(arm64_elf_hwcaps); + + if (id_aa64pfr0_32bit_el0(read_cpuid(ID_AA64PFR0_EL1))) + __verify_local_elf_hwcaps(compat_elf_hwcaps); +} + static void verify_sve_features(void) { - u64 safe_zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1); - u64 zcr = read_zcr_features(); + unsigned long cpacr = cpacr_save_enable_kernel_sve(); + + if (vec_verify_vq_map(ARM64_VEC_SVE)) { + pr_crit("CPU%d: SVE: vector length support mismatch\n", + smp_processor_id()); + cpu_die_early(); + } + + cpacr_restore(cpacr); +} - unsigned int safe_len = safe_zcr & ZCR_ELx_LEN_MASK; - unsigned int len = zcr & ZCR_ELx_LEN_MASK; +static void verify_sme_features(void) +{ + unsigned long cpacr = cpacr_save_enable_kernel_sme(); - if (len < safe_len || sve_verify_vq_map()) { - pr_crit("CPU%d: SVE: required vector length(s) missing\n", + if (vec_verify_vq_map(ARM64_VEC_SME)) { + pr_crit("CPU%d: SME: vector length support mismatch\n", smp_processor_id()); cpu_die_early(); } - /* Add checks on other ZCR bits here if necessary */ + cpacr_restore(cpacr); +} + +static void verify_hyp_capabilities(void) +{ + u64 safe_mmfr1, mmfr0, mmfr1; + int parange, ipa_max; + unsigned int safe_vmid_bits, vmid_bits; + + if (!IS_ENABLED(CONFIG_KVM)) + return; + + safe_mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); + + /* Verify VMID bits */ + safe_vmid_bits = get_vmid_bits(safe_mmfr1); + vmid_bits = get_vmid_bits(mmfr1); + if (vmid_bits < safe_vmid_bits) { + pr_crit("CPU%d: VMID width mismatch\n", smp_processor_id()); + cpu_die_early(); + } + + /* Verify IPA range */ + parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_PARANGE_SHIFT); + ipa_max = id_aa64mmfr0_parange_to_phys_shift(parange); + if (ipa_max < get_kvm_ipa_limit()) { + pr_crit("CPU%d: IPA range mismatch\n", smp_processor_id()); + cpu_die_early(); + } } +static void verify_mpam_capabilities(void) +{ + u64 cpu_idr = read_cpuid(ID_AA64PFR0_EL1); + u64 sys_idr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u16 cpu_partid_max, cpu_pmg_max, sys_partid_max, sys_pmg_max; + + if (FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, cpu_idr) != + FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, sys_idr)) { + pr_crit("CPU%d: MPAM version mismatch\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_idr = read_cpuid(MPAMIDR_EL1); + sys_idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + if (FIELD_GET(MPAMIDR_EL1_HAS_HCR, cpu_idr) != + FIELD_GET(MPAMIDR_EL1_HAS_HCR, sys_idr)) { + pr_crit("CPU%d: Missing MPAM HCR\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, cpu_idr); + cpu_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, cpu_idr); + sys_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, sys_idr); + sys_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, sys_idr); + if (cpu_partid_max < sys_partid_max || cpu_pmg_max < sys_pmg_max) { + pr_crit("CPU%d: MPAM PARTID/PMG max values are mismatched\n", smp_processor_id()); + cpu_die_early(); + } +} /* * Run through the enabled system capabilities and enable() it on this CPU. @@ -1849,16 +3772,20 @@ static void verify_local_cpu_capabilities(void) * check_early_cpu_features(), as they need to be verified * on all secondary CPUs. */ - if (!verify_local_cpu_caps(SCOPE_ALL & ~SCOPE_BOOT_CPU)) - cpu_die_early(); - - verify_local_elf_hwcaps(arm64_elf_hwcaps); - - if (system_supports_32bit_el0()) - verify_local_elf_hwcaps(compat_elf_hwcaps); + verify_local_cpu_caps(SCOPE_ALL & ~SCOPE_BOOT_CPU); + verify_local_elf_hwcaps(); if (system_supports_sve()) verify_sve_features(); + + if (system_supports_sme()) + verify_sme_features(); + + if (is_hyp_mode_available()) + verify_hyp_capabilities(); + + if (system_supports_mpam()) + verify_mpam_capabilities(); } void check_local_cpu_capabilities(void) @@ -1875,95 +3802,245 @@ void check_local_cpu_capabilities(void) * Otherwise, this CPU should verify that it has all the system * advertised capabilities. */ - if (!sys_caps_initialised) + if (!system_capabilities_finalized()) update_cpu_capabilities(SCOPE_LOCAL_CPU); else verify_local_cpu_capabilities(); } -static void __init setup_boot_cpu_capabilities(void) +bool this_cpu_has_cap(unsigned int n) { - /* Detect capabilities with either SCOPE_BOOT_CPU or SCOPE_LOCAL_CPU */ - update_cpu_capabilities(SCOPE_BOOT_CPU | SCOPE_LOCAL_CPU); - /* Enable the SCOPE_BOOT_CPU capabilities alone right away */ - enable_cpu_capabilities(SCOPE_BOOT_CPU); -} + if (!WARN_ON(preemptible()) && n < ARM64_NCAPS) { + const struct arm64_cpu_capabilities *cap = cpucap_ptrs[n]; -DEFINE_STATIC_KEY_FALSE(arm64_const_caps_ready); -EXPORT_SYMBOL(arm64_const_caps_ready); + if (cap) + return cap->matches(cap, SCOPE_LOCAL_CPU); + } -static void __init mark_const_caps_ready(void) -{ - static_branch_enable(&arm64_const_caps_ready); + return false; } +EXPORT_SYMBOL_GPL(this_cpu_has_cap); -bool this_cpu_has_cap(unsigned int n) +/* + * This helper function is used in a narrow window when, + * - The system wide safe registers are set with all the SMP CPUs and, + * - The SYSTEM_FEATURE system_cpucaps may not have been set. + */ +static bool __maybe_unused __system_matches_cap(unsigned int n) { - if (!WARN_ON(preemptible()) && n < ARM64_NCAPS) { - const struct arm64_cpu_capabilities *cap = cpu_hwcaps_ptrs[n]; + if (n < ARM64_NCAPS) { + const struct arm64_cpu_capabilities *cap = cpucap_ptrs[n]; if (cap) - return cap->matches(cap, SCOPE_LOCAL_CPU); + return cap->matches(cap, SCOPE_SYSTEM); } - return false; } +void cpu_set_feature(unsigned int num) +{ + set_bit(num, elf_hwcap); +} + +bool cpu_have_feature(unsigned int num) +{ + return test_bit(num, elf_hwcap); +} +EXPORT_SYMBOL_GPL(cpu_have_feature); + +unsigned long cpu_get_elf_hwcap(void) +{ + /* + * We currently only populate the first 32 bits of AT_HWCAP. Please + * note that for userspace compatibility we guarantee that bits 62 + * and 63 will always be returned as 0. + */ + return elf_hwcap[0]; +} + +unsigned long cpu_get_elf_hwcap2(void) +{ + return elf_hwcap[1]; +} + +unsigned long cpu_get_elf_hwcap3(void) +{ + return elf_hwcap[2]; +} + +static void __init setup_boot_cpu_capabilities(void) +{ + kvm_arm_target_impl_cpu_init(); + /* + * The boot CPU's feature register values have been recorded. Detect + * boot cpucaps and local cpucaps for the boot CPU, then enable and + * patch alternatives for the available boot cpucaps. + */ + update_cpu_capabilities(SCOPE_BOOT_CPU | SCOPE_LOCAL_CPU); + enable_cpu_capabilities(SCOPE_BOOT_CPU); + apply_boot_alternatives(); +} + +void __init setup_boot_cpu_features(void) +{ + /* + * Initialize the indirect array of CPU capabilities pointers before we + * handle the boot CPU. + */ + init_cpucap_indirect_list(); + + /* + * Detect broken pseudo-NMI. Must be called _before_ the call to + * setup_boot_cpu_capabilities() since it interacts with + * can_use_gic_priorities(). + */ + detect_system_supports_pseudo_nmi(); + + setup_boot_cpu_capabilities(); +} + static void __init setup_system_capabilities(void) { /* - * We have finalised the system-wide safe feature - * registers, finalise the capabilities that depend - * on it. Also enable all the available capabilities, - * that are not enabled already. + * The system-wide safe feature register values have been finalized. + * Detect, enable, and patch alternatives for the available system + * cpucaps. */ update_cpu_capabilities(SCOPE_SYSTEM); enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU); -} + apply_alternatives_all(); -void __init setup_cpu_features(void) -{ - u32 cwg; + for (int i = 0; i < ARM64_NCAPS; i++) { + const struct arm64_cpu_capabilities *caps = cpucap_ptrs[i]; - setup_system_capabilities(); - mark_const_caps_ready(); - setup_elf_hwcaps(arm64_elf_hwcaps); + if (!caps || !caps->desc) + continue; - if (system_supports_32bit_el0()) - setup_elf_hwcaps(compat_elf_hwcaps); + /* + * Log any cpucaps with a cpumask as these aren't logged by + * update_cpu_capabilities(). + */ + if (caps->cpus && cpumask_any(caps->cpus) < nr_cpu_ids) + pr_info("detected: %s on CPU%*pbl\n", + caps->desc, cpumask_pr_args(caps->cpus)); + + /* Log match-all CPUs capabilities */ + if (cpucap_match_all_early_cpus(caps) && + cpus_have_cap(caps->capability)) + pr_info("detected: %s\n", caps->desc); + } + /* + * TTBR0 PAN doesn't have its own cpucap, so log it manually. + */ if (system_uses_ttbr0_pan()) pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); - sve_setup(); - minsigstksz_setup(); + /* + * Report Spectre mitigations status. + */ + spectre_print_disabled_mitigations(); +} + +void __init setup_system_features(void) +{ + setup_system_capabilities(); + + linear_map_maybe_split_to_ptes(); + kpti_install_ng_mappings(); - /* Advertise that we have computed the system capabilities */ - set_sys_caps_initialised(); + sve_setup(); + sme_setup(); /* * Check for sane CTR_EL0.CWG value. */ - cwg = cache_type_cwg(); - if (!cwg) + if (!cache_type_cwg()) pr_warn("No Cache Writeback Granule information, assuming %d\n", ARCH_DMA_MINALIGN); } -static bool __maybe_unused -cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused) +void __init setup_user_features(void) +{ + user_feature_fixup(); + + setup_elf_hwcaps(arm64_elf_hwcaps); + + if (system_supports_32bit_el0()) { + setup_elf_hwcaps(compat_elf_hwcaps); + elf_hwcap_fixup(); + } + + minsigstksz_setup(); +} + +static int enable_mismatched_32bit_el0(unsigned int cpu) +{ + /* + * The first 32-bit-capable CPU we detected and so can no longer + * be offlined by userspace. -1 indicates we haven't yet onlined + * a 32-bit-capable CPU. + */ + static int lucky_winner = -1; + + struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu); + bool cpu_32bit = false; + + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { + if (!housekeeping_cpu(cpu, HK_TYPE_TICK)) + pr_info("Treating adaptive-ticks CPU %u as 64-bit only\n", cpu); + else + cpu_32bit = true; + } + + if (cpu_32bit) { + cpumask_set_cpu(cpu, cpu_32bit_el0_mask); + static_branch_enable_cpuslocked(&arm64_mismatched_32bit_el0); + } + + if (cpumask_test_cpu(0, cpu_32bit_el0_mask) == cpu_32bit) + return 0; + + if (lucky_winner >= 0) + return 0; + + /* + * We've detected a mismatch. We need to keep one of our CPUs with + * 32-bit EL0 online so that is_cpu_allowed() doesn't end up rejecting + * every CPU in the system for a 32-bit task. + */ + lucky_winner = cpu_32bit ? cpu : cpumask_any_and(cpu_32bit_el0_mask, + cpu_active_mask); + get_cpu_device(lucky_winner)->offline_disabled = true; + setup_elf_hwcaps(compat_elf_hwcaps); + elf_hwcap_fixup(); + pr_info("Asymmetric 32-bit EL0 support detected on CPU %u; CPU hot-unplug disabled on CPU %u\n", + cpu, lucky_winner); + return 0; +} + +static int __init init_32bit_el0_mask(void) { - return (cpus_have_const_cap(ARM64_HAS_PAN) && !cpus_have_const_cap(ARM64_HAS_UAO)); + if (!allow_mismatched_32bit_el0) + return 0; + + if (!zalloc_cpumask_var(&cpu_32bit_el0_mask, GFP_KERNEL)) + return -ENOMEM; + + return cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "arm64/mismatched_32bit_el0:online", + enable_mismatched_32bit_el0, NULL); } +subsys_initcall_sync(init_32bit_el0_mask); static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap) { - cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); + cpu_enable_swapper_cnp(); } /* * We emulate only the following system register space. - * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 4 - 7] + * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 2 - 7] * See Table C5-6 System instruction encodings for System register accesses, * ARMv8 ARM(ARM DDI 0487A.f) for more details. */ @@ -1973,7 +4050,7 @@ static inline bool __attribute_const__ is_emulated(u32 id) sys_reg_CRn(id) == 0x0 && sys_reg_Op1(id) == 0x0 && (sys_reg_CRm(id) == 0 || - ((sys_reg_CRm(id) >= 4) && (sys_reg_CRm(id) <= 7)))); + ((sys_reg_CRm(id) >= 2) && (sys_reg_CRm(id) <= 7)))); } /* @@ -2010,7 +4087,7 @@ static int emulate_sys_reg(u32 id, u64 *valp) if (sys_reg_CRm(id) == 0) return emulate_id_reg(id, valp); - regp = get_arm64_ftr_reg(id); + regp = get_arm64_ftr_reg_nowarn(id); if (regp) *valp = arm64_ftr_reg_user_value(regp); else @@ -2035,31 +4112,44 @@ int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt) return rc; } -static int emulate_mrs(struct pt_regs *regs, u32 insn) +bool try_emulate_mrs(struct pt_regs *regs, u32 insn) { u32 sys_reg, rt; + if (compat_user_mode(regs) || !aarch64_insn_is_mrs(insn)) + return false; + /* * sys_reg values are defined as used in mrs/msr instruction. * shift the imm value to get the encoding. */ sys_reg = (u32)aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn) << 5; rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); - return do_emulate_mrs(regs, sys_reg, rt); + return do_emulate_mrs(regs, sys_reg, rt) == 0; } -static struct undef_hook mrs_hook = { - .instr_mask = 0xfff00000, - .instr_val = 0xd5300000, - .pstate_mask = PSR_AA32_MODE_MASK, - .pstate_val = PSR_MODE_EL0t, - .fn = emulate_mrs, -}; - -static int __init enable_mrs_emulation(void) +enum mitigation_state arm64_get_meltdown_state(void) { - register_undef_hook(&mrs_hook); - return 0; + if (__meltdown_safe) + return SPECTRE_UNAFFECTED; + + if (arm64_kernel_unmapped_at_el0()) + return SPECTRE_MITIGATED; + + return SPECTRE_VULNERABLE; } -core_initcall(enable_mrs_emulation); +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, + char *buf) +{ + switch (arm64_get_meltdown_state()) { + case SPECTRE_UNAFFECTED: + return sprintf(buf, "Not affected\n"); + + case SPECTRE_MITIGATED: + return sprintf(buf, "Mitigation: PTI\n"); + + default: + return sprintf(buf, "Vulnerable\n"); + } +} diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c deleted file mode 100644 index f2d13810daa8..000000000000 --- a/arch/arm64/kernel/cpuidle.c +++ /dev/null @@ -1,65 +0,0 @@ -/* - * ARM64 CPU idle arch support - * - * Copyright (C) 2014 ARM Ltd. - * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/acpi.h> -#include <linux/cpuidle.h> -#include <linux/cpu_pm.h> -#include <linux/of.h> -#include <linux/of_device.h> - -#include <asm/cpuidle.h> -#include <asm/cpu_ops.h> - -int arm_cpuidle_init(unsigned int cpu) -{ - int ret = -EOPNOTSUPP; - - if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_suspend && - cpu_ops[cpu]->cpu_init_idle) - ret = cpu_ops[cpu]->cpu_init_idle(cpu); - - return ret; -} - -/** - * arm_cpuidle_suspend() - function to enter a low-power idle state - * @arg: argument to pass to CPU suspend operations - * - * Return: 0 on success, -EOPNOTSUPP if CPU suspend hook not initialized, CPU - * operations back-end error code otherwise. - */ -int arm_cpuidle_suspend(int index) -{ - int cpu = smp_processor_id(); - - return cpu_ops[cpu]->cpu_suspend(index); -} - -#ifdef CONFIG_ACPI - -#include <acpi/processor.h> - -#define ARM64_LPI_IS_RETENTION_STATE(arch_flags) (!(arch_flags)) - -int acpi_processor_ffh_lpi_probe(unsigned int cpu) -{ - return arm_cpuidle_init(cpu); -} - -int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi) -{ - if (ARM64_LPI_IS_RETENTION_STATE(lpi->arch_flags)) - return CPU_PM_CPU_IDLE_ENTER_RETENTION(arm_cpuidle_suspend, - lpi->index); - else - return CPU_PM_CPU_IDLE_ENTER(arm_cpuidle_suspend, lpi->index); -} -#endif diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index ca0685f33900..c44e6d94f5de 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -1,18 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Record and handle CPU attributes. * * Copyright (C) 2014 ARM Ltd. - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <asm/arch_timer.h> #include <asm/cache.h> @@ -43,155 +33,258 @@ DEFINE_PER_CPU(struct cpuinfo_arm64, cpu_data); static struct cpuinfo_arm64 boot_cpu_data; -static char *icache_policy_str[] = { - [0 ... ICACHE_POLICY_PIPT] = "RESERVED/UNKNOWN", - [ICACHE_POLICY_VIPT] = "VIPT", - [ICACHE_POLICY_PIPT] = "PIPT", - [ICACHE_POLICY_VPIPT] = "VPIPT", -}; +static inline const char *icache_policy_str(int l1ip) +{ + switch (l1ip) { + case CTR_EL0_L1Ip_VIPT: + return "VIPT"; + case CTR_EL0_L1Ip_PIPT: + return "PIPT"; + default: + return "RESERVED/UNKNOWN"; + } +} unsigned long __icache_flags; static const char *const hwcap_str[] = { - "fp", - "asimd", - "evtstrm", - "aes", - "pmull", - "sha1", - "sha2", - "crc32", - "atomics", - "fphp", - "asimdhp", - "cpuid", - "asimdrdm", - "jscvt", - "fcma", - "lrcpc", - "dcpop", - "sha3", - "sm3", - "sm4", - "asimddp", - "sha512", - "sve", - "asimdfhm", - "dit", - "uscat", - "ilrcpc", - "flagm", - "ssbs", - "sb", - "paca", - "pacg", - NULL + [KERNEL_HWCAP_FP] = "fp", + [KERNEL_HWCAP_ASIMD] = "asimd", + [KERNEL_HWCAP_EVTSTRM] = "evtstrm", + [KERNEL_HWCAP_AES] = "aes", + [KERNEL_HWCAP_PMULL] = "pmull", + [KERNEL_HWCAP_SHA1] = "sha1", + [KERNEL_HWCAP_SHA2] = "sha2", + [KERNEL_HWCAP_CRC32] = "crc32", + [KERNEL_HWCAP_ATOMICS] = "atomics", + [KERNEL_HWCAP_FPHP] = "fphp", + [KERNEL_HWCAP_ASIMDHP] = "asimdhp", + [KERNEL_HWCAP_CPUID] = "cpuid", + [KERNEL_HWCAP_ASIMDRDM] = "asimdrdm", + [KERNEL_HWCAP_JSCVT] = "jscvt", + [KERNEL_HWCAP_FCMA] = "fcma", + [KERNEL_HWCAP_LRCPC] = "lrcpc", + [KERNEL_HWCAP_DCPOP] = "dcpop", + [KERNEL_HWCAP_SHA3] = "sha3", + [KERNEL_HWCAP_SM3] = "sm3", + [KERNEL_HWCAP_SM4] = "sm4", + [KERNEL_HWCAP_ASIMDDP] = "asimddp", + [KERNEL_HWCAP_SHA512] = "sha512", + [KERNEL_HWCAP_SVE] = "sve", + [KERNEL_HWCAP_ASIMDFHM] = "asimdfhm", + [KERNEL_HWCAP_DIT] = "dit", + [KERNEL_HWCAP_USCAT] = "uscat", + [KERNEL_HWCAP_ILRCPC] = "ilrcpc", + [KERNEL_HWCAP_FLAGM] = "flagm", + [KERNEL_HWCAP_SSBS] = "ssbs", + [KERNEL_HWCAP_SB] = "sb", + [KERNEL_HWCAP_PACA] = "paca", + [KERNEL_HWCAP_PACG] = "pacg", + [KERNEL_HWCAP_GCS] = "gcs", + [KERNEL_HWCAP_DCPODP] = "dcpodp", + [KERNEL_HWCAP_SVE2] = "sve2", + [KERNEL_HWCAP_SVEAES] = "sveaes", + [KERNEL_HWCAP_SVEPMULL] = "svepmull", + [KERNEL_HWCAP_SVEBITPERM] = "svebitperm", + [KERNEL_HWCAP_SVESHA3] = "svesha3", + [KERNEL_HWCAP_SVESM4] = "svesm4", + [KERNEL_HWCAP_FLAGM2] = "flagm2", + [KERNEL_HWCAP_FRINT] = "frint", + [KERNEL_HWCAP_SVEI8MM] = "svei8mm", + [KERNEL_HWCAP_SVEF32MM] = "svef32mm", + [KERNEL_HWCAP_SVEF64MM] = "svef64mm", + [KERNEL_HWCAP_SVEBF16] = "svebf16", + [KERNEL_HWCAP_I8MM] = "i8mm", + [KERNEL_HWCAP_BF16] = "bf16", + [KERNEL_HWCAP_DGH] = "dgh", + [KERNEL_HWCAP_RNG] = "rng", + [KERNEL_HWCAP_BTI] = "bti", + [KERNEL_HWCAP_MTE] = "mte", + [KERNEL_HWCAP_ECV] = "ecv", + [KERNEL_HWCAP_AFP] = "afp", + [KERNEL_HWCAP_RPRES] = "rpres", + [KERNEL_HWCAP_MTE3] = "mte3", + [KERNEL_HWCAP_SME] = "sme", + [KERNEL_HWCAP_SME_I16I64] = "smei16i64", + [KERNEL_HWCAP_SME_F64F64] = "smef64f64", + [KERNEL_HWCAP_SME_I8I32] = "smei8i32", + [KERNEL_HWCAP_SME_F16F32] = "smef16f32", + [KERNEL_HWCAP_SME_B16F32] = "smeb16f32", + [KERNEL_HWCAP_SME_F32F32] = "smef32f32", + [KERNEL_HWCAP_SME_FA64] = "smefa64", + [KERNEL_HWCAP_WFXT] = "wfxt", + [KERNEL_HWCAP_EBF16] = "ebf16", + [KERNEL_HWCAP_SVE_EBF16] = "sveebf16", + [KERNEL_HWCAP_CSSC] = "cssc", + [KERNEL_HWCAP_RPRFM] = "rprfm", + [KERNEL_HWCAP_SVE2P1] = "sve2p1", + [KERNEL_HWCAP_SME2] = "sme2", + [KERNEL_HWCAP_SME2P1] = "sme2p1", + [KERNEL_HWCAP_SME_I16I32] = "smei16i32", + [KERNEL_HWCAP_SME_BI32I32] = "smebi32i32", + [KERNEL_HWCAP_SME_B16B16] = "smeb16b16", + [KERNEL_HWCAP_SME_F16F16] = "smef16f16", + [KERNEL_HWCAP_MOPS] = "mops", + [KERNEL_HWCAP_HBC] = "hbc", + [KERNEL_HWCAP_SVE_B16B16] = "sveb16b16", + [KERNEL_HWCAP_LRCPC3] = "lrcpc3", + [KERNEL_HWCAP_LSE128] = "lse128", + [KERNEL_HWCAP_FPMR] = "fpmr", + [KERNEL_HWCAP_LUT] = "lut", + [KERNEL_HWCAP_FAMINMAX] = "faminmax", + [KERNEL_HWCAP_F8CVT] = "f8cvt", + [KERNEL_HWCAP_F8FMA] = "f8fma", + [KERNEL_HWCAP_F8DP4] = "f8dp4", + [KERNEL_HWCAP_F8DP2] = "f8dp2", + [KERNEL_HWCAP_F8E4M3] = "f8e4m3", + [KERNEL_HWCAP_F8E5M2] = "f8e5m2", + [KERNEL_HWCAP_SME_LUTV2] = "smelutv2", + [KERNEL_HWCAP_SME_F8F16] = "smef8f16", + [KERNEL_HWCAP_SME_F8F32] = "smef8f32", + [KERNEL_HWCAP_SME_SF8FMA] = "smesf8fma", + [KERNEL_HWCAP_SME_SF8DP4] = "smesf8dp4", + [KERNEL_HWCAP_SME_SF8DP2] = "smesf8dp2", + [KERNEL_HWCAP_POE] = "poe", + [KERNEL_HWCAP_CMPBR] = "cmpbr", + [KERNEL_HWCAP_FPRCVT] = "fprcvt", + [KERNEL_HWCAP_F8MM8] = "f8mm8", + [KERNEL_HWCAP_F8MM4] = "f8mm4", + [KERNEL_HWCAP_SVE_F16MM] = "svef16mm", + [KERNEL_HWCAP_SVE_ELTPERM] = "sveeltperm", + [KERNEL_HWCAP_SVE_AES2] = "sveaes2", + [KERNEL_HWCAP_SVE_BFSCALE] = "svebfscale", + [KERNEL_HWCAP_SVE2P2] = "sve2p2", + [KERNEL_HWCAP_SME2P2] = "sme2p2", + [KERNEL_HWCAP_SME_SBITPERM] = "smesbitperm", + [KERNEL_HWCAP_SME_AES] = "smeaes", + [KERNEL_HWCAP_SME_SFEXPA] = "smesfexpa", + [KERNEL_HWCAP_SME_STMOP] = "smestmop", + [KERNEL_HWCAP_SME_SMOP4] = "smesmop4", + [KERNEL_HWCAP_MTE_FAR] = "mtefar", + [KERNEL_HWCAP_MTE_STORE_ONLY] = "mtestoreonly", + [KERNEL_HWCAP_LSFE] = "lsfe", }; #ifdef CONFIG_COMPAT +#define COMPAT_KERNEL_HWCAP(x) const_ilog2(COMPAT_HWCAP_ ## x) static const char *const compat_hwcap_str[] = { - "swp", - "half", - "thumb", - "26bit", - "fastmult", - "fpa", - "vfp", - "edsp", - "java", - "iwmmxt", - "crunch", - "thumbee", - "neon", - "vfpv3", - "vfpv3d16", - "tls", - "vfpv4", - "idiva", - "idivt", - "vfpd32", - "lpae", - "evtstrm", - NULL + [COMPAT_KERNEL_HWCAP(SWP)] = "swp", + [COMPAT_KERNEL_HWCAP(HALF)] = "half", + [COMPAT_KERNEL_HWCAP(THUMB)] = "thumb", + [COMPAT_KERNEL_HWCAP(26BIT)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(FAST_MULT)] = "fastmult", + [COMPAT_KERNEL_HWCAP(FPA)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(VFP)] = "vfp", + [COMPAT_KERNEL_HWCAP(EDSP)] = "edsp", + [COMPAT_KERNEL_HWCAP(JAVA)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(IWMMXT)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(CRUNCH)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(THUMBEE)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(NEON)] = "neon", + [COMPAT_KERNEL_HWCAP(VFPv3)] = "vfpv3", + [COMPAT_KERNEL_HWCAP(VFPV3D16)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(TLS)] = "tls", + [COMPAT_KERNEL_HWCAP(VFPv4)] = "vfpv4", + [COMPAT_KERNEL_HWCAP(IDIVA)] = "idiva", + [COMPAT_KERNEL_HWCAP(IDIVT)] = "idivt", + [COMPAT_KERNEL_HWCAP(VFPD32)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(LPAE)] = "lpae", + [COMPAT_KERNEL_HWCAP(EVTSTRM)] = "evtstrm", + [COMPAT_KERNEL_HWCAP(FPHP)] = "fphp", + [COMPAT_KERNEL_HWCAP(ASIMDHP)] = "asimdhp", + [COMPAT_KERNEL_HWCAP(ASIMDDP)] = "asimddp", + [COMPAT_KERNEL_HWCAP(ASIMDFHM)] = "asimdfhm", + [COMPAT_KERNEL_HWCAP(ASIMDBF16)] = "asimdbf16", + [COMPAT_KERNEL_HWCAP(I8MM)] = "i8mm", }; +#define COMPAT_KERNEL_HWCAP2(x) const_ilog2(COMPAT_HWCAP2_ ## x) static const char *const compat_hwcap2_str[] = { - "aes", - "pmull", - "sha1", - "sha2", - "crc32", - NULL + [COMPAT_KERNEL_HWCAP2(AES)] = "aes", + [COMPAT_KERNEL_HWCAP2(PMULL)] = "pmull", + [COMPAT_KERNEL_HWCAP2(SHA1)] = "sha1", + [COMPAT_KERNEL_HWCAP2(SHA2)] = "sha2", + [COMPAT_KERNEL_HWCAP2(CRC32)] = "crc32", + [COMPAT_KERNEL_HWCAP2(SB)] = "sb", + [COMPAT_KERNEL_HWCAP2(SSBS)] = "ssbs", }; #endif /* CONFIG_COMPAT */ static int c_show(struct seq_file *m, void *v) { - int i, j; + int j; + int cpu = m->index; bool compat = personality(current->personality) == PER_LINUX32; + struct cpuinfo_arm64 *cpuinfo = v; + u32 midr = cpuinfo->reg_midr; - for_each_online_cpu(i) { - struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i); - u32 midr = cpuinfo->reg_midr; - - /* - * glibc reads /proc/cpuinfo to determine the number of - * online processors, looking for lines beginning with - * "processor". Give glibc what it expects. - */ - seq_printf(m, "processor\t: %d\n", i); - if (compat) - seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n", - MIDR_REVISION(midr), COMPAT_ELF_PLATFORM); + /* + * glibc reads /proc/cpuinfo to determine the number of + * online processors, looking for lines beginning with + * "processor". Give glibc what it expects. + */ + seq_printf(m, "processor\t: %d\n", cpu); + if (compat) + seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n", + MIDR_REVISION(midr), COMPAT_ELF_PLATFORM); - seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", - loops_per_jiffy / (500000UL/HZ), - loops_per_jiffy / (5000UL/HZ) % 100); + seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", + loops_per_jiffy / (500000UL/HZ), + loops_per_jiffy / (5000UL/HZ) % 100); - /* - * Dump out the common processor features in a single line. - * Userspace should read the hwcaps with getauxval(AT_HWCAP) - * rather than attempting to parse this, but there's a body of - * software which does already (at least for 32-bit). - */ - seq_puts(m, "Features\t:"); - if (compat) { + /* + * Dump out the common processor features in a single line. + * Userspace should read the hwcaps with getauxval(AT_HWCAP) + * rather than attempting to parse this, but there's a body of + * software which does already (at least for 32-bit). + */ + seq_puts(m, "Features\t:"); + if (compat) { #ifdef CONFIG_COMPAT - for (j = 0; compat_hwcap_str[j]; j++) - if (compat_elf_hwcap & (1 << j)) - seq_printf(m, " %s", compat_hwcap_str[j]); + for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { + if (compat_elf_hwcap & (1 << j)) { + /* + * Warn once if any feature should not + * have been present on arm64 platform. + */ + if (WARN_ON_ONCE(!compat_hwcap_str[j])) + continue; + + seq_printf(m, " %s", compat_hwcap_str[j]); + } + } - for (j = 0; compat_hwcap2_str[j]; j++) - if (compat_elf_hwcap2 & (1 << j)) - seq_printf(m, " %s", compat_hwcap2_str[j]); + for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) + if (compat_elf_hwcap2 & (1 << j)) + seq_printf(m, " %s", compat_hwcap2_str[j]); #endif /* CONFIG_COMPAT */ - } else { - for (j = 0; hwcap_str[j]; j++) - if (elf_hwcap & (1 << j)) - seq_printf(m, " %s", hwcap_str[j]); - } - seq_puts(m, "\n"); - - seq_printf(m, "CPU implementer\t: 0x%02x\n", - MIDR_IMPLEMENTOR(midr)); - seq_printf(m, "CPU architecture: 8\n"); - seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr)); - seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr)); - seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr)); + } else { + for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) + if (cpu_have_feature(j)) + seq_printf(m, " %s", hwcap_str[j]); } + seq_puts(m, "\n"); + + seq_printf(m, "CPU implementer\t: 0x%02x\n", + MIDR_IMPLEMENTOR(midr)); + seq_puts(m, "CPU architecture: 8\n"); + seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr)); + seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr)); + seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr)); return 0; } static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < 1 ? (void *)1 : NULL; + *pos = cpumask_next(*pos - 1, cpu_online_mask); + return *pos < nr_cpu_ids ? &per_cpu(cpu_data, *pos) : NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { ++*pos; - return NULL; + return c_start(m, pos); } static void c_stop(struct seq_file *m, void *v) @@ -206,7 +299,7 @@ const struct seq_operations cpuinfo_op = { }; -static struct kobj_type cpuregs_kobj_type = { +static const struct kobj_type cpuregs_kobj_type = { .sysfs_ops = &kobj_sysfs_ops, }; @@ -229,7 +322,7 @@ static struct kobj_type cpuregs_kobj_type = { struct cpuinfo_arm64 *info = kobj_to_cpuinfo(kobj); \ \ if (info->reg_midr) \ - return sprintf(buf, "0x%016x\n", info->reg_##_field); \ + return sprintf(buf, "0x%016llx\n", info->reg_##_field); \ else \ return 0; \ } \ @@ -237,10 +330,13 @@ static struct kobj_type cpuregs_kobj_type = { CPUREGS_ATTR_RO(midr_el1, midr); CPUREGS_ATTR_RO(revidr_el1, revidr); +CPUREGS_ATTR_RO(aidr_el1, aidr); +CPUREGS_ATTR_RO(smidr_el1, smidr); static struct attribute *cpuregs_id_attrs[] = { &cpuregs_attr_midr_el1.attr, &cpuregs_attr_revidr_el1.attr, + &cpuregs_attr_aidr_el1.attr, NULL }; @@ -249,6 +345,16 @@ static const struct attribute_group cpuregs_attr_group = { .name = "identification" }; +static struct attribute *sme_cpuregs_id_attrs[] = { + &cpuregs_attr_smidr_el1.attr, + NULL +}; + +static const struct attribute_group sme_cpuregs_attr_group = { + .attrs = sme_cpuregs_id_attrs, + .name = "identification" +}; + static int cpuid_cpu_online(unsigned int cpu) { int rc; @@ -266,6 +372,8 @@ static int cpuid_cpu_online(unsigned int cpu) rc = sysfs_create_group(&info->kobj, &cpuregs_attr_group); if (rc) kobject_del(&info->kobj); + if (system_supports_sme()) + rc = sysfs_merge_group(&info->kobj, &sme_cpuregs_attr_group); out: return rc; } @@ -304,25 +412,50 @@ static int __init cpuinfo_regs_init(void) } return 0; } +device_initcall(cpuinfo_regs_init); + static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info) { unsigned int cpu = smp_processor_id(); u32 l1ip = CTR_L1IP(info->reg_ctr); switch (l1ip) { - case ICACHE_POLICY_PIPT: - break; - case ICACHE_POLICY_VPIPT: - set_bit(ICACHEF_VPIPT, &__icache_flags); + case CTR_EL0_L1Ip_PIPT: break; + case CTR_EL0_L1Ip_VIPT: default: - /* Fallthrough */ - case ICACHE_POLICY_VIPT: /* Assume aliasing */ set_bit(ICACHEF_ALIASING, &__icache_flags); + break; } - pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str[l1ip], cpu); + pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str(l1ip), cpu); +} + +static void __cpuinfo_store_cpu_32bit(struct cpuinfo_32bit *info) +{ + info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1); + info->reg_id_dfr1 = read_cpuid(ID_DFR1_EL1); + info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1); + info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1); + info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1); + info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1); + info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1); + info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1); + info->reg_id_isar6 = read_cpuid(ID_ISAR6_EL1); + info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1); + info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1); + info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1); + info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1); + info->reg_id_mmfr4 = read_cpuid(ID_MMFR4_EL1); + info->reg_id_mmfr5 = read_cpuid(ID_MMFR5_EL1); + info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1); + info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1); + info->reg_id_pfr2 = read_cpuid(ID_PFR2_EL1); + + info->reg_mvfr0 = read_cpuid(MVFR0_EL1); + info->reg_mvfr1 = read_cpuid(MVFR1_EL1); + info->reg_mvfr2 = read_cpuid(MVFR2_EL1); } static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) @@ -330,52 +463,57 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_cntfrq = arch_timer_get_cntfrq(); /* * Use the effective value of the CTR_EL0 than the raw value - * exposed by the CPU. CTR_E0.IDC field value must be interpreted + * exposed by the CPU. CTR_EL0.IDC field value must be interpreted * with the CLIDR_EL1 fields to avoid triggering false warnings * when there is a mismatch across the CPUs. Keep track of the * effective value of the CTR_EL0 in our internal records for - * acurate sanity check and feature enablement. + * accurate sanity check and feature enablement. */ info->reg_ctr = read_cpuid_effective_cachetype(); info->reg_dczid = read_cpuid(DCZID_EL0); info->reg_midr = read_cpuid_id(); info->reg_revidr = read_cpuid(REVIDR_EL1); + info->reg_aidr = read_cpuid(AIDR_EL1); info->reg_id_aa64dfr0 = read_cpuid(ID_AA64DFR0_EL1); info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1); info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1); info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1); + info->reg_id_aa64isar2 = read_cpuid(ID_AA64ISAR2_EL1); + info->reg_id_aa64isar3 = read_cpuid(ID_AA64ISAR3_EL1); info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1); + info->reg_id_aa64mmfr3 = read_cpuid(ID_AA64MMFR3_EL1); + info->reg_id_aa64mmfr4 = read_cpuid(ID_AA64MMFR4_EL1); info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); + info->reg_id_aa64pfr2 = read_cpuid(ID_AA64PFR2_EL1); info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1); + info->reg_id_aa64smfr0 = read_cpuid(ID_AA64SMFR0_EL1); + info->reg_id_aa64fpfr0 = read_cpuid(ID_AA64FPFR0_EL1); - /* Update the 32bit ID registers only if AArch32 is implemented */ - if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { - info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1); - info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1); - info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1); - info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1); - info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1); - info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1); - info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1); - info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1); - info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1); - info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1); - info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1); - info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1); - info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1); - - info->reg_mvfr0 = read_cpuid(MVFR0_EL1); - info->reg_mvfr1 = read_cpuid(MVFR1_EL1); - info->reg_mvfr2 = read_cpuid(MVFR2_EL1); - } + if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) + info->reg_gmid = read_cpuid(GMID_EL1); + + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) + __cpuinfo_store_cpu_32bit(&info->aarch32); - if (IS_ENABLED(CONFIG_ARM64_SVE) && - id_aa64pfr0_sve(info->reg_id_aa64pfr0)) - info->reg_zcr = read_zcr_features(); + /* + * info->reg_mpamidr deferred to {init,update}_cpu_features because we + * don't want to read it (and trigger a trap on buggy firmware) if + * using an aa64pfr0_el1 override to unconditionally disable MPAM. + */ + + if (IS_ENABLED(CONFIG_ARM64_SME) && + id_aa64pfr1_sme(info->reg_id_aa64pfr1)) { + /* + * We mask out SMPS since even if the hardware + * supports priorities the kernel does not at present + * and we block access to them. + */ + info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS; + } cpuinfo_detect_icache_policy(info); } @@ -395,5 +533,3 @@ void __init cpuinfo_store_boot_cpu(void) boot_cpu_data = *info; init_cpu_features(&boot_cpu_data); } - -device_initcall(cpuinfo_regs_init); diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c deleted file mode 100644 index ca4c3e12d8c5..000000000000 --- a/arch/arm64/kernel/crash_core.c +++ /dev/null @@ -1,19 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) Linaro. - * Copyright (C) Huawei Futurewei Technologies. - */ - -#include <linux/crash_core.h> -#include <asm/memory.h> - -void arch_crash_save_vmcoreinfo(void) -{ - VMCOREINFO_NUMBER(VA_BITS); - /* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */ - vmcoreinfo_append_str("NUMBER(kimage_voffset)=0x%llx\n", - kimage_voffset); - vmcoreinfo_append_str("NUMBER(PHYS_OFFSET)=0x%llx\n", - PHYS_OFFSET); - vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); -} diff --git a/arch/arm64/kernel/crash_dump.c b/arch/arm64/kernel/crash_dump.c index 6b5037ed15b2..670e4ce81822 100644 --- a/arch/arm64/kernel/crash_dump.c +++ b/arch/arm64/kernel/crash_dump.c @@ -1,36 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Routines for doing kexec-based kdump * * Copyright (C) 2017 Linaro Limited * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/crash_dump.h> #include <linux/errno.h> #include <linux/io.h> -#include <linux/memblock.h> -#include <linux/uaccess.h> +#include <linux/uio.h> #include <asm/memory.h> -/** - * copy_oldmem_page() - copy one page from old kernel memory - * @pfn: page frame number to be copied - * @buf: buffer where the copied page is placed - * @csize: number of bytes to copy - * @offset: offset in bytes into the page - * @userbuf: if set, @buf is in a user address space - * - * This function copies one page from old kernel memory into buffer pointed by - * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes - * copied or negative error in case of failure. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, - int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; @@ -41,14 +24,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user((char __user *)buf, vaddr + offset, csize)) { - memunmap(vaddr); - return -EFAULT; - } - } else { - memcpy(buf, vaddr + offset, csize); - } + csize = copy_to_iter(vaddr + offset, csize, iter); memunmap(vaddr); @@ -67,5 +43,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { memcpy(buf, phys_to_virt((phys_addr_t)*ppos), count); + *ppos += count; + return count; } diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index d7bb6aefae0a..29307642f4c9 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * ARMv8 single-step debug support and mdscr context switching. * * Copyright (C) 2012 ARM Limited * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * * Author: Will Deacon <will.deacon@arm.com> */ @@ -32,20 +21,24 @@ #include <asm/cputype.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> +#include <asm/exception.h> +#include <asm/kgdb.h> +#include <asm/kprobes.h> #include <asm/system_misc.h> #include <asm/traps.h> +#include <asm/uprobes.h> /* Determine debug architecture. */ u8 debug_monitors_arch(void) { return cpuid_feature_extract_unsigned_field(read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1), - ID_AA64DFR0_DEBUGVER_SHIFT); + ID_AA64DFR0_EL1_DebugVer_SHIFT); } /* * MDSCR access routines. */ -static void mdscr_write(u32 mdscr) +static void mdscr_write(u64 mdscr) { unsigned long flags; flags = local_daif_save(); @@ -54,7 +47,7 @@ static void mdscr_write(u32 mdscr) } NOKPROBE_SYMBOL(mdscr_write); -static u32 mdscr_read(void) +static u64 mdscr_read(void) { return read_sysreg(mdscr_el1); } @@ -90,16 +83,16 @@ static DEFINE_PER_CPU(int, kde_ref_count); void enable_debug_monitors(enum dbg_active_el el) { - u32 mdscr, enable = 0; + u64 mdscr, enable = 0; WARN_ON(preemptible()); if (this_cpu_inc_return(mde_ref_count) == 1) - enable = DBG_MDSCR_MDE; + enable = MDSCR_EL1_MDE; if (el == DBG_ACTIVE_EL1 && this_cpu_inc_return(kde_ref_count) == 1) - enable |= DBG_MDSCR_KDE; + enable |= MDSCR_EL1_KDE; if (enable && debug_enabled) { mdscr = mdscr_read(); @@ -111,16 +104,16 @@ NOKPROBE_SYMBOL(enable_debug_monitors); void disable_debug_monitors(enum dbg_active_el el) { - u32 mdscr, disable = 0; + u64 mdscr, disable = 0; WARN_ON(preemptible()); if (this_cpu_dec_return(mde_ref_count) == 0) - disable = ~DBG_MDSCR_MDE; + disable = ~MDSCR_EL1_MDE; if (el == DBG_ACTIVE_EL1 && this_cpu_dec_return(kde_ref_count) == 0) - disable &= ~DBG_MDSCR_KDE; + disable &= ~MDSCR_EL1_KDE; if (disable) { mdscr = mdscr_read(); @@ -135,12 +128,13 @@ NOKPROBE_SYMBOL(disable_debug_monitors); */ static int clear_os_lock(unsigned int cpu) { + write_sysreg(0, osdlr_el1); write_sysreg(0, oslar_el1); isb(); return 0; } -static int debug_monitors_init(void) +static int __init debug_monitors_init(void) { return cpuhp_setup_state(CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, "arm64/debug_monitors:starting", @@ -151,183 +145,139 @@ postcore_initcall(debug_monitors_init); /* * Single step API and exception handling. */ -static void set_regs_spsr_ss(struct pt_regs *regs) +static void set_user_regs_spsr_ss(struct user_pt_regs *regs) { regs->pstate |= DBG_SPSR_SS; } -NOKPROBE_SYMBOL(set_regs_spsr_ss); +NOKPROBE_SYMBOL(set_user_regs_spsr_ss); -static void clear_regs_spsr_ss(struct pt_regs *regs) +static void clear_user_regs_spsr_ss(struct user_pt_regs *regs) { regs->pstate &= ~DBG_SPSR_SS; } -NOKPROBE_SYMBOL(clear_regs_spsr_ss); +NOKPROBE_SYMBOL(clear_user_regs_spsr_ss); -/* EL1 Single Step Handler hooks */ -static LIST_HEAD(step_hook); -static DEFINE_SPINLOCK(step_hook_lock); +#define set_regs_spsr_ss(r) set_user_regs_spsr_ss(&(r)->user_regs) +#define clear_regs_spsr_ss(r) clear_user_regs_spsr_ss(&(r)->user_regs) -void register_step_hook(struct step_hook *hook) +static void send_user_sigtrap(int si_code) { - spin_lock(&step_hook_lock); - list_add_rcu(&hook->node, &step_hook); - spin_unlock(&step_hook_lock); -} + struct pt_regs *regs = current_pt_regs(); -void unregister_step_hook(struct step_hook *hook) -{ - spin_lock(&step_hook_lock); - list_del_rcu(&hook->node); - spin_unlock(&step_hook_lock); - synchronize_rcu(); + if (WARN_ON(!user_mode(regs))) + return; + + if (!regs_irqs_disabled(regs)) + local_irq_enable(); + + arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs), + "User debug trap"); } /* - * Call registered single step handlers - * There is no Syndrome info to check for determining the handler. - * So we call all the registered handlers, until the right handler is - * found which returns zero. + * We have already unmasked interrupts and enabled preemption + * when calling do_el0_softstep() from entry-common.c. */ -static int call_step_hook(struct pt_regs *regs, unsigned int esr) +void do_el0_softstep(unsigned long esr, struct pt_regs *regs) { - struct step_hook *hook; - int retval = DBG_HOOK_ERROR; - - rcu_read_lock(); + if (uprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) + return; - list_for_each_entry_rcu(hook, &step_hook, node) { - retval = hook->fn(regs, esr); - if (retval == DBG_HOOK_HANDLED) - break; - } + send_user_sigtrap(TRAP_TRACE); + /* + * ptrace will disable single step unless explicitly + * asked to re-enable it. For other clients, it makes + * sense to leave it enabled (i.e. rewind the controls + * to the active-not-pending state). + */ + user_rewind_single_step(current); +} - rcu_read_unlock(); +void do_el1_softstep(unsigned long esr, struct pt_regs *regs) +{ + if (kgdb_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) + return; - return retval; + pr_warn("Unexpected kernel single-step exception at EL1\n"); + /* + * Re-enable stepping since we know that we will be + * returning to regs. + */ + set_regs_spsr_ss(regs); } -NOKPROBE_SYMBOL(call_step_hook); +NOKPROBE_SYMBOL(do_el1_softstep); -static void send_user_sigtrap(int si_code) +static int call_el1_break_hook(struct pt_regs *regs, unsigned long esr) { - struct pt_regs *regs = current_pt_regs(); + if (esr_brk_comment(esr) == BUG_BRK_IMM) + return bug_brk_handler(regs, esr); - if (WARN_ON(!user_mode(regs))) - return; + if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) + return cfi_brk_handler(regs, esr); - if (interrupts_enabled(regs)) - local_irq_enable(); + if (esr_brk_comment(esr) == FAULT_BRK_IMM) + return reserved_fault_brk_handler(regs, esr); - arm64_force_sig_fault(SIGTRAP, si_code, - (void __user *)instruction_pointer(regs), - "User debug trap"); -} + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) && + (esr_brk_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) + return kasan_brk_handler(regs, esr); -static int single_step_handler(unsigned long addr, unsigned int esr, - struct pt_regs *regs) -{ - bool handler_found = false; + if (IS_ENABLED(CONFIG_UBSAN_TRAP) && esr_is_ubsan_brk(esr)) + return ubsan_brk_handler(regs, esr); - /* - * If we are stepping a pending breakpoint, call the hw_breakpoint - * handler first. - */ - if (!reinstall_suspended_bps(regs)) - return 0; - -#ifdef CONFIG_KPROBES - if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; -#endif - if (!handler_found && call_step_hook(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; - - if (!handler_found && user_mode(regs)) { - send_user_sigtrap(TRAP_TRACE); - - /* - * ptrace will disable single step unless explicitly - * asked to re-enable it. For other clients, it makes - * sense to leave it enabled (i.e. rewind the controls - * to the active-not-pending state). - */ - user_rewind_single_step(current); - } else if (!handler_found) { - pr_warn("Unexpected kernel single-step exception at EL1\n"); - /* - * Re-enable stepping since we know that we will be - * returning to regs. - */ - set_regs_spsr_ss(regs); + if (IS_ENABLED(CONFIG_KGDB)) { + if (esr_brk_comment(esr) == KGDB_DYN_DBG_BRK_IMM) + return kgdb_brk_handler(regs, esr); + if (esr_brk_comment(esr) == KGDB_COMPILED_DBG_BRK_IMM) + return kgdb_compiled_brk_handler(regs, esr); } - return 0; + if (IS_ENABLED(CONFIG_KPROBES)) { + if (esr_brk_comment(esr) == KPROBES_BRK_IMM) + return kprobe_brk_handler(regs, esr); + if (esr_brk_comment(esr) == KPROBES_BRK_SS_IMM) + return kprobe_ss_brk_handler(regs, esr); + } + + if (IS_ENABLED(CONFIG_KRETPROBES) && + esr_brk_comment(esr) == KRETPROBES_BRK_IMM) + return kretprobe_brk_handler(regs, esr); + + return DBG_HOOK_ERROR; } -NOKPROBE_SYMBOL(single_step_handler); +NOKPROBE_SYMBOL(call_el1_break_hook); /* - * Breakpoint handler is re-entrant as another breakpoint can - * hit within breakpoint handler, especically in kprobes. - * Use reader/writer locks instead of plain spinlock. + * We have already unmasked interrupts and enabled preemption + * when calling do_el0_brk64() from entry-common.c. */ -static LIST_HEAD(break_hook); -static DEFINE_SPINLOCK(break_hook_lock); - -void register_break_hook(struct break_hook *hook) +void do_el0_brk64(unsigned long esr, struct pt_regs *regs) { - spin_lock(&break_hook_lock); - list_add_rcu(&hook->node, &break_hook); - spin_unlock(&break_hook_lock); -} + if (IS_ENABLED(CONFIG_UPROBES) && + esr_brk_comment(esr) == UPROBES_BRK_IMM && + uprobe_brk_handler(regs, esr) == DBG_HOOK_HANDLED) + return; -void unregister_break_hook(struct break_hook *hook) -{ - spin_lock(&break_hook_lock); - list_del_rcu(&hook->node); - spin_unlock(&break_hook_lock); - synchronize_rcu(); + send_user_sigtrap(TRAP_BRKPT); } -static int call_break_hook(struct pt_regs *regs, unsigned int esr) +void do_el1_brk64(unsigned long esr, struct pt_regs *regs) { - struct break_hook *hook; - int (*fn)(struct pt_regs *regs, unsigned int esr) = NULL; - - rcu_read_lock(); - list_for_each_entry_rcu(hook, &break_hook, node) - if ((esr & hook->esr_mask) == hook->esr_val) - fn = hook->fn; - rcu_read_unlock(); + if (call_el1_break_hook(regs, esr) == DBG_HOOK_HANDLED) + return; - return fn ? fn(regs, esr) : DBG_HOOK_ERROR; + die("Oops - BRK", regs, esr); } -NOKPROBE_SYMBOL(call_break_hook); +NOKPROBE_SYMBOL(do_el1_brk64); -static int brk_handler(unsigned long addr, unsigned int esr, - struct pt_regs *regs) +#ifdef CONFIG_COMPAT +void do_bkpt32(unsigned long esr, struct pt_regs *regs) { - bool handler_found = false; - -#ifdef CONFIG_KPROBES - if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) { - if (kprobe_breakpoint_handler(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; - } -#endif - if (!handler_found && call_break_hook(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; - - if (!handler_found && user_mode(regs)) { - send_user_sigtrap(TRAP_BRKPT); - } else if (!handler_found) { - pr_warn("Unexpected kernel BRK exception at EL1\n"); - return -EFAULT; - } - - return 0; + arm64_notify_die("aarch32 BKPT", regs, SIGTRAP, TRAP_BRKPT, regs->pc, esr); } -NOKPROBE_SYMBOL(brk_handler); +#endif /* CONFIG_COMPAT */ -int aarch32_break_handler(struct pt_regs *regs) +bool try_handle_aarch32_break(struct pt_regs *regs) { u32 arm_instr; u16 thumb_instr; @@ -335,7 +285,7 @@ int aarch32_break_handler(struct pt_regs *regs) void __user *pc = (void __user *)instruction_pointer(regs); if (!compat_user_mode(regs)) - return -EFAULT; + return false; if (compat_thumb_mode(regs)) { /* get 16-bit Thumb instruction */ @@ -359,22 +309,12 @@ int aarch32_break_handler(struct pt_regs *regs) } if (!bp) - return -EFAULT; + return false; send_user_sigtrap(TRAP_BRKPT); - return 0; -} -NOKPROBE_SYMBOL(aarch32_break_handler); - -static int __init debug_traps_init(void) -{ - hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP, - TRAP_TRACE, "single-step handler"); - hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP, - TRAP_BRKPT, "ptrace BRK handler"); - return 0; + return true; } -arch_initcall(debug_traps_init); +NOKPROBE_SYMBOL(try_handle_aarch32_break); /* Re-enable single step for syscall restarting. */ void user_rewind_single_step(struct task_struct *task) @@ -383,23 +323,32 @@ void user_rewind_single_step(struct task_struct *task) * If single step is active for this thread, then set SPSR.SS * to 1 to avoid returning to the active-pending state. */ - if (test_ti_thread_flag(task_thread_info(task), TIF_SINGLESTEP)) + if (test_tsk_thread_flag(task, TIF_SINGLESTEP)) set_regs_spsr_ss(task_pt_regs(task)); } NOKPROBE_SYMBOL(user_rewind_single_step); void user_fastforward_single_step(struct task_struct *task) { - if (test_ti_thread_flag(task_thread_info(task), TIF_SINGLESTEP)) + if (test_tsk_thread_flag(task, TIF_SINGLESTEP)) clear_regs_spsr_ss(task_pt_regs(task)); } +void user_regs_reset_single_step(struct user_pt_regs *regs, + struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_SINGLESTEP)) + set_user_regs_spsr_ss(regs); + else + clear_user_regs_spsr_ss(regs); +} + /* Kernel API */ void kernel_enable_single_step(struct pt_regs *regs) { WARN_ON(!irqs_disabled()); set_regs_spsr_ss(regs); - mdscr_write(mdscr_read() | DBG_MDSCR_SS); + mdscr_write(mdscr_read() | MDSCR_EL1_SS); enable_debug_monitors(DBG_ACTIVE_EL1); } NOKPROBE_SYMBOL(kernel_enable_single_step); @@ -407,7 +356,7 @@ NOKPROBE_SYMBOL(kernel_enable_single_step); void kernel_disable_single_step(void) { WARN_ON(!irqs_disabled()); - mdscr_write(mdscr_read() & ~DBG_MDSCR_SS); + mdscr_write(mdscr_read() & ~MDSCR_EL1_SS); disable_debug_monitors(DBG_ACTIVE_EL1); } NOKPROBE_SYMBOL(kernel_disable_single_step); @@ -415,10 +364,20 @@ NOKPROBE_SYMBOL(kernel_disable_single_step); int kernel_active_single_step(void) { WARN_ON(!irqs_disabled()); - return mdscr_read() & DBG_MDSCR_SS; + return mdscr_read() & MDSCR_EL1_SS; } NOKPROBE_SYMBOL(kernel_active_single_step); +void kernel_rewind_single_step(struct pt_regs *regs) +{ + set_regs_spsr_ss(regs); +} + +void kernel_fastforward_single_step(struct pt_regs *regs) +{ + clear_regs_spsr_ss(regs); +} + /* ptrace API */ void user_enable_single_step(struct task_struct *task) { diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S deleted file mode 100644 index 6b9736c3fb56..000000000000 --- a/arch/arm64/kernel/efi-entry.S +++ /dev/null @@ -1,124 +0,0 @@ -/* - * EFI entry point. - * - * Copyright (C) 2013, 2014 Red Hat, Inc. - * Author: Mark Salter <msalter@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ -#include <linux/linkage.h> -#include <linux/init.h> - -#include <asm/assembler.h> - -#define EFI_LOAD_ERROR 0x8000000000000001 - - __INIT - - /* - * We arrive here from the EFI boot manager with: - * - * * CPU in little-endian mode - * * MMU on with identity-mapped RAM - * * Icache and Dcache on - * - * We will most likely be running from some place other than where - * we want to be. The kernel image wants to be placed at TEXT_OFFSET - * from start of RAM. - */ -ENTRY(entry) - /* - * Create a stack frame to save FP/LR with extra space - * for image_addr variable passed to efi_entry(). - */ - stp x29, x30, [sp, #-32]! - mov x29, sp - - /* - * Call efi_entry to do the real work. - * x0 and x1 are already set up by firmware. Current runtime - * address of image is calculated and passed via *image_addr. - * - * unsigned long efi_entry(void *handle, - * efi_system_table_t *sys_table, - * unsigned long *image_addr) ; - */ - adr_l x8, _text - add x2, sp, 16 - str x8, [x2] - bl efi_entry - cmn x0, #1 - b.eq efi_load_fail - - /* - * efi_entry() will have copied the kernel image if necessary and we - * return here with device tree address in x0 and the kernel entry - * point stored at *image_addr. Save those values in registers which - * are callee preserved. - */ - mov x20, x0 // DTB address - ldr x0, [sp, #16] // relocated _text address - ldr w21, =stext_offset - add x21, x0, x21 - - /* - * Calculate size of the kernel Image (same for original and copy). - */ - adr_l x1, _text - adr_l x2, _edata - sub x1, x2, x1 - - /* - * Flush the copied Image to the PoC, and ensure it is not shadowed by - * stale icache entries from before relocation. - */ - bl __flush_dcache_area - ic ialluis - - /* - * Ensure that the rest of this function (in the original Image) is - * visible when the caches are disabled. The I-cache can't have stale - * entries for the VA range of the current image, so no maintenance is - * necessary. - */ - adr x0, entry - adr x1, entry_end - sub x1, x1, x0 - bl __flush_dcache_area - - /* Turn off Dcache and MMU */ - mrs x0, CurrentEL - cmp x0, #CurrentEL_EL2 - b.ne 1f - mrs x0, sctlr_el2 - bic x0, x0, #1 << 0 // clear SCTLR.M - bic x0, x0, #1 << 2 // clear SCTLR.C - pre_disable_mmu_workaround - msr sctlr_el2, x0 - isb - b 2f -1: - mrs x0, sctlr_el1 - bic x0, x0, #1 << 0 // clear SCTLR.M - bic x0, x0, #1 << 2 // clear SCTLR.C - pre_disable_mmu_workaround - msr sctlr_el1, x0 - isb -2: - /* Jump to kernel entry point */ - mov x0, x20 - mov x1, xzr - mov x2, xzr - mov x3, xzr - br x21 - -efi_load_fail: - mov x0, #EFI_LOAD_ERROR - ldp x29, x30, [sp], #32 - ret - -entry_end: -ENDPROC(entry) diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S index 613fc3000677..329e8df9215f 100644 --- a/arch/arm64/kernel/efi-header.S +++ b/arch/arm64/kernel/efi-header.S @@ -1,63 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2013 - 2017 Linaro, Ltd. * Copyright (C) 2013, 2014 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/pe.h> #include <linux/sizes.h> + .macro efi_signature_nop +#ifdef CONFIG_EFI +.L_head: + /* + * This ccmp instruction has no meaningful effect except that + * its opcode forms the magic "MZ" signature required by UEFI. + */ + ccmp x18, #0, #0xd, pl +#else + /* + * Bootloaders may inspect the opcode at the start of the kernel + * image to decide if the kernel is capable of booting via UEFI. + * So put an ordinary NOP here, not the "MZ.." pseudo-nop above. + */ + nop +#endif + .endm + .macro __EFI_PE_HEADER - .long PE_MAGIC -coff_header: +#ifdef CONFIG_EFI + .set .Lpe_header_offset, . - .L_head + .long IMAGE_NT_SIGNATURE .short IMAGE_FILE_MACHINE_ARM64 // Machine - .short section_count // NumberOfSections + .short .Lsection_count // NumberOfSections .long 0 // TimeDateStamp .long 0 // PointerToSymbolTable .long 0 // NumberOfSymbols - .short section_table - optional_header // SizeOfOptionalHeader + .short .Lsection_table - .Loptional_header // SizeOfOptionalHeader .short IMAGE_FILE_DEBUG_STRIPPED | \ IMAGE_FILE_EXECUTABLE_IMAGE | \ IMAGE_FILE_LINE_NUMS_STRIPPED // Characteristics -optional_header: - .short PE_OPT_MAGIC_PE32PLUS // PE32+ format +.Loptional_header: + .short IMAGE_NT_OPTIONAL_HDR64_MAGIC // PE32+ format .byte 0x02 // MajorLinkerVersion .byte 0x14 // MinorLinkerVersion - .long __initdata_begin - efi_header_end // SizeOfCode + .long __initdata_begin - .Lefi_header_end // SizeOfCode .long __pecoff_data_size // SizeOfInitializedData .long 0 // SizeOfUninitializedData - .long __efistub_entry - _head // AddressOfEntryPoint - .long efi_header_end - _head // BaseOfCode + .long __efistub_efi_pe_entry - .L_head // AddressOfEntryPoint + .long .Lefi_header_end - .L_head // BaseOfCode -extra_header_fields: .quad 0 // ImageBase - .long SZ_4K // SectionAlignment + .long SEGMENT_ALIGN // SectionAlignment .long PECOFF_FILE_ALIGNMENT // FileAlignment .short 0 // MajorOperatingSystemVersion .short 0 // MinorOperatingSystemVersion - .short 0 // MajorImageVersion - .short 0 // MinorImageVersion + .short LINUX_EFISTUB_MAJOR_VERSION // MajorImageVersion + .short LINUX_EFISTUB_MINOR_VERSION // MinorImageVersion .short 0 // MajorSubsystemVersion .short 0 // MinorSubsystemVersion .long 0 // Win32VersionValue - .long _end - _head // SizeOfImage + .long _end - .L_head // SizeOfImage // Everything before the kernel image is considered part of the header - .long efi_header_end - _head // SizeOfHeaders + .long .Lefi_header_end - .L_head // SizeOfHeaders .long 0 // CheckSum .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem - .short 0 // DllCharacteristics + .short IMAGE_DLLCHARACTERISTICS_NX_COMPAT // DllCharacteristics .quad 0 // SizeOfStackReserve .quad 0 // SizeOfStackCommit .quad 0 // SizeOfHeapReserve .quad 0 // SizeOfHeapCommit .long 0 // LoaderFlags - .long (section_table - .) / 8 // NumberOfRvaAndSizes + .long (.Lsection_table - .) / 8 // NumberOfRvaAndSizes .quad 0 // ExportTable .quad 0 // ImportTable @@ -66,18 +81,56 @@ extra_header_fields: .quad 0 // CertificationTable .quad 0 // BaseRelocationTable +#if defined(CONFIG_DEBUG_EFI) || defined(CONFIG_ARM64_BTI_KERNEL) + .long .Lefi_debug_table - .L_head // DebugTable + .long .Lefi_debug_table_size + + /* + * The debug table is referenced via its Relative Virtual Address (RVA), + * which is only defined for those parts of the image that are covered + * by a section declaration. Since this header is not covered by any + * section, the debug table must be emitted elsewhere. So stick it in + * the .init.rodata section instead. + * + * Note that the payloads themselves are permitted to have zero RVAs, + * which means we can simply put those right after the section headers. + */ + __INITRODATA + + .align 2 +.Lefi_debug_table: #ifdef CONFIG_DEBUG_EFI - .long efi_debug_table - _head // DebugTable - .long efi_debug_table_size + // EFI_IMAGE_DEBUG_DIRECTORY_ENTRY + .long 0 // Characteristics + .long 0 // TimeDateStamp + .short 0 // MajorVersion + .short 0 // MinorVersion + .long IMAGE_DEBUG_TYPE_CODEVIEW // Type + .long .Lefi_debug_entry_size // SizeOfData + .long 0 // RVA + .long .Lefi_debug_entry - .L_head // FileOffset +#endif +#ifdef CONFIG_ARM64_BTI_KERNEL + .long 0 // Characteristics + .long 0 // TimeDateStamp + .short 0 // MajorVersion + .short 0 // MinorVersion + .long IMAGE_DEBUG_TYPE_EX_DLLCHARACTERISTICS // Type + .long 4 // SizeOfData + .long 0 // RVA + .long .Lefi_dll_characteristics_ex - .L_head // FileOffset +#endif + .set .Lefi_debug_table_size, . - .Lefi_debug_table + .previous #endif // Section table -section_table: +.Lsection_table: .ascii ".text\0\0\0" - .long __initdata_begin - efi_header_end // VirtualSize - .long efi_header_end - _head // VirtualAddress - .long __initdata_begin - efi_header_end // SizeOfRawData - .long efi_header_end - _head // PointerToRawData + .long __initdata_begin - .Lefi_header_end // VirtualSize + .long .Lefi_header_end - .L_head // VirtualAddress + .long __initdata_begin - .Lefi_header_end // SizeOfRawData + .long .Lefi_header_end - .L_head // PointerToRawData .long 0 // PointerToRelocations .long 0 // PointerToLineNumbers @@ -89,9 +142,9 @@ section_table: .ascii ".data\0\0\0" .long __pecoff_data_size // VirtualSize - .long __initdata_begin - _head // VirtualAddress + .long __initdata_begin - .L_head // VirtualAddress .long __pecoff_data_rawsize // SizeOfRawData - .long __initdata_begin - _head // PointerToRawData + .long __initdata_begin - .L_head // PointerToRawData .long 0 // PointerToRelocations .long 0 // PointerToLineNumbers @@ -101,37 +154,10 @@ section_table: IMAGE_SCN_MEM_READ | \ IMAGE_SCN_MEM_WRITE // Characteristics - .set section_count, (. - section_table) / 40 + .set .Lsection_count, (. - .Lsection_table) / 40 #ifdef CONFIG_DEBUG_EFI - /* - * The debug table is referenced via its Relative Virtual Address (RVA), - * which is only defined for those parts of the image that are covered - * by a section declaration. Since this header is not covered by any - * section, the debug table must be emitted elsewhere. So stick it in - * the .init.rodata section instead. - * - * Note that the EFI debug entry itself may legally have a zero RVA, - * which means we can simply put it right after the section headers. - */ - __INITRODATA - - .align 2 -efi_debug_table: - // EFI_IMAGE_DEBUG_DIRECTORY_ENTRY - .long 0 // Characteristics - .long 0 // TimeDateStamp - .short 0 // MajorVersion - .short 0 // MinorVersion - .long IMAGE_DEBUG_TYPE_CODEVIEW // Type - .long efi_debug_entry_size // SizeOfData - .long 0 // RVA - .long efi_debug_entry - _head // FileOffset - - .set efi_debug_table_size, . - efi_debug_table - .previous - -efi_debug_entry: +.Lefi_debug_entry: // EFI_IMAGE_DEBUG_CODEVIEW_NB10_ENTRY .ascii "NB10" // Signature .long 0 // Unknown @@ -140,16 +166,16 @@ efi_debug_entry: .asciz VMLINUX_PATH - .set efi_debug_entry_size, . - efi_debug_entry + .set .Lefi_debug_entry_size, . - .Lefi_debug_entry +#endif +#ifdef CONFIG_ARM64_BTI_KERNEL +.Lefi_dll_characteristics_ex: + .long IMAGE_DLLCHARACTERISTICS_EX_FORWARD_CFI_COMPAT #endif - /* - * EFI will load .text onwards at the 4k section alignment - * described in the PE/COFF header. To ensure that instruction - * sequences using an adrp and a :lo12: immediate will function - * correctly at this alignment, we must ensure that .text is - * placed at a 4k boundary in the Image to begin with. - */ - .align 12 -efi_header_end: + .balign SEGMENT_ALIGN +.Lefi_header_end: +#else + .set .Lpe_header_offset, 0x0 +#endif .endm diff --git a/arch/arm64/kernel/efi-rt-wrapper.S b/arch/arm64/kernel/efi-rt-wrapper.S index 05235ebb336d..e8ae803662cf 100644 --- a/arch/arm64/kernel/efi-rt-wrapper.S +++ b/arch/arm64/kernel/efi-rt-wrapper.S @@ -1,15 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/linkage.h> +#include <asm/assembler.h> -ENTRY(__efi_rt_asm_wrapper) - stp x29, x30, [sp, #-32]! +SYM_FUNC_START(__efi_rt_asm_wrapper) + stp x29, x30, [sp, #-112]! mov x29, sp /* @@ -20,6 +18,22 @@ ENTRY(__efi_rt_asm_wrapper) stp x1, x18, [sp, #16] /* + * Preserve all callee saved registers and preserve the stack pointer + * value at the base of the EFI runtime stack so we can recover from + * synchronous exceptions occurring while executing the firmware + * routines. + */ + stp x19, x20, [sp, #32] + stp x21, x22, [sp, #48] + stp x23, x24, [sp, #64] + stp x25, x26, [sp, #80] + stp x27, x28, [sp, #96] + + ldr_l x16, efi_rt_stack_top + mov sp, x16 + stp x18, x29, [sp, #-16]! + + /* * We are lucky enough that no EFI runtime services take more than * 5 arguments, so all are passed in registers rather than via the * stack. @@ -32,10 +46,42 @@ ENTRY(__efi_rt_asm_wrapper) mov x4, x6 blr x8 + mov x16, sp + mov sp, x29 + str xzr, [x16, #8] // clear recorded task SP value + ldp x1, x2, [sp, #16] cmp x2, x18 - ldp x29, x30, [sp], #32 + ldp x29, x30, [sp], #112 b.ne 0f ret -0: b efi_handle_corrupted_x18 // tail call -ENDPROC(__efi_rt_asm_wrapper) +0: + /* + * With CONFIG_SHADOW_CALL_STACK, the kernel uses x18 to store a + * shadow stack pointer, which we need to restore before returning to + * potentially instrumented code. This is safe because the wrapper is + * called with preemption disabled and a separate shadow stack is used + * for interrupts. + */ +#ifdef CONFIG_SHADOW_CALL_STACK + ldr_l x18, efi_rt_stack_top + ldr x18, [x18, #-16] +#endif + + b efi_handle_corrupted_x18 // tail call +SYM_FUNC_END(__efi_rt_asm_wrapper) + +SYM_CODE_START(__efi_rt_asm_recover) + mov sp, x30 + + ldr_l x16, efi_rt_stack_top // clear recorded task SP value + str xzr, [x16, #-8] + + ldp x19, x20, [sp, #32] + ldp x21, x22, [sp, #48] + ldp x23, x24, [sp, #64] + ldp x25, x26, [sp, #80] + ldp x27, x28, [sp, #96] + ldp x29, x30, [sp], #112 + ret +SYM_CODE_END(__efi_rt_asm_recover) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 4f9acb5fbe97..a81cb4aa4738 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -1,42 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Extensible Firmware Interface * * Based on Extensible Firmware Interface Specification version 2.4 * * Copyright (C) 2013, 2014 Linaro Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * */ #include <linux/efi.h> #include <linux/init.h> +#include <linux/kmemleak.h> +#include <linux/kthread.h> +#include <linux/screen_info.h> +#include <linux/vmalloc.h> #include <asm/efi.h> +#include <asm/stacktrace.h> +#include <asm/vmap_stack.h> + +static bool region_is_misaligned(const efi_memory_desc_t *md) +{ + if (PAGE_SIZE == EFI_PAGE_SIZE) + return false; + return !PAGE_ALIGNED(md->phys_addr) || + !PAGE_ALIGNED(md->num_pages << EFI_PAGE_SHIFT); +} /* * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be * executable, everything else can be mapped with the XN bits * set. Also take the new (optional) RO/XP bits into account. */ -static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) +static __init ptdesc_t create_mapping_protection(efi_memory_desc_t *md) { u64 attr = md->attribute; u32 type = md->type; - if (type == EFI_MEMORY_MAPPED_IO) - return PROT_DEVICE_nGnRE; + if (type == EFI_MEMORY_MAPPED_IO) { + pgprot_t prot = __pgprot(PROT_DEVICE_nGnRE); + + if (arm64_is_protected_mmio(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT)) + prot = pgprot_encrypted(prot); + else + prot = pgprot_decrypted(prot); + return pgprot_val(prot); + } + + if (region_is_misaligned(md)) { + static bool __initdata code_is_misaligned; - if (WARN_ONCE(!PAGE_ALIGNED(md->phys_addr), - "UEFI Runtime regions are not aligned to 64 KB -- buggy firmware?")) /* - * If the region is not aligned to the page size of the OS, we - * can not use strict permissions, since that would also affect - * the mapping attributes of the adjacent regions. + * Regions that are not aligned to the OS page size cannot be + * mapped with strict permissions, as those might interfere + * with the permissions that are needed by the adjacent + * region's mapping. However, if we haven't encountered any + * misaligned runtime code regions so far, we can safely use + * non-executable permissions for non-code regions. */ - return pgprot_val(PAGE_KERNEL_EXEC); + code_is_misaligned |= (type == EFI_RUNTIME_SERVICES_CODE); + + return code_is_misaligned ? pgprot_val(PAGE_KERNEL_EXEC) + : pgprot_val(PAGE_KERNEL); + } /* R-- */ if ((attr & (EFI_MEMORY_XP | EFI_MEMORY_RO)) == @@ -57,28 +83,22 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) return pgprot_val(PAGE_KERNEL_EXEC); } -/* we will fill this structure from the stub, so don't put it in .bss */ -struct screen_info screen_info __section(.data); - int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { - pteval_t prot_val = create_mapping_protection(md); + ptdesc_t prot_val = create_mapping_protection(md); bool page_mappings_only = (md->type == EFI_RUNTIME_SERVICES_CODE || md->type == EFI_RUNTIME_SERVICES_DATA); - if (!PAGE_ALIGNED(md->phys_addr) || - !PAGE_ALIGNED(md->num_pages << EFI_PAGE_SHIFT)) { - /* - * If the end address of this region is not aligned to page - * size, the mapping is rounded up, and may end up sharing a - * page frame with the next UEFI memory region. If we create - * a block entry now, we may need to split it again when mapping - * the next region, and support for that is going to be removed - * from the MMU routines. So avoid block mappings altogether in - * that case. - */ + /* + * If this region is not aligned to the page size used by the OS, the + * mapping will be rounded outwards, and may end up sharing a page + * frame with an adjacent runtime memory region. Given that the page + * table descriptor covering the shared page will be rewritten when the + * adjacent region gets mapped, we must avoid block mappings here so we + * don't have to worry about splitting them when that happens. + */ + if (region_is_misaligned(md)) page_mappings_only = true; - } create_pgd_mapping(mm, md->phys_addr, md->virt_addr, md->num_pages << EFI_PAGE_SHIFT, @@ -86,26 +106,39 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) return 0; } -static int __init set_permissions(pte_t *ptep, pgtable_t token, - unsigned long addr, void *data) +struct set_perm_data { + const efi_memory_desc_t *md; + bool has_bti; +}; + +static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { - efi_memory_desc_t *md = data; - pte_t pte = READ_ONCE(*ptep); + struct set_perm_data *spd = data; + const efi_memory_desc_t *md = spd->md; + pte_t pte = __ptep_get(ptep); if (md->attribute & EFI_MEMORY_RO) pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); if (md->attribute & EFI_MEMORY_XP) pte = set_pte_bit(pte, __pgprot(PTE_PXN)); - set_pte(ptep, pte); + else if (system_supports_bti_kernel() && spd->has_bti) + pte = set_pte_bit(pte, __pgprot(PTE_GP)); + __set_pte(ptep, pte); return 0; } int __init efi_set_mapping_permissions(struct mm_struct *mm, - efi_memory_desc_t *md) + efi_memory_desc_t *md, + bool has_bti) { + struct set_perm_data data = { md, has_bti }; + BUG_ON(md->type != EFI_RUNTIME_SERVICES_CODE && md->type != EFI_RUNTIME_SERVICES_DATA); + if (region_is_misaligned(md)) + return 0; + /* * Calling apply_to_page_range() is only safe on regions that are * guaranteed to be mapped down to pages. Since we are only called @@ -115,7 +148,7 @@ int __init efi_set_mapping_permissions(struct mm_struct *mm, */ return apply_to_page_range(mm, md->virt_addr, md->num_pages << EFI_PAGE_SHIFT, - set_permissions, md); + set_permissions, &data); } /* @@ -132,3 +165,99 @@ asmlinkage efi_status_t efi_handle_corrupted_x18(efi_status_t s, const char *f) pr_err_ratelimited(FW_BUG "register x18 corrupted by EFI %s\n", f); return s; } + +void arch_efi_call_virt_setup(void) +{ + efi_runtime_assert_lock_held(); + + if (preemptible() && (current->flags & PF_KTHREAD)) { + /* + * Disable migration to ensure that a preempted EFI runtime + * service call will be resumed on the same CPU. This avoids + * potential issues with EFI runtime calls that are preempted + * while polling for an asynchronous completion of a secure + * firmware call, which may not permit the CPU to change. + */ + migrate_disable(); + kthread_use_mm(&efi_mm); + } else { + efi_virtmap_load(); + } + + /* + * Enable access to the valid TTBR0_EL1 and invoke the errata + * workaround directly since there is no return from exception when + * invoking the EFI run-time services. + */ + uaccess_ttbr0_enable(); + post_ttbr_update_workaround(); + + __efi_fpsimd_begin(); +} + +void arch_efi_call_virt_teardown(void) +{ + __efi_fpsimd_end(); + + /* + * Defer the switch to the current thread's TTBR0_EL1 until + * uaccess_enable(). Do so before efi_virtmap_unload() updates the + * saved TTBR0 value, so the userland page tables are not activated + * inadvertently over the back of an exception. + */ + uaccess_ttbr0_disable(); + + if (preemptible() && (current->flags & PF_KTHREAD)) { + kthread_unuse_mm(&efi_mm); + migrate_enable(); + } else { + efi_virtmap_unload(); + } +} + +asmlinkage u64 *efi_rt_stack_top __ro_after_init; + +asmlinkage efi_status_t __efi_rt_asm_recover(void); + +bool efi_runtime_fixup_exception(struct pt_regs *regs, const char *msg) +{ + /* Check whether the exception occurred while running the firmware */ + if (!current_in_efi() || regs->pc >= TASK_SIZE_64) + return false; + + pr_err(FW_BUG "Unable to handle %s in EFI runtime service\n", msg); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + + regs->regs[0] = EFI_ABORTED; + regs->regs[30] = efi_rt_stack_top[-1]; + regs->pc = (u64)__efi_rt_asm_recover; + + if (IS_ENABLED(CONFIG_SHADOW_CALL_STACK)) + regs->regs[18] = efi_rt_stack_top[-2]; + + return true; +} + +/* EFI requires 8 KiB of stack space for runtime services */ +static_assert(THREAD_SIZE >= SZ_8K); + +static int __init arm64_efi_rt_init(void) +{ + void *p; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return 0; + + p = arch_alloc_vmap_stack(THREAD_SIZE, NUMA_NO_NODE); + if (!p) { + pr_warn("Failed to allocate EFI runtime stack\n"); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + return -ENOMEM; + } + + kmemleak_not_leak(p); + efi_rt_stack_top = p + THREAD_SIZE; + return 0; +} +core_initcall(arm64_efi_rt_init); diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c new file mode 100644 index 000000000000..b735f4c2fe5e --- /dev/null +++ b/arch/arm64/kernel/elfcore.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/coredump.h> +#include <linux/elfcore.h> +#include <linux/kernel.h> +#include <linux/mm.h> + +#include <asm/cpufeature.h> +#include <asm/mte.h> + +#define for_each_mte_vma(cprm, i, m) \ + if (system_supports_mte()) \ + for (i = 0, m = cprm->vma_meta; \ + i < cprm->vma_count; \ + i++, m = cprm->vma_meta + i) \ + if (m->flags & VM_MTE) + +static unsigned long mte_vma_tag_dump_size(struct core_vma_metadata *m) +{ + return (m->dump_size >> PAGE_SHIFT) * MTE_PAGE_TAG_STORAGE; +} + +/* Derived from dump_user_range(); start/end must be page-aligned */ +static int mte_dump_tag_range(struct coredump_params *cprm, + unsigned long start, unsigned long len) +{ + int ret = 1; + unsigned long addr; + void *tags = NULL; + int locked = 0; + + for (addr = start; addr < start + len; addr += PAGE_SIZE) { + struct page *page = get_dump_page(addr, &locked); + + /* + * get_dump_page() returns NULL when encountering an empty + * page table entry that would otherwise have been filled with + * the zero page. Skip the equivalent tag dump which would + * have been all zeros. + */ + if (!page) { + dump_skip(cprm, MTE_PAGE_TAG_STORAGE); + continue; + } + + /* + * Pages mapped in user space as !pte_access_permitted() (e.g. + * PROT_EXEC only) may not have the PG_mte_tagged flag set. + */ + if (!page_mte_tagged(page)) { + put_page(page); + dump_skip(cprm, MTE_PAGE_TAG_STORAGE); + continue; + } + + if (!tags) { + tags = mte_allocate_tag_storage(); + if (!tags) { + put_page(page); + ret = 0; + break; + } + } + + mte_save_page_tags(page_address(page), tags); + put_page(page); + if (!dump_emit(cprm, tags, MTE_PAGE_TAG_STORAGE)) { + ret = 0; + break; + } + } + + if (tags) + mte_free_tag_storage(tags); + + return ret; +} + +Elf_Half elf_core_extra_phdrs(struct coredump_params *cprm) +{ + int i; + struct core_vma_metadata *m; + int vma_count = 0; + + for_each_mte_vma(cprm, i, m) + vma_count++; + + return vma_count; +} + +int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) +{ + int i; + struct core_vma_metadata *m; + + for_each_mte_vma(cprm, i, m) { + struct elf_phdr phdr; + + phdr.p_type = PT_AARCH64_MEMTAG_MTE; + phdr.p_offset = offset; + phdr.p_vaddr = m->start; + phdr.p_paddr = 0; + phdr.p_filesz = mte_vma_tag_dump_size(m); + phdr.p_memsz = m->end - m->start; + offset += phdr.p_filesz; + phdr.p_flags = 0; + phdr.p_align = 0; + + if (!dump_emit(cprm, &phdr, sizeof(phdr))) + return 0; + } + + return 1; +} + +size_t elf_core_extra_data_size(struct coredump_params *cprm) +{ + int i; + struct core_vma_metadata *m; + size_t data_size = 0; + + for_each_mte_vma(cprm, i, m) + data_size += mte_vma_tag_dump_size(m); + + return data_size; +} + +int elf_core_write_extra_data(struct coredump_params *cprm) +{ + int i; + struct core_vma_metadata *m; + + for_each_mte_vma(cprm, i, m) { + if (!mte_dump_tag_range(cprm, m->start, m->dump_size)) + return 0; + } + + return 1; +} diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c new file mode 100644 index 000000000000..3625797e9ee8 --- /dev/null +++ b/arch/arm64/kernel/entry-common.c @@ -0,0 +1,1002 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Exception handling code + * + * Copyright (C) 2019 ARM Ltd. + */ + +#include <linux/context_tracking.h> +#include <linux/irq-entry-common.h> +#include <linux/kasan.h> +#include <linux/linkage.h> +#include <linux/livepatch.h> +#include <linux/lockdep.h> +#include <linux/ptrace.h> +#include <linux/resume_user_mode.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/thread_info.h> + +#include <asm/cpufeature.h> +#include <asm/daifflags.h> +#include <asm/esr.h> +#include <asm/exception.h> +#include <asm/irq_regs.h> +#include <asm/kprobes.h> +#include <asm/mmu.h> +#include <asm/processor.h> +#include <asm/sdei.h> +#include <asm/stacktrace.h> +#include <asm/sysreg.h> +#include <asm/system_misc.h> + +/* + * Handle IRQ/context state management when entering from kernel mode. + * Before this function is called it is not safe to call regular kernel code, + * instrumentable code, or any code which may trigger an exception. + */ +static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs) +{ + irqentry_state_t state; + + state = irqentry_enter(regs); + mte_check_tfsr_entry(); + mte_disable_tco_entry(current); + + return state; +} + +/* + * Handle IRQ/context state management when exiting to kernel mode. + * After this function returns it is not safe to call regular kernel code, + * instrumentable code, or any code which may trigger an exception. + */ +static void noinstr exit_to_kernel_mode(struct pt_regs *regs, + irqentry_state_t state) +{ + mte_check_tfsr_exit(); + irqentry_exit(regs, state); +} + +/* + * Handle IRQ/context state management when entering from user mode. + * Before this function is called it is not safe to call regular kernel code, + * instrumentable code, or any code which may trigger an exception. + */ +static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + mte_disable_tco_entry(current); +} + +/* + * Handle IRQ/context state management when exiting to user mode. + * After this function returns it is not safe to call regular kernel code, + * instrumentable code, or any code which may trigger an exception. + */ + +static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) +{ + local_irq_disable(); + exit_to_user_mode_prepare_legacy(regs); + local_daif_mask(); + mte_check_tfsr_exit(); + exit_to_user_mode(); +} + +asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) +{ + arm64_exit_to_user_mode(regs); +} + +/* + * Handle IRQ/context state management when entering a debug exception from + * kernel mode. Before this function is called it is not safe to call regular + * kernel code, instrumentable code, or any code which may trigger an exception. + */ +static noinstr irqentry_state_t arm64_enter_el1_dbg(struct pt_regs *regs) +{ + irqentry_state_t state; + + state.lockdep = lockdep_hardirqs_enabled(); + + lockdep_hardirqs_off(CALLER_ADDR0); + ct_nmi_enter(); + + trace_hardirqs_off_finish(); + + return state; +} + +/* + * Handle IRQ/context state management when exiting a debug exception from + * kernel mode. After this function returns it is not safe to call regular + * kernel code, instrumentable code, or any code which may trigger an exception. + */ +static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs, + irqentry_state_t state) +{ + if (state.lockdep) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + } + + ct_nmi_exit(); + if (state.lockdep) + lockdep_hardirqs_on(CALLER_ADDR0); +} + +static void do_interrupt_handler(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + if (on_thread_stack()) + call_on_irq_stack(regs, handler); + else + handler(regs); + + set_irq_regs(old_regs); +} + +extern void (*handle_arch_irq)(struct pt_regs *); +extern void (*handle_arch_fiq)(struct pt_regs *); + +static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector, + unsigned long esr) +{ + irqentry_nmi_enter(regs); + + console_verbose(); + + pr_crit("Unhandled %s exception on CPU%d, ESR 0x%016lx -- %s\n", + vector, smp_processor_id(), esr, + esr_get_class_string(esr)); + + __show_regs(regs); + panic("Unhandled exception"); +} + +#define UNHANDLED(el, regsize, vector) \ +asmlinkage void noinstr el##_##regsize##_##vector##_handler(struct pt_regs *regs) \ +{ \ + const char *desc = #regsize "-bit " #el " " #vector; \ + __panic_unhandled(regs, desc, read_sysreg(esr_el1)); \ +} + +#ifdef CONFIG_ARM64_ERRATUM_1463225 +static DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); + +static void cortex_a76_erratum_1463225_svc_handler(void) +{ + u64 reg, val; + + if (!unlikely(test_thread_flag(TIF_SINGLESTEP))) + return; + + if (!unlikely(this_cpu_has_cap(ARM64_WORKAROUND_1463225))) + return; + + __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 1); + reg = read_sysreg(mdscr_el1); + val = reg | MDSCR_EL1_SS | MDSCR_EL1_KDE; + write_sysreg(val, mdscr_el1); + asm volatile("msr daifclr, #8"); + isb(); + + /* We will have taken a single-step exception by this point */ + + write_sysreg(reg, mdscr_el1); + __this_cpu_write(__in_cortex_a76_erratum_1463225_wa, 0); +} + +static __always_inline bool +cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) +{ + if (!__this_cpu_read(__in_cortex_a76_erratum_1463225_wa)) + return false; + + /* + * We've taken a dummy step exception from the kernel to ensure + * that interrupts are re-enabled on the syscall path. Return back + * to cortex_a76_erratum_1463225_svc_handler() with debug exceptions + * masked so that we can safely restore the mdscr and get on with + * handling the syscall. + */ + regs->pstate |= PSR_D_BIT; + return true; +} +#else /* CONFIG_ARM64_ERRATUM_1463225 */ +static void cortex_a76_erratum_1463225_svc_handler(void) { } +static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) +{ + return false; +} +#endif /* CONFIG_ARM64_ERRATUM_1463225 */ + +/* + * As per the ABI exit SME streaming mode and clear the SVE state not + * shared with FPSIMD on syscall entry. + */ +static inline void fpsimd_syscall_enter(void) +{ + /* Ensure PSTATE.SM is clear, but leave PSTATE.ZA as-is. */ + if (system_supports_sme()) + sme_smstop_sm(); + + /* + * The CPU is not in streaming mode. If non-streaming SVE is not + * supported, there is no SVE state that needs to be discarded. + */ + if (!system_supports_sve()) + return; + + if (test_thread_flag(TIF_SVE)) { + unsigned int sve_vq_minus_one; + + sve_vq_minus_one = sve_vq_from_vl(task_get_sve_vl(current)) - 1; + sve_flush_live(true, sve_vq_minus_one); + } + + /* + * Any live non-FPSIMD SVE state has been zeroed. Allow + * fpsimd_save_user_state() to lazily discard SVE state until either + * the live state is unbound or fpsimd_syscall_exit() is called. + */ + __this_cpu_write(fpsimd_last_state.to_save, FP_STATE_FPSIMD); +} + +static __always_inline void fpsimd_syscall_exit(void) +{ + if (!system_supports_sve()) + return; + + /* + * The current task's user FPSIMD/SVE/SME state is now bound to this + * CPU. The fpsimd_last_state.to_save value is either: + * + * - FP_STATE_FPSIMD, if the state has not been reloaded on this CPU + * since fpsimd_syscall_enter(). + * + * - FP_STATE_CURRENT, if the state has been reloaded on this CPU at + * any point. + * + * Reset this to FP_STATE_CURRENT to stop lazy discarding. + */ + __this_cpu_write(fpsimd_last_state.to_save, FP_STATE_CURRENT); +} + +/* + * In debug exception context, we explicitly disable preemption despite + * having interrupts disabled. + * This serves two purposes: it makes it much less likely that we would + * accidentally schedule in exception context and it will force a warning + * if we somehow manage to schedule by accident. + */ +static void debug_exception_enter(struct pt_regs *regs) +{ + preempt_disable(); + + /* This code is a bit fragile. Test it. */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work"); +} +NOKPROBE_SYMBOL(debug_exception_enter); + +static void debug_exception_exit(struct pt_regs *regs) +{ + preempt_enable_no_resched(); +} +NOKPROBE_SYMBOL(debug_exception_exit); + +UNHANDLED(el1t, 64, sync) +UNHANDLED(el1t, 64, irq) +UNHANDLED(el1t, 64, fiq) +UNHANDLED(el1t, 64, error) + +static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) +{ + unsigned long far = read_sysreg(far_el1); + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_mem_abort(far, esr, regs); + local_daif_mask(); + exit_to_kernel_mode(regs, state); +} + +static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) +{ + unsigned long far = read_sysreg(far_el1); + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_sp_pc_abort(far, esr, regs); + local_daif_mask(); + exit_to_kernel_mode(regs, state); +} + +static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_undef(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs, state); +} + +static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_bti(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs, state); +} + +static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_gcs(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs, state); +} + +static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_mops(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs, state); +} + +static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); + debug_exception_enter(regs); + do_breakpoint(esr, regs); + debug_exception_exit(regs); + arm64_exit_el1_dbg(regs, state); +} + +static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); + if (!cortex_a76_erratum_1463225_debug_handler(regs)) { + debug_exception_enter(regs); + /* + * After handling a breakpoint, we suspend the breakpoint + * and use single-step to move to the next instruction. + * If we are stepping a suspended breakpoint there's nothing more to do: + * the single-step is complete. + */ + if (!try_step_suspended_breakpoints(regs)) + do_el1_softstep(esr, regs); + debug_exception_exit(regs); + } + arm64_exit_el1_dbg(regs, state); +} + +static void noinstr el1_watchpt(struct pt_regs *regs, unsigned long esr) +{ + /* Watchpoints are the only debug exception to write FAR_EL1 */ + unsigned long far = read_sysreg(far_el1); + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); + debug_exception_enter(regs); + do_watchpoint(far, esr, regs); + debug_exception_exit(regs); + arm64_exit_el1_dbg(regs, state); +} + +static void noinstr el1_brk64(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); + debug_exception_enter(regs); + do_el1_brk64(esr, regs); + debug_exception_exit(regs); + arm64_exit_el1_dbg(regs, state); +} + +static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) +{ + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_fpac(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs, state); +} + +asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + + switch (ESR_ELx_EC(esr)) { + case ESR_ELx_EC_DABT_CUR: + case ESR_ELx_EC_IABT_CUR: + el1_abort(regs, esr); + break; + /* + * We don't handle ESR_ELx_EC_SP_ALIGN, since we will have hit a + * recursive exception when trying to push the initial pt_regs. + */ + case ESR_ELx_EC_PC_ALIGN: + el1_pc(regs, esr); + break; + case ESR_ELx_EC_SYS64: + case ESR_ELx_EC_UNKNOWN: + el1_undef(regs, esr); + break; + case ESR_ELx_EC_BTI: + el1_bti(regs, esr); + break; + case ESR_ELx_EC_GCS: + el1_gcs(regs, esr); + break; + case ESR_ELx_EC_MOPS: + el1_mops(regs, esr); + break; + case ESR_ELx_EC_BREAKPT_CUR: + el1_breakpt(regs, esr); + break; + case ESR_ELx_EC_SOFTSTP_CUR: + el1_softstp(regs, esr); + break; + case ESR_ELx_EC_WATCHPT_CUR: + el1_watchpt(regs, esr); + break; + case ESR_ELx_EC_BRK64: + el1_brk64(regs, esr); + break; + case ESR_ELx_EC_FPAC: + el1_fpac(regs, esr); + break; + default: + __panic_unhandled(regs, "64-bit el1h sync", esr); + } +} + +static __always_inline void __el1_pnmi(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + irqentry_state_t state; + + state = irqentry_nmi_enter(regs); + do_interrupt_handler(regs, handler); + irqentry_nmi_exit(regs, state); +} + +static __always_inline void __el1_irq(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); + + irq_enter_rcu(); + do_interrupt_handler(regs, handler); + irq_exit_rcu(); + + exit_to_kernel_mode(regs, state); +} +static void noinstr el1_interrupt(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + write_sysreg(DAIF_PROCCTX_NOIRQ, daif); + + if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && regs_irqs_disabled(regs)) + __el1_pnmi(regs, handler); + else + __el1_irq(regs, handler); +} + +asmlinkage void noinstr el1h_64_irq_handler(struct pt_regs *regs) +{ + el1_interrupt(regs, handle_arch_irq); +} + +asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs) +{ + el1_interrupt(regs, handle_arch_fiq); +} + +asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + irqentry_state_t state; + + local_daif_restore(DAIF_ERRCTX); + state = irqentry_nmi_enter(regs); + do_serror(regs, esr); + irqentry_nmi_exit(regs, state); +} + +static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) +{ + unsigned long far = read_sysreg(far_el1); + + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_mem_abort(far, esr, regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) +{ + unsigned long far = read_sysreg(far_el1); + + /* + * We've taken an instruction abort from userspace and not yet + * re-enabled IRQs. If the address is a kernel address, apply + * BP hardening prior to enabling IRQs and pre-emption. + */ + if (!is_ttbr0_addr(far)) + arm64_apply_bp_hardening(); + + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_mem_abort(far, esr, regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_fpsimd_acc(esr, regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_sve_acc(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_sve_acc(esr, regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_sme_acc(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_sme_acc(esr, regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_fpsimd_exc(esr, regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_sys(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_sys(esr, regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) +{ + unsigned long far = read_sysreg(far_el1); + + if (!is_ttbr0_addr(instruction_pointer(regs))) + arm64_apply_bp_hardening(); + + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_sp_pc_abort(far, esr, regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_sp(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_sp_pc_abort(regs->sp, esr, regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_undef(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_undef(regs, esr); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_bti(struct pt_regs *regs) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_bti(regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_mops(regs, esr); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_gcs(regs, esr); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + bad_el0_sync(regs, 0, esr); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_breakpt(struct pt_regs *regs, unsigned long esr) +{ + if (!is_ttbr0_addr(regs->pc)) + arm64_apply_bp_hardening(); + + arm64_enter_from_user_mode(regs); + debug_exception_enter(regs); + do_breakpoint(esr, regs); + debug_exception_exit(regs); + local_daif_restore(DAIF_PROCCTX); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr) +{ + bool step_done; + + if (!is_ttbr0_addr(regs->pc)) + arm64_apply_bp_hardening(); + + arm64_enter_from_user_mode(regs); + /* + * After handling a breakpoint, we suspend the breakpoint + * and use single-step to move to the next instruction. + * If we are stepping a suspended breakpoint there's nothing more to do: + * the single-step is complete. + */ + step_done = try_step_suspended_breakpoints(regs); + local_daif_restore(DAIF_PROCCTX); + if (!step_done) + do_el0_softstep(esr, regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_watchpt(struct pt_regs *regs, unsigned long esr) +{ + /* Watchpoints are the only debug exception to write FAR_EL1 */ + unsigned long far = read_sysreg(far_el1); + + arm64_enter_from_user_mode(regs); + debug_exception_enter(regs); + do_watchpoint(far, esr, regs); + debug_exception_exit(regs); + local_daif_restore(DAIF_PROCCTX); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_brk64(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_brk64(esr, regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_svc(struct pt_regs *regs) +{ + arm64_enter_from_user_mode(regs); + cortex_a76_erratum_1463225_svc_handler(); + fpsimd_syscall_enter(); + local_daif_restore(DAIF_PROCCTX); + do_el0_svc(regs); + arm64_exit_to_user_mode(regs); + fpsimd_syscall_exit(); +} + +static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_fpac(regs, esr); + arm64_exit_to_user_mode(regs); +} + +asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + + switch (ESR_ELx_EC(esr)) { + case ESR_ELx_EC_SVC64: + el0_svc(regs); + break; + case ESR_ELx_EC_DABT_LOW: + el0_da(regs, esr); + break; + case ESR_ELx_EC_IABT_LOW: + el0_ia(regs, esr); + break; + case ESR_ELx_EC_FP_ASIMD: + el0_fpsimd_acc(regs, esr); + break; + case ESR_ELx_EC_SVE: + el0_sve_acc(regs, esr); + break; + case ESR_ELx_EC_SME: + el0_sme_acc(regs, esr); + break; + case ESR_ELx_EC_FP_EXC64: + el0_fpsimd_exc(regs, esr); + break; + case ESR_ELx_EC_SYS64: + case ESR_ELx_EC_WFx: + el0_sys(regs, esr); + break; + case ESR_ELx_EC_SP_ALIGN: + el0_sp(regs, esr); + break; + case ESR_ELx_EC_PC_ALIGN: + el0_pc(regs, esr); + break; + case ESR_ELx_EC_UNKNOWN: + el0_undef(regs, esr); + break; + case ESR_ELx_EC_BTI: + el0_bti(regs); + break; + case ESR_ELx_EC_MOPS: + el0_mops(regs, esr); + break; + case ESR_ELx_EC_GCS: + el0_gcs(regs, esr); + break; + case ESR_ELx_EC_BREAKPT_LOW: + el0_breakpt(regs, esr); + break; + case ESR_ELx_EC_SOFTSTP_LOW: + el0_softstp(regs, esr); + break; + case ESR_ELx_EC_WATCHPT_LOW: + el0_watchpt(regs, esr); + break; + case ESR_ELx_EC_BRK64: + el0_brk64(regs, esr); + break; + case ESR_ELx_EC_FPAC: + el0_fpac(regs, esr); + break; + default: + el0_inv(regs, esr); + } +} + +static void noinstr el0_interrupt(struct pt_regs *regs, + void (*handler)(struct pt_regs *)) +{ + arm64_enter_from_user_mode(regs); + + write_sysreg(DAIF_PROCCTX_NOIRQ, daif); + + if (regs->pc & BIT(55)) + arm64_apply_bp_hardening(); + + irq_enter_rcu(); + do_interrupt_handler(regs, handler); + irq_exit_rcu(); + + arm64_exit_to_user_mode(regs); +} + +static void noinstr __el0_irq_handler_common(struct pt_regs *regs) +{ + el0_interrupt(regs, handle_arch_irq); +} + +asmlinkage void noinstr el0t_64_irq_handler(struct pt_regs *regs) +{ + __el0_irq_handler_common(regs); +} + +static void noinstr __el0_fiq_handler_common(struct pt_regs *regs) +{ + el0_interrupt(regs, handle_arch_fiq); +} + +asmlinkage void noinstr el0t_64_fiq_handler(struct pt_regs *regs) +{ + __el0_fiq_handler_common(regs); +} + +static void noinstr __el0_error_handler_common(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + irqentry_state_t state; + + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_ERRCTX); + state = irqentry_nmi_enter(regs); + do_serror(regs, esr); + irqentry_nmi_exit(regs, state); + local_daif_restore(DAIF_PROCCTX); + arm64_exit_to_user_mode(regs); +} + +asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs) +{ + __el0_error_handler_common(regs); +} + +#ifdef CONFIG_COMPAT +static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_cp15(esr, regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_svc_compat(struct pt_regs *regs) +{ + arm64_enter_from_user_mode(regs); + cortex_a76_erratum_1463225_svc_handler(); + local_daif_restore(DAIF_PROCCTX); + do_el0_svc_compat(regs); + arm64_exit_to_user_mode(regs); +} + +static void noinstr el0_bkpt32(struct pt_regs *regs, unsigned long esr) +{ + arm64_enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_bkpt32(esr, regs); + arm64_exit_to_user_mode(regs); +} + +asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + + switch (ESR_ELx_EC(esr)) { + case ESR_ELx_EC_SVC32: + el0_svc_compat(regs); + break; + case ESR_ELx_EC_DABT_LOW: + el0_da(regs, esr); + break; + case ESR_ELx_EC_IABT_LOW: + el0_ia(regs, esr); + break; + case ESR_ELx_EC_FP_ASIMD: + el0_fpsimd_acc(regs, esr); + break; + case ESR_ELx_EC_FP_EXC32: + el0_fpsimd_exc(regs, esr); + break; + case ESR_ELx_EC_PC_ALIGN: + el0_pc(regs, esr); + break; + case ESR_ELx_EC_UNKNOWN: + case ESR_ELx_EC_CP14_MR: + case ESR_ELx_EC_CP14_LS: + case ESR_ELx_EC_CP14_64: + el0_undef(regs, esr); + break; + case ESR_ELx_EC_CP15_32: + case ESR_ELx_EC_CP15_64: + el0_cp15(regs, esr); + break; + case ESR_ELx_EC_BREAKPT_LOW: + el0_breakpt(regs, esr); + break; + case ESR_ELx_EC_SOFTSTP_LOW: + el0_softstp(regs, esr); + break; + case ESR_ELx_EC_WATCHPT_LOW: + el0_watchpt(regs, esr); + break; + case ESR_ELx_EC_BKPT32: + el0_bkpt32(regs, esr); + break; + default: + el0_inv(regs, esr); + } +} + +asmlinkage void noinstr el0t_32_irq_handler(struct pt_regs *regs) +{ + __el0_irq_handler_common(regs); +} + +asmlinkage void noinstr el0t_32_fiq_handler(struct pt_regs *regs) +{ + __el0_fiq_handler_common(regs); +} + +asmlinkage void noinstr el0t_32_error_handler(struct pt_regs *regs) +{ + __el0_error_handler_common(regs); +} +#else /* CONFIG_COMPAT */ +UNHANDLED(el0t, 32, sync) +UNHANDLED(el0t, 32, irq) +UNHANDLED(el0t, 32, fiq) +UNHANDLED(el0t, 32, error) +#endif /* CONFIG_COMPAT */ + +asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs) +{ + unsigned long esr = read_sysreg(esr_el1); + unsigned long far = read_sysreg(far_el1); + + irqentry_nmi_enter(regs); + panic_bad_stack(regs, esr, far); +} + +#ifdef CONFIG_ARM_SDE_INTERFACE +asmlinkage noinstr unsigned long +__sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) +{ + irqentry_state_t state; + unsigned long ret; + + /* + * We didn't take an exception to get here, so the HW hasn't + * set/cleared bits in PSTATE that we may rely on. + * + * The original SDEI spec (ARM DEN 0054A) can be read ambiguously as to + * whether PSTATE bits are inherited unchanged or generated from + * scratch, and the TF-A implementation always clears PAN and always + * clears UAO. There are no other known implementations. + * + * Subsequent revisions (ARM DEN 0054B) follow the usual rules for how + * PSTATE is modified upon architectural exceptions, and so PAN is + * either inherited or set per SCTLR_ELx.SPAN, and UAO is always + * cleared. + * + * We must explicitly reset PAN to the expected state, including + * clearing it when the host isn't using it, in case a VM had it set. + */ + if (system_uses_hw_pan()) + set_pstate_pan(1); + else if (cpu_has_pan()) + set_pstate_pan(0); + + state = irqentry_nmi_enter(regs); + ret = do_sdei_event(regs, arg); + irqentry_nmi_exit(regs, state); + + return ret; +} +#endif /* CONFIG_ARM_SDE_INTERFACE */ diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 12d4958e6429..6325db1a2179 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -1,20 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * FP/SIMD state saving and restoring * * Copyright (C) 2012 ARM Ltd. * Author: Catalin Marinas <catalin.marinas@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/linkage.h> @@ -27,34 +16,119 @@ * * x0 - pointer to struct fpsimd_state */ -ENTRY(fpsimd_save_state) +SYM_FUNC_START(fpsimd_save_state) fpsimd_save x0, 8 ret -ENDPROC(fpsimd_save_state) +SYM_FUNC_END(fpsimd_save_state) /* * Load the FP registers. * * x0 - pointer to struct fpsimd_state */ -ENTRY(fpsimd_load_state) +SYM_FUNC_START(fpsimd_load_state) fpsimd_restore x0, 8 ret -ENDPROC(fpsimd_load_state) +SYM_FUNC_END(fpsimd_load_state) #ifdef CONFIG_ARM64_SVE -ENTRY(sve_save_state) - sve_save 0, x1, 2 + +/* + * Save the SVE state + * + * x0 - pointer to buffer for state + * x1 - pointer to storage for FPSR + * x2 - Save FFR if non-zero + */ +SYM_FUNC_START(sve_save_state) + sve_save 0, x1, x2, 3 ret -ENDPROC(sve_save_state) +SYM_FUNC_END(sve_save_state) -ENTRY(sve_load_state) - sve_load 0, x1, x2, 3, x4 +/* + * Load the SVE state + * + * x0 - pointer to buffer for state + * x1 - pointer to storage for FPSR + * x2 - Restore FFR if non-zero + */ +SYM_FUNC_START(sve_load_state) + sve_load 0, x1, x2, 4 ret -ENDPROC(sve_load_state) +SYM_FUNC_END(sve_load_state) -ENTRY(sve_get_vl) +SYM_FUNC_START(sve_get_vl) _sve_rdvl 0, 1 ret -ENDPROC(sve_get_vl) +SYM_FUNC_END(sve_get_vl) + +SYM_FUNC_START(sve_set_vq) + sve_load_vq x0, x1, x2 + ret +SYM_FUNC_END(sve_set_vq) + +/* + * Zero all SVE registers but the first 128-bits of each vector + * + * VQ must already be configured by caller, any further updates of VQ + * will need to ensure that the register state remains valid. + * + * x0 = include FFR? + * x1 = VQ - 1 + */ +SYM_FUNC_START(sve_flush_live) + cbz x1, 1f // A VQ-1 of 0 is 128 bits so no extra Z state + sve_flush_z +1: sve_flush_p + tbz x0, #0, 2f + sve_flush_ffr +2: ret +SYM_FUNC_END(sve_flush_live) + #endif /* CONFIG_ARM64_SVE */ + +#ifdef CONFIG_ARM64_SME + +SYM_FUNC_START(sme_get_vl) + _sme_rdsvl 0, 1 + ret +SYM_FUNC_END(sme_get_vl) + +SYM_FUNC_START(sme_set_vq) + sme_load_vq x0, x1, x2 + ret +SYM_FUNC_END(sme_set_vq) + +/* + * Save the ZA and ZT state + * + * x0 - pointer to buffer for state + * x1 - number of ZT registers to save + */ +SYM_FUNC_START(sme_save_state) + _sme_rdsvl 2, 1 // x2 = VL/8 + sme_save_za 0, x2, 12 // Leaves x0 pointing to the end of ZA + + cbz x1, 1f + _str_zt 0 +1: + ret +SYM_FUNC_END(sme_save_state) + +/* + * Load the ZA and ZT state + * + * x0 - pointer to buffer for state + * x1 - number of ZT registers to save + */ +SYM_FUNC_START(sme_load_state) + _sme_rdsvl 2, 1 // x2 = VL/8 + sme_load_za 0, x2, 12 // Leaves x0 pointing to the end of ZA + + cbz x1, 1f + _ldr_zt 0 +1: + ret +SYM_FUNC_END(sme_load_state) + +#endif /* CONFIG_ARM64_SME */ diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index 81b8eb5c4633..025140caafe7 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -1,19 +1,185 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * arch/arm64/kernel/entry-ftrace.S * * Copyright (C) 2013 Linaro Limited * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/linkage.h> +#include <linux/cfi_types.h> +#include <asm/asm-offsets.h> #include <asm/assembler.h> #include <asm/ftrace.h> #include <asm/insn.h> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS +/* + * Due to -fpatchable-function-entry=2, the compiler has placed two NOPs before + * the regular function prologue. For an enabled callsite, ftrace_init_nop() and + * ftrace_make_call() have patched those NOPs to: + * + * MOV X9, LR + * BL ftrace_caller + * + * Each instrumented function follows the AAPCS, so here x0-x8 and x18-x30 are + * live (x18 holds the Shadow Call Stack pointer), and x9-x17 are safe to + * clobber. + * + * We save the callsite's context into a struct ftrace_regs before invoking any + * ftrace callbacks. So that we can get a sensible backtrace, we create frame + * records for the callsite and the ftrace entry assembly. This is not + * sufficient for reliable stacktrace: until we create the callsite stack + * record, its caller is missing from the LR and existing chain of frame + * records. + */ +SYM_CODE_START(ftrace_caller) + bti c + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + /* + * The literal pointer to the ops is at an 8-byte aligned boundary + * which is either 12 or 16 bytes before the BL instruction in the call + * site. See ftrace_call_adjust() for details. + * + * Therefore here the LR points at `literal + 16` or `literal + 20`, + * and we can find the address of the literal in either case by + * aligning to an 8-byte boundary and subtracting 16. We do the + * alignment first as this allows us to fold the subtraction into the + * LDR. + */ + bic x11, x30, 0x7 + ldr x11, [x11, #-(4 * AARCH64_INSN_SIZE)] // op + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* + * If the op has a direct call, handle it immediately without + * saving/restoring registers. + */ + ldr x17, [x11, #FTRACE_OPS_DIRECT_CALL] // op->direct_call + cbnz x17, ftrace_caller_direct +#endif +#endif + + /* Save original SP */ + mov x10, sp + + /* Make room for ftrace regs, plus two frame records */ + sub sp, sp, #(FREGS_SIZE + 32) + + /* Save function arguments */ + stp x0, x1, [sp, #FREGS_X0] + stp x2, x3, [sp, #FREGS_X2] + stp x4, x5, [sp, #FREGS_X4] + stp x6, x7, [sp, #FREGS_X6] + str x8, [sp, #FREGS_X8] + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + str xzr, [sp, #FREGS_DIRECT_TRAMP] +#endif + + /* Save the callsite's FP, LR, SP */ + str x29, [sp, #FREGS_FP] + str x9, [sp, #FREGS_LR] + str x10, [sp, #FREGS_SP] + + /* Save the PC after the ftrace callsite */ + str x30, [sp, #FREGS_PC] + + /* Create a frame record for the callsite above the ftrace regs */ + stp x29, x9, [sp, #FREGS_SIZE + 16] + add x29, sp, #FREGS_SIZE + 16 + + /* Create our frame record above the ftrace regs */ + stp x29, x30, [sp, #FREGS_SIZE] + add x29, sp, #FREGS_SIZE + + /* Prepare arguments for the tracer func */ + sub x0, x30, #AARCH64_INSN_SIZE // ip (callsite's BL insn) + mov x1, x9 // parent_ip (callsite's LR) + mov x3, sp // regs + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + mov x2, x11 // op + ldr x4, [x2, #FTRACE_OPS_FUNC] // op->func + blr x4 // op->func(ip, parent_ip, op, regs) + +#else + ldr_l x2, function_trace_op // op + +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) + bl ftrace_stub // func(ip, parent_ip, op, regs) +#endif + +/* + * At the callsite x0-x8 and x19-x30 were live. Any C code will have preserved + * x19-x29 per the AAPCS, and we created frame records upon entry, so we need + * to restore x0-x8, x29, and x30. + */ + /* Restore function arguments */ + ldp x0, x1, [sp, #FREGS_X0] + ldp x2, x3, [sp, #FREGS_X2] + ldp x4, x5, [sp, #FREGS_X4] + ldp x6, x7, [sp, #FREGS_X6] + ldr x8, [sp, #FREGS_X8] + + /* Restore the callsite's FP */ + ldr x29, [sp, #FREGS_FP] + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + ldr x17, [sp, #FREGS_DIRECT_TRAMP] + cbnz x17, ftrace_caller_direct_late +#endif + + /* Restore the callsite's LR and PC */ + ldr x30, [sp, #FREGS_LR] + ldr x9, [sp, #FREGS_PC] + + /* Restore the callsite's SP */ + add sp, sp, #FREGS_SIZE + 32 + + ret x9 + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +SYM_INNER_LABEL(ftrace_caller_direct_late, SYM_L_LOCAL) + /* + * Head to a direct trampoline in x17 after having run other tracers. + * The ftrace_regs are live, and x0-x8 and FP have been restored. The + * LR, PC, and SP have not been restored. + */ + + /* + * Restore the callsite's LR and PC matching the trampoline calling + * convention. + */ + ldr x9, [sp, #FREGS_LR] + ldr x30, [sp, #FREGS_PC] + + /* Restore the callsite's SP */ + add sp, sp, #FREGS_SIZE + 32 + +SYM_INNER_LABEL(ftrace_caller_direct, SYM_L_LOCAL) + /* + * Head to a direct trampoline in x17. + * + * We use `BR X17` as this can safely land on a `BTI C` or `PACIASP` in + * the trampoline, and will not unbalance any return stack. + */ + br x17 +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ +SYM_CODE_END(ftrace_caller) + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +SYM_CODE_START(ftrace_stub_direct_tramp) + bti c + mov x10, x30 + mov x30, x9 + ret x10 +SYM_CODE_END(ftrace_stub_direct_tramp) +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +#else /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ + /* * Gcc with -pg will put the following code in the beginning of each function: * mov x0, x30 @@ -86,53 +252,15 @@ add \reg, \reg, #8 .endm -#ifndef CONFIG_DYNAMIC_FTRACE -/* - * void _mcount(unsigned long return_address) - * @return_address: return address to instrumented function - * - * This function makes calls, if enabled, to: - * - tracer function to probe instrumented function's entry, - * - ftrace_graph_caller to set up an exit hook - */ -ENTRY(_mcount) - mcount_enter - - ldr_l x2, ftrace_trace_function - adr x0, ftrace_stub - cmp x0, x2 // if (ftrace_trace_function - b.eq skip_ftrace_call // != ftrace_stub) { - - mcount_get_pc x0 // function's pc - mcount_get_lr x1 // function's lr (= parent's pc) - blr x2 // (*ftrace_trace_function)(pc, lr); - -skip_ftrace_call: // } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - ldr_l x2, ftrace_graph_return - cmp x0, x2 // if ((ftrace_graph_return - b.ne ftrace_graph_caller // != ftrace_stub) - - ldr_l x2, ftrace_graph_entry // || (ftrace_graph_entry - adr_l x0, ftrace_graph_entry_stub // != ftrace_graph_entry_stub)) - cmp x0, x2 - b.ne ftrace_graph_caller // ftrace_graph_caller(); -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - mcount_exit -ENDPROC(_mcount) -EXPORT_SYMBOL(_mcount) -NOKPROBE(_mcount) - -#else /* CONFIG_DYNAMIC_FTRACE */ /* * _mcount() is used to build the kernel with -pg option, but all the branch * instructions to _mcount() are replaced to NOP initially at kernel start up, * and later on, NOP to branch to ftrace_caller() when enabled or branch to * NOP when disabled per-function base. */ -ENTRY(_mcount) +SYM_FUNC_START(_mcount) ret -ENDPROC(_mcount) +SYM_FUNC_END(_mcount) EXPORT_SYMBOL(_mcount) NOKPROBE(_mcount) @@ -145,29 +273,24 @@ NOKPROBE(_mcount) * - tracer function to probe instrumented function's entry, * - ftrace_graph_caller to set up an exit hook */ -ENTRY(ftrace_caller) +SYM_FUNC_START(ftrace_caller) mcount_enter mcount_get_pc0 x0 // function's pc mcount_get_lr x1 // function's lr -GLOBAL(ftrace_call) // tracer(pc, lr); +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) // tracer(pc, lr); nop // This will be replaced with "bl xxx" // where xxx can be any kind of tracer. #ifdef CONFIG_FUNCTION_GRAPH_TRACER -GLOBAL(ftrace_graph_call) // ftrace_graph_caller(); +SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) // ftrace_graph_caller(); nop // If enabled, this will be replaced // "b ftrace_graph_caller" #endif mcount_exit -ENDPROC(ftrace_caller) -#endif /* CONFIG_DYNAMIC_FTRACE */ - -ENTRY(ftrace_stub) - ret -ENDPROC(ftrace_stub) +SYM_FUNC_END(ftrace_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* @@ -179,14 +302,25 @@ ENDPROC(ftrace_stub) * the call stack in order to intercept instrumented function's return path * and run return_to_handler() later on its exit. */ -ENTRY(ftrace_graph_caller) +SYM_FUNC_START(ftrace_graph_caller) mcount_get_pc x0 // function's pc mcount_get_lr_addr x1 // pointer to function's saved lr mcount_get_parent_fp x2 // parent's fp bl prepare_ftrace_return // prepare_ftrace_return(pc, &lr, fp) mcount_exit -ENDPROC(ftrace_graph_caller) +SYM_FUNC_END(ftrace_graph_caller) +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ + +SYM_TYPED_FUNC_START(ftrace_stub) + ret +SYM_FUNC_END(ftrace_stub) + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +SYM_TYPED_FUNC_START(ftrace_stub_graph) + ret +SYM_FUNC_END(ftrace_stub_graph) /* * void return_to_handler(void) @@ -194,25 +328,30 @@ ENDPROC(ftrace_graph_caller) * Run ftrace_return_to_handler() before going back to parent. * @fp is checked against the value passed by ftrace_graph_caller(). */ -ENTRY(return_to_handler) - /* save return value regs */ - sub sp, sp, #64 - stp x0, x1, [sp] - stp x2, x3, [sp, #16] - stp x4, x5, [sp, #32] - stp x6, x7, [sp, #48] - - mov x0, x29 // parent's fp - bl ftrace_return_to_handler// addr = ftrace_return_to_hander(fp); - mov x30, x0 // restore the original return address - - /* restore return value regs */ - ldp x0, x1, [sp] - ldp x2, x3, [sp, #16] - ldp x4, x5, [sp, #32] - ldp x6, x7, [sp, #48] - add sp, sp, #64 +SYM_CODE_START(return_to_handler) + /* Make room for ftrace_regs */ + sub sp, sp, #FREGS_SIZE + + /* Save return value regs */ + stp x0, x1, [sp, #FREGS_X0] + stp x2, x3, [sp, #FREGS_X2] + stp x4, x5, [sp, #FREGS_X4] + stp x6, x7, [sp, #FREGS_X6] + + /* Save the callsite's FP */ + str x29, [sp, #FREGS_FP] + + mov x0, sp + bl ftrace_return_to_handler // addr = ftrace_return_to_hander(fregs); + mov x30, x0 // restore the original return address + + /* Restore return value regs */ + ldp x0, x1, [sp, #FREGS_X0] + ldp x2, x3, [sp, #FREGS_X2] + ldp x4, x5, [sp, #FREGS_X4] + ldp x6, x7, [sp, #FREGS_X6] + add sp, sp, #FREGS_SIZE ret -END(return_to_handler) +SYM_CODE_END(return_to_handler) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 0ec0c46b2c0c..f8018b5c1f9a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -1,21 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Low-level exception handling code * * Copyright (C) 2012 ARM Ltd. * Authors: Catalin Marinas <catalin.marinas@arm.com> * Will Deacon <will.deacon@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/arm-smccc.h> @@ -25,6 +14,8 @@ #include <asm/alternative.h> #include <asm/assembler.h> #include <asm/asm-offsets.h> +#include <asm/asm_pointer_auth.h> +#include <asm/bug.h> #include <asm/cpufeature.h> #include <asm/errno.h> #include <asm/esr.h> @@ -33,68 +24,48 @@ #include <asm/mmu.h> #include <asm/processor.h> #include <asm/ptrace.h> +#include <asm/scs.h> +#include <asm/stacktrace/frame.h> #include <asm/thread_info.h> #include <asm/asm-uaccess.h> #include <asm/unistd.h> -/* - * Context tracking subsystem. Used to instrument transitions - * between user and kernel mode. - */ - .macro ct_user_exit -#ifdef CONFIG_CONTEXT_TRACKING - bl context_tracking_user_exit -#endif - .endm - - .macro ct_user_enter -#ifdef CONFIG_CONTEXT_TRACKING - bl context_tracking_user_enter -#endif - .endm - .macro clear_gp_regs .irp n,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29 mov x\n, xzr .endr .endm -/* - * Bad Abort numbers - *----------------- - */ -#define BAD_SYNC 0 -#define BAD_IRQ 1 -#define BAD_FIQ 2 -#define BAD_ERROR 3 - - .macro kernel_ventry, el, label, regsize = 64 + .macro kernel_ventry, el:req, ht:req, regsize:req, label:req .align 7 -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -alternative_if ARM64_UNMAP_KERNEL_AT_EL0 +.Lventry_start\@: .if \el == 0 + /* + * This must be the first instruction of the EL0 vector entries. It is + * skipped by the trampoline vectors, to trigger the cleanup. + */ + b .Lskip_tramp_vectors_cleanup\@ .if \regsize == 64 mrs x30, tpidrro_el0 msr tpidrro_el0, xzr .else mov x30, xzr .endif +.Lskip_tramp_vectors_cleanup\@: .endif -alternative_else_nop_endif -#endif - sub sp, sp, #S_FRAME_SIZE -#ifdef CONFIG_VMAP_STACK + sub sp, sp, #PT_REGS_SIZE /* * Test whether the SP has overflowed, without corrupting a GPR. - * Task and IRQ stacks are aligned to (1 << THREAD_SHIFT). + * Task and IRQ stacks are aligned so that SP & (1 << THREAD_SHIFT) + * should always be zero. */ add sp, sp, x0 // sp' = sp + x0 sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp tbnz x0, #THREAD_SHIFT, 0f sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp - b el\()\el\()_\label + b el\el\ht\()_\regsize\()_\label 0: /* @@ -103,7 +74,7 @@ alternative_else_nop_endif * userspace, and can clobber EL0 registers to free up GPRs. */ - /* Stash the original SP (minus S_FRAME_SIZE) in tpidr_el0. */ + /* Stash the original SP (minus PT_REGS_SIZE) in tpidr_el0. */ msr tpidr_el0, x0 /* Recover the original x0 value and stash it in tpidrro_el0 */ @@ -125,21 +96,24 @@ alternative_else_nop_endif /* We were already on the overflow stack. Restore sp/x0 and carry on. */ sub sp, sp, x0 mrs x0, tpidrro_el0 -#endif - b el\()\el\()_\label + b el\el\ht\()_\regsize\()_\label +.org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm - .macro tramp_alias, dst, sym - mov_q \dst, TRAMP_VALIAS - add \dst, \dst, #(\sym - .entry.tramp.text) + .macro tramp_alias, dst, sym + .set .Lalias\@, TRAMP_VALIAS + \sym - .entry.tramp.text + movz \dst, :abs_g2_s:.Lalias\@ + movk \dst, :abs_g1_nc:.Lalias\@ + movk \dst, :abs_g0_nc:.Lalias\@ .endm - // This macro corrupts x0-x3. It is the caller's duty - // to save/restore them if required. + /* + * This macro corrupts x0-x3. It is the caller's duty to save/restore + * them if required. + */ .macro apply_ssbd, state, tmp1, tmp2 -#ifdef CONFIG_ARM64_SSBD -alternative_cb arm64_enable_wa2_handling - b .L__asm_ssbd_skip\@ +alternative_cb ARM64_ALWAYS_SYSTEM, spectre_v4_patch_fw_mitigation_enable + b .L__asm_ssbd_skip\@ // Patched to NOP alternative_cb_end ldr_this_cpu \tmp2, arm64_ssbd_callback_required, \tmp1 cbz \tmp2, .L__asm_ssbd_skip\@ @@ -147,14 +121,83 @@ alternative_cb_end tbnz \tmp2, #TIF_SSBD, .L__asm_ssbd_skip\@ mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2 mov w1, #\state -alternative_cb arm64_update_smccc_conduit +alternative_cb ARM64_ALWAYS_SYSTEM, smccc_patch_fw_mitigation_conduit nop // Patched to SMC/HVC #0 alternative_cb_end .L__asm_ssbd_skip\@: + .endm + + /* Check for MTE asynchronous tag check faults */ + .macro check_mte_async_tcf, tmp, ti_flags, thread_sctlr +#ifdef CONFIG_ARM64_MTE + .arch_extension lse +alternative_if_not ARM64_MTE + b 1f +alternative_else_nop_endif + /* + * Asynchronous tag check faults are only possible in ASYNC (2) or + * ASYM (3) modes. In each of these modes bit 1 of SCTLR_EL1.TCF0 is + * set, so skip the check if it is unset. + */ + tbz \thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f + mrs_s \tmp, SYS_TFSRE0_EL1 + tbz \tmp, #SYS_TFSR_EL1_TF0_SHIFT, 1f + /* Asynchronous TCF occurred for TTBR0 access, set the TI flag */ + mov \tmp, #_TIF_MTE_ASYNC_FAULT + add \ti_flags, tsk, #TSK_TI_FLAGS + stset \tmp, [\ti_flags] +1: +#endif + .endm + + /* Clear the MTE asynchronous tag check faults */ + .macro clear_mte_async_tcf thread_sctlr +#ifdef CONFIG_ARM64_MTE +alternative_if ARM64_MTE + /* See comment in check_mte_async_tcf above. */ + tbz \thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f + dsb ish + msr_s SYS_TFSRE0_EL1, xzr +1: +alternative_else_nop_endif +#endif + .endm + + .macro mte_set_gcr, mte_ctrl, tmp +#ifdef CONFIG_ARM64_MTE + ubfx \tmp, \mte_ctrl, #MTE_CTRL_GCR_USER_EXCL_SHIFT, #16 + orr \tmp, \tmp, #SYS_GCR_EL1_RRND + msr_s SYS_GCR_EL1, \tmp +#endif + .endm + + .macro mte_set_kernel_gcr, tmp, tmp2 +#ifdef CONFIG_KASAN_HW_TAGS +alternative_cb ARM64_ALWAYS_SYSTEM, kasan_hw_tags_enable + b 1f +alternative_cb_end + mov \tmp, KERNEL_GCR_EL1 + msr_s SYS_GCR_EL1, \tmp +1: +#endif + .endm + + .macro mte_set_user_gcr, tsk, tmp, tmp2 +#ifdef CONFIG_KASAN_HW_TAGS +alternative_cb ARM64_ALWAYS_SYSTEM, kasan_hw_tags_enable + b 1f +alternative_cb_end + ldr \tmp, [\tsk, #THREAD_MTE_CTRL] + + mte_set_gcr \tmp, \tmp2 +1: #endif .endm .macro kernel_entry, el, regsize = 64 + .if \el == 0 + alternative_insn nop, SET_PSTATE_DIT(1), ARM64_HAS_DIT + .endif .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 .endif @@ -177,61 +220,85 @@ alternative_cb_end .if \el == 0 clear_gp_regs mrs x21, sp_el0 - ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear, - ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug - disable_step_tsk x19, x20 // exceptions when scheduling. + ldr_this_cpu tsk, __entry_task, x20 + msr sp_el0, tsk + + /* + * Ensure MDSCR_EL1.SS is clear, since we can unmask debug exceptions + * when scheduling. + */ + ldr x19, [tsk, #TSK_TI_FLAGS] + disable_step_tsk x19, x20 + + /* Check for asynchronous tag check faults in user space */ + ldr x0, [tsk, THREAD_SCTLR_USER] + check_mte_async_tcf x22, x23, x0 + +#ifdef CONFIG_ARM64_PTR_AUTH +alternative_if ARM64_HAS_ADDRESS_AUTH + /* + * Enable IA for in-kernel PAC if the task had it disabled. Although + * this could be implemented with an unconditional MRS which would avoid + * a load, this was measured to be slower on Cortex-A75 and Cortex-A76. + * + * Install the kernel IA key only if IA was enabled in the task. If IA + * was disabled on kernel exit then we would have left the kernel IA + * installed so there is no need to install it again. + */ + tbz x0, SCTLR_ELx_ENIA_SHIFT, 1f + __ptrauth_keys_install_kernel_nosync tsk, x20, x22, x23 + b 2f +1: + mrs x0, sctlr_el1 + orr x0, x0, SCTLR_ELx_ENIA + msr sctlr_el1, x0 +2: +alternative_else_nop_endif +#endif apply_ssbd 1, x22, x23 + mte_set_kernel_gcr x22, x23 + + /* + * Any non-self-synchronizing system register updates required for + * kernel entry should be placed before this point. + */ +alternative_if ARM64_MTE + isb + b 1f +alternative_else_nop_endif +alternative_if ARM64_HAS_ADDRESS_AUTH + isb +alternative_else_nop_endif +1: + + scs_load_current .else - add x21, sp, #S_FRAME_SIZE - get_thread_info tsk - /* Save the task's original addr_limit and set USER_DS */ - ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] - str x20, [sp, #S_ORIG_ADDR_LIMIT] - mov x20, #USER_DS - str x20, [tsk, #TSK_TI_ADDR_LIMIT] - /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */ + add x21, sp, #PT_REGS_SIZE + get_current_task tsk .endif /* \el == 0 */ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] /* - * In order to be able to dump the contents of struct pt_regs at the - * time the exception was taken (in case we attempt to walk the call - * stack later), chain it together with the stack frames. + * Create a metadata frame record. The unwinder will use this to + * identify and unwind exception boundaries. */ - .if \el == 0 stp xzr, xzr, [sp, #S_STACKFRAME] + .if \el == 0 + mov x0, #FRAME_META_TYPE_FINAL .else - stp x29, x22, [sp, #S_STACKFRAME] + mov x0, #FRAME_META_TYPE_PT_REGS .endif + str x0, [sp, #S_STACKFRAME_TYPE] add x29, sp, #S_STACKFRAME #ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Set the TTBR0 PAN bit in SPSR. When the exception is taken from - * EL0, there is no need to check the state of TTBR0_EL1 since - * accesses are always enabled. - * Note that the meaning of this bit differs from the ARMv8.1 PAN - * feature as all TTBR0_EL1 accesses are disabled, not just those to - * user mappings. - */ -alternative_if ARM64_HAS_PAN - b 1f // skip TTBR0 PAN +alternative_if_not ARM64_HAS_PAN + bl __swpan_entry_el\el alternative_else_nop_endif - - .if \el != 0 - mrs x21, ttbr0_el1 - tst x21, #TTBR_ASID_MASK // Check for the reserved ASID - orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR - b.eq 1f // TTBR0 access already disabled - and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR - .endif - - __uaccess_ttbr0_disable x21 -1: #endif stp x22, x23, [sp, #S_PC] @@ -242,16 +309,23 @@ alternative_else_nop_endif str w21, [sp, #S_SYSCALLNO] .endif - /* - * Set sp_el0 to current thread_info. - */ - .if \el == 0 - msr sp_el0, tsk - .endif +#ifdef CONFIG_ARM64_PSEUDO_NMI +alternative_if_not ARM64_HAS_GIC_PRIO_MASKING + b .Lskip_pmr_save\@ +alternative_else_nop_endif + + mrs_s x20, SYS_ICC_PMR_EL1 + str w20, [sp, #S_PMR] + mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET + msr_s SYS_ICC_PMR_EL1, x20 + +.Lskip_pmr_save\@: +#endif /* * Registers that may be useful after this macro is invoked: * + * x20 - ICC_PMR_EL1 * x21 - aborted SP * x22 - aborted PC * x23 - aborted PSTATE @@ -261,48 +335,30 @@ alternative_else_nop_endif .macro kernel_exit, el .if \el != 0 disable_daif - - /* Restore the task's original addr_limit. */ - ldr x20, [sp, #S_ORIG_ADDR_LIMIT] - str x20, [tsk, #TSK_TI_ADDR_LIMIT] - - /* No need to restore UAO, it will be restored from SPSR_EL1 */ .endif - ldp x21, x22, [sp, #S_PC] // load ELR, SPSR - .if \el == 0 - ct_user_enter - .endif +#ifdef CONFIG_ARM64_PSEUDO_NMI +alternative_if_not ARM64_HAS_GIC_PRIO_MASKING + b .Lskip_pmr_restore\@ +alternative_else_nop_endif -#ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR - * PAN bit checking. - */ -alternative_if ARM64_HAS_PAN - b 2f // skip TTBR0 PAN + ldr w20, [sp, #S_PMR] + msr_s SYS_ICC_PMR_EL1, x20 + + /* Ensure priority change is seen by redistributor */ +alternative_if_not ARM64_HAS_GIC_PRIO_RELAXED_SYNC + dsb sy alternative_else_nop_endif - .if \el != 0 - tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set - .endif +.Lskip_pmr_restore\@: +#endif - __uaccess_ttbr0_enable x0, x1 + ldp x21, x22, [sp, #S_PC] // load ELR, SPSR - .if \el == 0 - /* - * Enable errata workarounds only if returning to user. The only - * workaround currently required for TTBR0_EL1 changes are for the - * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache - * corruption). - */ - bl post_ttbr_update_workaround - .endif -1: - .if \el != 0 - and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit - .endif -2: +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +alternative_if_not ARM64_HAS_PAN + bl __swpan_exit_el\el +alternative_else_nop_endif #endif .if \el == 0 @@ -322,6 +378,34 @@ alternative_if ARM64_WORKAROUND_845719 alternative_else_nop_endif #endif 3: + scs_save tsk + + /* Ignore asynchronous tag check faults in the uaccess routines */ + ldr x0, [tsk, THREAD_SCTLR_USER] + clear_mte_async_tcf x0 + +#ifdef CONFIG_ARM64_PTR_AUTH +alternative_if ARM64_HAS_ADDRESS_AUTH + /* + * IA was enabled for in-kernel PAC. Disable it now if needed, or + * alternatively install the user's IA. All other per-task keys and + * SCTLR bits were updated on task switch. + * + * No kernel C function calls after this. + */ + tbz x0, SCTLR_ELx_ENIA_SHIFT, 1f + __ptrauth_keys_install_user tsk, x0, x1, x2 + b 2f +1: + mrs x0, sctlr_el1 + bic x0, x0, SCTLR_ELx_ENIA + msr sctlr_el1, x0 +2: +alternative_else_nop_endif +#endif + + mte_set_user_gcr tsk, x0, x1 + apply_ssbd 0, x0, x1 .endif @@ -342,70 +426,88 @@ alternative_else_nop_endif ldp x24, x25, [sp, #16 * 12] ldp x26, x27, [sp, #16 * 13] ldp x28, x29, [sp, #16 * 14] - ldr lr, [sp, #S_LR] - add sp, sp, #S_FRAME_SIZE // restore sp .if \el == 0 -alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - bne 4f - msr far_el1, x30 - tramp_alias x30, tramp_exit_native - br x30 -4: - tramp_alias x30, tramp_exit_compat - br x30 + alternative_insn "b .L_skip_tramp_exit_\@", nop, ARM64_UNMAP_KERNEL_AT_EL0 + + msr far_el1, x29 + + ldr_this_cpu x30, this_cpu_vector, x29 + tramp_alias x29, tramp_exit + msr vbar_el1, x30 // install vector table + ldr lr, [sp, #S_LR] // restore x30 + add sp, sp, #PT_REGS_SIZE // restore sp + br x29 + +.L_skip_tramp_exit_\@: #endif + .endif + + ldr lr, [sp, #S_LR] + add sp, sp, #PT_REGS_SIZE // restore sp + + .if \el == 0 + /* This must be after the last explicit memory access */ +alternative_if ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD + tlbi vale1, xzr + dsb nsh +alternative_else_nop_endif .else - eret + /* Ensure any device/NC reads complete */ + alternative_insn nop, "dmb sy", ARM64_WORKAROUND_1508412 .endif + + eret sb .endm - .macro irq_stack_entry - mov x19, sp // preserve the original sp +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Set the TTBR0 PAN bit in SPSR. When the exception is taken from + * EL0, there is no need to check the state of TTBR0_EL1 since + * accesses are always enabled. + * Note that the meaning of this bit differs from the ARMv8.1 PAN + * feature as all TTBR0_EL1 accesses are disabled, not just those to + * user mappings. + */ +SYM_CODE_START_LOCAL(__swpan_entry_el1) + mrs x21, ttbr0_el1 + tst x21, #TTBR_ASID_MASK // Check for the reserved ASID + orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR + b.eq 1f // TTBR0 access already disabled + and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR +SYM_INNER_LABEL(__swpan_entry_el0, SYM_L_LOCAL) + __uaccess_ttbr0_disable x21 +1: ret +SYM_CODE_END(__swpan_entry_el1) /* - * Compare sp with the base of the task stack. - * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack, - * and should switch to the irq stack. + * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR + * PAN bit checking. */ - ldr x25, [tsk, TSK_STACK] - eor x25, x25, x19 - and x25, x25, #~(THREAD_SIZE - 1) - cbnz x25, 9998f - - ldr_this_cpu x25, irq_stack_ptr, x26 - mov x26, #IRQ_STACK_SIZE - add x26, x25, x26 - - /* switch to the irq stack */ - mov sp, x26 -9998: - .endm +SYM_CODE_START_LOCAL(__swpan_exit_el1) + tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set + __uaccess_ttbr0_enable x0, x1 +1: and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit + ret +SYM_CODE_END(__swpan_exit_el1) +SYM_CODE_START_LOCAL(__swpan_exit_el0) + __uaccess_ttbr0_enable x0, x1 /* - * x19 should be preserved between irq_stack_entry and - * irq_stack_exit. + * Enable errata workarounds only if returning to user. The only + * workaround currently required for TTBR0_EL1 changes are for the + * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache + * corruption). */ - .macro irq_stack_exit - mov sp, x19 - .endm + b post_ttbr_update_workaround +SYM_CODE_END(__swpan_exit_el0) +#endif /* GPRs used by entry code */ tsk .req x28 // current thread_info -/* - * Interrupt handling. - */ - .macro irq_handler - ldr_l x1, handle_arch_irq - mov x0, sp - irq_stack_entry - blr x1 - irq_stack_exit - .endm - .text /* @@ -414,53 +516,46 @@ tsk .req x28 // current thread_info .pushsection ".entry.text", "ax" .align 11 -ENTRY(vectors) - kernel_ventry 1, sync_invalid // Synchronous EL1t - kernel_ventry 1, irq_invalid // IRQ EL1t - kernel_ventry 1, fiq_invalid // FIQ EL1t - kernel_ventry 1, error_invalid // Error EL1t - - kernel_ventry 1, sync // Synchronous EL1h - kernel_ventry 1, irq // IRQ EL1h - kernel_ventry 1, fiq_invalid // FIQ EL1h - kernel_ventry 1, error // Error EL1h - - kernel_ventry 0, sync // Synchronous 64-bit EL0 - kernel_ventry 0, irq // IRQ 64-bit EL0 - kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0 - kernel_ventry 0, error // Error 64-bit EL0 - -#ifdef CONFIG_COMPAT - kernel_ventry 0, sync_compat, 32 // Synchronous 32-bit EL0 - kernel_ventry 0, irq_compat, 32 // IRQ 32-bit EL0 - kernel_ventry 0, fiq_invalid_compat, 32 // FIQ 32-bit EL0 - kernel_ventry 0, error_compat, 32 // Error 32-bit EL0 -#else - kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0 - kernel_ventry 0, irq_invalid, 32 // IRQ 32-bit EL0 - kernel_ventry 0, fiq_invalid, 32 // FIQ 32-bit EL0 - kernel_ventry 0, error_invalid, 32 // Error 32-bit EL0 -#endif -END(vectors) - -#ifdef CONFIG_VMAP_STACK +SYM_CODE_START(vectors) + kernel_ventry 1, t, 64, sync // Synchronous EL1t + kernel_ventry 1, t, 64, irq // IRQ EL1t + kernel_ventry 1, t, 64, fiq // FIQ EL1t + kernel_ventry 1, t, 64, error // Error EL1t + + kernel_ventry 1, h, 64, sync // Synchronous EL1h + kernel_ventry 1, h, 64, irq // IRQ EL1h + kernel_ventry 1, h, 64, fiq // FIQ EL1h + kernel_ventry 1, h, 64, error // Error EL1h + + kernel_ventry 0, t, 64, sync // Synchronous 64-bit EL0 + kernel_ventry 0, t, 64, irq // IRQ 64-bit EL0 + kernel_ventry 0, t, 64, fiq // FIQ 64-bit EL0 + kernel_ventry 0, t, 64, error // Error 64-bit EL0 + + kernel_ventry 0, t, 32, sync // Synchronous 32-bit EL0 + kernel_ventry 0, t, 32, irq // IRQ 32-bit EL0 + kernel_ventry 0, t, 32, fiq // FIQ 32-bit EL0 + kernel_ventry 0, t, 32, error // Error 32-bit EL0 +SYM_CODE_END(vectors) + +SYM_CODE_START_LOCAL(__bad_stack) /* * We detected an overflow in kernel_ventry, which switched to the * overflow stack. Stash the exception regs, and head to our overflow * handler. */ -__bad_stack: + /* Restore the original x0 value */ mrs x0, tpidrro_el0 /* * Store the original GPRs to the new stack. The orginal SP (minus - * S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry. + * PT_REGS_SIZE) was stashed in tpidr_el0 by kernel_ventry. */ - sub sp, sp, #S_FRAME_SIZE + sub sp, sp, #PT_REGS_SIZE kernel_entry 1 mrs x0, tpidr_el0 - add x0, x0, #S_FRAME_SIZE + add x0, x0, #PT_REGS_SIZE str x0, [sp, #S_SP] /* Stash the regs for handle_bad_stack */ @@ -469,463 +564,64 @@ __bad_stack: /* Time to die */ bl handle_bad_stack ASM_BUG() -#endif /* CONFIG_VMAP_STACK */ +SYM_CODE_END(__bad_stack) -/* - * Invalid mode handlers - */ - .macro inv_entry, el, reason, regsize = 64 + + .macro entry_handler el:req, ht:req, regsize:req, label:req +SYM_CODE_START_LOCAL(el\el\ht\()_\regsize\()_\label) kernel_entry \el, \regsize mov x0, sp - mov x1, #\reason - mrs x2, esr_el1 - bl bad_mode - ASM_BUG() + bl el\el\ht\()_\regsize\()_\label\()_handler + .if \el == 0 + b ret_to_user + .else + b ret_to_kernel + .endif +SYM_CODE_END(el\el\ht\()_\regsize\()_\label) .endm -el0_sync_invalid: - inv_entry 0, BAD_SYNC -ENDPROC(el0_sync_invalid) - -el0_irq_invalid: - inv_entry 0, BAD_IRQ -ENDPROC(el0_irq_invalid) - -el0_fiq_invalid: - inv_entry 0, BAD_FIQ -ENDPROC(el0_fiq_invalid) - -el0_error_invalid: - inv_entry 0, BAD_ERROR -ENDPROC(el0_error_invalid) - -#ifdef CONFIG_COMPAT -el0_fiq_invalid_compat: - inv_entry 0, BAD_FIQ, 32 -ENDPROC(el0_fiq_invalid_compat) -#endif - -el1_sync_invalid: - inv_entry 1, BAD_SYNC -ENDPROC(el1_sync_invalid) - -el1_irq_invalid: - inv_entry 1, BAD_IRQ -ENDPROC(el1_irq_invalid) - -el1_fiq_invalid: - inv_entry 1, BAD_FIQ -ENDPROC(el1_fiq_invalid) - -el1_error_invalid: - inv_entry 1, BAD_ERROR -ENDPROC(el1_error_invalid) - /* - * EL1 mode handlers. + * Early exception handlers */ - .align 6 -el1_sync: - kernel_entry 1 - mrs x1, esr_el1 // read the syndrome register - lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class - cmp x24, #ESR_ELx_EC_DABT_CUR // data abort in EL1 - b.eq el1_da - cmp x24, #ESR_ELx_EC_IABT_CUR // instruction abort in EL1 - b.eq el1_ia - cmp x24, #ESR_ELx_EC_SYS64 // configurable trap - b.eq el1_undef - cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception - b.eq el1_sp_pc - cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception - b.eq el1_sp_pc - cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL1 - b.eq el1_undef - cmp x24, #ESR_ELx_EC_BREAKPT_CUR // debug exception in EL1 - b.ge el1_dbg - b el1_inv - -el1_ia: - /* - * Fall through to the Data abort case - */ -el1_da: - /* - * Data abort handling - */ - mrs x3, far_el1 - inherit_daif pstate=x23, tmp=x2 - clear_address_tag x0, x3 - mov x2, sp // struct pt_regs - bl do_mem_abort - + entry_handler 1, t, 64, sync + entry_handler 1, t, 64, irq + entry_handler 1, t, 64, fiq + entry_handler 1, t, 64, error + + entry_handler 1, h, 64, sync + entry_handler 1, h, 64, irq + entry_handler 1, h, 64, fiq + entry_handler 1, h, 64, error + + entry_handler 0, t, 64, sync + entry_handler 0, t, 64, irq + entry_handler 0, t, 64, fiq + entry_handler 0, t, 64, error + + entry_handler 0, t, 32, sync + entry_handler 0, t, 32, irq + entry_handler 0, t, 32, fiq + entry_handler 0, t, 32, error + +SYM_CODE_START_LOCAL(ret_to_kernel) kernel_exit 1 -el1_sp_pc: - /* - * Stack or PC alignment exception handling - */ - mrs x0, far_el1 - inherit_daif pstate=x23, tmp=x2 - mov x2, sp - bl do_sp_pc_abort - ASM_BUG() -el1_undef: - /* - * Undefined instruction - */ - inherit_daif pstate=x23, tmp=x2 - mov x0, sp - bl do_undefinstr - kernel_exit 1 -el1_dbg: - /* - * Debug exception handling - */ - cmp x24, #ESR_ELx_EC_BRK64 // if BRK64 - cinc x24, x24, eq // set bit '0' - tbz x24, #0, el1_inv // EL1 only - mrs x0, far_el1 - mov x2, sp // struct pt_regs - bl do_debug_exception - kernel_exit 1 -el1_inv: - // TODO: add support for undefined instructions in kernel mode - inherit_daif pstate=x23, tmp=x2 - mov x0, sp - mov x2, x1 - mov x1, #BAD_SYNC - bl bad_mode - ASM_BUG() -ENDPROC(el1_sync) - - .align 6 -el1_irq: - kernel_entry 1 - enable_da_f -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_off -#endif - - irq_handler - -#ifdef CONFIG_PREEMPT - ldr x24, [tsk, #TSK_TI_PREEMPT] // get preempt count - cbnz x24, 1f // preempt count != 0 - bl el1_preempt -1: -#endif -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on -#endif - kernel_exit 1 -ENDPROC(el1_irq) - -#ifdef CONFIG_PREEMPT -el1_preempt: - mov x24, lr -1: bl preempt_schedule_irq // irq en/disable is done inside - ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS - tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling? - ret x24 -#endif - -/* - * EL0 mode handlers. - */ - .align 6 -el0_sync: - kernel_entry 0 - mrs x25, esr_el1 // read the syndrome register - lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class - cmp x24, #ESR_ELx_EC_SVC64 // SVC in 64-bit state - b.eq el0_svc - cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0 - b.eq el0_da - cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0 - b.eq el0_ia - cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access - b.eq el0_fpsimd_acc - cmp x24, #ESR_ELx_EC_SVE // SVE access - b.eq el0_sve_acc - cmp x24, #ESR_ELx_EC_FP_EXC64 // FP/ASIMD exception - b.eq el0_fpsimd_exc - cmp x24, #ESR_ELx_EC_SYS64 // configurable trap - ccmp x24, #ESR_ELx_EC_WFx, #4, ne - b.eq el0_sys - cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception - b.eq el0_sp_pc - cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception - b.eq el0_sp_pc - cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0 - b.eq el0_undef - cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0 - b.ge el0_dbg - b el0_inv - -#ifdef CONFIG_COMPAT - .align 6 -el0_sync_compat: - kernel_entry 0, 32 - mrs x25, esr_el1 // read the syndrome register - lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class - cmp x24, #ESR_ELx_EC_SVC32 // SVC in 32-bit state - b.eq el0_svc_compat - cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0 - b.eq el0_da - cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0 - b.eq el0_ia - cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access - b.eq el0_fpsimd_acc - cmp x24, #ESR_ELx_EC_FP_EXC32 // FP/ASIMD exception - b.eq el0_fpsimd_exc - cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception - b.eq el0_sp_pc - cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0 - b.eq el0_undef - cmp x24, #ESR_ELx_EC_CP15_32 // CP15 MRC/MCR trap - b.eq el0_cp15 - cmp x24, #ESR_ELx_EC_CP15_64 // CP15 MRRC/MCRR trap - b.eq el0_cp15 - cmp x24, #ESR_ELx_EC_CP14_MR // CP14 MRC/MCR trap - b.eq el0_undef - cmp x24, #ESR_ELx_EC_CP14_LS // CP14 LDC/STC trap - b.eq el0_undef - cmp x24, #ESR_ELx_EC_CP14_64 // CP14 MRRC/MCRR trap - b.eq el0_undef - cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0 - b.ge el0_dbg - b el0_inv -el0_svc_compat: - mov x0, sp - bl el0_svc_compat_handler - b ret_to_user - - .align 6 -el0_irq_compat: - kernel_entry 0, 32 - b el0_irq_naked - -el0_error_compat: - kernel_entry 0, 32 - b el0_error_naked - -el0_cp15: - /* - * Trapped CP15 (MRC, MCR, MRRC, MCRR) instructions - */ - enable_daif - ct_user_exit - mov x0, x25 - mov x1, sp - bl do_cp15instr - b ret_to_user -#endif +SYM_CODE_END(ret_to_kernel) -el0_da: - /* - * Data abort handling - */ - mrs x26, far_el1 - enable_daif - ct_user_exit - clear_address_tag x0, x26 - mov x1, x25 - mov x2, sp - bl do_mem_abort - b ret_to_user -el0_ia: - /* - * Instruction abort handling - */ - mrs x26, far_el1 - enable_da_f -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_off -#endif - ct_user_exit - mov x0, x26 - mov x1, x25 - mov x2, sp - bl do_el0_ia_bp_hardening - b ret_to_user -el0_fpsimd_acc: - /* - * Floating Point or Advanced SIMD access - */ - enable_daif - ct_user_exit - mov x0, x25 - mov x1, sp - bl do_fpsimd_acc - b ret_to_user -el0_sve_acc: - /* - * Scalable Vector Extension access - */ - enable_daif - ct_user_exit - mov x0, x25 - mov x1, sp - bl do_sve_acc - b ret_to_user -el0_fpsimd_exc: - /* - * Floating Point, Advanced SIMD or SVE exception - */ - enable_daif - ct_user_exit - mov x0, x25 - mov x1, sp - bl do_fpsimd_exc - b ret_to_user -el0_sp_pc: - /* - * Stack or PC alignment exception handling - */ - mrs x26, far_el1 - enable_da_f -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_off -#endif - ct_user_exit - mov x0, x26 - mov x1, x25 - mov x2, sp - bl do_sp_pc_abort - b ret_to_user -el0_undef: - /* - * Undefined instruction - */ - enable_daif - ct_user_exit - mov x0, sp - bl do_undefinstr - b ret_to_user -el0_sys: - /* - * System instructions, for trapped cache maintenance instructions - */ - enable_daif - ct_user_exit - mov x0, x25 - mov x1, sp - bl do_sysinstr - b ret_to_user -el0_dbg: - /* - * Debug exception handling - */ - tbnz x24, #0, el0_inv // EL0 only - mrs x0, far_el1 - mov x1, x25 - mov x2, sp - bl do_debug_exception - enable_daif - ct_user_exit - b ret_to_user -el0_inv: - enable_daif - ct_user_exit - mov x0, sp - mov x1, #BAD_SYNC - mov x2, x25 - bl bad_el0_sync - b ret_to_user -ENDPROC(el0_sync) - - .align 6 -el0_irq: - kernel_entry 0 -el0_irq_naked: - enable_da_f -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_off -#endif - - ct_user_exit -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR - tbz x22, #55, 1f - bl do_el0_irq_bp_hardening -1: -#endif - irq_handler - -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on -#endif - b ret_to_user -ENDPROC(el0_irq) - -el1_error: - kernel_entry 1 - mrs x1, esr_el1 - enable_dbg - mov x0, sp - bl do_serror - kernel_exit 1 -ENDPROC(el1_error) - -el0_error: - kernel_entry 0 -el0_error_naked: - mrs x1, esr_el1 - enable_dbg - mov x0, sp - bl do_serror - enable_daif - ct_user_exit - b ret_to_user -ENDPROC(el0_error) - -/* - * Ok, we need to do extra processing, enter the slow path. - */ -work_pending: - mov x0, sp // 'regs' - bl do_notify_resume -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on // enabled while in userspace -#endif - ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step - b finish_ret_to_user -/* - * "slow" syscall return path. - */ -ret_to_user: - disable_daif - ldr x1, [tsk, #TSK_TI_FLAGS] - and x2, x1, #_TIF_WORK_MASK - cbnz x2, work_pending -finish_ret_to_user: - enable_step_tsk x1, x2 -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK - bl stackleak_erase +SYM_CODE_START_LOCAL(ret_to_user) + ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step + enable_step_tsk x19, x2 +#ifdef CONFIG_KSTACK_ERASE + bl stackleak_erase_on_task_stack #endif kernel_exit 0 -ENDPROC(ret_to_user) - -/* - * SVC handler. - */ - .align 6 -el0_svc: - mov x0, sp - bl el0_svc_handler - b ret_to_user -ENDPROC(el0_svc) +SYM_CODE_END(ret_to_user) .popsection // .entry.text -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -/* - * Exception vectors trampoline. - */ - .pushsection ".entry.tramp.text", "ax" - + // Move from tramp_pg_dir to swapper_pg_dir .macro tramp_map_kernel, tmp mrs \tmp, ttbr1_el1 - add \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE) + add \tmp, \tmp, #TRAMP_SWAPPER_OFFSET bic \tmp, \tmp, #USER_ASID_FLAG msr ttbr1_el1, \tmp #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 @@ -942,9 +638,10 @@ alternative_else_nop_endif #endif /* CONFIG_QCOM_FALKOR_ERRATUM_1003 */ .endm + // Move from swapper_pg_dir to tramp_pg_dir .macro tramp_unmap_kernel, tmp mrs \tmp, ttbr1_el1 - sub \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE) + sub \tmp, \tmp, #TRAMP_SWAPPER_OFFSET orr \tmp, \tmp, #USER_ASID_FLAG msr ttbr1_el1, \tmp /* @@ -954,12 +651,57 @@ alternative_else_nop_endif */ .endm - .macro tramp_ventry, regsize = 64 + .macro tramp_data_read_var dst, var +#ifdef CONFIG_RELOCATABLE + ldr \dst, .L__tramp_data_\var + .ifndef .L__tramp_data_\var + .pushsection ".entry.tramp.rodata", "a", %progbits + .align 3 +.L__tramp_data_\var: + .quad \var + .popsection + .endif +#else + /* + * As !RELOCATABLE implies !RANDOMIZE_BASE the address is always a + * compile time constant (and hence not secret and not worth hiding). + * + * As statically allocated kernel code and data always live in the top + * 47 bits of the address space we can sign-extend bit 47 and avoid an + * instruction to load the upper 16 bits (which must be 0xFFFF). + */ + movz \dst, :abs_g2_s:\var + movk \dst, :abs_g1_nc:\var + movk \dst, :abs_g0_nc:\var +#endif + .endm + +#define BHB_MITIGATION_NONE 0 +#define BHB_MITIGATION_LOOP 1 +#define BHB_MITIGATION_FW 2 +#define BHB_MITIGATION_INSN 3 + + .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 1: .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif + + .if \bhb == BHB_MITIGATION_LOOP + /* + * This sequence must appear before the first indirect branch. i.e. the + * ret out of tramp_ventry. It appears here because x30 is free. + */ + __mitigate_spectre_bhb_loop x30 + .endif // \bhb == BHB_MITIGATION_LOOP + + .if \bhb == BHB_MITIGATION_INSN + clearbhb + isb + .endif // \bhb == BHB_MITIGATION_INSN + + .if \kpti == 1 /* * Defend against branch aliasing attacks by pushing a dummy * entry onto the return stack and using a RET instruction to @@ -969,65 +711,106 @@ alternative_else_nop_endif b . 2: tramp_map_kernel x30 -#ifdef CONFIG_RANDOMIZE_BASE - adr x30, tramp_vectors + PAGE_SIZE alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 - ldr x30, [x30] -#else - ldr x30, =vectors -#endif - prfm plil1strm, [x30, #(1b - tramp_vectors)] + tramp_data_read_var x30, vectors +alternative_if_not ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM + prfm plil1strm, [x30, #(1b - \vector_start)] +alternative_else_nop_endif + msr vbar_el1, x30 - add x30, x30, #(1b - tramp_vectors) isb - ret - .endm + .else + adr_l x30, vectors + .endif // \kpti == 1 - .macro tramp_exit, regsize = 64 - adr x30, tramp_vectors - msr vbar_el1, x30 - tramp_unmap_kernel x30 - .if \regsize == 64 - mrs x30, far_el1 - .endif - eret - sb + .if \bhb == BHB_MITIGATION_FW + /* + * The firmware sequence must appear before the first indirect branch. + * i.e. the ret out of tramp_ventry. But it also needs the stack to be + * mapped to save/restore the registers the SMC clobbers. + */ + __mitigate_spectre_bhb_fw + .endif // \bhb == BHB_MITIGATION_FW + + add x30, x30, #(1b - \vector_start + 4) + ret +.org 1b + 128 // Did we overflow the ventry slot? .endm - .align 11 -ENTRY(tramp_vectors) + .macro generate_tramp_vector, kpti, bhb +.Lvector_start\@: .space 0x400 - tramp_ventry - tramp_ventry - tramp_ventry - tramp_ventry + .rept 4 + tramp_ventry .Lvector_start\@, 64, \kpti, \bhb + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32, \kpti, \bhb + .endr + .endm - tramp_ventry 32 - tramp_ventry 32 - tramp_ventry 32 - tramp_ventry 32 -END(tramp_vectors) +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * Exception vectors trampoline. + * The order must match __bp_harden_el1_vectors and the + * arm64_bp_harden_el1_vectors enum. + */ + .pushsection ".entry.tramp.text", "ax" + .align 11 +SYM_CODE_START_LOCAL_NOALIGN(tramp_vectors) +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_LOOP + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_FW + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_INSN +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_NONE +SYM_CODE_END(tramp_vectors) + +SYM_CODE_START_LOCAL(tramp_exit) + tramp_unmap_kernel x29 + mrs x29, far_el1 // restore x29 + eret + sb +SYM_CODE_END(tramp_exit) + .popsection // .entry.tramp.text +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ -ENTRY(tramp_exit_native) - tramp_exit -END(tramp_exit_native) +/* + * Exception vectors for spectre mitigations on entry from EL1 when + * kpti is not in use. + */ + .macro generate_el1_vector, bhb +.Lvector_start\@: + kernel_ventry 1, t, 64, sync // Synchronous EL1t + kernel_ventry 1, t, 64, irq // IRQ EL1t + kernel_ventry 1, t, 64, fiq // FIQ EL1h + kernel_ventry 1, t, 64, error // Error EL1t + + kernel_ventry 1, h, 64, sync // Synchronous EL1h + kernel_ventry 1, h, 64, irq // IRQ EL1h + kernel_ventry 1, h, 64, fiq // FIQ EL1h + kernel_ventry 1, h, 64, error // Error EL1h + + .rept 4 + tramp_ventry .Lvector_start\@, 64, 0, \bhb + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32, 0, \bhb + .endr + .endm -ENTRY(tramp_exit_compat) - tramp_exit 32 -END(tramp_exit_compat) +/* The order must match tramp_vecs and the arm64_bp_harden_el1_vectors enum. */ + .pushsection ".entry.text", "ax" + .align 11 +SYM_CODE_START(__bp_harden_el1_vectors) +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_el1_vector bhb=BHB_MITIGATION_LOOP + generate_el1_vector bhb=BHB_MITIGATION_FW + generate_el1_vector bhb=BHB_MITIGATION_INSN +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ +SYM_CODE_END(__bp_harden_el1_vectors) + .popsection - .ltorg - .popsection // .entry.tramp.text -#ifdef CONFIG_RANDOMIZE_BASE - .pushsection ".rodata", "a" - .align PAGE_SHIFT - .globl __entry_tramp_data_start -__entry_tramp_data_start: - .quad vectors - .popsection // .rodata -#endif /* CONFIG_RANDOMIZE_BASE */ -#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ /* * Register switch for AArch64. The callee-saved registers need to be saved @@ -1037,7 +820,8 @@ __entry_tramp_data_start: * Previous and next are guaranteed not to be the same. * */ -ENTRY(cpu_switch_to) +SYM_FUNC_START(cpu_switch_to) + save_and_disable_daif x11 mov x10, #THREAD_CPU_CONTEXT add x8, x0, x10 mov x9, sp @@ -1058,23 +842,67 @@ ENTRY(cpu_switch_to) ldr lr, [x8] mov sp, x9 msr sp_el0, x1 + ptrauth_keys_install_kernel x1, x8, x9, x10 + scs_save x0 + scs_load_current + restore_irq x11 ret -ENDPROC(cpu_switch_to) +SYM_FUNC_END(cpu_switch_to) NOKPROBE(cpu_switch_to) /* * This is how we return from a fork. */ -ENTRY(ret_from_fork) +SYM_CODE_START(ret_from_fork) bl schedule_tail cbz x19, 1f // not a kernel thread mov x0, x20 blr x19 -1: get_thread_info tsk +1: get_current_task tsk + mov x0, sp + bl asm_exit_to_user_mode b ret_to_user -ENDPROC(ret_from_fork) +SYM_CODE_END(ret_from_fork) NOKPROBE(ret_from_fork) +/* + * void call_on_irq_stack(struct pt_regs *regs, + * void (*func)(struct pt_regs *)); + * + * Calls func(regs) using this CPU's irq stack and shadow irq stack. + */ +SYM_FUNC_START(call_on_irq_stack) + save_and_disable_daif x9 +#ifdef CONFIG_SHADOW_CALL_STACK + get_current_task x16 + scs_save x16 + ldr_this_cpu scs_sp, irq_shadow_call_stack_ptr, x17 +#endif + + /* Create a frame record to save our LR and SP (implicit in FP) */ + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr_this_cpu x16, irq_stack_ptr, x17 + + /* Move to the new stack and call the function there */ + add sp, x16, #IRQ_STACK_SIZE + restore_irq x9 + blr x1 + + save_and_disable_daif x9 + /* + * Restore the SP from the FP, and restore the FP and LR from the frame + * record. + */ + mov sp, x29 + ldp x29, x30, [sp], #16 + scs_load_current + restore_irq x9 + ret +SYM_FUNC_END(call_on_irq_stack) +NOKPROBE(call_on_irq_stack) + #ifdef CONFIG_ARM_SDE_INTERFACE #include <asm/sdei.h> @@ -1099,9 +927,8 @@ NOKPROBE(ret_from_fork) * This clobbers x4, __sdei_handler() will restore this from firmware's * copy. */ -.ltorg .pushsection ".entry.tramp.text", "ax" -ENTRY(__sdei_asm_entry_trampoline) +SYM_CODE_START(__sdei_asm_entry_trampoline) mrs x4, ttbr1_el1 tbz x4, #USER_ASID_BIT, 1f @@ -1110,20 +937,12 @@ ENTRY(__sdei_asm_entry_trampoline) mov x4, xzr /* - * Use reg->interrupted_regs.addr_limit to remember whether to unmap - * the kernel on exit. + * Remember whether to unmap the kernel on exit. */ -1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] - -#ifdef CONFIG_RANDOMIZE_BASE - adr x4, tramp_vectors + PAGE_SIZE - add x4, x4, #:lo12:__sdei_asm_trampoline_next_handler - ldr x4, [x4] -#else - ldr x4, =__sdei_asm_handler -#endif +1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] + tramp_data_read_var x4, __sdei_asm_handler br x4 -ENDPROC(__sdei_asm_entry_trampoline) +SYM_CODE_END(__sdei_asm_entry_trampoline) NOKPROBE(__sdei_asm_entry_trampoline) /* @@ -1133,23 +952,16 @@ NOKPROBE(__sdei_asm_entry_trampoline) * x2: exit_mode * x4: struct sdei_registered_event argument from registration time. */ -ENTRY(__sdei_asm_exit_trampoline) - ldr x4, [x4, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] +SYM_CODE_START(__sdei_asm_exit_trampoline) + ldr x4, [x4, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] cbnz x4, 1f tramp_unmap_kernel tmp=x4 1: sdei_handler_exit exit_mode=x2 -ENDPROC(__sdei_asm_exit_trampoline) +SYM_CODE_END(__sdei_asm_exit_trampoline) NOKPROBE(__sdei_asm_exit_trampoline) - .ltorg .popsection // .entry.tramp.text -#ifdef CONFIG_RANDOMIZE_BASE -.pushsection ".rodata", "a" -__sdei_asm_trampoline_next_handler: - .quad __sdei_asm_handler -.popsection // .rodata -#endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ /* @@ -1165,7 +977,7 @@ __sdei_asm_trampoline_next_handler: * follow SMC-CC. We save (or retrieve) all the registers as the handler may * want them. */ -ENTRY(__sdei_asm_handler) +SYM_CODE_START(__sdei_asm_handler) stp x2, x3, [x1, #SDEI_EVENT_INTREGS + S_PC] stp x4, x5, [x1, #SDEI_EVENT_INTREGS + 16 * 2] stp x6, x7, [x1, #SDEI_EVENT_INTREGS + 16 * 3] @@ -1185,13 +997,19 @@ ENTRY(__sdei_asm_handler) mov x19, x1 -#ifdef CONFIG_VMAP_STACK + /* Store the registered-event for crash_smp_send_stop() */ + ldrb w4, [x19, #SDEI_EVENT_PRIORITY] + cbnz w4, 1f + adr_this_cpu dst=x5, sym=sdei_active_normal_event, tmp=x6 + b 2f +1: adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6 +2: str x19, [x5] + /* * entry.S may have been using sp as a scratch register, find whether * this is a normal or critical event and switch to the appropriate * stack for this CPU. */ - ldrb w4, [x19, #SDEI_EVENT_PRIORITY] cbnz w4, 1f ldr_this_cpu dst=x5, sym=sdei_stack_normal_ptr, tmp=x6 b 2f @@ -1199,6 +1017,14 @@ ENTRY(__sdei_asm_handler) 2: mov x6, #SDEI_STACK_SIZE add x5, x5, x6 mov sp, x5 + +#ifdef CONFIG_SHADOW_CALL_STACK + /* Use a separate shadow call stack for normal and critical events */ + cbnz w4, 3f + ldr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_normal_ptr, tmp=x6 + b 4f +3: ldr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_critical_ptr, tmp=x6 +4: #endif /* @@ -1232,14 +1058,24 @@ ENTRY(__sdei_asm_handler) mov sp, x1 mov x1, x0 // address to complete_and_resume - /* x0 = (x0 <= 1) ? EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME */ - cmp x0, #1 + /* x0 = (x0 <= SDEI_EV_FAILED) ? + * EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME + */ + cmp x0, #SDEI_EV_FAILED mov_q x2, SDEI_1_0_FN_SDEI_EVENT_COMPLETE mov_q x3, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME csel x0, x2, x3, ls ldr_l x2, sdei_exit_mode + /* Clear the registered-event seen by crash_smp_send_stop() */ + ldrb w3, [x4, #SDEI_EVENT_PRIORITY] + cbnz w3, 1f + adr_this_cpu dst=x5, sym=sdei_active_normal_event, tmp=x6 + b 2f +1: adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6 +2: str xzr, [x5] + alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 sdei_handler_exit exit_mode=x2 alternative_else_nop_endif @@ -1248,6 +1084,17 @@ alternative_else_nop_endif tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline br x5 #endif -ENDPROC(__sdei_asm_handler) +SYM_CODE_END(__sdei_asm_handler) NOKPROBE(__sdei_asm_handler) + +SYM_CODE_START(__sdei_handler_abort) + mov_q x0, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME + adr x1, 1f + ldr_l x2, sdei_exit_mode + sdei_handler_exit exit_mode=x2 + // exit the handler and jump to the next instruction. + // Exit will stomp x0-x17, PSTATE, ELR_ELx, and SPSR_ELx. +1: ret +SYM_CODE_END(__sdei_handler_abort) +NOKPROBE(__sdei_handler_abort) #endif /* CONFIG_ARM_SDE_INTERFACE */ diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 5ebe73b69961..c154f72634e0 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1,29 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * FP/SIMD context switching and fault handling * * Copyright (C) 2012 ARM Ltd. * Author: Catalin Marinas <catalin.marinas@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/bitmap.h> +#include <linux/bitops.h> #include <linux/bottom_half.h> #include <linux/bug.h> #include <linux/cache.h> #include <linux/compat.h> +#include <linux/compiler.h> #include <linux/cpu.h> #include <linux/cpu_pm.h> +#include <linux/ctype.h> #include <linux/kernel.h> #include <linux/linkage.h> #include <linux/irqflags.h> @@ -38,16 +30,20 @@ #include <linux/slab.h> #include <linux/stddef.h> #include <linux/sysctl.h> +#include <linux/swab.h> #include <asm/esr.h> +#include <asm/exception.h> #include <asm/fpsimd.h> #include <asm/cpufeature.h> #include <asm/cputype.h> +#include <asm/neon.h> #include <asm/processor.h> #include <asm/simd.h> #include <asm/sigcontext.h> #include <asm/sysreg.h> #include <asm/traps.h> +#include <asm/virt.h> #define FPEXC_IOF (1 << 0) #define FPEXC_DZF (1 << 1) @@ -83,14 +79,19 @@ * indicate whether or not the userland FPSIMD state of the current task is * present in the registers. The flag is set unless the FPSIMD registers of this * CPU currently contain the most recent userland FPSIMD state of the current - * task. + * task. If the task is behaving as a VMM, then this is will be managed by + * KVM which will clear it to indicate that the vcpu FPSIMD state is currently + * loaded on the CPU, allowing the state to be saved if a FPSIMD-aware + * softirq kicks in. Upon vcpu_put(), KVM will save the vcpu FP state and + * flag the register state as invalid. * - * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may - * save the task's FPSIMD context back to task_struct from softirq context. - * To prevent this from racing with the manipulation of the task's FPSIMD state - * from task context and thereby corrupting the state, it is necessary to - * protect any manipulation of a task's fpsimd_state or TIF_FOREIGN_FPSTATE - * flag with local_bh_disable() unless softirqs are already masked. + * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may be + * called from softirq context, which will save the task's FPSIMD context back + * to task_struct. To prevent this from racing with the manipulation of the + * task's FPSIMD state from task context and thereby corrupting the state, it + * is necessary to protect any manipulation of a task's fpsimd_state or + * TIF_FOREIGN_FPSTATE flag with get_cpu_fpsimd_context(), which will suspend + * softirq servicing entirely until put_cpu_fpsimd_context() is called. * * For a certain task, the sequence may look something like this: * - the task gets scheduled in; if both the task's fpsimd_cpu field @@ -117,71 +118,196 @@ * returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so * whatever is in the FPSIMD registers is not saved to memory, but discarded. */ -struct fpsimd_last_state_struct { - struct user_fpsimd_state *st; + +DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); + +__ro_after_init struct vl_info vl_info[ARM64_VEC_MAX] = { +#ifdef CONFIG_ARM64_SVE + [ARM64_VEC_SVE] = { + .type = ARM64_VEC_SVE, + .name = "SVE", + .min_vl = SVE_VL_MIN, + .max_vl = SVE_VL_MIN, + .max_virtualisable_vl = SVE_VL_MIN, + }, +#endif +#ifdef CONFIG_ARM64_SME + [ARM64_VEC_SME] = { + .type = ARM64_VEC_SME, + .name = "SME", + }, +#endif +}; + +static unsigned int vec_vl_inherit_flag(enum vec_type type) +{ + switch (type) { + case ARM64_VEC_SVE: + return TIF_SVE_VL_INHERIT; + case ARM64_VEC_SME: + return TIF_SME_VL_INHERIT; + default: + WARN_ON_ONCE(1); + return 0; + } +} + +struct vl_config { + int __default_vl; /* Default VL for tasks */ }; -static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state); +static struct vl_config vl_config[ARM64_VEC_MAX]; -/* Default VL for tasks that don't set it explicitly: */ -static int sve_default_vl = -1; +static inline int get_default_vl(enum vec_type type) +{ + return READ_ONCE(vl_config[type].__default_vl); +} #ifdef CONFIG_ARM64_SVE -/* Maximum supported vector length across all CPUs (initially poisoned) */ -int __ro_after_init sve_max_vl = SVE_VL_MIN; -/* Set of available vector lengths, as vq_to_bit(vq): */ -static __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); -static void __percpu *efi_sve_state; +static inline int get_sve_default_vl(void) +{ + return get_default_vl(ARM64_VEC_SVE); +} + +static inline void set_default_vl(enum vec_type type, int val) +{ + WRITE_ONCE(vl_config[type].__default_vl, val); +} + +static inline void set_sve_default_vl(int val) +{ + set_default_vl(ARM64_VEC_SVE, val); +} + +static u8 *efi_sve_state; #else /* ! CONFIG_ARM64_SVE */ /* Dummy declaration for code that will be optimised out: */ -extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); -extern void __percpu *efi_sve_state; +extern u8 *efi_sve_state; #endif /* ! CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +static int get_sme_default_vl(void) +{ + return get_default_vl(ARM64_VEC_SME); +} + +static void set_sme_default_vl(int val) +{ + set_default_vl(ARM64_VEC_SME, val); +} + +static void sme_free(struct task_struct *); + +#else + +static inline void sme_free(struct task_struct *t) { } + +#endif + +static void fpsimd_bind_task_to_cpu(void); + /* - * Call __sve_free() directly only if you know task can't be scheduled - * or preempted. + * Claim ownership of the CPU FPSIMD context for use by the calling context. + * + * The caller may freely manipulate the FPSIMD context metadata until + * put_cpu_fpsimd_context() is called. + * + * On RT kernels local_bh_disable() is not sufficient because it only + * serializes soft interrupt related sections via a local lock, but stays + * preemptible. Disabling preemption is the right choice here as bottom + * half processing is always in thread context on RT kernels so it + * implicitly prevents bottom half processing as well. */ -static void __sve_free(struct task_struct *task) +static void get_cpu_fpsimd_context(void) { - kfree(task->thread.sve_state); - task->thread.sve_state = NULL; + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + /* + * The softirq subsystem lacks a true unmask/mask API, and + * re-enabling softirq processing using local_bh_enable() will + * not only unmask softirqs, it will also result in immediate + * delivery of any pending softirqs. + * This is undesirable when running with IRQs disabled, but in + * that case, there is no need to mask softirqs in the first + * place, so only bother doing so when IRQs are enabled. + */ + if (!irqs_disabled()) + local_bh_disable(); + } else { + preempt_disable(); + } } -static void sve_free(struct task_struct *task) +/* + * Release the CPU FPSIMD context. + * + * Must be called from a context in which get_cpu_fpsimd_context() was + * previously called, with no call to put_cpu_fpsimd_context() in the + * meantime. + */ +static void put_cpu_fpsimd_context(void) { - WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + if (!irqs_disabled()) + local_bh_enable(); + } else { + preempt_enable(); + } +} - __sve_free(task); +unsigned int task_get_vl(const struct task_struct *task, enum vec_type type) +{ + return task->thread.vl[type]; +} + +void task_set_vl(struct task_struct *task, enum vec_type type, + unsigned long vl) +{ + task->thread.vl[type] = vl; +} + +unsigned int task_get_vl_onexec(const struct task_struct *task, + enum vec_type type) +{ + return task->thread.vl_onexec[type]; +} + +void task_set_vl_onexec(struct task_struct *task, enum vec_type type, + unsigned long vl) +{ + task->thread.vl_onexec[type] = vl; } /* + * TIF_SME controls whether a task can use SME without trapping while + * in userspace, when TIF_SME is set then we must have storage + * allocated in sve_state and sme_state to store the contents of both ZA + * and the SVE registers for both streaming and non-streaming modes. + * + * If both SVCR.ZA and SVCR.SM are disabled then at any point we + * may disable TIF_SME and reenable traps. + */ + + +/* * TIF_SVE controls whether a task can use SVE without trapping while - * in userspace, and also the way a task's FPSIMD/SVE state is stored - * in thread_struct. + * in userspace, and also (together with TIF_SME) the way a task's + * FPSIMD/SVE state is stored in thread_struct. * * The kernel uses this flag to track whether a user task is actively * using SVE, and therefore whether full SVE register state needs to * be tracked. If not, the cheaper FPSIMD context handling code can * be used instead of the more costly SVE equivalents. * - * * TIF_SVE set: + * * TIF_SVE or SVCR.SM set: * * The task can execute SVE instructions while in userspace without * trapping to the kernel. * - * When stored, Z0-Z31 (incorporating Vn in bits[127:0] or the - * corresponding Zn), P0-P15 and FFR are encoded in in - * task->thread.sve_state, formatted appropriately for vector - * length task->thread.sve_vl. - * - * task->thread.sve_state must point to a valid buffer at least - * sve_state_size(task) bytes in size. - * * During any syscall, the kernel may optionally clear TIF_SVE and * discard the vector state except for the FPSIMD subset. * @@ -191,7 +317,15 @@ static void sve_free(struct task_struct *task) * do_sve_acc() to be called, which does some preparation and then * sets TIF_SVE. * - * When stored, FPSIMD registers V0-V31 are encoded in + * During any syscall, the kernel may optionally clear TIF_SVE and + * discard the vector state except for the FPSIMD subset. + * + * The data will be stored in one of two formats: + * + * * FPSIMD only - FP_STATE_FPSIMD: + * + * When the FPSIMD only state stored task->thread.fp_type is set to + * FP_STATE_FPSIMD, the FPSIMD registers V0-V31 are encoded in * task->thread.uw.fpsimd_state; bits [max : 128] for each of Z0-Z31 are * logically zero but not stored anywhere; P0-P15 and FFR are not * stored and have unspecified values from userspace's point of @@ -199,7 +333,23 @@ static void sve_free(struct task_struct *task) * but userspace is discouraged from relying on this. * * task->thread.sve_state does not need to be non-NULL, valid or any - * particular size: it must not be dereferenced. + * particular size: it must not be dereferenced and any data stored + * there should be considered stale and not referenced. + * + * * SVE state - FP_STATE_SVE: + * + * When the full SVE state is stored task->thread.fp_type is set to + * FP_STATE_SVE and Z0-Z31 (incorporating Vn in bits[127:0] or the + * corresponding Zn), P0-P15 and FFR are encoded in in + * task->thread.sve_state, formatted appropriately for vector + * length task->thread.sve_vl or, if SVCR.SM is set, + * task->thread.sme_vl. The storage for the vector registers in + * task->thread.uw.fpsimd_state should be ignored. + * + * task->thread.sve_state must point to a valid buffer at least + * sve_state_size(task) bytes in size. The data stored in + * task->thread.uw.fpsimd_state.vregs should be considered stale + * and not referenced. * * * FPSR and FPCR are always stored in task->thread.uw.fpsimd_state * irrespective of whether TIF_SVE is clear or set, since these are @@ -212,69 +362,160 @@ static void sve_free(struct task_struct *task) * This function should be called only when the FPSIMD/SVE state in * thread_struct is known to be up to date, when preparing to enter * userspace. - * - * Softirqs (and preemption) must be disabled. */ static void task_fpsimd_load(void) { - WARN_ON(!in_softirq() && !irqs_disabled()); + bool restore_sve_regs = false; + bool restore_ffr; + + WARN_ON(!system_supports_fpsimd()); + WARN_ON(preemptible()); + WARN_ON(test_thread_flag(TIF_KERNEL_FPSTATE)); + + if (system_supports_sve() || system_supports_sme()) { + switch (current->thread.fp_type) { + case FP_STATE_FPSIMD: + /* Stop tracking SVE for this task until next use. */ + clear_thread_flag(TIF_SVE); + break; + case FP_STATE_SVE: + if (!thread_sm_enabled(¤t->thread)) + WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE)); + + if (test_thread_flag(TIF_SVE)) + sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1); + + restore_sve_regs = true; + restore_ffr = true; + break; + default: + /* + * This indicates either a bug in + * fpsimd_save_user_state() or memory corruption, we + * should always record an explicit format + * when we save. We always at least have the + * memory allocated for FPSIMD registers so + * try that and hope for the best. + */ + WARN_ON_ONCE(1); + clear_thread_flag(TIF_SVE); + break; + } + } + + /* Restore SME, override SVE register configuration if needed */ + if (system_supports_sme()) { + unsigned long sme_vl = task_get_sme_vl(current); + + /* Ensure VL is set up for restoring data */ + if (test_thread_flag(TIF_SME)) + sme_set_vq(sve_vq_from_vl(sme_vl) - 1); - if (system_supports_sve() && test_thread_flag(TIF_SVE)) + write_sysreg_s(current->thread.svcr, SYS_SVCR); + + if (thread_za_enabled(¤t->thread)) + sme_load_state(current->thread.sme_state, + system_supports_sme2()); + + if (thread_sm_enabled(¤t->thread)) + restore_ffr = system_supports_fa64(); + } + + if (system_supports_fpmr()) + write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR); + + if (restore_sve_regs) { + WARN_ON_ONCE(current->thread.fp_type != FP_STATE_SVE); sve_load_state(sve_pffr(¤t->thread), ¤t->thread.uw.fpsimd_state.fpsr, - sve_vq_from_vl(current->thread.sve_vl) - 1); - else + restore_ffr); + } else { + WARN_ON_ONCE(current->thread.fp_type != FP_STATE_FPSIMD); fpsimd_load_state(¤t->thread.uw.fpsimd_state); + } } /* * Ensure FPSIMD/SVE storage in memory for the loaded context is up to - * date with respect to the CPU registers. - * - * Softirqs (and preemption) must be disabled. + * date with respect to the CPU registers. Note carefully that the + * current context is the context last bound to the CPU stored in + * last, if KVM is involved this may be the guest VM context rather + * than the host thread for the VM pointed to by current. This means + * that we must always reference the state storage via last rather + * than via current, if we are saving KVM state then it will have + * ensured that the type of registers to save is set in last->to_save. */ -void fpsimd_save(void) +static void fpsimd_save_user_state(void) { - struct user_fpsimd_state *st = __this_cpu_read(fpsimd_last_state.st); + struct cpu_fp_state const *last = + this_cpu_ptr(&fpsimd_last_state); /* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */ + bool save_sve_regs = false; + bool save_ffr; + unsigned int vl; - WARN_ON(!in_softirq() && !irqs_disabled()); + WARN_ON(!system_supports_fpsimd()); + WARN_ON(preemptible()); - if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { - if (system_supports_sve() && test_thread_flag(TIF_SVE)) { - if (WARN_ON(sve_get_vl() != current->thread.sve_vl)) { - /* - * Can't save the user regs, so current would - * re-enter user with corrupt state. - * There's no way to recover, so kill it: - */ - force_signal_inject(SIGKILL, SI_KERNEL, 0); - return; - } + if (test_thread_flag(TIF_FOREIGN_FPSTATE)) + return; - sve_save_state(sve_pffr(¤t->thread), &st->fpsr); - } else - fpsimd_save_state(st); + if (system_supports_fpmr()) + *(last->fpmr) = read_sysreg_s(SYS_FPMR); + + /* + * Save SVE state if it is live. + * + * The syscall ABI discards live SVE state at syscall entry. When + * entering a syscall, fpsimd_syscall_enter() sets to_save to + * FP_STATE_FPSIMD to allow the SVE state to be lazily discarded until + * either new SVE state is loaded+bound or fpsimd_syscall_exit() is + * called prior to a return to userspace. + */ + if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE)) || + last->to_save == FP_STATE_SVE) { + save_sve_regs = true; + save_ffr = true; + vl = last->sve_vl; } -} -/* - * Helpers to translate bit indices in sve_vq_map to VQ values (and - * vice versa). This allows find_next_bit() to be used to find the - * _maximum_ VQ not exceeding a certain value. - */ + if (system_supports_sme()) { + u64 *svcr = last->svcr; -static unsigned int vq_to_bit(unsigned int vq) -{ - return SVE_VQ_MAX - vq; -} + *svcr = read_sysreg_s(SYS_SVCR); -static unsigned int bit_to_vq(unsigned int bit) -{ - if (WARN_ON(bit >= SVE_VQ_MAX)) - bit = SVE_VQ_MAX - 1; + if (*svcr & SVCR_ZA_MASK) + sme_save_state(last->sme_state, + system_supports_sme2()); + + /* If we are in streaming mode override regular SVE. */ + if (*svcr & SVCR_SM_MASK) { + save_sve_regs = true; + save_ffr = system_supports_fa64(); + vl = last->sme_vl; + } + } - return SVE_VQ_MAX - bit; + if (IS_ENABLED(CONFIG_ARM64_SVE) && save_sve_regs) { + /* Get the configured VL from RDVL, will account for SM */ + if (WARN_ON(sve_get_vl() != vl)) { + /* + * Can't save the user regs, so current would + * re-enter user with corrupt state. + * There's no way to recover, so kill it: + */ + force_signal_inject(SIGKILL, SI_KERNEL, 0, 0); + return; + } + + sve_save_state((char *)last->sve_state + + sve_ffr_offset(vl), + &last->st->fpsr, save_ffr); + *last->fp_type = FP_STATE_SVE; + } else { + fpsimd_save_state(last->st); + *last->fp_type = FP_STATE_FPSIMD; + } } /* @@ -283,33 +524,38 @@ static unsigned int bit_to_vq(unsigned int bit) * If things go wrong there's a bug somewhere, but try to fall back to a * safe choice. */ -static unsigned int find_supported_vector_length(unsigned int vl) +static unsigned int find_supported_vector_length(enum vec_type type, + unsigned int vl) { + struct vl_info *info = &vl_info[type]; int bit; - int max_vl = sve_max_vl; + int max_vl = info->max_vl; if (WARN_ON(!sve_vl_valid(vl))) - vl = SVE_VL_MIN; + vl = info->min_vl; if (WARN_ON(!sve_vl_valid(max_vl))) - max_vl = SVE_VL_MIN; + max_vl = info->min_vl; if (vl > max_vl) vl = max_vl; + if (vl < info->min_vl) + vl = info->min_vl; - bit = find_next_bit(sve_vq_map, SVE_VQ_MAX, - vq_to_bit(sve_vq_from_vl(vl))); - return sve_vl_from_vq(bit_to_vq(bit)); + bit = find_next_bit(info->vq_map, SVE_VQ_MAX, + __vq_to_bit(sve_vq_from_vl(vl))); + return sve_vl_from_vq(__bit_to_vq(bit)); } -#ifdef CONFIG_SYSCTL +#if defined(CONFIG_ARM64_SVE) && defined(CONFIG_SYSCTL) -static int sve_proc_do_default_vl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int vec_proc_do_default_vl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { + struct vl_info *info = table->extra1; + enum vec_type type = info->type; int ret; - int vl = sve_default_vl; + int vl = get_default_vl(type); struct ctl_table tmp_table = { .data = &vl, .maxlen = sizeof(vl), @@ -321,22 +567,22 @@ static int sve_proc_do_default_vl(struct ctl_table *table, int write, /* Writing -1 has the special meaning "set to max": */ if (vl == -1) - vl = sve_max_vl; + vl = info->max_vl; if (!sve_vl_valid(vl)) return -EINVAL; - sve_default_vl = find_supported_vector_length(vl); + set_default_vl(type, find_supported_vector_length(type, vl)); return 0; } -static struct ctl_table sve_default_vl_table[] = { +static const struct ctl_table sve_default_vl_table[] = { { .procname = "sve_default_vector_length", .mode = 0644, - .proc_handler = sve_proc_do_default_vl, + .proc_handler = vec_proc_do_default_vl, + .extra1 = &vl_info[ARM64_VEC_SVE], }, - { } }; static int __init sve_sysctl_init(void) @@ -348,38 +594,88 @@ static int __init sve_sysctl_init(void) return 0; } -#else /* ! CONFIG_SYSCTL */ +#else /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */ static int __init sve_sysctl_init(void) { return 0; } -#endif /* ! CONFIG_SYSCTL */ +#endif /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */ + +#if defined(CONFIG_ARM64_SME) && defined(CONFIG_SYSCTL) +static const struct ctl_table sme_default_vl_table[] = { + { + .procname = "sme_default_vector_length", + .mode = 0644, + .proc_handler = vec_proc_do_default_vl, + .extra1 = &vl_info[ARM64_VEC_SME], + }, +}; + +static int __init sme_sysctl_init(void) +{ + if (system_supports_sme()) + if (!register_sysctl("abi", sme_default_vl_table)) + return -EINVAL; + + return 0; +} + +#else /* ! (CONFIG_ARM64_SME && CONFIG_SYSCTL) */ +static int __init sme_sysctl_init(void) { return 0; } +#endif /* ! (CONFIG_ARM64_SME && CONFIG_SYSCTL) */ #define ZREG(sve_state, vq, n) ((char *)(sve_state) + \ (SVE_SIG_ZREG_OFFSET(vq, n) - SVE_SIG_REGS_OFFSET)) +#ifdef CONFIG_CPU_BIG_ENDIAN +static __uint128_t arm64_cpu_to_le128(__uint128_t x) +{ + u64 a = swab64(x); + u64 b = swab64(x >> 64); + + return ((__uint128_t)a << 64) | b; +} +#else +static __uint128_t arm64_cpu_to_le128(__uint128_t x) +{ + return x; +} +#endif + +#define arm64_le128_to_cpu(x) arm64_cpu_to_le128(x) + +static void __fpsimd_to_sve(void *sst, struct user_fpsimd_state const *fst, + unsigned int vq) +{ + unsigned int i; + __uint128_t *p; + + for (i = 0; i < SVE_NUM_ZREGS; ++i) { + p = (__uint128_t *)ZREG(sst, vq, i); + *p = arm64_cpu_to_le128(fst->vregs[i]); + } +} + /* * Transfer the FPSIMD state in task->thread.uw.fpsimd_state to * task->thread.sve_state. * * Task can be a non-runnable task, or current. In the latter case, - * softirqs (and preemption) must be disabled. + * the caller must have ownership of the cpu FPSIMD context before calling + * this function. * task->thread.sve_state must point to at least sve_state_size(task) * bytes of allocated kernel memory. * task->thread.uw.fpsimd_state must be up to date before calling this * function. */ -static void fpsimd_to_sve(struct task_struct *task) +static inline void fpsimd_to_sve(struct task_struct *task) { unsigned int vq; void *sst = task->thread.sve_state; struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state; - unsigned int i; - if (!system_supports_sve()) + if (!system_supports_sve() && !system_supports_sme()) return; - vq = sve_vq_from_vl(task->thread.sve_vl); - for (i = 0; i < 32; ++i) - memcpy(ZREG(sst, vq, i), &fst->vregs[i], - sizeof(fst->vregs[i])); + vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); + __fpsimd_to_sve(sst, fst, vq); } /* @@ -387,36 +683,64 @@ static void fpsimd_to_sve(struct task_struct *task) * task->thread.uw.fpsimd_state. * * Task can be a non-runnable task, or current. In the latter case, - * softirqs (and preemption) must be disabled. + * the caller must have ownership of the cpu FPSIMD context before calling + * this function. * task->thread.sve_state must point to at least sve_state_size(task) * bytes of allocated kernel memory. * task->thread.sve_state must be up to date before calling this function. */ -static void sve_to_fpsimd(struct task_struct *task) +static inline void sve_to_fpsimd(struct task_struct *task) { - unsigned int vq; + unsigned int vq, vl; void const *sst = task->thread.sve_state; struct user_fpsimd_state *fst = &task->thread.uw.fpsimd_state; unsigned int i; + __uint128_t const *p; - if (!system_supports_sve()) + if (!system_supports_sve() && !system_supports_sme()) return; - vq = sve_vq_from_vl(task->thread.sve_vl); - for (i = 0; i < 32; ++i) - memcpy(&fst->vregs[i], ZREG(sst, vq, i), - sizeof(fst->vregs[i])); + vl = thread_get_cur_vl(&task->thread); + vq = sve_vq_from_vl(vl); + for (i = 0; i < SVE_NUM_ZREGS; ++i) { + p = (__uint128_t const *)ZREG(sst, vq, i); + fst->vregs[i] = arm64_le128_to_cpu(*p); + } } -#ifdef CONFIG_ARM64_SVE +static inline void __fpsimd_zero_vregs(struct user_fpsimd_state *fpsimd) +{ + memset(&fpsimd->vregs, 0, sizeof(fpsimd->vregs)); +} /* - * Return how many bytes of memory are required to store the full SVE - * state for task, given task's currently configured vector length. + * Simulate the effects of an SMSTOP SM instruction. */ -size_t sve_state_size(struct task_struct const *task) +void task_smstop_sm(struct task_struct *task) +{ + if (!thread_sm_enabled(&task->thread)) + return; + + __fpsimd_zero_vregs(&task->thread.uw.fpsimd_state); + task->thread.uw.fpsimd_state.fpsr = 0x0800009f; + if (system_supports_fpmr()) + task->thread.uw.fpmr = 0; + + task->thread.svcr &= ~SVCR_SM_MASK; + task->thread.fp_type = FP_STATE_FPSIMD; +} + +void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p) { - return SVE_SIG_REGS_SIZE(sve_vq_from_vl(task->thread.sve_vl)); + write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK, + SYS_SCTLR_EL1); +} + +#ifdef CONFIG_ARM64_SVE +static void sve_free(struct task_struct *task) +{ + kfree(task->thread.sve_state); + task->thread.sve_state = NULL; } /* @@ -429,147 +753,154 @@ size_t sve_state_size(struct task_struct const *task) * do_sve_acc() case, there is no ABI requirement to hide stale data * written previously be task. */ -void sve_alloc(struct task_struct *task) +void sve_alloc(struct task_struct *task, bool flush) { if (task->thread.sve_state) { - memset(task->thread.sve_state, 0, sve_state_size(current)); + if (flush) + memset(task->thread.sve_state, 0, + sve_state_size(task)); return; } /* This is a small allocation (maximum ~8KB) and Should Not Fail. */ task->thread.sve_state = kzalloc(sve_state_size(task), GFP_KERNEL); - - /* - * If future SVE revisions can have larger vectors though, - * this may cease to be true: - */ - BUG_ON(!task->thread.sve_state); -} - - -/* - * Ensure that task->thread.sve_state is up to date with respect to - * the user task, irrespective of when SVE is in use or not. - * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - */ -void fpsimd_sync_to_sve(struct task_struct *task) -{ - if (!test_tsk_thread_flag(task, TIF_SVE)) - fpsimd_to_sve(task); } /* - * Ensure that task->thread.uw.fpsimd_state is up to date with respect to - * the user task, irrespective of whether SVE is in use or not. + * Ensure that task->thread.uw.fpsimd_state is up to date with respect to the + * task's currently effective FPSIMD/SVE state. * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. + * The task's FPSIMD/SVE/SME state must not be subject to concurrent + * manipulation. */ -void sve_sync_to_fpsimd(struct task_struct *task) +void fpsimd_sync_from_effective_state(struct task_struct *task) { - if (test_tsk_thread_flag(task, TIF_SVE)) + if (task->thread.fp_type == FP_STATE_SVE) sve_to_fpsimd(task); } /* - * Ensure that task->thread.sve_state is up to date with respect to - * the task->thread.uw.fpsimd_state. + * Ensure that the task's currently effective FPSIMD/SVE state is up to date + * with respect to task->thread.uw.fpsimd_state, zeroing any effective + * non-FPSIMD (S)SVE state. * - * This should only be called by ptrace to merge new FPSIMD register - * values into a task for which SVE is currently active. - * task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - * task->thread.uw.fpsimd_state must already have been initialised with - * the new FPSIMD register values to be merged in. + * The task's FPSIMD/SVE/SME state must not be subject to concurrent + * manipulation. */ -void sve_sync_from_fpsimd_zeropad(struct task_struct *task) +void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task) { unsigned int vq; void *sst = task->thread.sve_state; struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state; - unsigned int i; - if (!test_tsk_thread_flag(task, TIF_SVE)) + if (task->thread.fp_type != FP_STATE_SVE) return; - vq = sve_vq_from_vl(task->thread.sve_vl); + vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); memset(sst, 0, SVE_SIG_REGS_SIZE(vq)); - - for (i = 0; i < 32; ++i) - memcpy(ZREG(sst, vq, i), &fst->vregs[i], - sizeof(fst->vregs[i])); + __fpsimd_to_sve(sst, fst, vq); } -int sve_set_vector_length(struct task_struct *task, - unsigned long vl, unsigned long flags) +static int change_live_vector_length(struct task_struct *task, + enum vec_type type, + unsigned long vl) { - if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT | - PR_SVE_SET_VL_ONEXEC)) - return -EINVAL; + unsigned int sve_vl = task_get_sve_vl(task); + unsigned int sme_vl = task_get_sme_vl(task); + void *sve_state = NULL, *sme_state = NULL; - if (!sve_vl_valid(vl)) - return -EINVAL; + if (type == ARM64_VEC_SME) + sme_vl = vl; + else + sve_vl = vl; /* - * Clamp to the maximum vector length that VL-agnostic SVE code can - * work with. A flag may be assigned in the future to allow setting - * of larger vector lengths without confusing older software. + * Allocate the new sve_state and sme_state before freeing the old + * copies so that allocation failure can be handled without needing to + * mutate the task's state in any way. + * + * Changes to the SVE vector length must not discard live ZA state or + * clear PSTATE.ZA, as userspace code which is unaware of the AAPCS64 + * ZA lazy saving scheme may attempt to change the SVE vector length + * while unsaved/dormant ZA state exists. */ - if (vl > SVE_VL_ARCH_MAX) - vl = SVE_VL_ARCH_MAX; - - vl = find_supported_vector_length(vl); + sve_state = kzalloc(__sve_state_size(sve_vl, sme_vl), GFP_KERNEL); + if (!sve_state) + goto out_mem; + + if (type == ARM64_VEC_SME) { + sme_state = kzalloc(__sme_state_size(sme_vl), GFP_KERNEL); + if (!sme_state) + goto out_mem; + } - if (flags & (PR_SVE_VL_INHERIT | - PR_SVE_SET_VL_ONEXEC)) - task->thread.sve_vl_onexec = vl; + if (task == current) + fpsimd_save_and_flush_current_state(); else - /* Reset VL to system default on next exec: */ - task->thread.sve_vl_onexec = 0; - - /* Only actually set the VL if not deferred: */ - if (flags & PR_SVE_SET_VL_ONEXEC) - goto out; - - if (vl == task->thread.sve_vl) - goto out; + fpsimd_flush_task_state(task); /* - * To ensure the FPSIMD bits of the SVE vector registers are preserved, - * write any live register state back to task_struct, and convert to a - * non-SVE thread. + * Always preserve PSTATE.SM and the effective FPSIMD state, zeroing + * other SVE state. */ - if (task == current) { - local_bh_disable(); + fpsimd_sync_from_effective_state(task); + task_set_vl(task, type, vl); + kfree(task->thread.sve_state); + task->thread.sve_state = sve_state; + fpsimd_sync_to_effective_state_zeropad(task); - fpsimd_save(); - set_thread_flag(TIF_FOREIGN_FPSTATE); + if (type == ARM64_VEC_SME) { + task->thread.svcr &= ~SVCR_ZA_MASK; + kfree(task->thread.sme_state); + task->thread.sme_state = sme_state; } - fpsimd_flush_task_state(task); - if (test_and_clear_tsk_thread_flag(task, TIF_SVE)) - sve_to_fpsimd(task); + return 0; - if (task == current) - local_bh_enable(); +out_mem: + kfree(sve_state); + kfree(sme_state); + return -ENOMEM; +} + +int vec_set_vector_length(struct task_struct *task, enum vec_type type, + unsigned long vl, unsigned long flags) +{ + bool onexec = flags & PR_SVE_SET_VL_ONEXEC; + bool inherit = flags & PR_SVE_VL_INHERIT; + + if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT | + PR_SVE_SET_VL_ONEXEC)) + return -EINVAL; + + if (!sve_vl_valid(vl)) + return -EINVAL; /* - * Force reallocation of task SVE state to the correct size - * on next use: + * Clamp to the maximum vector length that VL-agnostic code + * can work with. A flag may be assigned in the future to + * allow setting of larger vector lengths without confusing + * older software. */ - sve_free(task); + if (vl > VL_ARCH_MAX) + vl = VL_ARCH_MAX; - task->thread.sve_vl = vl; + vl = find_supported_vector_length(type, vl); + + if (!onexec && vl != task_get_vl(task, type)) { + if (change_live_vector_length(task, type, vl)) + return -ENOMEM; + } -out: - update_tsk_thread_flag(task, TIF_SVE_VL_INHERIT, + if (onexec || inherit) + task_set_vl_onexec(task, type, vl); + else + /* Reset VL to system default on next exec: */ + task_set_vl_onexec(task, type, 0); + + update_tsk_thread_flag(task, vec_vl_inherit_flag(type), flags & PR_SVE_VL_INHERIT); return 0; @@ -577,20 +908,21 @@ out: /* * Encode the current vector length and flags for return. - * This is only required for prctl(): ptrace has separate fields + * This is only required for prctl(): ptrace has separate fields. + * SVE and SME use the same bits for _ONEXEC and _INHERIT. * - * flags are as for sve_set_vector_length(). + * flags are as for vec_set_vector_length(). */ -static int sve_prctl_status(unsigned long flags) +static int vec_prctl_status(enum vec_type type, unsigned long flags) { int ret; if (flags & PR_SVE_SET_VL_ONEXEC) - ret = current->thread.sve_vl_onexec; + ret = task_get_vl_onexec(current, type); else - ret = current->thread.sve_vl; + ret = task_get_vl(current, type); - if (test_thread_flag(TIF_SVE_VL_INHERIT)) + if (test_thread_flag(vec_vl_inherit_flag(type))) ret |= PR_SVE_VL_INHERIT; return ret; @@ -605,145 +937,208 @@ int sve_set_current_vl(unsigned long arg) vl = arg & PR_SVE_VL_LEN_MASK; flags = arg & ~vl; - if (!system_supports_sve()) + if (!system_supports_sve() || is_compat_task()) return -EINVAL; - ret = sve_set_vector_length(current, vl, flags); + ret = vec_set_vector_length(current, ARM64_VEC_SVE, vl, flags); if (ret) return ret; - return sve_prctl_status(flags); + return vec_prctl_status(ARM64_VEC_SVE, flags); } /* PR_SVE_GET_VL */ int sve_get_current_vl(void) { - if (!system_supports_sve()) + if (!system_supports_sve() || is_compat_task()) return -EINVAL; - return sve_prctl_status(0); + return vec_prctl_status(ARM64_VEC_SVE, 0); } -/* - * Bitmap for temporary storage of the per-CPU set of supported vector lengths - * during secondary boot. - */ -static DECLARE_BITMAP(sve_secondary_vq_map, SVE_VQ_MAX); +#ifdef CONFIG_ARM64_SME +/* PR_SME_SET_VL */ +int sme_set_current_vl(unsigned long arg) +{ + unsigned long vl, flags; + int ret; + + vl = arg & PR_SME_VL_LEN_MASK; + flags = arg & ~vl; -static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX)) + if (!system_supports_sme() || is_compat_task()) + return -EINVAL; + + ret = vec_set_vector_length(current, ARM64_VEC_SME, vl, flags); + if (ret) + return ret; + + return vec_prctl_status(ARM64_VEC_SME, flags); +} + +/* PR_SME_GET_VL */ +int sme_get_current_vl(void) +{ + if (!system_supports_sme() || is_compat_task()) + return -EINVAL; + + return vec_prctl_status(ARM64_VEC_SME, 0); +} +#endif /* CONFIG_ARM64_SME */ + +static void vec_probe_vqs(struct vl_info *info, + DECLARE_BITMAP(map, SVE_VQ_MAX)) { unsigned int vq, vl; - unsigned long zcr; bitmap_zero(map, SVE_VQ_MAX); - zcr = ZCR_ELx_LEN_MASK; - zcr = read_sysreg_s(SYS_ZCR_EL1) & ~zcr; - for (vq = SVE_VQ_MAX; vq >= SVE_VQ_MIN; --vq) { - write_sysreg_s(zcr | (vq - 1), SYS_ZCR_EL1); /* self-syncing */ - vl = sve_get_vl(); + write_vl(info->type, vq - 1); /* self-syncing */ + + switch (info->type) { + case ARM64_VEC_SVE: + vl = sve_get_vl(); + break; + case ARM64_VEC_SME: + vl = sme_get_vl(); + break; + default: + vl = 0; + break; + } + + /* Minimum VL identified? */ + if (sve_vq_from_vl(vl) > vq) + break; + vq = sve_vq_from_vl(vl); /* skip intervening lengths */ - set_bit(vq_to_bit(vq), map); + set_bit(__vq_to_bit(vq), map); } } -void __init sve_init_vq_map(void) +/* + * Initialise the set of known supported VQs for the boot CPU. + * This is called during kernel boot, before secondary CPUs are brought up. + */ +void __init vec_init_vq_map(enum vec_type type) { - sve_probe_vqs(sve_vq_map); + struct vl_info *info = &vl_info[type]; + vec_probe_vqs(info, info->vq_map); + bitmap_copy(info->vq_partial_map, info->vq_map, SVE_VQ_MAX); } /* * If we haven't committed to the set of supported VQs yet, filter out * those not supported by the current CPU. + * This function is called during the bring-up of early secondary CPUs only. */ -void sve_update_vq_map(void) +void vec_update_vq_map(enum vec_type type) { - sve_probe_vqs(sve_secondary_vq_map); - bitmap_and(sve_vq_map, sve_vq_map, sve_secondary_vq_map, SVE_VQ_MAX); + struct vl_info *info = &vl_info[type]; + DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); + + vec_probe_vqs(info, tmp_map); + bitmap_and(info->vq_map, info->vq_map, tmp_map, SVE_VQ_MAX); + bitmap_or(info->vq_partial_map, info->vq_partial_map, tmp_map, + SVE_VQ_MAX); } -/* Check whether the current CPU supports all VQs in the committed set */ -int sve_verify_vq_map(void) +/* + * Check whether the current CPU supports all VQs in the committed set. + * This function is called during the bring-up of late secondary CPUs only. + */ +int vec_verify_vq_map(enum vec_type type) { - int ret = 0; + struct vl_info *info = &vl_info[type]; + DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); + unsigned long b; - sve_probe_vqs(sve_secondary_vq_map); - bitmap_andnot(sve_secondary_vq_map, sve_vq_map, sve_secondary_vq_map, - SVE_VQ_MAX); - if (!bitmap_empty(sve_secondary_vq_map, SVE_VQ_MAX)) { - pr_warn("SVE: cpu%d: Required vector length(s) missing\n", - smp_processor_id()); - ret = -EINVAL; + vec_probe_vqs(info, tmp_map); + + bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX); + if (bitmap_intersects(tmp_map, info->vq_map, SVE_VQ_MAX)) { + pr_warn("%s: cpu%d: Required vector length(s) missing\n", + info->name, smp_processor_id()); + return -EINVAL; } - return ret; + if (!IS_ENABLED(CONFIG_KVM) || !is_hyp_mode_available()) + return 0; + + /* + * For KVM, it is necessary to ensure that this CPU doesn't + * support any vector length that guests may have probed as + * unsupported. + */ + + /* Recover the set of supported VQs: */ + bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX); + /* Find VQs supported that are not globally supported: */ + bitmap_andnot(tmp_map, tmp_map, info->vq_map, SVE_VQ_MAX); + + /* Find the lowest such VQ, if any: */ + b = find_last_bit(tmp_map, SVE_VQ_MAX); + if (b >= SVE_VQ_MAX) + return 0; /* no mismatches */ + + /* + * Mismatches above sve_max_virtualisable_vl are fine, since + * no guest is allowed to configure ZCR_EL2.LEN to exceed this: + */ + if (sve_vl_from_vq(__bit_to_vq(b)) <= info->max_virtualisable_vl) { + pr_warn("%s: cpu%d: Unsupported vector length(s) present\n", + info->name, smp_processor_id()); + return -EINVAL; + } + + return 0; } static void __init sve_efi_setup(void) { + int max_vl = 0; + int i; + if (!IS_ENABLED(CONFIG_EFI)) return; + for (i = 0; i < ARRAY_SIZE(vl_info); i++) + max_vl = max(vl_info[i].max_vl, max_vl); + /* * alloc_percpu() warns and prints a backtrace if this goes wrong. * This is evidence of a crippled system and we are returning void, * so no attempt is made to handle this situation here. */ - if (!sve_vl_valid(sve_max_vl)) + if (!sve_vl_valid(max_vl)) goto fail; - efi_sve_state = __alloc_percpu( - SVE_SIG_REGS_SIZE(sve_vq_from_vl(sve_max_vl)), SVE_VQ_BYTES); + efi_sve_state = kmalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), + GFP_KERNEL); if (!efi_sve_state) goto fail; return; fail: - panic("Cannot allocate percpu memory for EFI SVE save/restore"); + panic("Cannot allocate memory for EFI SVE save/restore"); } -/* - * Enable SVE for EL1. - * Intended for use by the cpufeatures code during CPU boot. - */ -void sve_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p) +void cpu_enable_sve(const struct arm64_cpu_capabilities *__always_unused p) { write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_ZEN_EL1EN, CPACR_EL1); isb(); -} - -/* - * Read the pseudo-ZCR used by cpufeatures to identify the supported SVE - * vector length. - * - * Use only if SVE is present. - * This function clobbers the SVE vector length. - */ -u64 read_zcr_features(void) -{ - u64 zcr; - unsigned int vq_max; - - /* - * Set the maximum possible VL, and write zeroes to all other - * bits to see if they stick. - */ - sve_kernel_enable(NULL); - write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL1); - zcr = read_sysreg_s(SYS_ZCR_EL1); - zcr &= ~(u64)ZCR_ELx_LEN_MASK; /* find sticky 1s outside LEN field */ - vq_max = sve_vq_from_vl(sve_get_vl()); - zcr |= vq_max - 1; /* set LEN field to maximum effective value */ - - return zcr; + write_sysreg_s(0, SYS_ZCR_EL1); } void __init sve_setup(void) { - u64 zcr; + struct vl_info *info = &vl_info[ARM64_VEC_SVE]; + DECLARE_BITMAP(tmp_map, SVE_VQ_MAX); + unsigned long b; + int max_bit; if (!system_supports_sve()) return; @@ -753,29 +1148,43 @@ void __init sve_setup(void) * so sve_vq_map must have at least SVE_VQ_MIN set. * If something went wrong, at least try to patch it up: */ - if (WARN_ON(!test_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map))) - set_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map); + if (WARN_ON(!test_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map))) + set_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map); - zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1); - sve_max_vl = sve_vl_from_vq((zcr & ZCR_ELx_LEN_MASK) + 1); - - /* - * Sanity-check that the max VL we determined through CPU features - * corresponds properly to sve_vq_map. If not, do our best: - */ - if (WARN_ON(sve_max_vl != find_supported_vector_length(sve_max_vl))) - sve_max_vl = find_supported_vector_length(sve_max_vl); + max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX); + info->max_vl = sve_vl_from_vq(__bit_to_vq(max_bit)); /* * For the default VL, pick the maximum supported value <= 64. * VL == 64 is guaranteed not to grow the signal frame. */ - sve_default_vl = find_supported_vector_length(64); + set_sve_default_vl(find_supported_vector_length(ARM64_VEC_SVE, 64)); - pr_info("SVE: maximum available vector length %u bytes per vector\n", - sve_max_vl); - pr_info("SVE: default vector length %u bytes per vector\n", - sve_default_vl); + bitmap_andnot(tmp_map, info->vq_partial_map, info->vq_map, + SVE_VQ_MAX); + + b = find_last_bit(tmp_map, SVE_VQ_MAX); + if (b >= SVE_VQ_MAX) + /* No non-virtualisable VLs found */ + info->max_virtualisable_vl = SVE_VQ_MAX; + else if (WARN_ON(b == SVE_VQ_MAX - 1)) + /* No virtualisable VLs? This is architecturally forbidden. */ + info->max_virtualisable_vl = SVE_VQ_MIN; + else /* b + 1 < SVE_VQ_MAX */ + info->max_virtualisable_vl = sve_vl_from_vq(__bit_to_vq(b + 1)); + + if (info->max_virtualisable_vl > info->max_vl) + info->max_virtualisable_vl = info->max_vl; + + pr_info("%s: maximum available vector length %u bytes per vector\n", + info->name, info->max_vl); + pr_info("%s: default vector length %u bytes per vector\n", + info->name, get_sve_default_vl()); + + /* KVM decides whether to support mismatched systems. Just warn here: */ + if (sve_max_virtualisable_vl() < sve_max_vl()) + pr_warn("%s: unvirtualisable vector lengths present\n", + info->name); sve_efi_setup(); } @@ -786,61 +1195,284 @@ void __init sve_setup(void) */ void fpsimd_release_task(struct task_struct *dead_task) { - __sve_free(dead_task); + sve_free(dead_task); + sme_free(dead_task); } #endif /* CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +/* + * Ensure that task->thread.sme_state is allocated and sufficiently large. + * + * This function should be used only in preparation for replacing + * task->thread.sme_state with new data. The memory is always zeroed + * here to prevent stale data from showing through: this is done in + * the interest of testability and predictability, the architecture + * guarantees that when ZA is enabled it will be zeroed. + */ +void sme_alloc(struct task_struct *task, bool flush) +{ + if (task->thread.sme_state) { + if (flush) + memset(task->thread.sme_state, 0, + sme_state_size(task)); + return; + } + + /* This could potentially be up to 64K. */ + task->thread.sme_state = + kzalloc(sme_state_size(task), GFP_KERNEL); +} + +static void sme_free(struct task_struct *task) +{ + kfree(task->thread.sme_state); + task->thread.sme_state = NULL; +} + +void cpu_enable_sme(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* Set priority for all PEs to architecturally defined minimum */ + write_sysreg_s(read_sysreg_s(SYS_SMPRI_EL1) & ~SMPRI_EL1_PRIORITY_MASK, + SYS_SMPRI_EL1); + + /* Allow SME in kernel */ + write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_SMEN_EL1EN, CPACR_EL1); + isb(); + + /* Ensure all bits in SMCR are set to known values */ + write_sysreg_s(0, SYS_SMCR_EL1); + + /* Allow EL0 to access TPIDR2 */ + write_sysreg(read_sysreg(SCTLR_EL1) | SCTLR_ELx_ENTP2, SCTLR_EL1); + isb(); +} + +void cpu_enable_sme2(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* This must be enabled after SME */ + BUILD_BUG_ON(ARM64_SME2 <= ARM64_SME); + + /* Allow use of ZT0 */ + write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_EZT0_MASK, + SYS_SMCR_EL1); +} + +void cpu_enable_fa64(const struct arm64_cpu_capabilities *__always_unused p) +{ + /* This must be enabled after SME */ + BUILD_BUG_ON(ARM64_SME_FA64 <= ARM64_SME); + + /* Allow use of FA64 */ + write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK, + SYS_SMCR_EL1); +} + +void __init sme_setup(void) +{ + struct vl_info *info = &vl_info[ARM64_VEC_SME]; + int min_bit, max_bit; + + if (!system_supports_sme()) + return; + + min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); + + /* + * SME doesn't require any particular vector length be + * supported but it does require at least one. We should have + * disabled the feature entirely while bringing up CPUs but + * let's double check here. The bitmap is SVE_VQ_MAP sized for + * sharing with SVE. + */ + WARN_ON(min_bit >= SVE_VQ_MAX); + + info->min_vl = sve_vl_from_vq(__bit_to_vq(min_bit)); + + max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX); + info->max_vl = sve_vl_from_vq(__bit_to_vq(max_bit)); + + WARN_ON(info->min_vl > info->max_vl); + + /* + * For the default VL, pick the maximum supported value <= 32 + * (256 bits) if there is one since this is guaranteed not to + * grow the signal frame when in streaming mode, otherwise the + * minimum available VL will be used. + */ + set_sme_default_vl(find_supported_vector_length(ARM64_VEC_SME, 32)); + + pr_info("SME: minimum available vector length %u bytes per vector\n", + info->min_vl); + pr_info("SME: maximum available vector length %u bytes per vector\n", + info->max_vl); + pr_info("SME: default vector length %u bytes per vector\n", + get_sme_default_vl()); +} + +void sme_suspend_exit(void) +{ + u64 smcr = 0; + + if (!system_supports_sme()) + return; + + if (system_supports_fa64()) + smcr |= SMCR_ELx_FA64; + if (system_supports_sme2()) + smcr |= SMCR_ELx_EZT0; + + write_sysreg_s(smcr, SYS_SMCR_EL1); + write_sysreg_s(0, SYS_SMPRI_EL1); +} + +#endif /* CONFIG_ARM64_SME */ + +static void sve_init_regs(void) +{ + /* + * Convert the FPSIMD state to SVE, zeroing all the state that + * is not shared with FPSIMD. If (as is likely) the current + * state is live in the registers then do this there and + * update our metadata for the current task including + * disabling the trap, otherwise update our in-memory copy. + * We are guaranteed to not be in streaming mode, we can only + * take a SVE trap when not in streaming mode and we can't be + * in streaming mode when taking a SME trap. + */ + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { + unsigned long vq_minus_one = + sve_vq_from_vl(task_get_sve_vl(current)) - 1; + sve_set_vq(vq_minus_one); + sve_flush_live(true, vq_minus_one); + fpsimd_bind_task_to_cpu(); + } else { + fpsimd_to_sve(current); + current->thread.fp_type = FP_STATE_SVE; + fpsimd_flush_task_state(current); + } +} + /* * Trapped SVE access * * Storage is allocated for the full SVE state, the current FPSIMD - * register contents are migrated across, and TIF_SVE is set so that - * the SVE access trap will be disabled the next time this task - * reaches ret_to_user. + * register contents are migrated across, and the access trap is + * disabled. * - * TIF_SVE should be clear on entry: otherwise, task_fpsimd_load() + * TIF_SVE should be clear on entry: otherwise, fpsimd_restore_current_state() * would have disabled the SVE access trap for userspace during * ret_to_user, making an SVE access trap impossible in that case. */ -asmlinkage void do_sve_acc(unsigned int esr, struct pt_regs *regs) +void do_sve_acc(unsigned long esr, struct pt_regs *regs) { /* Even if we chose not to use SVE, the hardware could still trap: */ if (unlikely(!system_supports_sve()) || WARN_ON(is_compat_task())) { - force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } - sve_alloc(current); - - local_bh_disable(); - - fpsimd_save(); - fpsimd_to_sve(current); + sve_alloc(current, true); + if (!current->thread.sve_state) { + force_sig(SIGKILL); + return; + } - /* Force ret_to_user to reload the registers: */ - fpsimd_flush_task_state(current); - set_thread_flag(TIF_FOREIGN_FPSTATE); + get_cpu_fpsimd_context(); if (test_and_set_thread_flag(TIF_SVE)) WARN_ON(1); /* SVE access shouldn't have trapped */ - local_bh_enable(); + /* + * Even if the task can have used streaming mode we can only + * generate SVE access traps in normal SVE mode and + * transitioning out of streaming mode may discard any + * streaming mode state. Always clear the high bits to avoid + * any potential errors tracking what is properly initialised. + */ + sve_init_regs(); + + put_cpu_fpsimd_context(); +} + +/* + * Trapped SME access + * + * Storage is allocated for the full SVE and SME state, the current + * FPSIMD register contents are migrated to SVE if SVE is not already + * active, and the access trap is disabled. + * + * TIF_SME should be clear on entry: otherwise, fpsimd_restore_current_state() + * would have disabled the SME access trap for userspace during + * ret_to_user, making an SME access trap impossible in that case. + */ +void do_sme_acc(unsigned long esr, struct pt_regs *regs) +{ + /* Even if we chose not to use SME, the hardware could still trap: */ + if (unlikely(!system_supports_sme()) || WARN_ON(is_compat_task())) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + /* + * If this not a trap due to SME being disabled then something + * is being used in the wrong mode, report as SIGILL. + */ + if (ESR_ELx_SME_ISS_SMTC(esr) != ESR_ELx_SME_ISS_SMTC_SME_DISABLED) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + sve_alloc(current, false); + sme_alloc(current, true); + if (!current->thread.sve_state || !current->thread.sme_state) { + force_sig(SIGKILL); + return; + } + + get_cpu_fpsimd_context(); + + /* With TIF_SME userspace shouldn't generate any traps */ + if (test_and_set_thread_flag(TIF_SME)) + WARN_ON(1); + + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { + unsigned long vq_minus_one = + sve_vq_from_vl(task_get_sme_vl(current)) - 1; + sme_set_vq(vq_minus_one); + + fpsimd_bind_task_to_cpu(); + } else { + fpsimd_flush_task_state(current); + } + + put_cpu_fpsimd_context(); } /* * Trapped FP/ASIMD access. */ -asmlinkage void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) +void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs) { - /* TODO: implement lazy context saving/restoring */ - WARN_ON(1); + /* Even if we chose not to use FPSIMD, the hardware could still trap: */ + if (!system_supports_fpsimd()) { + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); + return; + } + + /* + * When FPSIMD is enabled, we should never take a trap unless something + * has gone very wrong. + */ + BUG(); } /* * Raise a SIGFPE for the current process. */ -asmlinkage void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) +void do_fpsimd_exc(unsigned long esr, struct pt_regs *regs) { unsigned int si_code = FPE_FLTUNK; @@ -862,6 +1494,57 @@ asmlinkage void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) current); } +static void fpsimd_load_kernel_state(struct task_struct *task) +{ + struct cpu_fp_state *last = this_cpu_ptr(&fpsimd_last_state); + + /* + * Elide the load if this CPU holds the most recent kernel mode + * FPSIMD context of the current task. + */ + if (last->st == task->thread.kernel_fpsimd_state && + task->thread.kernel_fpsimd_cpu == smp_processor_id()) + return; + + fpsimd_load_state(task->thread.kernel_fpsimd_state); +} + +static void fpsimd_save_kernel_state(struct task_struct *task) +{ + struct cpu_fp_state cpu_fp_state = { + .st = task->thread.kernel_fpsimd_state, + .to_save = FP_STATE_FPSIMD, + }; + + BUG_ON(!cpu_fp_state.st); + + fpsimd_save_state(task->thread.kernel_fpsimd_state); + fpsimd_bind_state_to_cpu(&cpu_fp_state); + + task->thread.kernel_fpsimd_cpu = smp_processor_id(); +} + +/* + * Invalidate any task's FPSIMD state that is present on this cpu. + * The FPSIMD context should be acquired with get_cpu_fpsimd_context() + * before calling this function. + */ +static void fpsimd_flush_cpu_state(void) +{ + WARN_ON(!system_supports_fpsimd()); + __this_cpu_write(fpsimd_last_state.st, NULL); + + /* + * Leaving streaming mode enabled will cause issues for any kernel + * NEON and leaving streaming mode or ZA enabled may increase power + * consumption. + */ + if (system_supports_sme()) + sme_smstop(); + + set_thread_flag(TIF_FOREIGN_FPSTATE); +} + void fpsimd_thread_switch(struct task_struct *next) { bool wrong_task, wrong_cpu; @@ -869,73 +1552,111 @@ void fpsimd_thread_switch(struct task_struct *next) if (!system_supports_fpsimd()) return; + WARN_ON_ONCE(!irqs_disabled()); + /* Save unsaved fpsimd state, if any: */ - fpsimd_save(); + if (test_thread_flag(TIF_KERNEL_FPSTATE)) + fpsimd_save_kernel_state(current); + else + fpsimd_save_user_state(); + + if (test_tsk_thread_flag(next, TIF_KERNEL_FPSTATE)) { + fpsimd_flush_cpu_state(); + fpsimd_load_kernel_state(next); + } else { + /* + * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's + * state. For kernel threads, FPSIMD registers are never + * loaded with user mode FPSIMD state and so wrong_task and + * wrong_cpu will always be true. + */ + wrong_task = __this_cpu_read(fpsimd_last_state.st) != + &next->thread.uw.fpsimd_state; + wrong_cpu = next->thread.fpsimd_cpu != smp_processor_id(); + + update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, + wrong_task || wrong_cpu); + } +} + +static void fpsimd_flush_thread_vl(enum vec_type type) +{ + int vl, supported_vl; /* - * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's - * state. For kernel threads, FPSIMD registers are never loaded - * and wrong_task and wrong_cpu will always be true. + * Reset the task vector length as required. This is where we + * ensure that all user tasks have a valid vector length + * configured: no kernel task can become a user task without + * an exec and hence a call to this function. By the time the + * first call to this function is made, all early hardware + * probing is complete, so __sve_default_vl should be valid. + * If a bug causes this to go wrong, we make some noise and + * try to fudge thread.sve_vl to a safe value here. */ - wrong_task = __this_cpu_read(fpsimd_last_state.st) != - &next->thread.uw.fpsimd_state; - wrong_cpu = next->thread.fpsimd_cpu != smp_processor_id(); + vl = task_get_vl_onexec(current, type); + if (!vl) + vl = get_default_vl(type); + + if (WARN_ON(!sve_vl_valid(vl))) + vl = vl_info[type].min_vl; + + supported_vl = find_supported_vector_length(type, vl); + if (WARN_ON(supported_vl != vl)) + vl = supported_vl; - update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, - wrong_task || wrong_cpu); + task_set_vl(current, type, vl); + + /* + * If the task is not set to inherit, ensure that the vector + * length will be reset by a subsequent exec: + */ + if (!test_thread_flag(vec_vl_inherit_flag(type))) + task_set_vl_onexec(current, type, 0); } void fpsimd_flush_thread(void) { - int vl, supported_vl; + void *sve_state = NULL; + void *sme_state = NULL; if (!system_supports_fpsimd()) return; - local_bh_disable(); + get_cpu_fpsimd_context(); + fpsimd_flush_task_state(current); memset(¤t->thread.uw.fpsimd_state, 0, sizeof(current->thread.uw.fpsimd_state)); - fpsimd_flush_task_state(current); if (system_supports_sve()) { clear_thread_flag(TIF_SVE); - sve_free(current); - /* - * Reset the task vector length as required. - * This is where we ensure that all user tasks have a valid - * vector length configured: no kernel task can become a user - * task without an exec and hence a call to this function. - * By the time the first call to this function is made, all - * early hardware probing is complete, so sve_default_vl - * should be valid. - * If a bug causes this to go wrong, we make some noise and - * try to fudge thread.sve_vl to a safe value here. - */ - vl = current->thread.sve_vl_onexec ? - current->thread.sve_vl_onexec : sve_default_vl; + /* Defer kfree() while in atomic context */ + sve_state = current->thread.sve_state; + current->thread.sve_state = NULL; - if (WARN_ON(!sve_vl_valid(vl))) - vl = SVE_VL_MIN; + fpsimd_flush_thread_vl(ARM64_VEC_SVE); + } - supported_vl = find_supported_vector_length(vl); - if (WARN_ON(supported_vl != vl)) - vl = supported_vl; + if (system_supports_sme()) { + clear_thread_flag(TIF_SME); - current->thread.sve_vl = vl; + /* Defer kfree() while in atomic context */ + sme_state = current->thread.sme_state; + current->thread.sme_state = NULL; - /* - * If the task is not set to inherit, ensure that the vector - * length will be reset by a subsequent exec: - */ - if (!test_thread_flag(TIF_SVE_VL_INHERIT)) - current->thread.sve_vl_onexec = 0; + fpsimd_flush_thread_vl(ARM64_VEC_SME); + current->thread.svcr = 0; } - set_thread_flag(TIF_FOREIGN_FPSTATE); + if (system_supports_fpmr()) + current->thread.uw.fpmr = 0; + + current->thread.fp_type = FP_STATE_FPSIMD; - local_bh_enable(); + put_cpu_fpsimd_context(); + kfree(sve_state); + kfree(sme_state); } /* @@ -947,118 +1668,170 @@ void fpsimd_preserve_current_state(void) if (!system_supports_fpsimd()) return; - local_bh_disable(); - fpsimd_save(); - local_bh_enable(); -} - -/* - * Like fpsimd_preserve_current_state(), but ensure that - * current->thread.uw.fpsimd_state is updated so that it can be copied to - * the signal frame. - */ -void fpsimd_signal_preserve_current_state(void) -{ - fpsimd_preserve_current_state(); - if (system_supports_sve() && test_thread_flag(TIF_SVE)) - sve_to_fpsimd(current); + get_cpu_fpsimd_context(); + fpsimd_save_user_state(); + put_cpu_fpsimd_context(); } /* * Associate current's FPSIMD context with this cpu - * Preemption must be disabled when calling this function. + * The caller must have ownership of the cpu FPSIMD context before calling + * this function. */ -void fpsimd_bind_task_to_cpu(void) +static void fpsimd_bind_task_to_cpu(void) { - struct fpsimd_last_state_struct *last = - this_cpu_ptr(&fpsimd_last_state); + struct cpu_fp_state *last = this_cpu_ptr(&fpsimd_last_state); + WARN_ON(!system_supports_fpsimd()); last->st = ¤t->thread.uw.fpsimd_state; + last->sve_state = current->thread.sve_state; + last->sme_state = current->thread.sme_state; + last->sve_vl = task_get_sve_vl(current); + last->sme_vl = task_get_sme_vl(current); + last->svcr = ¤t->thread.svcr; + last->fpmr = ¤t->thread.uw.fpmr; + last->fp_type = ¤t->thread.fp_type; + last->to_save = FP_STATE_CURRENT; current->thread.fpsimd_cpu = smp_processor_id(); + /* + * Toggle SVE and SME trapping for userspace if needed, these + * are serialsied by ret_to_user(). + */ + if (system_supports_sme()) { + if (test_thread_flag(TIF_SME)) + sme_user_enable(); + else + sme_user_disable(); + } + if (system_supports_sve()) { - /* Toggle SVE trapping for userspace if needed */ if (test_thread_flag(TIF_SVE)) sve_user_enable(); else sve_user_disable(); - - /* Serialised by exception return to user */ } } -void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st) +void fpsimd_bind_state_to_cpu(struct cpu_fp_state *state) { - struct fpsimd_last_state_struct *last = - this_cpu_ptr(&fpsimd_last_state); + struct cpu_fp_state *last = this_cpu_ptr(&fpsimd_last_state); + WARN_ON(!system_supports_fpsimd()); WARN_ON(!in_softirq() && !irqs_disabled()); - last->st = st; + *last = *state; } /* * Load the userland FPSIMD state of 'current' from memory, but only if the * FPSIMD state already held in the registers is /not/ the most recent FPSIMD - * state of 'current' + * state of 'current'. This is called when we are preparing to return to + * userspace to ensure that userspace sees a good register state. */ void fpsimd_restore_current_state(void) { - if (!system_supports_fpsimd()) + /* + * TIF_FOREIGN_FPSTATE is set on the init task and copied by + * arch_dup_task_struct() regardless of whether FP/SIMD is detected. + * Thus user threads can have this set even when FP/SIMD hasn't been + * detected. + * + * When FP/SIMD is detected, begin_new_exec() will set + * TIF_FOREIGN_FPSTATE via flush_thread() -> fpsimd_flush_thread(), + * and fpsimd_thread_switch() will set TIF_FOREIGN_FPSTATE when + * switching tasks. We detect FP/SIMD before we exec the first user + * process, ensuring this has TIF_FOREIGN_FPSTATE set and + * do_notify_resume() will call fpsimd_restore_current_state() to + * install the user FP/SIMD context. + * + * When FP/SIMD is not detected, nothing else will clear or set + * TIF_FOREIGN_FPSTATE prior to the first return to userspace, and + * we must clear TIF_FOREIGN_FPSTATE to avoid do_notify_resume() + * looping forever calling fpsimd_restore_current_state(). + */ + if (!system_supports_fpsimd()) { + clear_thread_flag(TIF_FOREIGN_FPSTATE); return; + } - local_bh_disable(); + get_cpu_fpsimd_context(); if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { task_fpsimd_load(); fpsimd_bind_task_to_cpu(); } - local_bh_enable(); + put_cpu_fpsimd_context(); } -/* - * Load an updated userland FPSIMD state for 'current' from memory and set the - * flag that indicates that the FPSIMD register contents are the most recent - * FPSIMD state of 'current' - */ void fpsimd_update_current_state(struct user_fpsimd_state const *state) { - if (!system_supports_fpsimd()) + if (WARN_ON(!system_supports_fpsimd())) return; - local_bh_disable(); - current->thread.uw.fpsimd_state = *state; - if (system_supports_sve() && test_thread_flag(TIF_SVE)) + if (current->thread.fp_type == FP_STATE_SVE) fpsimd_to_sve(current); - - task_fpsimd_load(); - fpsimd_bind_task_to_cpu(); - - clear_thread_flag(TIF_FOREIGN_FPSTATE); - - local_bh_enable(); } /* * Invalidate live CPU copies of task t's FPSIMD state + * + * This function may be called with preemption enabled. The barrier() + * ensures that the assignment to fpsimd_cpu is visible to any + * preemption/softirq that could race with set_tsk_thread_flag(), so + * that TIF_FOREIGN_FPSTATE cannot be spuriously re-cleared. + * + * The final barrier ensures that TIF_FOREIGN_FPSTATE is seen set by any + * subsequent code. */ void fpsimd_flush_task_state(struct task_struct *t) { t->thread.fpsimd_cpu = NR_CPUS; + t->thread.kernel_fpsimd_state = NULL; + /* + * If we don't support fpsimd, bail out after we have + * reset the fpsimd_cpu for this task and clear the + * FPSTATE. + */ + if (!system_supports_fpsimd()) + return; + barrier(); + set_tsk_thread_flag(t, TIF_FOREIGN_FPSTATE); + + barrier(); } -void fpsimd_flush_cpu_state(void) +void fpsimd_save_and_flush_current_state(void) { - __this_cpu_write(fpsimd_last_state.st, NULL); - set_thread_flag(TIF_FOREIGN_FPSTATE); + if (!system_supports_fpsimd()) + return; + + get_cpu_fpsimd_context(); + fpsimd_save_user_state(); + fpsimd_flush_task_state(current); + put_cpu_fpsimd_context(); } -#ifdef CONFIG_KERNEL_MODE_NEON +/* + * Save the FPSIMD state to memory and invalidate cpu view. + * This function must be called with preemption disabled. + */ +void fpsimd_save_and_flush_cpu_state(void) +{ + unsigned long flags; -DEFINE_PER_CPU(bool, kernel_neon_busy); -EXPORT_PER_CPU_SYMBOL(kernel_neon_busy); + if (!system_supports_fpsimd()) + return; + WARN_ON(preemptible()); + local_irq_save(flags); + fpsimd_save_user_state(); + fpsimd_flush_cpu_state(); + local_irq_restore(flags); +} + +#ifdef CONFIG_KERNEL_MODE_NEON /* * Kernel-side NEON support functions @@ -1076,29 +1849,65 @@ EXPORT_PER_CPU_SYMBOL(kernel_neon_busy); * * The caller may freely use the FPSIMD registers until kernel_neon_end() is * called. + * + * Unless called from non-preemptible task context, @state must point to a + * caller provided buffer that will be used to preserve the task's kernel mode + * FPSIMD context when it is scheduled out, or if it is interrupted by kernel + * mode FPSIMD occurring in softirq context. May be %NULL otherwise. */ -void kernel_neon_begin(void) +void kernel_neon_begin(struct user_fpsimd_state *state) { if (WARN_ON(!system_supports_fpsimd())) return; - BUG_ON(!may_use_simd()); + WARN_ON((preemptible() || in_serving_softirq()) && !state); - local_bh_disable(); + BUG_ON(!may_use_simd()); - __this_cpu_write(kernel_neon_busy, true); + get_cpu_fpsimd_context(); /* Save unsaved fpsimd state, if any: */ - fpsimd_save(); + if (test_thread_flag(TIF_KERNEL_FPSTATE)) { + BUG_ON(IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq()); + fpsimd_save_state(state); + } else { + fpsimd_save_user_state(); + + /* + * Set the thread flag so that the kernel mode FPSIMD state + * will be context switched along with the rest of the task + * state. + * + * On non-PREEMPT_RT, softirqs may interrupt task level kernel + * mode FPSIMD, but the task will not be preemptible so setting + * TIF_KERNEL_FPSTATE for those would be both wrong (as it + * would mark the task context FPSIMD state as requiring a + * context switch) and unnecessary. + * + * On PREEMPT_RT, softirqs are serviced from a separate thread, + * which is scheduled as usual, and this guarantees that these + * softirqs are not interrupting use of the FPSIMD in kernel + * mode in task context. So in this case, setting the flag here + * is always appropriate. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq()) { + /* + * Record the caller provided buffer as the kernel mode + * FP/SIMD buffer for this task, so that the state can + * be preserved and restored on a context switch. + */ + WARN_ON(current->thread.kernel_fpsimd_state != NULL); + current->thread.kernel_fpsimd_state = state; + set_thread_flag(TIF_KERNEL_FPSTATE); + } + } /* Invalidate any task state remaining in the fpsimd regs: */ fpsimd_flush_cpu_state(); - preempt_disable(); - - local_bh_enable(); + put_cpu_fpsimd_context(); } -EXPORT_SYMBOL(kernel_neon_begin); +EXPORT_SYMBOL_GPL(kernel_neon_begin); /* * kernel_neon_end(): give the CPU FPSIMD registers back to the current task @@ -1108,26 +1917,39 @@ EXPORT_SYMBOL(kernel_neon_begin); * * The caller must not use the FPSIMD registers after this function is called, * unless kernel_neon_begin() is called again in the meantime. + * + * The value of @state must match the value passed to the preceding call to + * kernel_neon_begin(). */ -void kernel_neon_end(void) +void kernel_neon_end(struct user_fpsimd_state *state) { - bool busy; - if (!system_supports_fpsimd()) return; - busy = __this_cpu_xchg(kernel_neon_busy, false); - WARN_ON(!busy); /* No matching kernel_neon_begin()? */ + if (!test_thread_flag(TIF_KERNEL_FPSTATE)) + return; - preempt_enable(); + /* + * If we are returning from a nested use of kernel mode FPSIMD, restore + * the task context kernel mode FPSIMD state. This can only happen when + * running in softirq context on non-PREEMPT_RT. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && in_serving_softirq()) { + fpsimd_load_state(state); + } else { + clear_thread_flag(TIF_KERNEL_FPSTATE); + WARN_ON(current->thread.kernel_fpsimd_state != state); + current->thread.kernel_fpsimd_state = NULL; + } } -EXPORT_SYMBOL(kernel_neon_end); +EXPORT_SYMBOL_GPL(kernel_neon_end); #ifdef CONFIG_EFI -static DEFINE_PER_CPU(struct user_fpsimd_state, efi_fpsimd_state); -static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); -static DEFINE_PER_CPU(bool, efi_sve_state_used); +static struct user_fpsimd_state efi_fpsimd_state; +static bool efi_fpsimd_state_used; +static bool efi_sve_state_used; +static bool efi_sm_state; /* * EFI runtime services support functions @@ -1151,27 +1973,46 @@ void __efi_fpsimd_begin(void) if (!system_supports_fpsimd()) return; - WARN_ON(preemptible()); - if (may_use_simd()) { - kernel_neon_begin(); + kernel_neon_begin(&efi_fpsimd_state); } else { + WARN_ON(preemptible()); + /* * If !efi_sve_state, SVE can't be in use yet and doesn't need * preserving: */ - if (system_supports_sve() && likely(efi_sve_state)) { - char *sve_state = this_cpu_ptr(efi_sve_state); + if (system_supports_sve() && efi_sve_state != NULL) { + bool ffr = true; + u64 svcr; + + efi_sve_state_used = true; + + if (system_supports_sme()) { + svcr = read_sysreg_s(SYS_SVCR); - __this_cpu_write(efi_sve_state_used, true); + efi_sm_state = svcr & SVCR_SM_MASK; + + /* + * Unless we have FA64 FFR does not + * exist in streaming mode. + */ + if (!system_supports_fa64()) + ffr = !(svcr & SVCR_SM_MASK); + } + + sve_save_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), + &efi_fpsimd_state.fpsr, ffr); + + if (system_supports_sme()) + sysreg_clear_set_s(SYS_SVCR, + SVCR_SM_MASK, 0); - sve_save_state(sve_state + sve_ffr_offset(sve_max_vl), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr); } else { - fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); + fpsimd_save_state(&efi_fpsimd_state); } - __this_cpu_write(efi_fpsimd_state_used, true); + efi_fpsimd_state_used = true; } } @@ -1183,21 +2024,41 @@ void __efi_fpsimd_end(void) if (!system_supports_fpsimd()) return; - if (!__this_cpu_xchg(efi_fpsimd_state_used, false)) { - kernel_neon_end(); + if (!efi_fpsimd_state_used) { + kernel_neon_end(&efi_fpsimd_state); } else { - if (system_supports_sve() && - likely(__this_cpu_read(efi_sve_state_used))) { - char const *sve_state = this_cpu_ptr(efi_sve_state); + if (system_supports_sve() && efi_sve_state_used) { + bool ffr = true; + + /* + * Restore streaming mode; EFI calls are + * normal function calls so should not return in + * streaming mode. + */ + if (system_supports_sme()) { + if (efi_sm_state) { + sysreg_clear_set_s(SYS_SVCR, + 0, + SVCR_SM_MASK); + + /* + * Unless we have FA64 FFR does not + * exist in streaming mode. + */ + if (!system_supports_fa64()) + ffr = false; + } + } - sve_load_state(sve_state + sve_ffr_offset(sve_max_vl), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - sve_vq_from_vl(sve_get_vl()) - 1); + sve_load_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), + &efi_fpsimd_state.fpsr, ffr); - __this_cpu_write(efi_sve_state_used, false); + efi_sve_state_used = false; } else { - fpsimd_load_state(this_cpu_ptr(&efi_fpsimd_state)); + fpsimd_load_state(&efi_fpsimd_state); } + + efi_fpsimd_state_used = false; } } @@ -1211,8 +2072,7 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self, { switch (cmd) { case CPU_PM_ENTER: - fpsimd_save(); - fpsimd_flush_cpu_state(); + fpsimd_save_and_flush_cpu_state(); break; case CPU_PM_EXIT: break; @@ -1253,21 +2113,32 @@ static inline void fpsimd_hotplug_init(void) static inline void fpsimd_hotplug_init(void) { } #endif +void cpu_enable_fpsimd(const struct arm64_cpu_capabilities *__always_unused p) +{ + unsigned long enable = CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN; + write_sysreg(read_sysreg(CPACR_EL1) | enable, CPACR_EL1); + isb(); +} + /* * FP/SIMD support code initialisation. */ static int __init fpsimd_init(void) { - if (elf_hwcap & HWCAP_FP) { + if (cpu_have_named_feature(FP)) { fpsimd_pm_init(); fpsimd_hotplug_init(); } else { pr_notice("Floating-point is not implemented\n"); } - if (!(elf_hwcap & HWCAP_ASIMD)) + if (!cpu_have_named_feature(ASIMD)) pr_notice("Advanced SIMD is not implemented\n"); - return sve_sysctl_init(); + + sve_sysctl_init(); + sme_sysctl_init(); + + return 0; } core_initcall(fpsimd_init); diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 8e4431a8821f..5a1554a44162 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arch/arm64/kernel/ftrace.c * * Copyright (C) 2013 Linaro Limited * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/ftrace.h> @@ -18,8 +15,197 @@ #include <asm/debug-monitors.h> #include <asm/ftrace.h> #include <asm/insn.h> +#include <asm/text-patching.h> + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS +struct fregs_offset { + const char *name; + int offset; +}; + +#define FREGS_OFFSET(n, field) \ +{ \ + .name = n, \ + .offset = offsetof(struct __arch_ftrace_regs, field), \ +} + +static const struct fregs_offset fregs_offsets[] = { + FREGS_OFFSET("x0", regs[0]), + FREGS_OFFSET("x1", regs[1]), + FREGS_OFFSET("x2", regs[2]), + FREGS_OFFSET("x3", regs[3]), + FREGS_OFFSET("x4", regs[4]), + FREGS_OFFSET("x5", regs[5]), + FREGS_OFFSET("x6", regs[6]), + FREGS_OFFSET("x7", regs[7]), + FREGS_OFFSET("x8", regs[8]), + + FREGS_OFFSET("x29", fp), + FREGS_OFFSET("x30", lr), + FREGS_OFFSET("lr", lr), + + FREGS_OFFSET("sp", sp), + FREGS_OFFSET("pc", pc), +}; + +int ftrace_regs_query_register_offset(const char *name) +{ + for (int i = 0; i < ARRAY_SIZE(fregs_offsets); i++) { + const struct fregs_offset *roff = &fregs_offsets[i]; + if (!strcmp(roff->name, name)) + return roff->offset; + } + + return -EINVAL; +} +#endif + +unsigned long ftrace_call_adjust(unsigned long addr) +{ + /* + * When using mcount, addr is the address of the mcount call + * instruction, and no adjustment is necessary. + */ + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_ARGS)) + return addr; + + /* + * When using patchable-function-entry without pre-function NOPS, addr + * is the address of the first NOP after the function entry point. + * + * The compiler has either generated: + * + * addr+00: func: NOP // To be patched to MOV X9, LR + * addr+04: NOP // To be patched to BL <caller> + * + * Or: + * + * addr-04: BTI C + * addr+00: func: NOP // To be patched to MOV X9, LR + * addr+04: NOP // To be patched to BL <caller> + * + * We must adjust addr to the address of the NOP which will be patched + * to `BL <caller>`, which is at `addr + 4` bytes in either case. + * + */ + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return addr + AARCH64_INSN_SIZE; + + /* + * When using patchable-function-entry with pre-function NOPs, addr is + * the address of the first pre-function NOP. + * + * Starting from an 8-byte aligned base, the compiler has either + * generated: + * + * addr+00: NOP // Literal (first 32 bits) + * addr+04: NOP // Literal (last 32 bits) + * addr+08: func: NOP // To be patched to MOV X9, LR + * addr+12: NOP // To be patched to BL <caller> + * + * Or: + * + * addr+00: NOP // Literal (first 32 bits) + * addr+04: NOP // Literal (last 32 bits) + * addr+08: func: BTI C + * addr+12: NOP // To be patched to MOV X9, LR + * addr+16: NOP // To be patched to BL <caller> + * + * We must adjust addr to the address of the NOP which will be patched + * to `BL <caller>`, which is at either addr+12 or addr+16 depending on + * whether there is a BTI. + */ + + if (!IS_ALIGNED(addr, sizeof(unsigned long))) { + WARN_RATELIMIT(1, "Misaligned patch-site %pS\n", + (void *)(addr + 8)); + return 0; + } + + /* Skip the NOPs placed before the function entry point */ + addr += 2 * AARCH64_INSN_SIZE; + + /* Skip any BTI */ + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) { + u32 insn = le32_to_cpu(*(__le32 *)addr); + + if (aarch64_insn_is_bti(insn)) { + addr += AARCH64_INSN_SIZE; + } else if (insn != aarch64_insn_gen_nop()) { + WARN_RATELIMIT(1, "unexpected insn in patch-site %pS: 0x%08x\n", + (void *)addr, insn); + } + } + + /* Skip the first NOP after function entry */ + addr += AARCH64_INSN_SIZE; + + return addr; +} + +/* Convert fentry_ip to the symbol address without kallsyms */ +unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip) +{ + u32 insn; + + /* + * When using patchable-function-entry without pre-function NOPS, ftrace + * entry is the address of the first NOP after the function entry point. + * + * The compiler has either generated: + * + * func+00: func: NOP // To be patched to MOV X9, LR + * func+04: NOP // To be patched to BL <caller> + * + * Or: + * + * func-04: BTI C + * func+00: func: NOP // To be patched to MOV X9, LR + * func+04: NOP // To be patched to BL <caller> + * + * The fentry_ip is the address of `BL <caller>` which is at `func + 4` + * bytes in either case. + */ + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return fentry_ip - AARCH64_INSN_SIZE; + + /* + * When using patchable-function-entry with pre-function NOPs, BTI is + * a bit different. + * + * func+00: func: NOP // To be patched to MOV X9, LR + * func+04: NOP // To be patched to BL <caller> + * + * Or: + * + * func+00: func: BTI C + * func+04: NOP // To be patched to MOV X9, LR + * func+08: NOP // To be patched to BL <caller> + * + * The fentry_ip is the address of `BL <caller>` which is at either + * `func + 4` or `func + 8` depends on whether there is a BTI. + */ + + /* If there is no BTI, the func address should be one instruction before. */ + if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) + return fentry_ip - AARCH64_INSN_SIZE; + + /* We want to be extra safe in case entry ip is on the page edge, + * but otherwise we need to avoid get_kernel_nofault()'s overhead. + */ + if ((fentry_ip & ~PAGE_MASK) < AARCH64_INSN_SIZE * 2) { + if (get_kernel_nofault(insn, (u32 *)(fentry_ip - AARCH64_INSN_SIZE * 2))) + return 0; + } else { + insn = *(u32 *)(fentry_ip - AARCH64_INSN_SIZE * 2); + } + + if (aarch64_insn_is_bti(le32_to_cpu((__le32)insn))) + return fentry_ip - AARCH64_INSN_SIZE * 2; + + return fentry_ip - AARCH64_INSN_SIZE; +} -#ifdef CONFIG_DYNAMIC_FTRACE /* * Replace a single instruction, which may be a branch or NOP. * If @validate == true, a replaced instruction is checked against 'old'. @@ -58,13 +244,149 @@ int ftrace_update_ftrace_func(ftrace_func_t func) unsigned long pc; u32 new; - pc = (unsigned long)&ftrace_call; + /* + * When using CALL_OPS, the function to call is associated with the + * call site, and we don't have a global function pointer to update. + */ + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return 0; + + pc = (unsigned long)ftrace_call; new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, AARCH64_INSN_BRANCH_LINK); return ftrace_modify_code(pc, 0, new, false); } +static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr) +{ +#ifdef CONFIG_MODULES + struct plt_entry *plt = NULL; + + if (within_module_mem_type(addr, mod, MOD_INIT_TEXT)) + plt = mod->arch.init_ftrace_trampolines; + else if (within_module_mem_type(addr, mod, MOD_TEXT)) + plt = mod->arch.ftrace_trampolines; + else + return NULL; + + return &plt[FTRACE_PLT_IDX]; +#else + return NULL; +#endif +} + +static bool reachable_by_bl(unsigned long addr, unsigned long pc) +{ + long offset = (long)addr - (long)pc; + + return offset >= -SZ_128M && offset < SZ_128M; +} + +/* + * Find the address the callsite must branch to in order to reach '*addr'. + * + * Due to the limited range of 'BL' instructions, modules may be placed too far + * away to branch directly and must use a PLT. + * + * Returns true when '*addr' contains a reachable target address, or has been + * modified to contain a PLT address. Returns false otherwise. + */ +static bool ftrace_find_callable_addr(struct dyn_ftrace *rec, + struct module *mod, + unsigned long *addr) +{ + unsigned long pc = rec->ip; + struct plt_entry *plt; + + /* + * If a custom trampoline is unreachable, rely on the ftrace_caller + * trampoline which knows how to indirectly reach that trampoline + * through ops->direct_call. + */ + if (*addr != FTRACE_ADDR && !reachable_by_bl(*addr, pc)) + *addr = FTRACE_ADDR; + + /* + * When the target is within range of the 'BL' instruction, use 'addr' + * as-is and branch to that directly. + */ + if (reachable_by_bl(*addr, pc)) + return true; + + /* + * When the target is outside of the range of a 'BL' instruction, we + * must use a PLT to reach it. We can only place PLTs for modules, and + * only when module PLT support is built-in. + */ + if (!IS_ENABLED(CONFIG_MODULES)) + return false; + + /* + * 'mod' is only set at module load time, but if we end up + * dealing with an out-of-range condition, we can assume it + * is due to a module being loaded far away from the kernel. + * + * NOTE: __module_text_address() must be called within a RCU read + * section, but we can rely on ftrace_lock to ensure that 'mod' + * retains its validity throughout the remainder of this code. + */ + if (!mod) { + guard(rcu)(); + mod = __module_text_address(pc); + } + + if (WARN_ON(!mod)) + return false; + + plt = get_ftrace_plt(mod, pc); + if (!plt) { + pr_err("ftrace: no module PLT for %ps\n", (void *)*addr); + return false; + } + + *addr = (unsigned long)plt; + return true; +} + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS +static const struct ftrace_ops *arm64_rec_get_ops(struct dyn_ftrace *rec) +{ + const struct ftrace_ops *ops = NULL; + + if (rec->flags & FTRACE_FL_CALL_OPS_EN) { + ops = ftrace_find_unique_ops(rec); + WARN_ON_ONCE(!ops); + } + + if (!ops) + ops = &ftrace_list_ops; + + return ops; +} + +static int ftrace_rec_set_ops(const struct dyn_ftrace *rec, + const struct ftrace_ops *ops) +{ + unsigned long literal = ALIGN_DOWN(rec->ip - 12, 8); + return aarch64_insn_write_literal_u64((void *)literal, + (unsigned long)ops); +} + +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) +{ + return ftrace_rec_set_ops(rec, &ftrace_nop_ops); +} + +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) +{ + return ftrace_rec_set_ops(rec, arm64_rec_get_ops(rec)); +} +#else +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; } +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; } +#endif + /* * Turn on the call to ftrace_caller() in instrumented function */ @@ -72,60 +394,14 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned long pc = rec->ip; u32 old, new; - long offset = (long)pc - (long)addr; - - if (offset < -SZ_128M || offset >= SZ_128M) { -#ifdef CONFIG_ARM64_MODULE_PLTS - struct plt_entry trampoline; - struct module *mod; - - /* - * On kernels that support module PLTs, the offset between the - * branch instruction and its target may legally exceed the - * range of an ordinary relative 'bl' opcode. In this case, we - * need to branch via a trampoline in the module. - * - * NOTE: __module_text_address() must be called with preemption - * disabled, but we can rely on ftrace_lock to ensure that 'mod' - * retains its validity throughout the remainder of this code. - */ - preempt_disable(); - mod = __module_text_address(pc); - preempt_enable(); + int ret; - if (WARN_ON(!mod)) - return -EINVAL; + ret = ftrace_rec_update_ops(rec); + if (ret) + return ret; - /* - * There is only one ftrace trampoline per module. For now, - * this is not a problem since on arm64, all dynamic ftrace - * invocations are routed via ftrace_caller(). This will need - * to be revisited if support for multiple ftrace entry points - * is added in the future, but for now, the pr_err() below - * deals with a theoretical issue only. - */ - trampoline = get_plt_entry(addr, mod->arch.ftrace_trampoline); - if (!plt_entries_equal(mod->arch.ftrace_trampoline, - &trampoline)) { - if (!plt_entries_equal(mod->arch.ftrace_trampoline, - &(struct plt_entry){})) { - pr_err("ftrace: far branches to multiple entry points unsupported inside a single module\n"); - return -EINVAL; - } - - /* point the trampoline to our ftrace entry point */ - module_disable_ro(mod); - *mod->arch.ftrace_trampoline = trampoline; - module_enable_ro(mod, true); - - /* update trampoline before patching in the branch */ - smp_wmb(); - } - addr = (unsigned long)(void *)mod->arch.ftrace_trampoline; -#else /* CONFIG_ARM64_MODULE_PLTS */ + if (!ftrace_find_callable_addr(rec, NULL, &addr)) return -EINVAL; -#endif /* CONFIG_ARM64_MODULE_PLTS */ - } old = aarch64_insn_gen_nop(); new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK); @@ -133,6 +409,72 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return ftrace_modify_code(pc, old, new, true); } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + unsigned long pc = rec->ip; + u32 old, new; + int ret; + + ret = ftrace_rec_set_ops(rec, arm64_rec_get_ops(rec)); + if (ret) + return ret; + + if (!ftrace_find_callable_addr(rec, NULL, &old_addr)) + return -EINVAL; + if (!ftrace_find_callable_addr(rec, NULL, &addr)) + return -EINVAL; + + old = aarch64_insn_gen_branch_imm(pc, old_addr, + AARCH64_INSN_BRANCH_LINK); + new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK); + + return ftrace_modify_code(pc, old, new, true); +} +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS +/* + * The compiler has inserted two NOPs before the regular function prologue. + * All instrumented functions follow the AAPCS, so x0-x8 and x19-x30 are live, + * and x9-x18 are free for our use. + * + * At runtime we want to be able to swing a single NOP <-> BL to enable or + * disable the ftrace call. The BL requires us to save the original LR value, + * so here we insert a <MOV X9, LR> over the first NOP so the instructions + * before the regular prologue are: + * + * | Compiled | Disabled | Enabled | + * +----------+------------+------------+ + * | NOP | MOV X9, LR | MOV X9, LR | + * | NOP | NOP | BL <entry> | + * + * The LR value will be recovered by ftrace_caller, and restored into LR + * before returning to the regular function prologue. When a function is not + * being traced, the MOV is not harmful given x9 is not live per the AAPCS. + * + * Note: ftrace_process_locs() has pre-adjusted rec->ip to be the address of + * the BL. + */ +int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) +{ + unsigned long pc = rec->ip - AARCH64_INSN_SIZE; + u32 old, new; + int ret; + + ret = ftrace_rec_set_nop_ops(rec); + if (ret) + return ret; + + old = aarch64_insn_gen_nop(); + new = aarch64_insn_gen_move_reg(AARCH64_INSN_REG_9, + AARCH64_INSN_REG_LR, + AARCH64_INSN_VARIANT_64BIT); + return ftrace_modify_code(pc, old, new, true); +} +#endif + /* * Turn off the call to ftrace_caller() in instrumented function */ @@ -140,55 +482,33 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { unsigned long pc = rec->ip; - bool validate = true; u32 old = 0, new; - long offset = (long)pc - (long)addr; - - if (offset < -SZ_128M || offset >= SZ_128M) { -#ifdef CONFIG_ARM64_MODULE_PLTS - u32 replaced; - - /* - * 'mod' is only set at module load time, but if we end up - * dealing with an out-of-range condition, we can assume it - * is due to a module being loaded far away from the kernel. - */ - if (!mod) { - preempt_disable(); - mod = __module_text_address(pc); - preempt_enable(); - - if (WARN_ON(!mod)) - return -EINVAL; - } + int ret; - /* - * The instruction we are about to patch may be a branch and - * link instruction that was redirected via a PLT entry. In - * this case, the normal validation will fail, but we can at - * least check that we are dealing with a branch and link - * instruction that points into the right module. - */ - if (aarch64_insn_read((void *)pc, &replaced)) - return -EFAULT; + new = aarch64_insn_gen_nop(); - if (!aarch64_insn_is_bl(replaced) || - !within_module(pc + aarch64_get_branch_offset(replaced), - mod)) - return -EINVAL; + ret = ftrace_rec_set_nop_ops(rec); + if (ret) + return ret; - validate = false; -#else /* CONFIG_ARM64_MODULE_PLTS */ + /* + * When using mcount, callsites in modules may have been initialized to + * call an arbitrary module PLT (which redirects to the _mcount stub) + * rather than the ftrace PLT we'll use at runtime (which redirects to + * the ftrace trampoline). We can ignore the old PLT when initializing + * the callsite. + * + * Note: 'mod' is only set at module load time. + */ + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) && mod) + return aarch64_insn_patch_text_nosync((void *)pc, new); + + if (!ftrace_find_callable_addr(rec, mod, &addr)) return -EINVAL; -#endif /* CONFIG_ARM64_MODULE_PLTS */ - } else { - old = aarch64_insn_gen_branch_imm(pc, addr, - AARCH64_INSN_BRANCH_LINK); - } - new = aarch64_insn_gen_nop(); + old = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK); - return ftrace_modify_code(pc, old, new, validate); + return ftrace_modify_code(pc, old, new, true); } void arch_ftrace_update_code(int command) @@ -197,20 +517,12 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} -#endif /* CONFIG_DYNAMIC_FTRACE */ - #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* * function_graph tracer expects ftrace_return_to_handler() to be called * on the way back to parent. For this purpose, this function is called * in _mcount() or ftrace_caller() to replace return address (*parent) on * the call stack to return_to_handler. - * - * Note that @frame_pointer is used only for sanity check later. */ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long frame_pointer) @@ -228,11 +540,32 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, */ old = *parent; - if (!function_graph_enter(old, self_addr, frame_pointer, NULL)) + if (!function_graph_enter(old, self_addr, frame_pointer, + (void *)frame_pointer)) { *parent = return_hooker; + } } -#ifdef CONFIG_DYNAMIC_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + unsigned long return_hooker = (unsigned long)&return_to_handler; + unsigned long frame_pointer = arch_ftrace_regs(fregs)->fp; + unsigned long *parent = &arch_ftrace_regs(fregs)->lr; + unsigned long old; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + + old = *parent; + + if (!function_graph_enter_regs(old, ip, frame_pointer, + (void *)frame_pointer, fregs)) { + *parent = return_hooker; + } +} +#else /* * Turn on/off the call to ftrace_graph_caller() in ftrace_caller() * depending on @enable. @@ -262,5 +595,5 @@ int ftrace_disable_ftrace_graph_caller(void) { return ftrace_modify_graph_caller(false); } -#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 15d79a8e5e5e..ca04b338cb0d 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Low-level CPU initialisation * Based on arch/arm/kernel/head.S @@ -6,53 +7,39 @@ * Copyright (C) 2003-2012 ARM Ltd. * Authors: Catalin Marinas <catalin.marinas@arm.com> * Will Deacon <will.deacon@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/linkage.h> #include <linux/init.h> -#include <linux/irqchip/arm-gic-v3.h> +#include <linux/pgtable.h> +#include <asm/asm_pointer_auth.h> #include <asm/assembler.h> #include <asm/boot.h> +#include <asm/bug.h> #include <asm/ptrace.h> #include <asm/asm-offsets.h> #include <asm/cache.h> #include <asm/cputype.h> +#include <asm/el2_setup.h> #include <asm/elf.h> #include <asm/image.h> #include <asm/kernel-pgtable.h> #include <asm/kvm_arm.h> #include <asm/memory.h> #include <asm/pgtable-hwdef.h> -#include <asm/pgtable.h> #include <asm/page.h> +#include <asm/scs.h> #include <asm/smp.h> #include <asm/sysreg.h> +#include <asm/stacktrace/frame.h> #include <asm/thread_info.h> #include <asm/virt.h> #include "efi-header.S" -#define __PHYS_OFFSET (KERNEL_START - TEXT_OFFSET) - -#if (TEXT_OFFSET & 0xfff) != 0 -#error TEXT_OFFSET must be at least 4KB aligned -#elif (PAGE_OFFSET & 0x1fffff) != 0 +#if (PAGE_OFFSET & 0x1fffff) != 0 #error PAGE_OFFSET must be at least 2MB aligned -#elif TEXT_OFFSET > 0x1fffff -#error TEXT_OFFSET must be less than 2MB #endif /* @@ -63,64 +50,76 @@ * MMU = off, D-cache = off, I-cache = on or off, * x0 = physical address to the FDT blob. * - * This code is mostly position independent so you call this at - * __pa(PAGE_OFFSET + TEXT_OFFSET). - * * Note that the callee-saved registers are used for storing variables * that are useful before the MMU is enabled. The allocations are described * in the entry routines. */ __HEAD -_head: /* * DO NOT MODIFY. Image header expected by Linux boot-loaders. */ -#ifdef CONFIG_EFI - /* - * This add instruction has no meaningful effect except that - * its opcode forms the magic "MZ" signature required by UEFI. - */ - add x13, x18, #0x16 - b stext -#else - b stext // branch to kernel start, magic - .long 0 // reserved -#endif - le64sym _kernel_offset_le // Image load offset from start of RAM, little-endian + efi_signature_nop // special NOP to identity as PE/COFF executable + b primary_entry // branch to kernel start, magic + .quad 0 // Image load offset from start of RAM, little-endian le64sym _kernel_size_le // Effective size of kernel image, little-endian le64sym _kernel_flags_le // Informative flags, little-endian .quad 0 // reserved .quad 0 // reserved .quad 0 // reserved .ascii ARM64_IMAGE_MAGIC // Magic number -#ifdef CONFIG_EFI - .long pe_header - _head // Offset to the PE header. + .long .Lpe_header_offset // Offset to the PE header. -pe_header: __EFI_PE_HEADER -#else - .long 0 // reserved -#endif - __INIT + .section ".idmap.text","a" /* * The following callee saved general purpose registers are used on the * primary lowlevel boot path: * * Register Scope Purpose - * x21 stext() .. start_kernel() FDT pointer passed at boot in x0 - * x23 stext() .. start_kernel() physical misalignment/KASLR offset - * x28 __create_page_tables() callee preserved temp register - * x19/x20 __primary_switch() callee preserved temp registers + * x19 primary_entry() .. start_kernel() whether we entered with the MMU on + * x20 primary_entry() .. __primary_switch() CPU boot mode + * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 */ -ENTRY(stext) +SYM_CODE_START(primary_entry) + bl record_mmu_state bl preserve_boot_args - bl el2_setup // Drop to EL1, w0=cpu_boot_mode - adrp x23, __PHYS_OFFSET - and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 - bl set_cpu_boot_mode_flag - bl __create_page_tables + + adrp x1, early_init_stack + mov sp, x1 + mov x29, xzr + adrp x0, __pi_init_idmap_pg_dir + mov x1, xzr + bl __pi_create_init_idmap + + /* + * If the page tables have been populated with non-cacheable + * accesses (MMU disabled), invalidate those tables again to + * remove any speculatively loaded cache lines. + */ + cbnz x19, 0f + dmb sy + mov x1, x0 // end of used region + adrp x0, __pi_init_idmap_pg_dir + adr_l x2, dcache_inval_poc + blr x2 + b 1f + + /* + * If we entered with the MMU and caches on, clean the ID mapped part + * of the primary boot code to the PoC so we can safely execute it with + * the MMU off. + */ +0: adrp x0, __idmap_text_start + adr_l x1, __idmap_text_end + adr_l x2, dcache_clean_poc + blr x2 + +1: mov x0, x19 + bl init_kernel_el // w0=cpu_boot_mode + mov x20, x0 + /* * The following calls CPU setup code, see arch/arm64/mm/proc.S for * details. @@ -129,620 +128,301 @@ ENTRY(stext) */ bl __cpu_setup // initialise processor b __primary_switch -ENDPROC(stext) +SYM_CODE_END(primary_entry) + + __INIT +SYM_CODE_START_LOCAL(record_mmu_state) + mrs x19, CurrentEL + cmp x19, #CurrentEL_EL2 + mrs x19, sctlr_el1 + b.ne 0f + mrs x19, sctlr_el2 +0: +CPU_LE( tbnz x19, #SCTLR_ELx_EE_SHIFT, 1f ) +CPU_BE( tbz x19, #SCTLR_ELx_EE_SHIFT, 1f ) + tst x19, #SCTLR_ELx_C // Z := (C == 0) + and x19, x19, #SCTLR_ELx_M // isolate M bit + csel x19, xzr, x19, eq // clear x19 if Z + ret + + /* + * Set the correct endianness early so all memory accesses issued + * before init_kernel_el() occur in the correct byte order. Note that + * this means the MMU must be disabled, or the active ID map will end + * up getting interpreted with the wrong byte order. + */ +1: eor x19, x19, #SCTLR_ELx_EE + bic x19, x19, #SCTLR_ELx_M + b.ne 2f + pre_disable_mmu_workaround + msr sctlr_el2, x19 + b 3f +2: pre_disable_mmu_workaround + msr sctlr_el1, x19 +3: isb + mov x19, xzr + ret +SYM_CODE_END(record_mmu_state) /* * Preserve the arguments passed by the bootloader in x0 .. x3 */ -preserve_boot_args: +SYM_CODE_START_LOCAL(preserve_boot_args) mov x21, x0 // x21=FDT adr_l x0, boot_args // record the contents of stp x21, x1, [x0] // x0 .. x3 at kernel entry stp x2, x3, [x0, #16] + cbnz x19, 0f // skip cache invalidation if MMU is on dmb sy // needed before dc ivac with // MMU off - mov x1, #0x20 // 4 x 8 bytes - b __inval_dcache_area // tail call -ENDPROC(preserve_boot_args) - -/* - * Macro to create a table entry to the next page. - * - * tbl: page table address - * virt: virtual address - * shift: #imm page table shift - * ptrs: #imm pointers per table page - * - * Preserves: virt - * Corrupts: ptrs, tmp1, tmp2 - * Returns: tbl -> next level table page address - */ - .macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2 - add \tmp1, \tbl, #PAGE_SIZE - phys_to_pte \tmp2, \tmp1 - orr \tmp2, \tmp2, #PMD_TYPE_TABLE // address of next table and entry type - lsr \tmp1, \virt, #\shift - sub \ptrs, \ptrs, #1 - and \tmp1, \tmp1, \ptrs // table index - str \tmp2, [\tbl, \tmp1, lsl #3] - add \tbl, \tbl, #PAGE_SIZE // next level table page - .endm - -/* - * Macro to populate page table entries, these entries can be pointers to the next level - * or last level entries pointing to physical memory. - * - * tbl: page table address - * rtbl: pointer to page table or physical memory - * index: start index to write - * eindex: end index to write - [index, eindex] written to - * flags: flags for pagetable entry to or in - * inc: increment to rtbl between each entry - * tmp1: temporary variable - * - * Preserves: tbl, eindex, flags, inc - * Corrupts: index, tmp1 - * Returns: rtbl - */ - .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1 -.Lpe\@: phys_to_pte \tmp1, \rtbl - orr \tmp1, \tmp1, \flags // tmp1 = table entry - str \tmp1, [\tbl, \index, lsl #3] - add \rtbl, \rtbl, \inc // rtbl = pa next level - add \index, \index, #1 - cmp \index, \eindex - b.ls .Lpe\@ - .endm - -/* - * Compute indices of table entries from virtual address range. If multiple entries - * were needed in the previous page table level then the next page table level is assumed - * to be composed of multiple pages. (This effectively scales the end index). - * - * vstart: virtual address of start of range - * vend: virtual address of end of range - * shift: shift used to transform virtual address into index - * ptrs: number of entries in page table - * istart: index in table corresponding to vstart - * iend: index in table corresponding to vend - * count: On entry: how many extra entries were required in previous level, scales - * our end index. - * On exit: returns how many extra entries required for next page table level - * - * Preserves: vstart, vend, shift, ptrs - * Returns: istart, iend, count - */ - .macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count - lsr \iend, \vend, \shift - mov \istart, \ptrs - sub \istart, \istart, #1 - and \iend, \iend, \istart // iend = (vend >> shift) & (ptrs - 1) - mov \istart, \ptrs - mul \istart, \istart, \count - add \iend, \iend, \istart // iend += (count - 1) * ptrs - // our entries span multiple tables - - lsr \istart, \vstart, \shift - mov \count, \ptrs - sub \count, \count, #1 - and \istart, \istart, \count - - sub \count, \iend, \istart - .endm - -/* - * Map memory for specified virtual address range. Each level of page table needed supports - * multiple entries. If a level requires n entries the next page table level is assumed to be - * formed from n pages. - * - * tbl: location of page table - * rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE) - * vstart: start address to map - * vend: end address to map - we map [vstart, vend] - * flags: flags to use to map last level entries - * phys: physical address corresponding to vstart - physical memory is contiguous - * pgds: the number of pgd entries - * - * Temporaries: istart, iend, tmp, count, sv - these need to be different registers - * Preserves: vstart, vend, flags - * Corrupts: tbl, rtbl, istart, iend, tmp, count, sv - */ - .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv - add \rtbl, \tbl, #PAGE_SIZE - mov \sv, \rtbl - mov \count, #0 - compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv - mov \sv, \rtbl - -#if SWAPPER_PGTABLE_LEVELS > 3 - compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv - mov \sv, \rtbl -#endif - -#if SWAPPER_PGTABLE_LEVELS > 2 - compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv -#endif - - compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count - bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1 - populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp - .endm - -/* - * Setup the initial page tables. We only setup the barest amount which is - * required to get the kernel running. The following sections are required: - * - identity mapping to enable the MMU (low address, TTBR0) - * - first few MB of the kernel linear mapping to jump to once the MMU has - * been enabled - */ -__create_page_tables: - mov x28, lr - - /* - * Invalidate the init page tables to avoid potential dirty cache lines - * being evicted. Other page tables are allocated in rodata as part of - * the kernel image, and thus are clean to the PoC per the boot - * protocol. - */ - adrp x0, init_pg_dir - adrp x1, init_pg_end - sub x1, x1, x0 - bl __inval_dcache_area - - /* - * Clear the init page tables. - */ - adrp x0, init_pg_dir - adrp x1, init_pg_end - sub x1, x1, x0 -1: stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - stp xzr, xzr, [x0], #16 - subs x1, x1, #64 - b.ne 1b - - mov x7, SWAPPER_MM_MMUFLAGS - - /* - * Create the identity mapping. - */ - adrp x0, idmap_pg_dir - adrp x3, __idmap_text_start // __pa(__idmap_text_start) - -#ifdef CONFIG_ARM64_USER_VA_BITS_52 - mrs_s x6, SYS_ID_AA64MMFR2_EL1 - and x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT) - mov x5, #52 - cbnz x6, 1f -#endif - mov x5, #VA_BITS -1: - adr_l x6, vabits_user - str x5, [x6] - dmb sy - dc ivac, x6 // Invalidate potentially stale cache line + add x1, x0, #0x20 // 4 x 8 bytes + b dcache_inval_poc // tail call +0: str_l x19, mmu_enabled_at_boot, x0 + ret +SYM_CODE_END(preserve_boot_args) /* - * VA_BITS may be too small to allow for an ID mapping to be created - * that covers system RAM if that is located sufficiently high in the - * physical address space. So for the ID map, use an extended virtual - * range in that case, and configure an additional translation level - * if needed. + * Initialize CPU registers with task-specific and cpu-specific context. * - * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the - * entire ID map region can be mapped. As T0SZ == (64 - #bits used), - * this number conveniently equals the number of leading zeroes in - * the physical address of __idmap_text_end. - */ - adrp x5, __idmap_text_end - clz x5, x5 - cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough? - b.ge 1f // .. then skip VA range extension - - adr_l x6, idmap_t0sz - str x5, [x6] - dmb sy - dc ivac, x6 // Invalidate potentially stale cache line - -#if (VA_BITS < 48) -#define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) -#define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT)) - - /* - * If VA_BITS < 48, we have to configure an additional table level. - * First, we have to verify our assumption that the current value of - * VA_BITS was chosen such that all translation levels are fully - * utilised, and that lowering T0SZ will always result in an additional - * translation level to be configured. - */ -#if VA_BITS != EXTRA_SHIFT -#error "Mismatch between VA_BITS and page size/number of translation levels" -#endif - - mov x4, EXTRA_PTRS - create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6 -#else - /* - * If VA_BITS == 48, we don't have to configure an additional - * translation level, but the top-level table has more entries. + * Create a final frame record at task_pt_regs(current)->stackframe, so + * that the unwinder can identify the final frame record of any task by + * its location in the task stack. We reserve the entire pt_regs space + * for consistency with user tasks and kthreads. */ - mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT) - str_l x4, idmap_ptrs_per_pgd, x5 -#endif -1: - ldr_l x4, idmap_ptrs_per_pgd - mov x5, x3 // __pa(__idmap_text_start) - adr_l x6, __idmap_text_end // __pa(__idmap_text_end) + .macro init_cpu_task tsk, tmp1, tmp2 + msr sp_el0, \tsk - map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14 + ldr \tmp1, [\tsk, #TSK_STACK] + add sp, \tmp1, #THREAD_SIZE + sub sp, sp, #PT_REGS_SIZE - /* - * Map the kernel image (starting with PHYS_OFFSET). - */ - adrp x0, init_pg_dir - mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text) - add x5, x5, x23 // add KASLR displacement - mov x4, PTRS_PER_PGD - adrp x6, _end // runtime __pa(_end) - adrp x3, _text // runtime __pa(_text) - sub x6, x6, x3 // _end - _text - add x6, x6, x5 // runtime __va(_end) + stp xzr, xzr, [sp, #S_STACKFRAME] + mov \tmp1, #FRAME_META_TYPE_FINAL + str \tmp1, [sp, #S_STACKFRAME_TYPE] + add x29, sp, #S_STACKFRAME - map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14 + scs_load_current - /* - * Since the page tables have been populated with non-cacheable - * accesses (MMU disabled), invalidate the idmap and swapper page - * tables again to remove any speculatively loaded cache lines. - */ - adrp x0, idmap_pg_dir - adrp x1, init_pg_end - sub x1, x1, x0 - dmb sy - bl __inval_dcache_area - - ret x28 -ENDPROC(__create_page_tables) - .ltorg + adr_l \tmp1, __per_cpu_offset + ldr w\tmp2, [\tsk, #TSK_TI_CPU] + ldr \tmp1, [\tmp1, \tmp2, lsl #3] + set_this_cpu_offset \tmp1 + .endm /* * The following fragment of code is executed with the MMU enabled. * - * x0 = __PHYS_OFFSET + * x0 = __pa(KERNEL_START) */ -__primary_switched: - adrp x4, init_thread_union - add sp, x4, #THREAD_SIZE - adr_l x5, init_task - msr sp_el0, x5 // Save thread_info +SYM_FUNC_START_LOCAL(__primary_switched) + adr_l x4, init_task + init_cpu_task x4, x5, x6 adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address isb - stp xzr, x30, [sp, #-16]! + stp x29, x30, [sp, #-16]! mov x29, sp str_l x21, __fdt_pointer, x5 // Save FDT pointer - ldr_l x4, kimage_vaddr // Save the offset between + adrp x4, _text // Save the offset between sub x4, x4, x0 // the kernel virtual and str_l x4, kimage_voffset, x5 // physical mappings - // Clear BSS - adr_l x0, __bss_start - mov x1, xzr - adr_l x2, __bss_stop - sub x2, x2, x0 - bl __pi_memset - dsb ishst // Make zero page visible to PTW + mov x0, x20 + bl set_cpu_boot_mode_flag -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) bl kasan_early_init #endif -#ifdef CONFIG_RANDOMIZE_BASE - tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized? - b.ne 0f - mov x0, x21 // pass FDT address in x0 - bl kaslr_early_init // parse FDT for KASLR options - cbz x0, 0f // KASLR disabled? just proceed - orr x23, x23, x0 // record KASLR offset - ldp x29, x30, [sp], #16 // we must enable KASLR, return - ret // to __primary_switch() -0: -#endif - add sp, sp, #16 - mov x29, #0 - mov x30, #0 - b start_kernel -ENDPROC(__primary_switched) + mov x0, x20 + bl finalise_el2 // Prefer VHE if possible + ldp x29, x30, [sp], #16 + bl start_kernel + ASM_BUG() +SYM_FUNC_END(__primary_switched) /* * end early head section, begin head code that is also used for * hotplug and needs to have the same protections as the text region */ - .section ".idmap.text","awx" - -ENTRY(kimage_vaddr) - .quad _text - TEXT_OFFSET -EXPORT_SYMBOL(kimage_vaddr) + .section ".idmap.text","a" /* - * If we're fortunate enough to boot at EL2, ensure that the world is - * sane before dropping to EL1. + * Starting from EL2 or EL1, configure the CPU to execute at the highest + * reachable EL supported by the kernel in a chosen default state. If dropping + * from EL2 to EL1, configure EL2 before configuring EL1. + * + * Since we cannot always rely on ERET synchronizing writes to sysregs (e.g. if + * SCTLR_ELx.EOS is clear), we place an ISB prior to ERET. * - * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if - * booted in EL1 or EL2 respectively. + * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in x0 if + * booted in EL1 or EL2 respectively, with the top 32 bits containing + * potential context flags. These flags are *not* stored in __boot_cpu_mode. + * + * x0: whether we are being called from the primary boot path with the MMU on */ -ENTRY(el2_setup) - msr SPsel, #1 // We want to use SP_EL{1,2} - mrs x0, CurrentEL - cmp x0, #CurrentEL_EL2 - b.eq 1f - mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1) +SYM_FUNC_START(init_kernel_el) + mrs x1, CurrentEL + cmp x1, #CurrentEL_EL2 + b.eq init_el2 + +SYM_INNER_LABEL(init_el1, SYM_L_LOCAL) + mov_q x0, INIT_SCTLR_EL1_MMU_OFF + pre_disable_mmu_workaround msr sctlr_el1, x0 - mov w0, #BOOT_CPU_MODE_EL1 // This cpu booted in EL1 isb - ret + mov_q x0, INIT_PSTATE_EL1 + msr spsr_el1, x0 + msr elr_el1, lr + mov w0, #BOOT_CPU_MODE_EL1 + eret -1: mov_q x0, (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) - msr sctlr_el2, x0 +SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) + msr elr_el2, lr -#ifdef CONFIG_ARM64_VHE - /* - * Check for VHE being present. For the rest of the EL2 setup, - * x2 being non-zero indicates that we do have VHE, and that the - * kernel is intended to run at EL2. - */ - mrs x2, id_aa64mmfr1_el1 - ubfx x2, x2, #8, #4 -#else - mov x2, xzr -#endif + // clean all HYP code to the PoC if we booted at EL2 with the MMU on + cbz x0, 0f + adrp x0, __hyp_idmap_text_start + adr_l x1, __hyp_text_end + adr_l x2, dcache_clean_poc + blr x2 - /* Hyp configuration. */ - mov_q x0, HCR_HOST_NVHE_FLAGS - cbz x2, set_hcr - mov_q x0, HCR_HOST_VHE_FLAGS -set_hcr: - msr hcr_el2, x0 + mov_q x0, INIT_SCTLR_EL2_MMU_OFF + pre_disable_mmu_workaround + msr sctlr_el2, x0 isb +0: - /* - * Allow Non-secure EL1 and EL0 to access physical timer and counter. - * This is not necessary for VHE, since the host kernel runs in EL2, - * and EL0 accesses are configured in the later stage of boot process. - * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout - * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined - * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1 - * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in - * EL2. - */ - cbnz x2, 1f - mrs x0, cnthctl_el2 - orr x0, x0, #3 // Enable EL1 physical timers - msr cnthctl_el2, x0 -1: - msr cntvoff_el2, xzr // Clear virtual offset - -#ifdef CONFIG_ARM_GIC_V3 - /* GICv3 system register access */ - mrs x0, id_aa64pfr0_el1 - ubfx x0, x0, #24, #4 - cmp x0, #1 - b.ne 3f - - mrs_s x0, SYS_ICC_SRE_EL2 - orr x0, x0, #ICC_SRE_EL2_SRE // Set ICC_SRE_EL2.SRE==1 - orr x0, x0, #ICC_SRE_EL2_ENABLE // Set ICC_SRE_EL2.Enable==1 - msr_s SYS_ICC_SRE_EL2, x0 - isb // Make sure SRE is now set - mrs_s x0, SYS_ICC_SRE_EL2 // Read SRE back, - tbz x0, #0, 3f // and check that it sticks - msr_s SYS_ICH_HCR_EL2, xzr // Reset ICC_HCR_EL2 to defaults - -3: -#endif - - /* Populate ID registers. */ - mrs x0, midr_el1 - mrs x1, mpidr_el1 - msr vpidr_el2, x0 - msr vmpidr_el2, x1 - -#ifdef CONFIG_COMPAT - msr hstr_el2, xzr // Disable CP15 traps to EL2 -#endif - - /* EL2 debug */ - mrs x1, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer - sbfx x0, x1, #8, #4 - cmp x0, #1 - b.lt 4f // Skip if no PMU present - mrs x0, pmcr_el0 // Disable debug access traps - ubfx x0, x0, #11, #5 // to EL2 and allow access to -4: - csel x3, xzr, x0, lt // all PMU counters from EL1 - - /* Statistical profiling */ - ubfx x0, x1, #32, #4 // Check ID_AA64DFR0_EL1 PMSVer - cbz x0, 7f // Skip if SPE not present - cbnz x2, 6f // VHE? - mrs_s x4, SYS_PMBIDR_EL1 // If SPE available at EL2, - and x4, x4, #(1 << SYS_PMBIDR_EL1_P_SHIFT) - cbnz x4, 5f // then permit sampling of physical - mov x4, #(1 << SYS_PMSCR_EL2_PCT_SHIFT | \ - 1 << SYS_PMSCR_EL2_PA_SHIFT) - msr_s SYS_PMSCR_EL2, x4 // addresses and physical counter -5: - mov x1, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) - orr x3, x3, x1 // If we don't have VHE, then - b 7f // use EL1&0 translation. -6: // For VHE, use EL2 translation - orr x3, x3, #MDCR_EL2_TPMS // and disable access from EL1 -7: - msr mdcr_el2, x3 // Configure debug traps - - /* LORegions */ - mrs x1, id_aa64mmfr1_el1 - ubfx x0, x1, #ID_AA64MMFR1_LOR_SHIFT, 4 - cbz x0, 1f - msr_s SYS_LORC_EL1, xzr -1: - - /* Stage-2 translation */ - msr vttbr_el2, xzr - - cbz x2, install_el2_stub + init_el2_hcr HCR_HOST_NVHE_FLAGS + init_el2_state - mov w0, #BOOT_CPU_MODE_EL2 // This CPU booted in EL2 + /* Hypervisor stub */ + adr_l x0, __hyp_stub_vectors + msr vbar_el2, x0 isb - ret - -install_el2_stub: - /* - * When VHE is not in use, early init of EL2 and EL1 needs to be - * done here. - * When VHE _is_ in use, EL1 will not be used in the host and - * requires no configuration, and all non-hyp-specific EL2 setup - * will be done via the _EL1 system register aliases in __cpu_setup. - */ - mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1) - msr sctlr_el1, x0 - /* Coprocessor traps. */ - mov x0, #0x33ff - msr cptr_el2, x0 // Disable copro. traps to EL2 + mov_q x1, INIT_SCTLR_EL1_MMU_OFF - /* SVE register access */ - mrs x1, id_aa64pfr0_el1 - ubfx x1, x1, #ID_AA64PFR0_SVE_SHIFT, #4 - cbz x1, 7f + mrs x0, hcr_el2 + and x0, x0, #HCR_E2H + cbz x0, 2f - bic x0, x0, #CPTR_EL2_TZ // Also disable SVE traps - msr cptr_el2, x0 // Disable copro. traps to EL2 - isb - mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector - msr_s SYS_ZCR_EL2, x1 // length for EL1. - - /* Hypervisor stub */ -7: adr_l x0, __hyp_stub_vectors - msr vbar_el2, x0 + /* Set a sane SCTLR_EL1, the VHE way */ + msr_s SYS_SCTLR_EL12, x1 + mov x2, #BOOT_CPU_FLAG_E2H + b 3f - /* spsr */ - mov x0, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ - PSR_MODE_EL1h) +2: + msr sctlr_el1, x1 + mov x2, xzr +3: + mov x0, #INIT_PSTATE_EL1 msr spsr_el2, x0 - msr elr_el2, lr - mov w0, #BOOT_CPU_MODE_EL2 // This CPU booted in EL2 - eret -ENDPROC(el2_setup) - -/* - * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed - * in w0. See arch/arm64/include/asm/virt.h for more info. - */ -set_cpu_boot_mode_flag: - adr_l x1, __boot_cpu_mode - cmp w0, #BOOT_CPU_MODE_EL2 - b.ne 1f - add x1, x1, #4 -1: str w0, [x1] // This CPU has booted in EL1 - dmb sy - dc ivac, x1 // Invalidate potentially stale cache line - ret -ENDPROC(set_cpu_boot_mode_flag) -/* - * These values are written with the MMU off, but read with the MMU on. - * Writers will invalidate the corresponding address, discarding up to a - * 'Cache Writeback Granule' (CWG) worth of data. The linker script ensures - * sufficient alignment that the CWG doesn't overlap another section. - */ - .pushsection ".mmuoff.data.write", "aw" -/* - * We need to find out the CPU boot mode long after boot, so we need to - * store it in a writable variable. - * - * This is not in .bss, because we set it sufficiently early that the boot-time - * zeroing of .bss would clobber it. - */ -ENTRY(__boot_cpu_mode) - .long BOOT_CPU_MODE_EL2 - .long BOOT_CPU_MODE_EL1 -/* - * The booting CPU updates the failed status @__early_cpu_boot_status, - * with MMU turned off. - */ -ENTRY(__early_cpu_boot_status) - .long 0 - - .popsection + mov w0, #BOOT_CPU_MODE_EL2 + orr x0, x0, x2 + eret +SYM_FUNC_END(init_kernel_el) /* * This provides a "holding pen" for platforms to hold all secondary * cores are held until we're ready for them to initialise. */ -ENTRY(secondary_holding_pen) - bl el2_setup // Drop to EL1, w0=cpu_boot_mode - bl set_cpu_boot_mode_flag - mrs x0, mpidr_el1 +SYM_FUNC_START(secondary_holding_pen) + mov x0, xzr + bl init_kernel_el // w0=cpu_boot_mode + mrs x2, mpidr_el1 mov_q x1, MPIDR_HWID_BITMASK - and x0, x0, x1 + and x2, x2, x1 adr_l x3, secondary_holding_pen_release pen: ldr x4, [x3] - cmp x4, x0 + cmp x4, x2 b.eq secondary_startup wfe b pen -ENDPROC(secondary_holding_pen) +SYM_FUNC_END(secondary_holding_pen) /* * Secondary entry point that jumps straight into the kernel. Only to * be used where CPUs are brought online dynamically by the kernel. */ -ENTRY(secondary_entry) - bl el2_setup // Drop to EL1 - bl set_cpu_boot_mode_flag +SYM_FUNC_START(secondary_entry) + mov x0, xzr + bl init_kernel_el // w0=cpu_boot_mode b secondary_startup -ENDPROC(secondary_entry) +SYM_FUNC_END(secondary_entry) -secondary_startup: +SYM_FUNC_START_LOCAL(secondary_startup) /* * Common entry point for secondary CPUs. */ + mov x20, x0 // preserve boot mode + +#ifdef CONFIG_ARM64_VA_BITS_52 +alternative_if ARM64_HAS_VA52 bl __cpu_secondary_check52bitva +alternative_else_nop_endif +#endif + bl __cpu_setup // initialise processor adrp x1, swapper_pg_dir + adrp x2, idmap_pg_dir bl __enable_mmu ldr x8, =__secondary_switched br x8 -ENDPROC(secondary_startup) +SYM_FUNC_END(secondary_startup) + + .text +SYM_FUNC_START_LOCAL(__secondary_switched) + mov x0, x20 + bl set_cpu_boot_mode_flag + + mov x0, x20 + bl finalise_el2 -__secondary_switched: + str_l xzr, __early_cpu_boot_status, x3 adr_l x5, vectors msr vbar_el1, x5 isb adr_l x0, secondary_data - ldr x1, [x0, #CPU_BOOT_STACK] // get secondary_data.stack - mov sp, x1 ldr x2, [x0, #CPU_BOOT_TASK] - msr sp_el0, x2 - mov x29, #0 - mov x30, #0 - b secondary_start_kernel -ENDPROC(__secondary_switched) + cbz x2, __secondary_too_slow + + init_cpu_task x2, x1, x3 + +#ifdef CONFIG_ARM64_PTR_AUTH + ptrauth_keys_init_cpu x2, x3, x4, x5 +#endif + + bl secondary_start_kernel + ASM_BUG() +SYM_FUNC_END(__secondary_switched) + +SYM_FUNC_START_LOCAL(__secondary_too_slow) + wfe + wfi + b __secondary_too_slow +SYM_FUNC_END(__secondary_too_slow) + +/* + * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed + * in w0. See arch/arm64/include/asm/virt.h for more info. + */ +SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag) + adr_l x1, __boot_cpu_mode + cmp w0, #BOOT_CPU_MODE_EL2 + b.ne 1f + add x1, x1, #4 +1: str w0, [x1] // Save CPU boot mode + ret +SYM_FUNC_END(set_cpu_boot_mode_flag) /* * The booting CPU updates the failed status @__early_cpu_boot_status, @@ -767,6 +447,7 @@ ENDPROC(__secondary_switched) * * x0 = SCTLR_EL1 value for turning on the MMU. * x1 = TTBR1_EL1 value + * x2 = ID map root table address * * Returns to the caller via x30/lr. This requires the caller to be covered * by the .idmap.text section. @@ -774,41 +455,35 @@ ENDPROC(__secondary_switched) * Checks if the selected granule size is supported by the CPU. * If it isn't, park the CPU */ -ENTRY(__enable_mmu) - mrs x2, ID_AA64MMFR0_EL1 - ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4 - cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED - b.ne __no_granule_support - update_early_cpu_boot_status 0, x2, x3 - adrp x2, idmap_pg_dir - phys_to_ttbr x1, x1 + .section ".idmap.text","a" +SYM_FUNC_START(__enable_mmu) + mrs x3, ID_AA64MMFR0_EL1 + ubfx x3, x3, #ID_AA64MMFR0_EL1_TGRAN_SHIFT, 4 + cmp x3, #ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN + b.lt __no_granule_support + cmp x3, #ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX + b.gt __no_granule_support phys_to_ttbr x2, x2 msr ttbr0_el1, x2 // load TTBR0 - offset_ttbr1 x1 - msr ttbr1_el1, x1 // load TTBR1 - isb - msr sctlr_el1, x0 - isb - /* - * Invalidate the local I-cache so that any instructions fetched - * speculatively from the PoC are discarded, since they may have - * been dynamically patched at the PoU. - */ - ic iallu - dsb nsh - isb - ret -ENDPROC(__enable_mmu) + load_ttbr1 x1, x1, x3 -ENTRY(__cpu_secondary_check52bitva) -#ifdef CONFIG_ARM64_USER_VA_BITS_52 - ldr_l x0, vabits_user - cmp x0, #52 - b.ne 2f + set_sctlr_el1 x0 + ret +SYM_FUNC_END(__enable_mmu) + +#ifdef CONFIG_ARM64_VA_BITS_52 +SYM_FUNC_START(__cpu_secondary_check52bitva) +#ifndef CONFIG_ARM64_LPA2 mrs_s x0, SYS_ID_AA64MMFR2_EL1 - and x0, x0, #(0xf << ID_AA64MMFR2_LVA_SHIFT) + and x0, x0, ID_AA64MMFR2_EL1_VARange_MASK cbnz x0, 2f +#else + mrs x0, id_aa64mmfr0_el1 + sbfx x0, x0, #ID_AA64MMFR0_EL1_TGRAN_SHIFT, 4 + cmp x0, #ID_AA64MMFR0_EL1_TGRAN_LPA2 + b.ge 2f +#endif update_early_cpu_boot_status \ CPU_STUCK_IN_KERNEL | CPU_STUCK_REASON_52_BIT_VA, x0, x1 @@ -816,11 +491,11 @@ ENTRY(__cpu_secondary_check52bitva) wfi b 1b -#endif 2: ret -ENDPROC(__cpu_secondary_check52bitva) +SYM_FUNC_END(__cpu_secondary_check52bitva) +#endif -__no_granule_support: +SYM_FUNC_START_LOCAL(__no_granule_support) /* Indicate that this CPU can't boot and is stuck in the kernel */ update_early_cpu_boot_status \ CPU_STUCK_IN_KERNEL | CPU_STUCK_REASON_NO_GRAN, x1, x2 @@ -828,73 +503,21 @@ __no_granule_support: wfe wfi b 1b -ENDPROC(__no_granule_support) - -#ifdef CONFIG_RELOCATABLE -__relocate_kernel: - /* - * Iterate over each entry in the relocation table, and apply the - * relocations in place. - */ - ldr w9, =__rela_offset // offset to reloc table - ldr w10, =__rela_size // size of reloc table - - mov_q x11, KIMAGE_VADDR // default virtual offset - add x11, x11, x23 // actual virtual offset - add x9, x9, x11 // __va(.rela) - add x10, x9, x10 // __va(.rela) + sizeof(.rela) - -0: cmp x9, x10 - b.hs 1f - ldp x11, x12, [x9], #24 - ldr x13, [x9, #-8] - cmp w12, #R_AARCH64_RELATIVE - b.ne 0b - add x13, x13, x23 // relocate - str x13, [x11, x23] - b 0b -1: ret -ENDPROC(__relocate_kernel) -#endif +SYM_FUNC_END(__no_granule_support) -__primary_switch: -#ifdef CONFIG_RANDOMIZE_BASE - mov x19, x0 // preserve new SCTLR_EL1 value - mrs x20, sctlr_el1 // preserve old SCTLR_EL1 value -#endif - - adrp x1, init_pg_dir +SYM_FUNC_START_LOCAL(__primary_switch) + adrp x1, reserved_pg_dir + adrp x2, __pi_init_idmap_pg_dir bl __enable_mmu -#ifdef CONFIG_RELOCATABLE - bl __relocate_kernel -#ifdef CONFIG_RANDOMIZE_BASE - ldr x8, =__primary_switched - adrp x0, __PHYS_OFFSET - blr x8 - - /* - * If we return here, we have a KASLR displacement in x23 which we need - * to take into account by discarding the current kernel mapping and - * creating a new one. - */ - pre_disable_mmu_workaround - msr sctlr_el1, x20 // disable the MMU - isb - bl __create_page_tables // recreate kernel mapping - - tlbi vmalle1 // Remove any stale TLB entries - dsb nsh - msr sctlr_el1, x19 // re-enable the MMU - isb - ic iallu // flush instructions fetched - dsb nsh // via old mapping - isb + adrp x1, early_init_stack + mov sp, x1 + mov x29, xzr + mov x0, x20 // pass the full boot status + mov x1, x21 // pass the FDT + bl __pi_early_map_kernel // Map and relocate the kernel - bl __relocate_kernel -#endif -#endif ldr x8, =__primary_switched - adrp x0, __PHYS_OFFSET + adrp x0, KERNEL_START // __pa(KERNEL_START) br x8 -ENDPROC(__primary_switch) +SYM_FUNC_END(__primary_switch) diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S index fe36d85c60bd..0e1d9c3c6a93 100644 --- a/arch/arm64/kernel/hibernate-asm.S +++ b/arch/arm64/kernel/hibernate-asm.S @@ -1,20 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Hibernate low-level support * * Copyright (C) 2016 ARM Ltd. * Author: James Morse <james.morse@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/linkage.h> #include <linux/errno.h> @@ -27,26 +16,6 @@ #include <asm/virt.h> /* - * To prevent the possibility of old and new partial table walks being visible - * in the tlb, switch the ttbr to a zero page when we invalidate the old - * records. D4.7.1 'General TLB maintenance requirements' in ARM DDI 0487A.i - * Even switching to our copied tables will cause a changed output address at - * each stage of the walk. - */ -.macro break_before_make_ttbr_switch zero_page, page_table, tmp - phys_to_ttbr \tmp, \zero_page - msr ttbr1_el1, \tmp - isb - tlbi vmalle1 - dsb nsh - phys_to_ttbr \tmp, \page_table - offset_ttbr1 \tmp - msr ttbr1_el1, \tmp - isb -.endm - - -/* * Resume from hibernate * * Loads temporary page tables then restores the memory image. @@ -56,7 +25,7 @@ * Because this code has to be copied to a 'safe' page, it can't call out to * other functions by PC-relative address. Also remember that it may be * mid-way through over-writing other functions. For this reason it contains - * code from flush_icache_range() and uses the copy_page() macro. + * code from caches_clean_inval_pou() and uses the copy_page() macro. * * This 'safe' page is mapped via ttbr0, and executed from there. This function * switches to a copy of the linear map in ttbr1, performs the restore, then @@ -76,12 +45,12 @@ * x5: physical address of a zero page that remains zero after resume */ .pushsection ".hibernate_exit.text", "ax" -ENTRY(swsusp_arch_suspend_exit) +SYM_CODE_START(swsusp_arch_suspend_exit) /* * We execute from ttbr0, change ttbr1 to our copied linear map tables * with a break-before-make via the zero page */ - break_before_make_ttbr_switch x5, x0, x6 + break_before_make_ttbr_switch x5, x0, x6, x8 mov x21, x1 mov x30, x2 @@ -98,11 +67,12 @@ ENTRY(swsusp_arch_suspend_exit) copy_page x0, x1, x2, x3, x4, x5, x6, x7, x8, x9 add x1, x10, #PAGE_SIZE - /* Clean the copied page to PoU - based on flush_icache_range() */ + /* Clean the copied page to PoU - based on caches_clean_inval_pou() */ raw_dcache_line_size x2, x3 sub x3, x2, #1 bic x4, x10, x3 -2: dc cvau, x4 /* clean D line / unified line */ +2: /* clean D line / unified line */ +alternative_insn "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE add x4, x4, x2 cmp x4, x1 b.lo 2b @@ -112,7 +82,7 @@ ENTRY(swsusp_arch_suspend_exit) dsb ish /* wait for PoU cleaning to finish */ /* switch to the restored kernels page tables */ - break_before_make_ttbr_switch x25, x21, x6 + break_before_make_ttbr_switch x25, x21, x6, x8 ic ialluis dsb ish @@ -121,59 +91,5 @@ ENTRY(swsusp_arch_suspend_exit) cbz x24, 3f /* Do we need to re-initialise EL2? */ hvc #0 3: ret - - .ltorg -ENDPROC(swsusp_arch_suspend_exit) - -/* - * Restore the hyp stub. - * This must be done before the hibernate page is unmapped by _cpu_resume(), - * but happens before any of the hyp-stub's code is cleaned to PoC. - * - * x24: The physical address of __hyp_stub_vectors - */ -el1_sync: - msr vbar_el2, x24 - eret -ENDPROC(el1_sync) - -.macro invalid_vector label -\label: - b \label -ENDPROC(\label) -.endm - - invalid_vector el2_sync_invalid - invalid_vector el2_irq_invalid - invalid_vector el2_fiq_invalid - invalid_vector el2_error_invalid - invalid_vector el1_sync_invalid - invalid_vector el1_irq_invalid - invalid_vector el1_fiq_invalid - invalid_vector el1_error_invalid - -/* el2 vectors - switch el2 here while we restore the memory image. */ - .align 11 -ENTRY(hibernate_el2_vectors) - ventry el2_sync_invalid // Synchronous EL2t - ventry el2_irq_invalid // IRQ EL2t - ventry el2_fiq_invalid // FIQ EL2t - ventry el2_error_invalid // Error EL2t - - ventry el2_sync_invalid // Synchronous EL2h - ventry el2_irq_invalid // IRQ EL2h - ventry el2_fiq_invalid // FIQ EL2h - ventry el2_error_invalid // Error EL2h - - ventry el1_sync // Synchronous 64-bit EL1 - ventry el1_irq_invalid // IRQ 64-bit EL1 - ventry el1_fiq_invalid // FIQ 64-bit EL1 - ventry el1_error_invalid // Error 64-bit EL1 - - ventry el1_sync_invalid // Synchronous 32-bit EL1 - ventry el1_irq_invalid // IRQ 32-bit EL1 - ventry el1_fiq_invalid // FIQ 32-bit EL1 - ventry el1_error_invalid // Error 32-bit EL1 -END(hibernate_el2_vectors) - +SYM_CODE_END(swsusp_arch_suspend_exit) .popsection diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 29cdc99688f3..18749e9a6c2d 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /*: * Hibernate support specific for ARM64 * @@ -6,23 +7,15 @@ * Ubuntu project, hibernation support for mach-dove * Copyright (C) 2010 Nokia Corporation (Hiroshi Doyu) * Copyright (C) 2010 Texas Instruments, Inc. (Teerth Reddy et al.) - * https://lkml.org/lkml/2010/6/18/4 - * https://lists.linux-foundation.org/pipermail/linux-pm/2010-June/027422.html - * https://patchwork.kernel.org/patch/96442/ - * * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> - * - * License terms: GNU General Public License (GPL) version 2 */ #define pr_fmt(x) "hibernate: " x #include <linux/cpu.h> #include <linux/kvm_host.h> -#include <linux/mm.h> #include <linux/pm.h> #include <linux/sched.h> #include <linux/suspend.h> #include <linux/utsname.h> -#include <linux/version.h> #include <asm/barrier.h> #include <asm/cacheflush.h> @@ -32,14 +25,13 @@ #include <asm/kexec.h> #include <asm/memory.h> #include <asm/mmu_context.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> -#include <asm/pgtable-hwdef.h> +#include <asm/mte.h> #include <asm/sections.h> #include <asm/smp.h> #include <asm/smp_plat.h> #include <asm/suspend.h> #include <asm/sysreg.h> +#include <asm/trans_pgd.h> #include <asm/virt.h> /* @@ -53,10 +45,7 @@ extern int in_suspend; /* Do we need to reset el2? */ -#define el2_reset_needed() (is_hyp_mode_available() && !is_kernel_in_hyp_mode()) - -/* temporary el2 vectors in the __hibernate_exit_text section. */ -extern char hibernate_el2_vectors[]; +#define el2_reset_needed() (is_hyp_nvhe()) /* hyp-stub vectors, used to restore el2 during resume from hibernate. */ extern char __hyp_stub_vectors[]; @@ -110,7 +99,6 @@ int pfn_is_nosave(unsigned long pfn) void notrace save_processor_state(void) { - WARN_ON(num_online_cpus() != 1); } void notrace restore_processor_state(void) @@ -167,14 +155,11 @@ int arch_hibernation_header_restore(void *addr) sleep_cpu = -EINVAL; return -EINVAL; } - if (!cpu_online(sleep_cpu)) { - pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n"); - ret = cpu_up(sleep_cpu); - if (ret) { - pr_err("Failed to bring hibernate-CPU up!\n"); - sleep_cpu = -EINVAL; - return ret; - } + + ret = bringup_hibernate_cpu(sleep_cpu); + if (ret) { + sleep_cpu = -EINVAL; + return ret; } resume_hdr = *hdr; @@ -183,9 +168,14 @@ int arch_hibernation_header_restore(void *addr) } EXPORT_SYMBOL(arch_hibernation_header_restore); +static void *hibernate_page_alloc(void *arg) +{ + return (void *)get_safe_page((__force gfp_t)(unsigned long)arg); +} + /* * Copies length bytes, starting at src_start into an new page, - * perform cache maintentance, then maps it at the specified address low + * perform cache maintenance, then maps it at the specified address low * address as executable. * * This is used by hibernate to copy the code it needs to execute when @@ -196,83 +186,149 @@ EXPORT_SYMBOL(arch_hibernation_header_restore); * page system. */ static int create_safe_exec_page(void *src_start, size_t length, - unsigned long dst_addr, - phys_addr_t *phys_dst_addr, - void *(*allocator)(gfp_t mask), - gfp_t mask) + phys_addr_t *phys_dst_addr) { - int rc = 0; - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - unsigned long dst = (unsigned long)allocator(mask); - - if (!dst) { - rc = -ENOMEM; - goto out; + struct trans_pgd_info trans_info = { + .trans_alloc_page = hibernate_page_alloc, + .trans_alloc_arg = (__force void *)GFP_ATOMIC, + }; + + void *page = (void *)get_safe_page(GFP_ATOMIC); + phys_addr_t trans_ttbr0; + unsigned long t0sz; + int rc; + + if (!page) + return -ENOMEM; + + memcpy(page, src_start, length); + caches_clean_inval_pou((unsigned long)page, (unsigned long)page + length); + rc = trans_pgd_idmap_page(&trans_info, &trans_ttbr0, &t0sz, page); + if (rc) + return rc; + + cpu_install_ttbr0(trans_ttbr0, t0sz); + *phys_dst_addr = virt_to_phys(page); + + return 0; +} + +#ifdef CONFIG_ARM64_MTE + +static DEFINE_XARRAY(mte_pages); + +static int save_tags(struct page *page, unsigned long pfn) +{ + void *tag_storage, *ret; + + tag_storage = mte_allocate_tag_storage(); + if (!tag_storage) + return -ENOMEM; + + mte_save_page_tags(page_address(page), tag_storage); + + ret = xa_store(&mte_pages, pfn, tag_storage, GFP_KERNEL); + if (WARN(xa_is_err(ret), "Failed to store MTE tags")) { + mte_free_tag_storage(tag_storage); + return xa_err(ret); + } else if (WARN(ret, "swsusp: %s: Duplicate entry", __func__)) { + mte_free_tag_storage(ret); } - memcpy((void *)dst, src_start, length); - __flush_icache_range(dst, dst + length); + return 0; +} - pgdp = pgd_offset_raw(allocator(mask), dst_addr); - if (pgd_none(READ_ONCE(*pgdp))) { - pudp = allocator(mask); - if (!pudp) { - rc = -ENOMEM; - goto out; - } - pgd_populate(&init_mm, pgdp, pudp); +static void swsusp_mte_free_storage(void) +{ + XA_STATE(xa_state, &mte_pages, 0); + void *tags; + + xa_lock(&mte_pages); + xas_for_each(&xa_state, tags, ULONG_MAX) { + mte_free_tag_storage(tags); } + xa_unlock(&mte_pages); + + xa_destroy(&mte_pages); +} + +static int swsusp_mte_save_tags(void) +{ + struct zone *zone; + unsigned long pfn, max_zone_pfn; + int ret = 0; + int n = 0; + + if (!system_supports_mte()) + return 0; + + for_each_populated_zone(zone) { + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { + struct page *page = pfn_to_online_page(pfn); + struct folio *folio; - pudp = pud_offset(pgdp, dst_addr); - if (pud_none(READ_ONCE(*pudp))) { - pmdp = allocator(mask); - if (!pmdp) { - rc = -ENOMEM; - goto out; + if (!page) + continue; + folio = page_folio(page); + + if (folio_test_hugetlb(folio) && + !folio_test_hugetlb_mte_tagged(folio)) + continue; + + if (!page_mte_tagged(page)) + continue; + + ret = save_tags(page, pfn); + if (ret) { + swsusp_mte_free_storage(); + goto out; + } + + n++; } - pud_populate(&init_mm, pudp, pmdp); } + pr_info("Saved %d MTE pages\n", n); - pmdp = pmd_offset(pudp, dst_addr); - if (pmd_none(READ_ONCE(*pmdp))) { - ptep = allocator(mask); - if (!ptep) { - rc = -ENOMEM; - goto out; - } - pmd_populate_kernel(&init_mm, pmdp, ptep); +out: + return ret; +} + +static void swsusp_mte_restore_tags(void) +{ + XA_STATE(xa_state, &mte_pages, 0); + int n = 0; + void *tags; + + xa_lock(&mte_pages); + xas_for_each(&xa_state, tags, ULONG_MAX) { + unsigned long pfn = xa_state.xa_index; + struct page *page = pfn_to_online_page(pfn); + + mte_restore_page_tags(page_address(page), tags); + + mte_free_tag_storage(tags); + n++; } + xa_unlock(&mte_pages); - ptep = pte_offset_kernel(pmdp, dst_addr); - set_pte(ptep, pfn_pte(virt_to_pfn(dst), PAGE_KERNEL_EXEC)); + pr_info("Restored %d MTE pages\n", n); - /* - * Load our new page tables. A strict BBM approach requires that we - * ensure that TLBs are free of any entries that may overlap with the - * global mappings we are about to install. - * - * For a real hibernate/resume cycle TTBR0 currently points to a zero - * page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI - * runtime services), while for a userspace-driven test_resume cycle it - * points to userspace page tables (and we must point it at a zero page - * ourselves). Elsewhere we only (un)install the idmap with preemption - * disabled, so T0SZ should be as required regardless. - */ - cpu_set_reserved_ttbr0(); - local_flush_tlb_all(); - write_sysreg(phys_to_ttbr(virt_to_phys(pgdp)), ttbr0_el1); - isb(); + xa_destroy(&mte_pages); +} - *phys_dst_addr = virt_to_phys((void *)dst); +#else /* CONFIG_ARM64_MTE */ -out: - return rc; +static int swsusp_mte_save_tags(void) +{ + return 0; } -#define dcache_clean_range(start, end) __flush_dcache_area(start, (end - start)) +static void swsusp_mte_restore_tags(void) +{ +} + +#endif /* CONFIG_ARM64_MTE */ int swsusp_arch_suspend(void) { @@ -291,16 +347,29 @@ int swsusp_arch_suspend(void) /* make the crash dump kernel image visible/saveable */ crash_prepare_suspend(); + ret = swsusp_mte_save_tags(); + if (ret) + return ret; + sleep_cpu = smp_processor_id(); ret = swsusp_save(); } else { /* Clean kernel core startup/idle code to PoC*/ - dcache_clean_range(__mmuoff_data_start, __mmuoff_data_end); - dcache_clean_range(__idmap_text_start, __idmap_text_end); + dcache_clean_inval_poc((unsigned long)__mmuoff_data_start, + (unsigned long)__mmuoff_data_end); + dcache_clean_inval_poc((unsigned long)__idmap_text_start, + (unsigned long)__idmap_text_end); /* Clean kvm setup code to PoC? */ - if (el2_reset_needed()) - dcache_clean_range(__hyp_idmap_text_start, __hyp_idmap_text_end); + if (el2_reset_needed()) { + dcache_clean_inval_poc( + (unsigned long)__hyp_idmap_text_start, + (unsigned long)__hyp_idmap_text_end); + dcache_clean_inval_poc((unsigned long)__hyp_text_start, + (unsigned long)__hyp_text_end); + } + + swsusp_mte_restore_tags(); /* make the crash dump kernel image protected again */ crash_post_resume(); @@ -319,11 +388,7 @@ int swsusp_arch_suspend(void) * mitigation off behind our back, let's set the state * to what we expect it to be. */ - switch (arm64_get_ssbd_state()) { - case ARM64_SSBD_FORCE_ENABLE: - case ARM64_SSBD_KERNEL: - arm64_set_ssbd_mitigation(true); - } + spectre_v4_enable_mitigation(NULL); } local_daif_restore(flags); @@ -331,143 +396,6 @@ int swsusp_arch_suspend(void) return ret; } -static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) -{ - pte_t pte = READ_ONCE(*src_ptep); - - if (pte_valid(pte)) { - /* - * Resume will overwrite areas that may be marked - * read only (code, rodata). Clear the RDONLY bit from - * the temporary mappings we use during restore. - */ - set_pte(dst_ptep, pte_mkwrite(pte)); - } else if (debug_pagealloc_enabled() && !pte_none(pte)) { - /* - * debug_pagealloc will removed the PTE_VALID bit if - * the page isn't in use by the resume kernel. It may have - * been in use by the original kernel, in which case we need - * to put it back in our copy to do the restore. - * - * Before marking this entry valid, check the pfn should - * be mapped. - */ - BUG_ON(!pfn_valid(pte_pfn(pte))); - - set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte))); - } -} - -static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, - unsigned long end) -{ - pte_t *src_ptep; - pte_t *dst_ptep; - unsigned long addr = start; - - dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC); - if (!dst_ptep) - return -ENOMEM; - pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep); - dst_ptep = pte_offset_kernel(dst_pmdp, start); - - src_ptep = pte_offset_kernel(src_pmdp, start); - do { - _copy_pte(dst_ptep, src_ptep, addr); - } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end); - - return 0; -} - -static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, - unsigned long end) -{ - pmd_t *src_pmdp; - pmd_t *dst_pmdp; - unsigned long next; - unsigned long addr = start; - - if (pud_none(READ_ONCE(*dst_pudp))) { - dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pmdp) - return -ENOMEM; - pud_populate(&init_mm, dst_pudp, dst_pmdp); - } - dst_pmdp = pmd_offset(dst_pudp, start); - - src_pmdp = pmd_offset(src_pudp, start); - do { - pmd_t pmd = READ_ONCE(*src_pmdp); - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - continue; - if (pmd_table(pmd)) { - if (copy_pte(dst_pmdp, src_pmdp, addr, next)) - return -ENOMEM; - } else { - set_pmd(dst_pmdp, - __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY)); - } - } while (dst_pmdp++, src_pmdp++, addr = next, addr != end); - - return 0; -} - -static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, - unsigned long end) -{ - pud_t *dst_pudp; - pud_t *src_pudp; - unsigned long next; - unsigned long addr = start; - - if (pgd_none(READ_ONCE(*dst_pgdp))) { - dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pudp) - return -ENOMEM; - pgd_populate(&init_mm, dst_pgdp, dst_pudp); - } - dst_pudp = pud_offset(dst_pgdp, start); - - src_pudp = pud_offset(src_pgdp, start); - do { - pud_t pud = READ_ONCE(*src_pudp); - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - continue; - if (pud_table(pud)) { - if (copy_pmd(dst_pudp, src_pudp, addr, next)) - return -ENOMEM; - } else { - set_pud(dst_pudp, - __pud(pud_val(pud) & ~PMD_SECT_RDONLY)); - } - } while (dst_pudp++, src_pudp++, addr = next, addr != end); - - return 0; -} - -static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, - unsigned long end) -{ - unsigned long next; - unsigned long addr = start; - pgd_t *src_pgdp = pgd_offset_k(start); - - dst_pgdp = pgd_offset_raw(dst_pgdp, start); - do { - next = pgd_addr_end(addr, end); - if (pgd_none(READ_ONCE(*src_pgdp))) - continue; - if (copy_pud(dst_pgdp, src_pgdp, addr, next)) - return -ENOMEM; - } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); - - return 0; -} - /* * Setup then Resume from the hibernate image using swsusp_arch_suspend_exit(). * @@ -476,85 +404,72 @@ static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, */ int swsusp_arch_resume(void) { - int rc = 0; + int rc; void *zero_page; size_t exit_size; pgd_t *tmp_pg_dir; - phys_addr_t phys_hibernate_exit; + phys_addr_t el2_vectors; void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *, void *, phys_addr_t, phys_addr_t); + struct trans_pgd_info trans_info = { + .trans_alloc_page = hibernate_page_alloc, + .trans_alloc_arg = (__force void *)GFP_ATOMIC, + }; /* * Restoring the memory image will overwrite the ttbr1 page tables. * Create a second copy of just the linear map, and use this when * restoring. */ - tmp_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!tmp_pg_dir) { - pr_err("Failed to allocate memory for temporary page tables.\n"); - rc = -ENOMEM; - goto out; - } - rc = copy_page_tables(tmp_pg_dir, PAGE_OFFSET, 0); + rc = trans_pgd_create_copy(&trans_info, &tmp_pg_dir, PAGE_OFFSET, + PAGE_END); if (rc) - goto out; + return rc; /* - * We need a zero page that is zero before & after resume in order to + * We need a zero page that is zero before & after resume in order * to break before make on the ttbr1 page tables. */ zero_page = (void *)get_safe_page(GFP_ATOMIC); if (!zero_page) { pr_err("Failed to allocate zero page.\n"); - rc = -ENOMEM; - goto out; + return -ENOMEM; + } + + if (el2_reset_needed()) { + rc = trans_pgd_copy_el2_vectors(&trans_info, &el2_vectors); + if (rc) { + pr_err("Failed to setup el2 vectors\n"); + return rc; + } } - /* - * Locate the exit code in the bottom-but-one page, so that *NULL - * still has disastrous affects. - */ - hibernate_exit = (void *)PAGE_SIZE; exit_size = __hibernate_exit_text_end - __hibernate_exit_text_start; /* * Copy swsusp_arch_suspend_exit() to a safe page. This will generate * a new set of ttbr0 page tables and load them. */ rc = create_safe_exec_page(__hibernate_exit_text_start, exit_size, - (unsigned long)hibernate_exit, - &phys_hibernate_exit, - (void *)get_safe_page, GFP_ATOMIC); + (phys_addr_t *)&hibernate_exit); if (rc) { pr_err("Failed to create safe executable page for hibernate_exit code.\n"); - goto out; + return rc; } /* - * The hibernate exit text contains a set of el2 vectors, that will - * be executed at el2 with the mmu off in order to reload hyp-stub. - */ - __flush_dcache_area(hibernate_exit, exit_size); - - /* * KASLR will cause the el2 vectors to be in a different location in * the resumed kernel. Load hibernate's temporary copy into el2. * * We can skip this step if we booted at EL1, or are running with VHE. */ - if (el2_reset_needed()) { - phys_addr_t el2_vectors = phys_hibernate_exit; /* base */ - el2_vectors += hibernate_el2_vectors - - __hibernate_exit_text_start; /* offset */ - + if (el2_reset_needed()) __hyp_set_vectors(el2_vectors); - } hibernate_exit(virt_to_phys(tmp_pg_dir), resume_hdr.ttbr1_el1, resume_hdr.reenter_kernel, restore_pblist, resume_hdr.__hyp_stub_vectors, virt_to_phys(zero_page)); -out: - return rc; + return 0; } int hibernate_resume_nonboot_cpu_disable(void) diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 8c9644376326..ab76b36dce82 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -1,21 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, * using the CPU's debug registers. * * Copyright (C) 2012 ARM Limited * Author: Will Deacon <will.deacon@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) "hw-breakpoint: " fmt @@ -32,6 +21,8 @@ #include <asm/current.h> #include <asm/debug-monitors.h> +#include <asm/esr.h> +#include <asm/exception.h> #include <asm/hw_breakpoint.h> #include <asm/traps.h> #include <asm/cputype.h> @@ -62,7 +53,7 @@ int hw_breakpoint_slots(int type) case TYPE_DATA: return get_num_wrps(); default: - pr_warning("unknown slot type: %d\n", type); + pr_warn("unknown slot type: %d\n", type); return 0; } } @@ -123,7 +114,7 @@ static u64 read_wb_reg(int reg, int n) GEN_READ_WB_REG_CASES(AARCH64_DBG_REG_WVR, AARCH64_DBG_REG_NAME_WVR, val); GEN_READ_WB_REG_CASES(AARCH64_DBG_REG_WCR, AARCH64_DBG_REG_NAME_WCR, val); default: - pr_warning("attempt to read from unknown breakpoint register %d\n", n); + pr_warn("attempt to read from unknown breakpoint register %d\n", n); } return val; @@ -138,7 +129,7 @@ static void write_wb_reg(int reg, int n, u64 val) GEN_WRITE_WB_REG_CASES(AARCH64_DBG_REG_WVR, AARCH64_DBG_REG_NAME_WVR, val); GEN_WRITE_WB_REG_CASES(AARCH64_DBG_REG_WCR, AARCH64_DBG_REG_NAME_WCR, val); default: - pr_warning("attempt to write to unknown breakpoint register %d\n", n); + pr_warn("attempt to write to unknown breakpoint register %d\n", n); } isb(); } @@ -156,7 +147,7 @@ static enum dbg_active_el debug_exception_level(int privilege) case AARCH64_BREAKPOINT_EL1: return DBG_ACTIVE_EL1; default: - pr_warning("invalid breakpoint privilege level %d\n", privilege); + pr_warn("invalid breakpoint privilege level %d\n", privilege); return -EINVAL; } } @@ -268,7 +259,7 @@ static int hw_breakpoint_control(struct perf_event *bp, * level. */ enable_debug_monitors(dbg_el); - /* Fall through */ + fallthrough; case HW_BREAKPOINT_RESTORE: /* Setup the address register. */ write_wb_reg(val_reg, i, info->address); @@ -547,13 +538,18 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, /* Aligned */ break; case 1: - /* Allow single byte watchpoint. */ - if (hw->ctrl.len == ARM_BREAKPOINT_LEN_1) - break; case 2: /* Allow halfword watchpoints and breakpoints. */ if (hw->ctrl.len == ARM_BREAKPOINT_LEN_2) break; + + fallthrough; + case 3: + /* Allow single byte watchpoint. */ + if (hw->ctrl.len == ARM_BREAKPOINT_LEN_1) + break; + + fallthrough; default: return -EINVAL; } @@ -623,8 +619,7 @@ NOKPROBE_SYMBOL(toggle_bp_registers); /* * Debug exception handlers. */ -static int breakpoint_handler(unsigned long unused, unsigned int esr, - struct pt_regs *regs) +void do_breakpoint(unsigned long esr, struct pt_regs *regs) { int i, step = 0, *kernel_step; u32 ctrl_reg; @@ -667,7 +662,7 @@ unlock: } if (!step) - return 0; + return; if (user_mode(regs)) { debug_info->bps_disabled = 1; @@ -675,7 +670,7 @@ unlock: /* If we're already stepping a watchpoint, just return. */ if (debug_info->wps_disabled) - return 0; + return; if (test_thread_flag(TIF_SINGLESTEP)) debug_info->suspended_step = 1; @@ -686,7 +681,7 @@ unlock: kernel_step = this_cpu_ptr(&stepping_kernel_bp); if (*kernel_step != ARM_KERNEL_STEP_NONE) - return 0; + return; if (kernel_active_single_step()) { *kernel_step = ARM_KERNEL_STEP_SUSPEND; @@ -695,10 +690,8 @@ unlock: kernel_enable_single_step(regs); } } - - return 0; } -NOKPROBE_SYMBOL(breakpoint_handler); +NOKPROBE_SYMBOL(do_breakpoint); /* * Arm64 hardware does not always report a watchpoint hit address that matches @@ -707,7 +700,7 @@ NOKPROBE_SYMBOL(breakpoint_handler); * addresses. There is no straight-forward way, short of disassembling the * offending instruction, to map that address back to the watchpoint. This * function computes the distance of the memory access from the watchpoint as a - * heuristic for the likelyhood that a given access triggered the watchpoint. + * heuristic for the likelihood that a given access triggered the watchpoint. * * See Section D2.10.5 "Determining the memory location that caused a Watchpoint * exception" of ARMv8 Architecture Reference Manual for details. @@ -736,8 +729,28 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val, return 0; } -static int watchpoint_handler(unsigned long addr, unsigned int esr, - struct pt_regs *regs) +static int watchpoint_report(struct perf_event *wp, unsigned long addr, + struct pt_regs *regs) +{ + int step = is_default_overflow_handler(wp); + struct arch_hw_breakpoint *info = counter_arch_bp(wp); + + info->trigger = addr; + + /* + * If we triggered a user watchpoint from a uaccess routine, then + * handle the stepping ourselves since userspace really can't help + * us with this. + */ + if (!user_mode(regs) && info->ctrl.privilege == AARCH64_BREAKPOINT_EL0) + step = 1; + else + perf_bp_event(wp, regs); + + return step; +} + +void do_watchpoint(unsigned long addr, unsigned long esr, struct pt_regs *regs) { int i, step = 0, *kernel_step, access, closest_match = 0; u64 min_dist = -1, dist; @@ -745,7 +758,6 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, u64 val; struct perf_event *wp, **slots; struct debug_info *debug_info; - struct arch_hw_breakpoint *info; struct arch_hw_breakpoint_ctrl ctrl; slots = this_cpu_ptr(wp_on_reg); @@ -765,7 +777,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, * Check that the access type matches. * 0 => load, otherwise => store */ - access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W : + access = (esr & ESR_ELx_WNR) ? HW_BREAKPOINT_W : HW_BREAKPOINT_R; if (!(access & hw_breakpoint_type(wp))) continue; @@ -783,29 +795,17 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, if (dist != 0) continue; - info = counter_arch_bp(wp); - info->trigger = addr; - perf_bp_event(wp, regs); - - /* Do we need to handle the stepping? */ - if (is_default_overflow_handler(wp)) - step = 1; + step = watchpoint_report(wp, addr, regs); } - if (min_dist > 0 && min_dist != -1) { - /* No exact match found. */ - wp = slots[closest_match]; - info = counter_arch_bp(wp); - info->trigger = addr; - perf_bp_event(wp, regs); - /* Do we need to handle the stepping? */ - if (is_default_overflow_handler(wp)) - step = 1; - } + /* No exact match found? */ + if (min_dist > 0 && min_dist != -1) + step = watchpoint_report(slots[closest_match], addr, regs); + rcu_read_unlock(); if (!step) - return 0; + return; /* * We always disable EL0 watchpoints because the kernel can @@ -818,7 +818,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, /* If we're already stepping a breakpoint, just return. */ if (debug_info->bps_disabled) - return 0; + return; if (test_thread_flag(TIF_SINGLESTEP)) debug_info->suspended_step = 1; @@ -829,7 +829,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, kernel_step = this_cpu_ptr(&stepping_kernel_bp); if (*kernel_step != ARM_KERNEL_STEP_NONE) - return 0; + return; if (kernel_active_single_step()) { *kernel_step = ARM_KERNEL_STEP_SUSPEND; @@ -838,44 +838,41 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, kernel_enable_single_step(regs); } } - - return 0; } -NOKPROBE_SYMBOL(watchpoint_handler); +NOKPROBE_SYMBOL(do_watchpoint); /* * Handle single-step exception. */ -int reinstall_suspended_bps(struct pt_regs *regs) +bool try_step_suspended_breakpoints(struct pt_regs *regs) { struct debug_info *debug_info = ¤t->thread.debug; - int handled_exception = 0, *kernel_step; - - kernel_step = this_cpu_ptr(&stepping_kernel_bp); + int *kernel_step = this_cpu_ptr(&stepping_kernel_bp); + bool handled_exception = false; /* - * Called from single-step exception handler. - * Return 0 if execution can resume, 1 if a SIGTRAP should be - * reported. + * Called from single-step exception entry. + * Return true if we stepped a breakpoint and can resume execution, + * false if we need to handle a single-step. */ if (user_mode(regs)) { if (debug_info->bps_disabled) { debug_info->bps_disabled = 0; toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL0, 1); - handled_exception = 1; + handled_exception = true; } if (debug_info->wps_disabled) { debug_info->wps_disabled = 0; toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL0, 1); - handled_exception = 1; + handled_exception = true; } if (handled_exception) { if (debug_info->suspended_step) { debug_info->suspended_step = 0; /* Allow exception handling to fall-through. */ - handled_exception = 0; + handled_exception = false; } else { user_disable_single_step(current); } @@ -889,17 +886,17 @@ int reinstall_suspended_bps(struct pt_regs *regs) if (*kernel_step != ARM_KERNEL_STEP_SUSPEND) { kernel_disable_single_step(); - handled_exception = 1; + handled_exception = true; } else { - handled_exception = 0; + handled_exception = false; } *kernel_step = ARM_KERNEL_STEP_NONE; } - return !handled_exception; + return handled_exception; } -NOKPROBE_SYMBOL(reinstall_suspended_bps); +NOKPROBE_SYMBOL(try_step_suspended_breakpoints); /* * Context-switcher for restoring suspended breakpoints. @@ -971,14 +968,6 @@ static int hw_breakpoint_reset(unsigned int cpu) return 0; } -#ifdef CONFIG_CPU_PM -extern void cpu_suspend_set_dbg_restorer(int (*hw_bp_restore)(unsigned int)); -#else -static inline void cpu_suspend_set_dbg_restorer(int (*hw_bp_restore)(unsigned int)) -{ -} -#endif - /* * One-time initialisation. */ @@ -992,12 +981,6 @@ static int __init arch_hw_breakpoint_init(void) pr_info("found %d breakpoint and %d watchpoint registers.\n", core_num_brps, core_num_wrps); - /* Register debug fault handlers. */ - hook_debug_fault_code(DBG_ESR_EVT_HWBP, breakpoint_handler, SIGTRAP, - TRAP_HWBKPT, "hw-breakpoint handler"); - hook_debug_fault_code(DBG_ESR_EVT_HWWP, watchpoint_handler, SIGTRAP, - TRAP_HWBKPT, "hw-watchpoint handler"); - /* * Reset the breakpoint resources. We assume that a halting * debugger will leave the world in a nice state for us. diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index e1261fbaa374..085bc9972f6b 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -1,47 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Hypervisor stub * * Copyright (C) 2012 ARM Ltd. * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/init.h> #include <linux/linkage.h> -#include <linux/irqchip/arm-gic-v3.h> #include <asm/assembler.h> +#include <asm/el2_setup.h> #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> #include <asm/ptrace.h> #include <asm/virt.h> .text + .pushsection .hyp.text, "ax" + .align 11 -ENTRY(__hyp_stub_vectors) +SYM_CODE_START(__hyp_stub_vectors) ventry el2_sync_invalid // Synchronous EL2t ventry el2_irq_invalid // IRQ EL2t ventry el2_fiq_invalid // FIQ EL2t ventry el2_error_invalid // Error EL2t - ventry el2_sync_invalid // Synchronous EL2h + ventry elx_sync // Synchronous EL2h ventry el2_irq_invalid // IRQ EL2h ventry el2_fiq_invalid // FIQ EL2h ventry el2_error_invalid // Error EL2h - ventry el1_sync // Synchronous 64-bit EL1 + ventry elx_sync // Synchronous 64-bit EL1 ventry el1_irq_invalid // IRQ 64-bit EL1 ventry el1_fiq_invalid // FIQ 64-bit EL1 ventry el1_error_invalid // Error 64-bit EL1 @@ -50,16 +41,24 @@ ENTRY(__hyp_stub_vectors) ventry el1_irq_invalid // IRQ 32-bit EL1 ventry el1_fiq_invalid // FIQ 32-bit EL1 ventry el1_error_invalid // Error 32-bit EL1 -ENDPROC(__hyp_stub_vectors) +SYM_CODE_END(__hyp_stub_vectors) .align 11 -el1_sync: +SYM_CODE_START_LOCAL(elx_sync) cmp x0, #HVC_SET_VECTORS - b.ne 2f + b.ne 1f msr vbar_el2, x1 b 9f +1: cmp x0, #HVC_FINALISE_EL2 + b.eq __finalise_el2 + + cmp x0, #HVC_GET_ICH_VTR_EL2 + b.ne 2f + mrs_s x1, SYS_ICH_VTR_EL2 + b 9f + 2: cmp x0, #HVC_SOFT_RESTART b.ne 3f mov x0, x2 @@ -72,17 +71,129 @@ el1_sync: beq 9f // Nothing to reset! /* Someone called kvm_call_hyp() against the hyp-stub... */ - ldr x0, =HVC_STUB_ERR + mov_q x0, HVC_STUB_ERR eret 9: mov x0, xzr eret -ENDPROC(el1_sync) +SYM_CODE_END(elx_sync) + +SYM_CODE_START_LOCAL(__finalise_el2) + finalise_el2_state + + // nVHE? No way! Give me the real thing! + // Sanity check: MMU *must* be off + mrs x1, sctlr_el2 + tbnz x1, #0, 1f + + // Needs to be VHE capable, obviously + check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 0f 1f x1 x2 + +0: // Check whether we only want the hypervisor to run VHE, not the kernel + adr_l x1, arm64_sw_feature_override + ldr x2, [x1, FTR_OVR_VAL_OFFSET] + ldr x1, [x1, FTR_OVR_MASK_OFFSET] + and x2, x2, x1 + ubfx x2, x2, #ARM64_SW_FEATURE_OVERRIDE_HVHE, #4 + cbz x2, 2f + +1: mov_q x0, HVC_STUB_ERR + eret +2: + // Engage the VHE magic! + mov_q x0, HCR_HOST_VHE_FLAGS + msr_hcr_el2 x0 + isb + + // Use the EL1 allocated stack, per-cpu offset + mrs x0, sp_el1 + mov sp, x0 + mrs x0, tpidr_el1 + msr tpidr_el2, x0 + + // FP configuration, vectors + mrs_s x0, SYS_CPACR_EL12 + msr cpacr_el1, x0 + mrs_s x0, SYS_VBAR_EL12 + msr vbar_el1, x0 + + // Use EL2 translations for SPE & TRBE and disable access from EL1 + mrs x0, mdcr_el2 + bic x0, x0, #MDCR_EL2_E2PB_MASK + bic x0, x0, #MDCR_EL2_E2TB_MASK + msr mdcr_el2, x0 + + // Transfer the MM state from EL1 to EL2 + mrs_s x0, SYS_TCR_EL12 + msr tcr_el1, x0 + mrs_s x0, SYS_TTBR0_EL12 + msr ttbr0_el1, x0 + mrs_s x0, SYS_TTBR1_EL12 + msr ttbr1_el1, x0 + mrs_s x0, SYS_MAIR_EL12 + msr mair_el1, x0 + mrs x1, REG_ID_AA64MMFR3_EL1 + ubfx x1, x1, #ID_AA64MMFR3_EL1_TCRX_SHIFT, #4 + cbz x1, .Lskip_tcr2 + mrs x0, REG_TCR2_EL12 + msr REG_TCR2_EL1, x0 + + // Transfer permission indirection state + mrs x1, REG_ID_AA64MMFR3_EL1 + ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 + cbz x1, .Lskip_indirection + mrs x0, REG_PIRE0_EL12 + msr REG_PIRE0_EL1, x0 + mrs x0, REG_PIR_EL12 + msr REG_PIR_EL1, x0 + +.Lskip_indirection: +.Lskip_tcr2: + + isb + + // Hack the exception return to stay at EL2 + mrs x0, spsr_el1 + and x0, x0, #~PSR_MODE_MASK + mov x1, #PSR_MODE_EL2h + orr x0, x0, x1 + msr spsr_el1, x0 + + b enter_vhe +SYM_CODE_END(__finalise_el2) + + // At the point where we reach enter_vhe(), we run with + // the MMU off (which is enforced by __finalise_el2()). + // We thus need to be in the idmap, or everything will + // explode when enabling the MMU. + + .pushsection .idmap.text, "ax" + +SYM_CODE_START_LOCAL(enter_vhe) + // Invalidate TLBs before enabling the MMU + tlbi vmalle1 + dsb nsh + isb + + // Enable the EL2 S1 MMU, as set up from EL1 + mrs_s x0, SYS_SCTLR_EL12 + set_sctlr_el1 x0 + + // Disable the EL1 S1 MMU for a good measure + mov_q x0, INIT_SCTLR_EL1_MMU_OFF + msr_s SYS_SCTLR_EL12, x0 + + mov x0, xzr + + eret +SYM_CODE_END(enter_vhe) + + .popsection .macro invalid_vector label -\label: +SYM_CODE_START_LOCAL(\label) b \label -ENDPROC(\label) +SYM_CODE_END(\label) .endm invalid_vector el2_sync_invalid @@ -94,6 +205,8 @@ ENDPROC(\label) invalid_vector el1_fiq_invalid invalid_vector el1_error_invalid + .popsection + /* * __hyp_set_vectors: Call this after boot to set the initial hypervisor * vectors as part of hypervisor installation. On an SMP system, this should @@ -115,15 +228,36 @@ ENDPROC(\label) * initialisation entry point. */ -ENTRY(__hyp_set_vectors) +SYM_FUNC_START(__hyp_set_vectors) mov x1, x0 mov x0, #HVC_SET_VECTORS hvc #0 ret -ENDPROC(__hyp_set_vectors) +SYM_FUNC_END(__hyp_set_vectors) -ENTRY(__hyp_reset_vectors) +SYM_FUNC_START(__hyp_reset_vectors) mov x0, #HVC_RESET_VECTORS hvc #0 ret -ENDPROC(__hyp_reset_vectors) +SYM_FUNC_END(__hyp_reset_vectors) + +/* + * Entry point to finalise EL2 and switch to VHE if deemed capable + * + * w0: boot mode, as returned by init_kernel_el() + */ +SYM_FUNC_START(finalise_el2) + // Need to have booted at EL2 + cmp w0, #BOOT_CPU_MODE_EL2 + b.ne 1f + + // and still be at EL1 + mrs x0, CurrentEL + cmp x0, #CurrentEL_EL1 + b.ne 1f + + mov x0, #HVC_FINALISE_EL2 + hvc #0 +1: + ret +SYM_FUNC_END(finalise_el2) diff --git a/arch/arm64/kernel/idle.c b/arch/arm64/kernel/idle.c new file mode 100644 index 000000000000..05cfb347ec26 --- /dev/null +++ b/arch/arm64/kernel/idle.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Low-level idle sequences + */ + +#include <linux/cpu.h> +#include <linux/irqflags.h> + +#include <asm/barrier.h> +#include <asm/cpuidle.h> +#include <asm/cpufeature.h> +#include <asm/sysreg.h> + +/* + * cpu_do_idle() + * + * Idle the processor (wait for interrupt). + * + * If the CPU supports priority masking we must do additional work to + * ensure that interrupts are not masked at the PMR (because the core will + * not wake up if we block the wake up signal in the interrupt controller). + */ +void __cpuidle cpu_do_idle(void) +{ + struct arm_cpuidle_irq_context context; + + arm_cpuidle_save_irq_context(&context); + + dsb(sy); + wfi(); + + arm_cpuidle_restore_irq_context(&context); +} + +/* + * This is our default idle handler. + */ +void __cpuidle arch_cpu_idle(void) +{ + /* + * This should do all the clock switching and wait for interrupt + * tricks + */ + cpu_do_idle(); +} diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h new file mode 100644 index 000000000000..85bc629270bd --- /dev/null +++ b/arch/arm64/kernel/image-vars.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Linker script variables to be set after section resolution, as + * ld.lld does not like variables assigned before SECTIONS is processed. + */ +#ifndef __ARM64_KERNEL_IMAGE_VARS_H +#define __ARM64_KERNEL_IMAGE_VARS_H + +#ifndef LINKER_SCRIPT +#error This file should only be included in vmlinux.lds.S +#endif + +#if defined(CONFIG_LD_IS_LLD) && CONFIG_LLD_VERSION < 210000 +#define ASSERT(...) +#endif + +#define PI_EXPORT_SYM(sym) \ + __PI_EXPORT_SYM(sym, __pi_ ## sym, Cannot export BSS symbol sym to startup code) +#define __PI_EXPORT_SYM(sym, pisym, msg)\ + PROVIDE(pisym = sym); \ + ASSERT((sym - KIMAGE_VADDR) < (__bss_start - KIMAGE_VADDR), #msg) + +PROVIDE(__efistub_primary_entry = primary_entry); + +/* + * The EFI stub has its own symbol namespace prefixed by __efistub_, to + * isolate it from the kernel proper. The following symbols are legally + * accessed by the stub, so provide some aliases to make them accessible. + * Only include data symbols here, or text symbols of functions that are + * guaranteed to be safe when executed at another offset than they were + * linked at. The routines below are all implemented in assembler in a + * position independent manner + */ +PROVIDE(__efistub_caches_clean_inval_pou = __pi_caches_clean_inval_pou); + +PROVIDE(__efistub__text = _text); +PROVIDE(__efistub__end = _end); +PROVIDE(__efistub___inittext_end = __inittext_end); +PROVIDE(__efistub__edata = _edata); +#if defined(CONFIG_EFI_EARLYCON) || defined(CONFIG_SYSFB) +PROVIDE(__efistub_screen_info = screen_info); +#endif +PROVIDE(__efistub__ctype = _ctype); + +PROVIDE(__pi___memcpy = __pi_memcpy); +PROVIDE(__pi___memmove = __pi_memmove); +PROVIDE(__pi___memset = __pi_memset); + +PI_EXPORT_SYM(id_aa64isar1_override); +PI_EXPORT_SYM(id_aa64isar2_override); +PI_EXPORT_SYM(id_aa64mmfr0_override); +PI_EXPORT_SYM(id_aa64mmfr1_override); +PI_EXPORT_SYM(id_aa64mmfr2_override); +PI_EXPORT_SYM(id_aa64pfr0_override); +PI_EXPORT_SYM(id_aa64pfr1_override); +PI_EXPORT_SYM(id_aa64smfr0_override); +PI_EXPORT_SYM(id_aa64zfr0_override); +PI_EXPORT_SYM(arm64_sw_feature_override); +PI_EXPORT_SYM(arm64_use_ng_mappings); +PI_EXPORT_SYM(_ctype); + +PI_EXPORT_SYM(swapper_pg_dir); + +PI_EXPORT_SYM(_text); +PI_EXPORT_SYM(_stext); +PI_EXPORT_SYM(_etext); +PI_EXPORT_SYM(__start_rodata); +PI_EXPORT_SYM(__inittext_begin); +PI_EXPORT_SYM(__inittext_end); +PI_EXPORT_SYM(__initdata_begin); +PI_EXPORT_SYM(__initdata_end); +PI_EXPORT_SYM(_data); + +#ifdef CONFIG_KVM + +/* + * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_, to + * separate it from the kernel proper. The following symbols are legally + * accessed by it, therefore provide aliases to make them linkable. + * Do not include symbols which may not be safely accessed under hypervisor + * memory mappings. + */ + +/* Alternative callbacks for init-time patching of nVHE hyp code. */ +KVM_NVHE_ALIAS(kvm_patch_vector_branch); +KVM_NVHE_ALIAS(kvm_update_va_mask); +KVM_NVHE_ALIAS(kvm_get_kimage_voffset); +KVM_NVHE_ALIAS(kvm_compute_final_ctr_el0); +KVM_NVHE_ALIAS(spectre_bhb_patch_loop_iter); +KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable); +KVM_NVHE_ALIAS(spectre_bhb_patch_wa3); +KVM_NVHE_ALIAS(spectre_bhb_patch_clearbhb); +KVM_NVHE_ALIAS(alt_cb_patch_nops); +KVM_NVHE_ALIAS(kvm_compute_ich_hcr_trap_bits); + +/* Global kernel state accessed by nVHE hyp code. */ +KVM_NVHE_ALIAS(kvm_vgic_global_state); + +/* Kernel symbols used to call panic() from nVHE hyp code (via ERET). */ +KVM_NVHE_ALIAS(nvhe_hyp_panic_handler); + +/* Vectors installed by hyp-init on reset HVC. */ +KVM_NVHE_ALIAS(__hyp_stub_vectors); + +/* Static keys which are set if a vGIC trap should be handled in hyp. */ +KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); +KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); + +/* Static key indicating whether GICv3 has GICv2 compatibility */ +KVM_NVHE_ALIAS(vgic_v3_has_v2_compat); + +/* Static key which is set if CNTVOFF_EL2 is unusable */ +KVM_NVHE_ALIAS(broken_cntvoff_key); + +/* EL2 exception handling */ +KVM_NVHE_ALIAS(__start___kvm_ex_table); +KVM_NVHE_ALIAS(__stop___kvm_ex_table); + +/* Position-independent library routines */ +KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page); +KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page); +KVM_NVHE_ALIAS_HYP(memcpy, __pi_memcpy); +KVM_NVHE_ALIAS_HYP(memset, __pi_memset); + +#ifdef CONFIG_KASAN +KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy); +KVM_NVHE_ALIAS_HYP(__memset, __pi_memset); +#endif + +/* Hyp memory sections */ +KVM_NVHE_ALIAS(__hyp_idmap_text_start); +KVM_NVHE_ALIAS(__hyp_idmap_text_end); +KVM_NVHE_ALIAS(__hyp_text_start); +KVM_NVHE_ALIAS(__hyp_text_end); +KVM_NVHE_ALIAS(__hyp_bss_start); +KVM_NVHE_ALIAS(__hyp_bss_end); +KVM_NVHE_ALIAS(__hyp_data_start); +KVM_NVHE_ALIAS(__hyp_data_end); +KVM_NVHE_ALIAS(__hyp_rodata_start); +KVM_NVHE_ALIAS(__hyp_rodata_end); + +/* pKVM static key */ +KVM_NVHE_ALIAS(kvm_protected_mode_initialized); + +#endif /* CONFIG_KVM */ + +#ifdef CONFIG_EFI_ZBOOT +_kernel_codesize = ABSOLUTE(__inittext_end - _text); +#endif + +/* + * LLD will occasionally error out with a '__init_end does not converge' error + * if INIT_IDMAP_DIR_SIZE is defined in terms of _end, as this results in a + * circular dependency. Counter this by dimensioning the initial IDMAP page + * tables based on kimage_limit, which is defined such that its value should + * not change as a result of the initdata segment being pushed over a 64k + * segment boundary due to changes in INIT_IDMAP_DIR_SIZE, provided that its + * value doesn't change by more than 2M between linker passes. + */ +kimage_limit = ALIGN(ABSOLUTE(_end + SZ_64K), SZ_2M); + +#undef ASSERT + +#endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index 33f14e484040..7bc3ba897901 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -1,19 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Linker script macros to generate Image header fields. * * Copyright (C) 2014 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #ifndef __ARM64_KERNEL_IMAGE_H #define __ARM64_KERNEL_IMAGE_H @@ -73,45 +62,6 @@ */ #define HEAD_SYMBOLS \ DEFINE_IMAGE_LE64(_kernel_size_le, _end - _text); \ - DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET); \ DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS); -#ifdef CONFIG_EFI - -__efistub_stext_offset = stext - _text; - -/* - * The EFI stub has its own symbol namespace prefixed by __efistub_, to - * isolate it from the kernel proper. The following symbols are legally - * accessed by the stub, so provide some aliases to make them accessible. - * Only include data symbols here, or text symbols of functions that are - * guaranteed to be safe when executed at another offset than they were - * linked at. The routines below are all implemented in assembler in a - * position independent manner - */ -__efistub_memcmp = __pi_memcmp; -__efistub_memchr = __pi_memchr; -__efistub_memcpy = __pi_memcpy; -__efistub_memmove = __pi_memmove; -__efistub_memset = __pi_memset; -__efistub_strlen = __pi_strlen; -__efistub_strnlen = __pi_strnlen; -__efistub_strcmp = __pi_strcmp; -__efistub_strncmp = __pi_strncmp; -__efistub_strrchr = __pi_strrchr; -__efistub___flush_dcache_area = __pi___flush_dcache_area; - -#ifdef CONFIG_KASAN -__efistub___memcpy = __pi_memcpy; -__efistub___memmove = __pi_memmove; -__efistub___memset = __pi_memset; -#endif - -__efistub__text = _text; -__efistub__end = _end; -__efistub__edata = _edata; -__efistub_screen_info = screen_info; - -#endif - #endif /* __ARM64_KERNEL_IMAGE_H */ diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c deleted file mode 100644 index 7820a4a688fa..000000000000 --- a/arch/arm64/kernel/insn.c +++ /dev/null @@ -1,1634 +0,0 @@ -/* - * Copyright (C) 2013 Huawei Ltd. - * Author: Jiang Liu <liuj97@gmail.com> - * - * Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ -#include <linux/bitops.h> -#include <linux/bug.h> -#include <linux/compiler.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/spinlock.h> -#include <linux/stop_machine.h> -#include <linux/types.h> -#include <linux/uaccess.h> - -#include <asm/cacheflush.h> -#include <asm/debug-monitors.h> -#include <asm/fixmap.h> -#include <asm/insn.h> -#include <asm/kprobes.h> - -#define AARCH64_INSN_SF_BIT BIT(31) -#define AARCH64_INSN_N_BIT BIT(22) -#define AARCH64_INSN_LSL_12 BIT(22) - -static int aarch64_insn_encoding_class[] = { - AARCH64_INSN_CLS_UNKNOWN, - AARCH64_INSN_CLS_UNKNOWN, - AARCH64_INSN_CLS_UNKNOWN, - AARCH64_INSN_CLS_UNKNOWN, - AARCH64_INSN_CLS_LDST, - AARCH64_INSN_CLS_DP_REG, - AARCH64_INSN_CLS_LDST, - AARCH64_INSN_CLS_DP_FPSIMD, - AARCH64_INSN_CLS_DP_IMM, - AARCH64_INSN_CLS_DP_IMM, - AARCH64_INSN_CLS_BR_SYS, - AARCH64_INSN_CLS_BR_SYS, - AARCH64_INSN_CLS_LDST, - AARCH64_INSN_CLS_DP_REG, - AARCH64_INSN_CLS_LDST, - AARCH64_INSN_CLS_DP_FPSIMD, -}; - -enum aarch64_insn_encoding_class __kprobes aarch64_get_insn_class(u32 insn) -{ - return aarch64_insn_encoding_class[(insn >> 25) & 0xf]; -} - -/* NOP is an alias of HINT */ -bool __kprobes aarch64_insn_is_nop(u32 insn) -{ - if (!aarch64_insn_is_hint(insn)) - return false; - - switch (insn & 0xFE0) { - case AARCH64_INSN_HINT_YIELD: - case AARCH64_INSN_HINT_WFE: - case AARCH64_INSN_HINT_WFI: - case AARCH64_INSN_HINT_SEV: - case AARCH64_INSN_HINT_SEVL: - return false; - default: - return true; - } -} - -bool aarch64_insn_is_branch_imm(u32 insn) -{ - return (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn) || - aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn) || - aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || - aarch64_insn_is_bcond(insn)); -} - -static DEFINE_RAW_SPINLOCK(patch_lock); - -static void __kprobes *patch_map(void *addr, int fixmap) -{ - unsigned long uintaddr = (uintptr_t) addr; - bool module = !core_kernel_text(uintaddr); - struct page *page; - - if (module && IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) - page = vmalloc_to_page(addr); - else if (!module) - page = phys_to_page(__pa_symbol(addr)); - else - return addr; - - BUG_ON(!page); - return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + - (uintaddr & ~PAGE_MASK)); -} - -static void __kprobes patch_unmap(int fixmap) -{ - clear_fixmap(fixmap); -} -/* - * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always - * little-endian. - */ -int __kprobes aarch64_insn_read(void *addr, u32 *insnp) -{ - int ret; - __le32 val; - - ret = probe_kernel_read(&val, addr, AARCH64_INSN_SIZE); - if (!ret) - *insnp = le32_to_cpu(val); - - return ret; -} - -static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) -{ - void *waddr = addr; - unsigned long flags = 0; - int ret; - - raw_spin_lock_irqsave(&patch_lock, flags); - waddr = patch_map(addr, FIX_TEXT_POKE0); - - ret = probe_kernel_write(waddr, &insn, AARCH64_INSN_SIZE); - - patch_unmap(FIX_TEXT_POKE0); - raw_spin_unlock_irqrestore(&patch_lock, flags); - - return ret; -} - -int __kprobes aarch64_insn_write(void *addr, u32 insn) -{ - return __aarch64_insn_write(addr, cpu_to_le32(insn)); -} - -bool __kprobes aarch64_insn_uses_literal(u32 insn) -{ - /* ldr/ldrsw (literal), prfm */ - - return aarch64_insn_is_ldr_lit(insn) || - aarch64_insn_is_ldrsw_lit(insn) || - aarch64_insn_is_adr_adrp(insn) || - aarch64_insn_is_prfm_lit(insn); -} - -bool __kprobes aarch64_insn_is_branch(u32 insn) -{ - /* b, bl, cb*, tb*, b.cond, br, blr */ - - return aarch64_insn_is_b(insn) || - aarch64_insn_is_bl(insn) || - aarch64_insn_is_cbz(insn) || - aarch64_insn_is_cbnz(insn) || - aarch64_insn_is_tbz(insn) || - aarch64_insn_is_tbnz(insn) || - aarch64_insn_is_ret(insn) || - aarch64_insn_is_br(insn) || - aarch64_insn_is_blr(insn) || - aarch64_insn_is_bcond(insn); -} - -int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) -{ - u32 *tp = addr; - int ret; - - /* A64 instructions must be word aligned */ - if ((uintptr_t)tp & 0x3) - return -EINVAL; - - ret = aarch64_insn_write(tp, insn); - if (ret == 0) - __flush_icache_range((uintptr_t)tp, - (uintptr_t)tp + AARCH64_INSN_SIZE); - - return ret; -} - -struct aarch64_insn_patch { - void **text_addrs; - u32 *new_insns; - int insn_cnt; - atomic_t cpu_count; -}; - -static int __kprobes aarch64_insn_patch_text_cb(void *arg) -{ - int i, ret = 0; - struct aarch64_insn_patch *pp = arg; - - /* The first CPU becomes master */ - if (atomic_inc_return(&pp->cpu_count) == 1) { - for (i = 0; ret == 0 && i < pp->insn_cnt; i++) - ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i], - pp->new_insns[i]); - /* Notify other processors with an additional increment. */ - atomic_inc(&pp->cpu_count); - } else { - while (atomic_read(&pp->cpu_count) <= num_online_cpus()) - cpu_relax(); - isb(); - } - - return ret; -} - -int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt) -{ - struct aarch64_insn_patch patch = { - .text_addrs = addrs, - .new_insns = insns, - .insn_cnt = cnt, - .cpu_count = ATOMIC_INIT(0), - }; - - if (cnt <= 0) - return -EINVAL; - - return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch, - cpu_online_mask); -} - -static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type, - u32 *maskp, int *shiftp) -{ - u32 mask; - int shift; - - switch (type) { - case AARCH64_INSN_IMM_26: - mask = BIT(26) - 1; - shift = 0; - break; - case AARCH64_INSN_IMM_19: - mask = BIT(19) - 1; - shift = 5; - break; - case AARCH64_INSN_IMM_16: - mask = BIT(16) - 1; - shift = 5; - break; - case AARCH64_INSN_IMM_14: - mask = BIT(14) - 1; - shift = 5; - break; - case AARCH64_INSN_IMM_12: - mask = BIT(12) - 1; - shift = 10; - break; - case AARCH64_INSN_IMM_9: - mask = BIT(9) - 1; - shift = 12; - break; - case AARCH64_INSN_IMM_7: - mask = BIT(7) - 1; - shift = 15; - break; - case AARCH64_INSN_IMM_6: - case AARCH64_INSN_IMM_S: - mask = BIT(6) - 1; - shift = 10; - break; - case AARCH64_INSN_IMM_R: - mask = BIT(6) - 1; - shift = 16; - break; - case AARCH64_INSN_IMM_N: - mask = 1; - shift = 22; - break; - default: - return -EINVAL; - } - - *maskp = mask; - *shiftp = shift; - - return 0; -} - -#define ADR_IMM_HILOSPLIT 2 -#define ADR_IMM_SIZE SZ_2M -#define ADR_IMM_LOMASK ((1 << ADR_IMM_HILOSPLIT) - 1) -#define ADR_IMM_HIMASK ((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1) -#define ADR_IMM_LOSHIFT 29 -#define ADR_IMM_HISHIFT 5 - -u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn) -{ - u32 immlo, immhi, mask; - int shift; - - switch (type) { - case AARCH64_INSN_IMM_ADR: - shift = 0; - immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK; - immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK; - insn = (immhi << ADR_IMM_HILOSPLIT) | immlo; - mask = ADR_IMM_SIZE - 1; - break; - default: - if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) { - pr_err("aarch64_insn_decode_immediate: unknown immediate encoding %d\n", - type); - return 0; - } - } - - return (insn >> shift) & mask; -} - -u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type, - u32 insn, u64 imm) -{ - u32 immlo, immhi, mask; - int shift; - - if (insn == AARCH64_BREAK_FAULT) - return AARCH64_BREAK_FAULT; - - switch (type) { - case AARCH64_INSN_IMM_ADR: - shift = 0; - immlo = (imm & ADR_IMM_LOMASK) << ADR_IMM_LOSHIFT; - imm >>= ADR_IMM_HILOSPLIT; - immhi = (imm & ADR_IMM_HIMASK) << ADR_IMM_HISHIFT; - imm = immlo | immhi; - mask = ((ADR_IMM_LOMASK << ADR_IMM_LOSHIFT) | - (ADR_IMM_HIMASK << ADR_IMM_HISHIFT)); - break; - default: - if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) { - pr_err("aarch64_insn_encode_immediate: unknown immediate encoding %d\n", - type); - return AARCH64_BREAK_FAULT; - } - } - - /* Update the immediate field. */ - insn &= ~(mask << shift); - insn |= (imm & mask) << shift; - - return insn; -} - -u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type, - u32 insn) -{ - int shift; - - switch (type) { - case AARCH64_INSN_REGTYPE_RT: - case AARCH64_INSN_REGTYPE_RD: - shift = 0; - break; - case AARCH64_INSN_REGTYPE_RN: - shift = 5; - break; - case AARCH64_INSN_REGTYPE_RT2: - case AARCH64_INSN_REGTYPE_RA: - shift = 10; - break; - case AARCH64_INSN_REGTYPE_RM: - shift = 16; - break; - default: - pr_err("%s: unknown register type encoding %d\n", __func__, - type); - return 0; - } - - return (insn >> shift) & GENMASK(4, 0); -} - -static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type, - u32 insn, - enum aarch64_insn_register reg) -{ - int shift; - - if (insn == AARCH64_BREAK_FAULT) - return AARCH64_BREAK_FAULT; - - if (reg < AARCH64_INSN_REG_0 || reg > AARCH64_INSN_REG_SP) { - pr_err("%s: unknown register encoding %d\n", __func__, reg); - return AARCH64_BREAK_FAULT; - } - - switch (type) { - case AARCH64_INSN_REGTYPE_RT: - case AARCH64_INSN_REGTYPE_RD: - shift = 0; - break; - case AARCH64_INSN_REGTYPE_RN: - shift = 5; - break; - case AARCH64_INSN_REGTYPE_RT2: - case AARCH64_INSN_REGTYPE_RA: - shift = 10; - break; - case AARCH64_INSN_REGTYPE_RM: - case AARCH64_INSN_REGTYPE_RS: - shift = 16; - break; - default: - pr_err("%s: unknown register type encoding %d\n", __func__, - type); - return AARCH64_BREAK_FAULT; - } - - insn &= ~(GENMASK(4, 0) << shift); - insn |= reg << shift; - - return insn; -} - -static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type, - u32 insn) -{ - u32 size; - - switch (type) { - case AARCH64_INSN_SIZE_8: - size = 0; - break; - case AARCH64_INSN_SIZE_16: - size = 1; - break; - case AARCH64_INSN_SIZE_32: - size = 2; - break; - case AARCH64_INSN_SIZE_64: - size = 3; - break; - default: - pr_err("%s: unknown size encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - insn &= ~GENMASK(31, 30); - insn |= size << 30; - - return insn; -} - -static inline long branch_imm_common(unsigned long pc, unsigned long addr, - long range) -{ - long offset; - - if ((pc & 0x3) || (addr & 0x3)) { - pr_err("%s: A64 instructions must be word aligned\n", __func__); - return range; - } - - offset = ((long)addr - (long)pc); - - if (offset < -range || offset >= range) { - pr_err("%s: offset out of range\n", __func__); - return range; - } - - return offset; -} - -u32 __kprobes aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr, - enum aarch64_insn_branch_type type) -{ - u32 insn; - long offset; - - /* - * B/BL support [-128M, 128M) offset - * ARM64 virtual address arrangement guarantees all kernel and module - * texts are within +/-128M. - */ - offset = branch_imm_common(pc, addr, SZ_128M); - if (offset >= SZ_128M) - return AARCH64_BREAK_FAULT; - - switch (type) { - case AARCH64_INSN_BRANCH_LINK: - insn = aarch64_insn_get_bl_value(); - break; - case AARCH64_INSN_BRANCH_NOLINK: - insn = aarch64_insn_get_b_value(); - break; - default: - pr_err("%s: unknown branch encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn, - offset >> 2); -} - -u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr, - enum aarch64_insn_register reg, - enum aarch64_insn_variant variant, - enum aarch64_insn_branch_type type) -{ - u32 insn; - long offset; - - offset = branch_imm_common(pc, addr, SZ_1M); - if (offset >= SZ_1M) - return AARCH64_BREAK_FAULT; - - switch (type) { - case AARCH64_INSN_BRANCH_COMP_ZERO: - insn = aarch64_insn_get_cbz_value(); - break; - case AARCH64_INSN_BRANCH_COMP_NONZERO: - insn = aarch64_insn_get_cbnz_value(); - break; - default: - pr_err("%s: unknown branch encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, - offset >> 2); -} - -u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr, - enum aarch64_insn_condition cond) -{ - u32 insn; - long offset; - - offset = branch_imm_common(pc, addr, SZ_1M); - - insn = aarch64_insn_get_bcond_value(); - - if (cond < AARCH64_INSN_COND_EQ || cond > AARCH64_INSN_COND_AL) { - pr_err("%s: unknown condition encoding %d\n", __func__, cond); - return AARCH64_BREAK_FAULT; - } - insn |= cond; - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, - offset >> 2); -} - -u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_op op) -{ - return aarch64_insn_get_hint_value() | op; -} - -u32 __kprobes aarch64_insn_gen_nop(void) -{ - return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP); -} - -u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg, - enum aarch64_insn_branch_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_BRANCH_NOLINK: - insn = aarch64_insn_get_br_value(); - break; - case AARCH64_INSN_BRANCH_LINK: - insn = aarch64_insn_get_blr_value(); - break; - case AARCH64_INSN_BRANCH_RETURN: - insn = aarch64_insn_get_ret_value(); - break; - default: - pr_err("%s: unknown branch encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, reg); -} - -u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg, - enum aarch64_insn_register base, - enum aarch64_insn_register offset, - enum aarch64_insn_size_type size, - enum aarch64_insn_ldst_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_LDST_LOAD_REG_OFFSET: - insn = aarch64_insn_get_ldr_reg_value(); - break; - case AARCH64_INSN_LDST_STORE_REG_OFFSET: - insn = aarch64_insn_get_str_reg_value(); - break; - default: - pr_err("%s: unknown load/store encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_ldst_size(size, insn); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - base); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, - offset); -} - -u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, - enum aarch64_insn_register reg2, - enum aarch64_insn_register base, - int offset, - enum aarch64_insn_variant variant, - enum aarch64_insn_ldst_type type) -{ - u32 insn; - int shift; - - switch (type) { - case AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX: - insn = aarch64_insn_get_ldp_pre_value(); - break; - case AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX: - insn = aarch64_insn_get_stp_pre_value(); - break; - case AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX: - insn = aarch64_insn_get_ldp_post_value(); - break; - case AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX: - insn = aarch64_insn_get_stp_post_value(); - break; - default: - pr_err("%s: unknown load/store encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if ((offset & 0x3) || (offset < -256) || (offset > 252)) { - pr_err("%s: offset must be multiples of 4 in the range of [-256, 252] %d\n", - __func__, offset); - return AARCH64_BREAK_FAULT; - } - shift = 2; - break; - case AARCH64_INSN_VARIANT_64BIT: - if ((offset & 0x7) || (offset < -512) || (offset > 504)) { - pr_err("%s: offset must be multiples of 8 in the range of [-512, 504] %d\n", - __func__, offset); - return AARCH64_BREAK_FAULT; - } - shift = 3; - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, - reg1); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn, - reg2); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - base); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_7, insn, - offset >> shift); -} - -u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg, - enum aarch64_insn_register base, - enum aarch64_insn_register state, - enum aarch64_insn_size_type size, - enum aarch64_insn_ldst_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_LDST_LOAD_EX: - insn = aarch64_insn_get_load_ex_value(); - break; - case AARCH64_INSN_LDST_STORE_EX: - insn = aarch64_insn_get_store_ex_value(); - break; - default: - pr_err("%s: unknown load/store exclusive encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_ldst_size(size, insn); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, - reg); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - base); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn, - AARCH64_INSN_REG_ZR); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn, - state); -} - -static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type, - enum aarch64_insn_prfm_target target, - enum aarch64_insn_prfm_policy policy, - u32 insn) -{ - u32 imm_type = 0, imm_target = 0, imm_policy = 0; - - switch (type) { - case AARCH64_INSN_PRFM_TYPE_PLD: - break; - case AARCH64_INSN_PRFM_TYPE_PLI: - imm_type = BIT(0); - break; - case AARCH64_INSN_PRFM_TYPE_PST: - imm_type = BIT(1); - break; - default: - pr_err("%s: unknown prfm type encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (target) { - case AARCH64_INSN_PRFM_TARGET_L1: - break; - case AARCH64_INSN_PRFM_TARGET_L2: - imm_target = BIT(0); - break; - case AARCH64_INSN_PRFM_TARGET_L3: - imm_target = BIT(1); - break; - default: - pr_err("%s: unknown prfm target encoding %d\n", __func__, target); - return AARCH64_BREAK_FAULT; - } - - switch (policy) { - case AARCH64_INSN_PRFM_POLICY_KEEP: - break; - case AARCH64_INSN_PRFM_POLICY_STRM: - imm_policy = BIT(0); - break; - default: - pr_err("%s: unknown prfm policy encoding %d\n", __func__, policy); - return AARCH64_BREAK_FAULT; - } - - /* In this case, imm5 is encoded into Rt field. */ - insn &= ~GENMASK(4, 0); - insn |= imm_policy | (imm_target << 1) | (imm_type << 3); - - return insn; -} - -u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base, - enum aarch64_insn_prfm_type type, - enum aarch64_insn_prfm_target target, - enum aarch64_insn_prfm_policy policy) -{ - u32 insn = aarch64_insn_get_prfm_value(); - - insn = aarch64_insn_encode_ldst_size(AARCH64_INSN_SIZE_64, insn); - - insn = aarch64_insn_encode_prfm_imm(type, target, policy, insn); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - base); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, 0); -} - -u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - int imm, enum aarch64_insn_variant variant, - enum aarch64_insn_adsb_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_ADSB_ADD: - insn = aarch64_insn_get_add_imm_value(); - break; - case AARCH64_INSN_ADSB_SUB: - insn = aarch64_insn_get_sub_imm_value(); - break; - case AARCH64_INSN_ADSB_ADD_SETFLAGS: - insn = aarch64_insn_get_adds_imm_value(); - break; - case AARCH64_INSN_ADSB_SUB_SETFLAGS: - insn = aarch64_insn_get_subs_imm_value(); - break; - default: - pr_err("%s: unknown add/sub encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - /* We can't encode more than a 24bit value (12bit + 12bit shift) */ - if (imm & ~(BIT(24) - 1)) - goto out; - - /* If we have something in the top 12 bits... */ - if (imm & ~(SZ_4K - 1)) { - /* ... and in the low 12 bits -> error */ - if (imm & (SZ_4K - 1)) - goto out; - - imm >>= 12; - insn |= AARCH64_INSN_LSL_12; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm); - -out: - pr_err("%s: invalid immediate encoding %d\n", __func__, imm); - return AARCH64_BREAK_FAULT; -} - -u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - int immr, int imms, - enum aarch64_insn_variant variant, - enum aarch64_insn_bitfield_type type) -{ - u32 insn; - u32 mask; - - switch (type) { - case AARCH64_INSN_BITFIELD_MOVE: - insn = aarch64_insn_get_bfm_value(); - break; - case AARCH64_INSN_BITFIELD_MOVE_UNSIGNED: - insn = aarch64_insn_get_ubfm_value(); - break; - case AARCH64_INSN_BITFIELD_MOVE_SIGNED: - insn = aarch64_insn_get_sbfm_value(); - break; - default: - pr_err("%s: unknown bitfield encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - mask = GENMASK(4, 0); - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT | AARCH64_INSN_N_BIT; - mask = GENMASK(5, 0); - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - if (immr & ~mask) { - pr_err("%s: invalid immr encoding %d\n", __func__, immr); - return AARCH64_BREAK_FAULT; - } - if (imms & ~mask) { - pr_err("%s: invalid imms encoding %d\n", __func__, imms); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms); -} - -u32 aarch64_insn_gen_movewide(enum aarch64_insn_register dst, - int imm, int shift, - enum aarch64_insn_variant variant, - enum aarch64_insn_movewide_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_MOVEWIDE_ZERO: - insn = aarch64_insn_get_movz_value(); - break; - case AARCH64_INSN_MOVEWIDE_KEEP: - insn = aarch64_insn_get_movk_value(); - break; - case AARCH64_INSN_MOVEWIDE_INVERSE: - insn = aarch64_insn_get_movn_value(); - break; - default: - pr_err("%s: unknown movewide encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - if (imm & ~(SZ_64K - 1)) { - pr_err("%s: invalid immediate encoding %d\n", __func__, imm); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if (shift != 0 && shift != 16) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - if (shift != 0 && shift != 16 && shift != 32 && shift != 48) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn |= (shift >> 4) << 21; - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); -} - -u32 aarch64_insn_gen_add_sub_shifted_reg(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_register reg, - int shift, - enum aarch64_insn_variant variant, - enum aarch64_insn_adsb_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_ADSB_ADD: - insn = aarch64_insn_get_add_value(); - break; - case AARCH64_INSN_ADSB_SUB: - insn = aarch64_insn_get_sub_value(); - break; - case AARCH64_INSN_ADSB_ADD_SETFLAGS: - insn = aarch64_insn_get_adds_value(); - break; - case AARCH64_INSN_ADSB_SUB_SETFLAGS: - insn = aarch64_insn_get_subs_value(); - break; - default: - pr_err("%s: unknown add/sub encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if (shift & ~(SZ_32 - 1)) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - if (shift & ~(SZ_64 - 1)) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift); -} - -u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_variant variant, - enum aarch64_insn_data1_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_DATA1_REVERSE_16: - insn = aarch64_insn_get_rev16_value(); - break; - case AARCH64_INSN_DATA1_REVERSE_32: - insn = aarch64_insn_get_rev32_value(); - break; - case AARCH64_INSN_DATA1_REVERSE_64: - if (variant != AARCH64_INSN_VARIANT_64BIT) { - pr_err("%s: invalid variant for reverse64 %d\n", - __func__, variant); - return AARCH64_BREAK_FAULT; - } - insn = aarch64_insn_get_rev64_value(); - break; - default: - pr_err("%s: unknown data1 encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); -} - -u32 aarch64_insn_gen_data2(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_register reg, - enum aarch64_insn_variant variant, - enum aarch64_insn_data2_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_DATA2_UDIV: - insn = aarch64_insn_get_udiv_value(); - break; - case AARCH64_INSN_DATA2_SDIV: - insn = aarch64_insn_get_sdiv_value(); - break; - case AARCH64_INSN_DATA2_LSLV: - insn = aarch64_insn_get_lslv_value(); - break; - case AARCH64_INSN_DATA2_LSRV: - insn = aarch64_insn_get_lsrv_value(); - break; - case AARCH64_INSN_DATA2_ASRV: - insn = aarch64_insn_get_asrv_value(); - break; - case AARCH64_INSN_DATA2_RORV: - insn = aarch64_insn_get_rorv_value(); - break; - default: - pr_err("%s: unknown data2 encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); -} - -u32 aarch64_insn_gen_data3(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_register reg1, - enum aarch64_insn_register reg2, - enum aarch64_insn_variant variant, - enum aarch64_insn_data3_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_DATA3_MADD: - insn = aarch64_insn_get_madd_value(); - break; - case AARCH64_INSN_DATA3_MSUB: - insn = aarch64_insn_get_msub_value(); - break; - default: - pr_err("%s: unknown data3 encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RA, insn, src); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, - reg1); - - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, - reg2); -} - -u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst, - enum aarch64_insn_register src, - enum aarch64_insn_register reg, - int shift, - enum aarch64_insn_variant variant, - enum aarch64_insn_logic_type type) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_LOGIC_AND: - insn = aarch64_insn_get_and_value(); - break; - case AARCH64_INSN_LOGIC_BIC: - insn = aarch64_insn_get_bic_value(); - break; - case AARCH64_INSN_LOGIC_ORR: - insn = aarch64_insn_get_orr_value(); - break; - case AARCH64_INSN_LOGIC_ORN: - insn = aarch64_insn_get_orn_value(); - break; - case AARCH64_INSN_LOGIC_EOR: - insn = aarch64_insn_get_eor_value(); - break; - case AARCH64_INSN_LOGIC_EON: - insn = aarch64_insn_get_eon_value(); - break; - case AARCH64_INSN_LOGIC_AND_SETFLAGS: - insn = aarch64_insn_get_ands_value(); - break; - case AARCH64_INSN_LOGIC_BIC_SETFLAGS: - insn = aarch64_insn_get_bics_value(); - break; - default: - pr_err("%s: unknown logical encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if (shift & ~(SZ_32 - 1)) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - if (shift & ~(SZ_64 - 1)) { - pr_err("%s: invalid shift encoding %d\n", __func__, - shift); - return AARCH64_BREAK_FAULT; - } - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift); -} - -u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr, - enum aarch64_insn_register reg, - enum aarch64_insn_adr_type type) -{ - u32 insn; - s32 offset; - - switch (type) { - case AARCH64_INSN_ADR_TYPE_ADR: - insn = aarch64_insn_get_adr_value(); - offset = addr - pc; - break; - case AARCH64_INSN_ADR_TYPE_ADRP: - insn = aarch64_insn_get_adrp_value(); - offset = (addr - ALIGN_DOWN(pc, SZ_4K)) >> 12; - break; - default: - pr_err("%s: unknown adr encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - if (offset < -SZ_1M || offset >= SZ_1M) - return AARCH64_BREAK_FAULT; - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, reg); - - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, offset); -} - -/* - * Decode the imm field of a branch, and return the byte offset as a - * signed value (so it can be used when computing a new branch - * target). - */ -s32 aarch64_get_branch_offset(u32 insn) -{ - s32 imm; - - if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) { - imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn); - return (imm << 6) >> 4; - } - - if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || - aarch64_insn_is_bcond(insn)) { - imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn); - return (imm << 13) >> 11; - } - - if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) { - imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn); - return (imm << 18) >> 16; - } - - /* Unhandled instruction */ - BUG(); -} - -/* - * Encode the displacement of a branch in the imm field and return the - * updated instruction. - */ -u32 aarch64_set_branch_offset(u32 insn, s32 offset) -{ - if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn, - offset >> 2); - - if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) || - aarch64_insn_is_bcond(insn)) - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn, - offset >> 2); - - if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_14, insn, - offset >> 2); - - /* Unhandled instruction */ - BUG(); -} - -s32 aarch64_insn_adrp_get_offset(u32 insn) -{ - BUG_ON(!aarch64_insn_is_adrp(insn)); - return aarch64_insn_decode_immediate(AARCH64_INSN_IMM_ADR, insn) << 12; -} - -u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset) -{ - BUG_ON(!aarch64_insn_is_adrp(insn)); - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, - offset >> 12); -} - -/* - * Extract the Op/CR data from a msr/mrs instruction. - */ -u32 aarch64_insn_extract_system_reg(u32 insn) -{ - return (insn & 0x1FFFE0) >> 5; -} - -bool aarch32_insn_is_wide(u32 insn) -{ - return insn >= 0xe800; -} - -/* - * Macros/defines for extracting register numbers from instruction. - */ -u32 aarch32_insn_extract_reg_num(u32 insn, int offset) -{ - return (insn & (0xf << offset)) >> offset; -} - -#define OPC2_MASK 0x7 -#define OPC2_OFFSET 5 -u32 aarch32_insn_mcr_extract_opc2(u32 insn) -{ - return (insn & (OPC2_MASK << OPC2_OFFSET)) >> OPC2_OFFSET; -} - -#define CRM_MASK 0xf -u32 aarch32_insn_mcr_extract_crm(u32 insn) -{ - return insn & CRM_MASK; -} - -static bool __kprobes __check_eq(unsigned long pstate) -{ - return (pstate & PSR_Z_BIT) != 0; -} - -static bool __kprobes __check_ne(unsigned long pstate) -{ - return (pstate & PSR_Z_BIT) == 0; -} - -static bool __kprobes __check_cs(unsigned long pstate) -{ - return (pstate & PSR_C_BIT) != 0; -} - -static bool __kprobes __check_cc(unsigned long pstate) -{ - return (pstate & PSR_C_BIT) == 0; -} - -static bool __kprobes __check_mi(unsigned long pstate) -{ - return (pstate & PSR_N_BIT) != 0; -} - -static bool __kprobes __check_pl(unsigned long pstate) -{ - return (pstate & PSR_N_BIT) == 0; -} - -static bool __kprobes __check_vs(unsigned long pstate) -{ - return (pstate & PSR_V_BIT) != 0; -} - -static bool __kprobes __check_vc(unsigned long pstate) -{ - return (pstate & PSR_V_BIT) == 0; -} - -static bool __kprobes __check_hi(unsigned long pstate) -{ - pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ - return (pstate & PSR_C_BIT) != 0; -} - -static bool __kprobes __check_ls(unsigned long pstate) -{ - pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ - return (pstate & PSR_C_BIT) == 0; -} - -static bool __kprobes __check_ge(unsigned long pstate) -{ - pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ - return (pstate & PSR_N_BIT) == 0; -} - -static bool __kprobes __check_lt(unsigned long pstate) -{ - pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ - return (pstate & PSR_N_BIT) != 0; -} - -static bool __kprobes __check_gt(unsigned long pstate) -{ - /*PSR_N_BIT ^= PSR_V_BIT */ - unsigned long temp = pstate ^ (pstate << 3); - - temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ - return (temp & PSR_N_BIT) == 0; -} - -static bool __kprobes __check_le(unsigned long pstate) -{ - /*PSR_N_BIT ^= PSR_V_BIT */ - unsigned long temp = pstate ^ (pstate << 3); - - temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ - return (temp & PSR_N_BIT) != 0; -} - -static bool __kprobes __check_al(unsigned long pstate) -{ - return true; -} - -/* - * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that - * it behaves identically to 0b1110 ("al"). - */ -pstate_check_t * const aarch32_opcode_cond_checks[16] = { - __check_eq, __check_ne, __check_cs, __check_cc, - __check_mi, __check_pl, __check_vs, __check_vc, - __check_hi, __check_ls, __check_ge, __check_lt, - __check_gt, __check_le, __check_al, __check_al -}; - -static bool range_of_ones(u64 val) -{ - /* Doesn't handle full ones or full zeroes */ - u64 sval = val >> __ffs64(val); - - /* One of Sean Eron Anderson's bithack tricks */ - return ((sval + 1) & (sval)) == 0; -} - -static u32 aarch64_encode_immediate(u64 imm, - enum aarch64_insn_variant variant, - u32 insn) -{ - unsigned int immr, imms, n, ones, ror, esz, tmp; - u64 mask = ~0UL; - - /* Can't encode full zeroes or full ones */ - if (!imm || !~imm) - return AARCH64_BREAK_FAULT; - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if (upper_32_bits(imm)) - return AARCH64_BREAK_FAULT; - esz = 32; - break; - case AARCH64_INSN_VARIANT_64BIT: - insn |= AARCH64_INSN_SF_BIT; - esz = 64; - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - /* - * Inverse of Replicate(). Try to spot a repeating pattern - * with a pow2 stride. - */ - for (tmp = esz / 2; tmp >= 2; tmp /= 2) { - u64 emask = BIT(tmp) - 1; - - if ((imm & emask) != ((imm >> tmp) & emask)) - break; - - esz = tmp; - mask = emask; - } - - /* N is only set if we're encoding a 64bit value */ - n = esz == 64; - - /* Trim imm to the element size */ - imm &= mask; - - /* That's how many ones we need to encode */ - ones = hweight64(imm); - - /* - * imms is set to (ones - 1), prefixed with a string of ones - * and a zero if they fit. Cap it to 6 bits. - */ - imms = ones - 1; - imms |= 0xf << ffs(esz); - imms &= BIT(6) - 1; - - /* Compute the rotation */ - if (range_of_ones(imm)) { - /* - * Pattern: 0..01..10..0 - * - * Compute how many rotate we need to align it right - */ - ror = __ffs64(imm); - } else { - /* - * Pattern: 0..01..10..01..1 - * - * Fill the unused top bits with ones, and check if - * the result is a valid immediate (all ones with a - * contiguous ranges of zeroes). - */ - imm |= ~mask; - if (!range_of_ones(~imm)) - return AARCH64_BREAK_FAULT; - - /* - * Compute the rotation to get a continuous set of - * ones, with the first bit set at position 0 - */ - ror = fls(~imm); - } - - /* - * immr is the number of bits we need to rotate back to the - * original set of ones. Note that this is relative to the - * element size... - */ - immr = (esz - ror) % esz; - - insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, n); - insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr); - return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms); -} - -u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type, - enum aarch64_insn_variant variant, - enum aarch64_insn_register Rn, - enum aarch64_insn_register Rd, - u64 imm) -{ - u32 insn; - - switch (type) { - case AARCH64_INSN_LOGIC_AND: - insn = aarch64_insn_get_and_imm_value(); - break; - case AARCH64_INSN_LOGIC_ORR: - insn = aarch64_insn_get_orr_imm_value(); - break; - case AARCH64_INSN_LOGIC_EOR: - insn = aarch64_insn_get_eor_imm_value(); - break; - case AARCH64_INSN_LOGIC_AND_SETFLAGS: - insn = aarch64_insn_get_ands_imm_value(); - break; - default: - pr_err("%s: unknown logical encoding %d\n", __func__, type); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd); - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn); - return aarch64_encode_immediate(imm, variant, insn); -} - -u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant, - enum aarch64_insn_register Rm, - enum aarch64_insn_register Rn, - enum aarch64_insn_register Rd, - u8 lsb) -{ - u32 insn; - - insn = aarch64_insn_get_extr_value(); - - switch (variant) { - case AARCH64_INSN_VARIANT_32BIT: - if (lsb > 31) - return AARCH64_BREAK_FAULT; - break; - case AARCH64_INSN_VARIANT_64BIT: - if (lsb > 63) - return AARCH64_BREAK_FAULT; - insn |= AARCH64_INSN_SF_BIT; - insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, 1); - break; - default: - pr_err("%s: unknown variant encoding %d\n", __func__, variant); - return AARCH64_BREAK_FAULT; - } - - insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, lsb); - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd); - insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn); - return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm); -} diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c index 79b17384effa..fe86ada23c7d 100644 --- a/arch/arm64/kernel/io.c +++ b/arch/arm64/kernel/io.c @@ -1,19 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/io.c * * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/export.h> @@ -21,88 +10,43 @@ #include <linux/io.h> /* - * Copy data from IO memory space to "real" memory space. - */ -void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) -{ - while (count && !IS_ALIGNED((unsigned long)from, 8)) { - *(u8 *)to = __raw_readb(from); - from++; - to++; - count--; - } - - while (count >= 8) { - *(u64 *)to = __raw_readq(from); - from += 8; - to += 8; - count -= 8; - } - - while (count) { - *(u8 *)to = __raw_readb(from); - from++; - to++; - count--; - } -} -EXPORT_SYMBOL(__memcpy_fromio); - -/* - * Copy data from "real" memory space to IO memory space. + * This generates a memcpy that works on a from/to address which is aligned to + * bits. Count is in terms of the number of bits sized quantities to copy. It + * optimizes to use the STR groupings when possible so that it is WC friendly. */ -void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) +#define memcpy_toio_aligned(to, from, count, bits) \ + ({ \ + volatile u##bits __iomem *_to = to; \ + const u##bits *_from = from; \ + size_t _count = count; \ + const u##bits *_end_from = _from + ALIGN_DOWN(_count, 8); \ + \ + for (; _from < _end_from; _from += 8, _to += 8) \ + __const_memcpy_toio_aligned##bits(_to, _from, 8); \ + if ((_count % 8) >= 4) { \ + __const_memcpy_toio_aligned##bits(_to, _from, 4); \ + _from += 4; \ + _to += 4; \ + } \ + if ((_count % 4) >= 2) { \ + __const_memcpy_toio_aligned##bits(_to, _from, 2); \ + _from += 2; \ + _to += 2; \ + } \ + if (_count % 2) \ + __const_memcpy_toio_aligned##bits(_to, _from, 1); \ + }) + +void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count) { - while (count && !IS_ALIGNED((unsigned long)to, 8)) { - __raw_writeb(*(u8 *)from, to); - from++; - to++; - count--; - } - - while (count >= 8) { - __raw_writeq(*(u64 *)from, to); - from += 8; - to += 8; - count -= 8; - } - - while (count) { - __raw_writeb(*(u8 *)from, to); - from++; - to++; - count--; - } + memcpy_toio_aligned(to, from, count, 64); + dgh(); } -EXPORT_SYMBOL(__memcpy_toio); +EXPORT_SYMBOL(__iowrite64_copy_full); -/* - * "memset" on IO memory space. - */ -void __memset_io(volatile void __iomem *dst, int c, size_t count) +void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count) { - u64 qc = (u8)c; - - qc |= qc << 8; - qc |= qc << 16; - qc |= qc << 32; - - while (count && !IS_ALIGNED((unsigned long)dst, 8)) { - __raw_writeb(c, dst); - dst++; - count--; - } - - while (count >= 8) { - __raw_writeq(qc, dst); - dst += 8; - count -= 8; - } - - while (count) { - __raw_writeb(c, dst); - dst++; - count--; - } + memcpy_toio_aligned(to, from, count, 32); + dgh(); } -EXPORT_SYMBOL(__memset_io); +EXPORT_SYMBOL(__iowrite32_copy_full); diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 780a12f59a8f..15dedb385b9e 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/irq.c * @@ -7,69 +8,117 @@ * Dynamic Tick Timer written by Tony Lindgren <tony@atomide.com> and * Tuukka Tikkanen <tuukka.tikkanen@elektrobit.com>. * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#include <linux/kernel_stat.h> -#include <linux/irq.h> -#include <linux/memory.h> -#include <linux/smp.h> +#include <linux/hardirq.h> #include <linux/init.h> +#include <linux/irq.h> #include <linux/irqchip.h> +#include <linux/kprobes.h> +#include <linux/memory.h> +#include <linux/scs.h> #include <linux/seq_file.h> +#include <linux/smp.h> #include <linux/vmalloc.h> +#include <asm/daifflags.h> +#include <asm/exception.h> +#include <asm/numa.h> +#include <asm/softirq_stack.h> +#include <asm/stacktrace.h> #include <asm/vmap_stack.h> -unsigned long irq_err_count; +/* Only access this in an NMI enter/exit */ +DEFINE_PER_CPU(struct nmi_ctx, nmi_contexts); DEFINE_PER_CPU(unsigned long *, irq_stack_ptr); -int arch_show_interrupts(struct seq_file *p, int prec) + +DECLARE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr); + +#ifdef CONFIG_SHADOW_CALL_STACK +DEFINE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr); +#endif + +static void init_irq_scs(void) { - show_ipi_list(p, prec); - seq_printf(p, "%*s: %10lu\n", prec, "Err", irq_err_count); - return 0; + int cpu; + + if (!scs_is_enabled()) + return; + + for_each_possible_cpu(cpu) + per_cpu(irq_shadow_call_stack_ptr, cpu) = + scs_alloc(early_cpu_to_node(cpu)); } -#ifdef CONFIG_VMAP_STACK -static void init_irq_stacks(void) +static void __init init_irq_stacks(void) { int cpu; unsigned long *p; for_each_possible_cpu(cpu) { - p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, cpu_to_node(cpu)); + p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, early_cpu_to_node(cpu)); per_cpu(irq_stack_ptr, cpu) = p; } } -#else -/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ -DEFINE_PER_CPU_ALIGNED(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); -static void init_irq_stacks(void) +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK +static void ____do_softirq(struct pt_regs *regs) { - int cpu; + __do_softirq(); +} - for_each_possible_cpu(cpu) - per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack, cpu); +void do_softirq_own_stack(void) +{ + call_on_irq_stack(NULL, ____do_softirq); } #endif +static void default_handle_irq(struct pt_regs *regs) +{ + panic("IRQ taken without a root IRQ handler\n"); +} + +static void default_handle_fiq(struct pt_regs *regs) +{ + panic("FIQ taken without a root FIQ handler\n"); +} + +void (*handle_arch_irq)(struct pt_regs *) __ro_after_init = default_handle_irq; +void (*handle_arch_fiq)(struct pt_regs *) __ro_after_init = default_handle_fiq; + +int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) +{ + if (handle_arch_irq != default_handle_irq) + return -EBUSY; + + handle_arch_irq = handle_irq; + pr_info("Root IRQ handler: %ps\n", handle_irq); + return 0; +} + +int __init set_handle_fiq(void (*handle_fiq)(struct pt_regs *)) +{ + if (handle_arch_fiq != default_handle_fiq) + return -EBUSY; + + handle_arch_fiq = handle_fiq; + pr_info("Root FIQ handler: %ps\n", handle_fiq); + return 0; +} + void __init init_IRQ(void) { init_irq_stacks(); + init_irq_scs(); irqchip_init(); - if (!handle_arch_irq) - panic("No interrupt controller found."); + + if (system_uses_irq_prio_masking()) { + /* + * Now that we have a stack for our IRQ handler, set + * the PMR/PSR pair to a consistent state. + */ + WARN_ON(read_sysreg(daif) & PSR_A_BIT); + local_daif_restore(DAIF_PROCCTX_NOIRQ); + } } diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c index 1eff270e8861..b345425193d2 100644 --- a/arch/arm64/kernel/jump_label.c +++ b/arch/arm64/kernel/jump_label.c @@ -1,27 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Huawei Ltd. * Author: Jiang Liu <liuj97@gmail.com> * * Based on arch/arm/kernel/jump_label.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/kernel.h> #include <linux/jump_label.h> +#include <linux/smp.h> #include <asm/insn.h> +#include <asm/text-patching.h> -void arch_jump_label_transform(struct jump_entry *entry, - enum jump_label_type type) +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) { void *addr = (void *)jump_entry_code(entry); u32 insn; @@ -35,15 +26,10 @@ void arch_jump_label_transform(struct jump_entry *entry, } aarch64_insn_patch_text_nosync(addr, insn); + return true; } -void arch_jump_label_transform_static(struct jump_entry *entry, - enum jump_label_type type) +void arch_jump_label_transform_apply(void) { - /* - * We use the architected A64 NOP in arch_static_branch, so there's no - * need to patch an identical A64 NOP over the top of it here. The core - * will call arch_jump_label_transform from a module notifier if the - * NOP needs to be replaced by a branch. - */ + kick_all_cpus_sync(); } diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index ba6b41790fcd..c9503ed45a6c 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -1,177 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/cache.h> -#include <linux/crc32.h> #include <linux/init.h> -#include <linux/libfdt.h> -#include <linux/mm_types.h> -#include <linux/sched.h> -#include <linux/types.h> +#include <linux/printk.h> -#include <asm/cacheflush.h> -#include <asm/fixmap.h> -#include <asm/kernel-pgtable.h> +#include <asm/cpufeature.h> #include <asm/memory.h> -#include <asm/mmu.h> -#include <asm/pgtable.h> -#include <asm/sections.h> -u64 __ro_after_init module_alloc_base; -u16 __initdata memstart_offset_seed; +bool __ro_after_init __kaslr_is_enabled = false; -static __init u64 get_kaslr_seed(void *fdt) +void __init kaslr_init(void) { - int node, len; - fdt64_t *prop; - u64 ret; - - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - return 0; - - prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); - if (!prop || len != sizeof(u64)) - return 0; - - ret = fdt64_to_cpu(*prop); - *prop = 0; - return ret; -} - -static __init const u8 *kaslr_get_cmdline(void *fdt) -{ - static __initconst const u8 default_cmdline[] = CONFIG_CMDLINE; - - if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { - int node; - const u8 *prop; - - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - goto out; - - prop = fdt_getprop(fdt, node, "bootargs", NULL); - if (!prop) - goto out; - return prop; + if (kaslr_disabled_cmdline()) { + pr_info("KASLR disabled on command line\n"); + return; } -out: - return default_cmdline; -} - -extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, - pgprot_t prot); - -/* - * This routine will be executed with the kernel mapped at its default virtual - * address, and if it returns successfully, the kernel will be remapped, and - * start_kernel() will be executed from a randomized virtual offset. The - * relocation will result in all absolute references (e.g., static variables - * containing function pointers) to be reinitialized, and zero-initialized - * .bss variables will be reset to 0. - */ -u64 __init kaslr_early_init(u64 dt_phys) -{ - void *fdt; - u64 seed, offset, mask, module_range; - const u8 *cmdline, *str; - int size; /* - * Set a reasonable default for module_alloc_base in case - * we end up running with module randomization disabled. + * The KASLR offset modulo MIN_KIMG_ALIGN is taken from the physical + * placement of the image rather than from the seed, so a displacement + * of less than MIN_KIMG_ALIGN means that no seed was provided. */ - module_alloc_base = (u64)_etext - MODULES_VSIZE; - - /* - * Try to map the FDT early. If this fails, we simply bail, - * and proceed with KASLR disabled. We will make another - * attempt at mapping the FDT in setup_machine() - */ - early_fixmap_init(); - fdt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); - if (!fdt) - return 0; - - /* - * Retrieve (and wipe) the seed from the FDT - */ - seed = get_kaslr_seed(fdt); - if (!seed) - return 0; - - /* - * Check if 'nokaslr' appears on the command line, and - * return 0 if that is the case. - */ - cmdline = kaslr_get_cmdline(fdt); - str = strstr(cmdline, "nokaslr"); - if (str == cmdline || (str > cmdline && *(str - 1) == ' ')) - return 0; - - /* - * OK, so we are proceeding with KASLR enabled. Calculate a suitable - * kernel image offset from the seed. Let's place the kernel in the - * middle half of the VMALLOC area (VA_BITS - 2), and stay clear of - * the lower and upper quarters to avoid colliding with other - * allocations. - * Even if we could randomize at page granularity for 16k and 64k pages, - * let's always round to 2 MB so we don't interfere with the ability to - * map using contiguous PTEs - */ - mask = ((1UL << (VA_BITS - 2)) - 1) & ~(SZ_2M - 1); - offset = BIT(VA_BITS - 3) + (seed & mask); - - /* use the top 16 bits to randomize the linear region */ - memstart_offset_seed = seed >> 48; - - if (IS_ENABLED(CONFIG_KASAN)) - /* - * KASAN does not expect the module region to intersect the - * vmalloc region, since shadow memory is allocated for each - * module at load time, whereas the vmalloc region is shadowed - * by KASAN zero pages. So keep modules out of the vmalloc - * region if KASAN is enabled, and put the kernel well within - * 4 GB of the module region. - */ - return offset % SZ_2G; - - if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) { - /* - * Randomize the module region over a 4 GB window covering the - * kernel. This reduces the risk of modules leaking information - * about the address of the kernel itself, but results in - * branches between modules and the core kernel that are - * resolved via PLTs. (Branches between modules will be - * resolved normally.) - */ - module_range = SZ_4G - (u64)(_end - _stext); - module_alloc_base = max((u64)_end + offset - SZ_4G, - (u64)MODULES_VADDR); - } else { - /* - * Randomize the module region by setting module_alloc_base to - * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE, - * _stext) . This guarantees that the resulting region still - * covers [_stext, _etext], and that all relative branches can - * be resolved without veneers. - */ - module_range = MODULES_VSIZE - (u64)(_etext - _stext); - module_alloc_base = (u64)_etext + offset - MODULES_VSIZE; + if (kaslr_offset() < MIN_KIMG_ALIGN) { + pr_warn("KASLR disabled due to lack of seed\n"); + return; } - /* use the lower 21 bits to randomize the base of the module region */ - module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21; - module_alloc_base &= PAGE_MASK; - - __flush_dcache_area(&module_alloc_base, sizeof(module_alloc_base)); - __flush_dcache_area(&memstart_offset_seed, sizeof(memstart_offset_seed)); + pr_info("KASLR enabled\n"); + __kaslr_is_enabled = true; +} - return offset; +static int __init parse_nokaslr(char *unused) +{ + /* nokaslr param handling is done by early cpufeature code */ + return 0; } +early_param("nokaslr", parse_nokaslr); diff --git a/arch/arm64/kernel/kexec_image.c b/arch/arm64/kernel/kexec_image.c index 07bf740bea91..532d72ea42ee 100644 --- a/arch/arm64/kernel/kexec_image.c +++ b/arch/arm64/kernel/kexec_image.c @@ -14,7 +14,6 @@ #include <linux/kexec.h> #include <linux/pe.h> #include <linux/string.h> -#include <linux/verification.h> #include <asm/byteorder.h> #include <asm/cpufeature.h> #include <asm/image.h> @@ -43,17 +42,13 @@ static void *image_load(struct kimage *image, u64 flags, value; bool be_image, be_kernel; struct kexec_buf kbuf; - unsigned long text_offset; + unsigned long text_offset, kernel_segment_number; struct kexec_segment *kernel_segment; int ret; - /* We don't support crash kernels yet. */ - if (image->type == KEXEC_TYPE_CRASH) - return ERR_PTR(-EOPNOTSUPP); - /* * We require a kernel with an unambiguous Image header. Per - * Documentation/booting.txt, this is the case when image_size + * Documentation/arch/arm64/booting.rst, this is the case when image_size * is non-zero (practically speaking, since v3.17). */ h = (struct arm64_image_header *)kernel; @@ -84,7 +79,7 @@ static void *image_load(struct kimage *image, kbuf.buffer = kernel; kbuf.bufsz = kernel_len; - kbuf.mem = 0; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; kbuf.memsz = le64_to_cpu(h->image_size); text_offset = le64_to_cpu(h->text_offset); kbuf.buf_align = MIN_KIMG_ALIGN; @@ -92,39 +87,52 @@ static void *image_load(struct kimage *image, /* Adjust kernel segment with TEXT_OFFSET */ kbuf.memsz += text_offset; - ret = kexec_add_buffer(&kbuf); - if (ret) + kernel_segment_number = image->nr_segments; + + /* + * The location of the kernel segment may make it impossible to satisfy + * the other segment requirements, so we try repeatedly to find a + * location that will work. + */ + while ((ret = kexec_add_buffer(&kbuf)) == 0) { + /* Try to load additional data */ + kernel_segment = &image->segment[kernel_segment_number]; + ret = load_other_segments(image, kernel_segment->mem, + kernel_segment->memsz, initrd, + initrd_len, cmdline); + if (!ret) + break; + + /* + * We couldn't find space for the other segments; erase the + * kernel segment and try the next available hole. + */ + image->nr_segments -= 1; + kbuf.buf_min = kernel_segment->mem + kernel_segment->memsz; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + } + + if (ret) { + pr_err("Could not find any suitable kernel location!"); return ERR_PTR(ret); + } - kernel_segment = &image->segment[image->nr_segments - 1]; + kernel_segment = &image->segment[kernel_segment_number]; kernel_segment->mem += text_offset; kernel_segment->memsz -= text_offset; image->start = kernel_segment->mem; - pr_debug("Loaded kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - kernel_segment->mem, kbuf.bufsz, - kernel_segment->memsz); - - /* Load additional data */ - ret = load_other_segments(image, - kernel_segment->mem, kernel_segment->memsz, - initrd, initrd_len, cmdline); + kexec_dprintk("Loaded kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kernel_segment->mem, kbuf.bufsz, + kernel_segment->memsz); - return ERR_PTR(ret); + return NULL; } -#ifdef CONFIG_KEXEC_IMAGE_VERIFY_SIG -static int image_verify_sig(const char *kernel, unsigned long kernel_len) -{ - return verify_pefile_signature(kernel, kernel_len, NULL, - VERIFYING_KEXEC_PE_SIGNATURE); -} -#endif - const struct kexec_file_ops kexec_image_ops = { .probe = image_probe, .load = image_load, #ifdef CONFIG_KEXEC_IMAGE_VERIFY_SIG - .verify_sig = image_verify_sig, + .verify_sig = kexec_kernel_verify_pe_sig, #endif }; diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index ce46c4cdf368..968324a79a89 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * AArch64 KGDB support * @@ -5,18 +6,6 @@ * * Copyright (C) 2013 Cavium Inc. * Author: Vijaya Kumar K <vijaya.kumar@caviumnetworks.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/bug.h> @@ -28,6 +17,7 @@ #include <asm/debug-monitors.h> #include <asm/insn.h> +#include <asm/text-patching.h> #include <asm/traps.h> struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { @@ -234,6 +224,8 @@ int kgdb_arch_handle_exception(int exception_vector, int signo, */ if (!kernel_active_single_step()) kernel_enable_single_step(linux_regs); + else + kernel_rewind_single_step(linux_regs); err = 0; break; default: @@ -242,47 +234,31 @@ int kgdb_arch_handle_exception(int exception_vector, int signo, return err; } -static int kgdb_brk_fn(struct pt_regs *regs, unsigned int esr) +int kgdb_brk_handler(struct pt_regs *regs, unsigned long esr) { kgdb_handle_exception(1, SIGTRAP, 0, regs); - return 0; + return DBG_HOOK_HANDLED; } -NOKPROBE_SYMBOL(kgdb_brk_fn) +NOKPROBE_SYMBOL(kgdb_brk_handler) -static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr) +int kgdb_compiled_brk_handler(struct pt_regs *regs, unsigned long esr) { compiled_break = 1; kgdb_handle_exception(1, SIGTRAP, 0, regs); - return 0; + return DBG_HOOK_HANDLED; } -NOKPROBE_SYMBOL(kgdb_compiled_brk_fn); +NOKPROBE_SYMBOL(kgdb_compiled_brk_handler); -static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr) +int kgdb_single_step_handler(struct pt_regs *regs, unsigned long esr) { if (!kgdb_single_step) return DBG_HOOK_ERROR; - kgdb_handle_exception(1, SIGTRAP, 0, regs); - return 0; + kgdb_handle_exception(0, SIGTRAP, 0, regs); + return DBG_HOOK_HANDLED; } -NOKPROBE_SYMBOL(kgdb_step_brk_fn); - -static struct break_hook kgdb_brkpt_hook = { - .esr_mask = 0xffffffff, - .esr_val = (u32)ESR_ELx_VAL_BRK64(KGDB_DYN_DBG_BRK_IMM), - .fn = kgdb_brk_fn -}; - -static struct break_hook kgdb_compiled_brkpt_hook = { - .esr_mask = 0xffffffff, - .esr_val = (u32)ESR_ELx_VAL_BRK64(KGDB_COMPILED_DBG_BRK_IMM), - .fn = kgdb_compiled_brk_fn -}; - -static struct step_hook kgdb_step_hook = { - .fn = kgdb_step_brk_fn -}; +NOKPROBE_SYMBOL(kgdb_single_step_handler); static int __kgdb_notify(struct die_args *args, unsigned long cmd) { @@ -321,15 +297,7 @@ static struct notifier_block kgdb_notifier = { */ int kgdb_arch_init(void) { - int ret = register_die_notifier(&kgdb_notifier); - - if (ret != 0) - return ret; - - register_break_hook(&kgdb_brkpt_hook); - register_break_hook(&kgdb_compiled_brkpt_hook); - register_step_hook(&kgdb_step_hook); - return 0; + return register_die_notifier(&kgdb_notifier); } /* @@ -339,9 +307,6 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { - unregister_break_hook(&kgdb_brkpt_hook); - unregister_break_hook(&kgdb_compiled_brkpt_hook); - unregister_step_hook(&kgdb_step_hook); unregister_die_notifier(&kgdb_notifier); } diff --git a/arch/arm64/kernel/kuser32.S b/arch/arm64/kernel/kuser32.S index 997e6b27ff6a..af046ceac22d 100644 --- a/arch/arm64/kernel/kuser32.S +++ b/arch/arm64/kernel/kuser32.S @@ -1,35 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* - * Low-level user helpers placed in the vectors page for AArch32. + * AArch32 user helpers. * Based on the kuser helpers in arch/arm/kernel/entry-armv.S. * * Copyright (C) 2005-2011 Nicolas Pitre <nico@fluxnic.net> - * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * Copyright (C) 2012-2018 ARM Ltd. * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. + * The kuser helpers below are mapped at a fixed address by + * aarch32_setup_additional_pages() and are provided for compatibility + * reasons with 32 bit (aarch32) applications that need them. * - * - * AArch32 user helpers. - * - * Each segment is 32-byte aligned and will be moved to the top of the high - * vector page. New segments (if ever needed) must be added in front of - * existing ones. This mechanism should be used only for things that are - * really small and justified, and not be abused freely. - * - * See Documentation/arm/kernel_user_helpers.txt for formal definitions. + * See Documentation/arch/arm/kernel_user_helpers.rst for formal definitions. */ #include <asm/unistd.h> + .section .rodata .align 5 .globl __kuser_helper_start __kuser_helper_start: @@ -77,42 +63,3 @@ __kuser_helper_version: // 0xffff0ffc .word ((__kuser_helper_end - __kuser_helper_start) >> 5) .globl __kuser_helper_end __kuser_helper_end: - -/* - * AArch32 sigreturn code - * - * For ARM syscalls, the syscall number has to be loaded into r7. - * We do not support an OABI userspace. - * - * For Thumb syscalls, we also pass the syscall number via r7. We therefore - * need two 16-bit instructions. - */ - .globl __aarch32_sigret_code_start -__aarch32_sigret_code_start: - - /* - * ARM Code - */ - .byte __NR_compat_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_sigreturn - .byte __NR_compat_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_sigreturn - - /* - * Thumb code - */ - .byte __NR_compat_sigreturn, 0x27 // svc #__NR_compat_sigreturn - .byte __NR_compat_sigreturn, 0xdf // mov r7, #__NR_compat_sigreturn - - /* - * ARM code - */ - .byte __NR_compat_rt_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_rt_sigreturn - .byte __NR_compat_rt_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_rt_sigreturn - - /* - * Thumb code - */ - .byte __NR_compat_rt_sigreturn, 0x27 // svc #__NR_compat_rt_sigreturn - .byte __NR_compat_rt_sigreturn, 0xdf // mov r7, #__NR_compat_rt_sigreturn - - .globl __aarch32_sigret_code_end -__aarch32_sigret_code_end: diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index aa9c94113700..239c16e3d02f 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kexec for arm64 * * Copyright (C) Linaro. * Copyright (C) Huawei Futurewei Technologies. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/interrupt.h> @@ -14,6 +11,8 @@ #include <linux/kernel.h> #include <linux/kexec.h> #include <linux/page-flags.h> +#include <linux/reboot.h> +#include <linux/set_memory.h> #include <linux/smp.h> #include <asm/cacheflush.h> @@ -23,12 +22,8 @@ #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/page.h> - -#include "cpu-reset.h" - -/* Global variables for the arm64_relocate_new_kernel routine. */ -extern const unsigned char arm64_relocate_new_kernel[]; -extern const unsigned long arm64_relocate_new_kernel_size; +#include <asm/sections.h> +#include <asm/trans_pgd.h> /** * kexec_image_info - For debugging output. @@ -37,23 +32,12 @@ extern const unsigned long arm64_relocate_new_kernel_size; static void _kexec_image_info(const char *func, int line, const struct kimage *kimage) { - unsigned long i; - - pr_debug("%s:%d:\n", func, line); - pr_debug(" kexec kimage info:\n"); - pr_debug(" type: %d\n", kimage->type); - pr_debug(" start: %lx\n", kimage->start); - pr_debug(" head: %lx\n", kimage->head); - pr_debug(" nr_segments: %lu\n", kimage->nr_segments); - - for (i = 0; i < kimage->nr_segments; i++) { - pr_debug(" segment[%lu]: %016lx - %016lx, 0x%lx bytes, %lu pages\n", - i, - kimage->segment[i].mem, - kimage->segment[i].mem + kimage->segment[i].memsz, - kimage->segment[i].memsz, - kimage->segment[i].memsz / PAGE_SIZE); - } + kexec_dprintk("%s:%d:\n", func, line); + kexec_dprintk(" kexec kimage info:\n"); + kexec_dprintk(" type: %d\n", kimage->type); + kexec_dprintk(" head: %lx\n", kimage->head); + kexec_dprintk(" kern_reloc: %pa\n", &kimage->arch.kern_reloc); + kexec_dprintk(" el2_vectors: %pa\n", &kimage->arch.el2_vectors); } void machine_kexec_cleanup(struct kimage *kimage) @@ -70,8 +54,6 @@ void machine_kexec_cleanup(struct kimage *kimage) */ int machine_kexec_prepare(struct kimage *kimage) { - kexec_image_info(kimage); - if (kimage->type != KEXEC_TYPE_CRASH && cpus_are_stuck_in_kernel()) { pr_err("Can't kexec: CPUs are stuck in the kernel.\n"); return -EBUSY; @@ -81,43 +63,6 @@ int machine_kexec_prepare(struct kimage *kimage) } /** - * kexec_list_flush - Helper to flush the kimage list and source pages to PoC. - */ -static void kexec_list_flush(struct kimage *kimage) -{ - kimage_entry_t *entry; - - for (entry = &kimage->head; ; entry++) { - unsigned int flag; - void *addr; - - /* flush the list entries. */ - __flush_dcache_area(entry, sizeof(kimage_entry_t)); - - flag = *entry & IND_FLAGS; - if (flag == IND_DONE) - break; - - addr = phys_to_virt(*entry & PAGE_MASK); - - switch (flag) { - case IND_INDIRECTION: - /* Set entry point just before the new list page. */ - entry = (kimage_entry_t *)addr - 1; - break; - case IND_SOURCE: - /* flush the source pages. */ - __flush_dcache_area(addr, PAGE_SIZE); - break; - case IND_DESTINATION: - break; - default: - BUG(); - } - } -} - -/** * kexec_segment_flush - Helper to flush the kimage segments to PoC. */ static void kexec_segment_flush(const struct kimage *kimage) @@ -134,11 +79,84 @@ static void kexec_segment_flush(const struct kimage *kimage) kimage->segment[i].memsz, kimage->segment[i].memsz / PAGE_SIZE); - __flush_dcache_area(phys_to_virt(kimage->segment[i].mem), - kimage->segment[i].memsz); + dcache_clean_inval_poc( + (unsigned long)phys_to_virt(kimage->segment[i].mem), + (unsigned long)phys_to_virt(kimage->segment[i].mem) + + kimage->segment[i].memsz); } } +/* Allocates pages for kexec page table */ +static void *kexec_page_alloc(void *arg) +{ + struct kimage *kimage = arg; + struct page *page = kimage_alloc_control_pages(kimage, 0); + void *vaddr = NULL; + + if (!page) + return NULL; + + vaddr = page_address(page); + memset(vaddr, 0, PAGE_SIZE); + + return vaddr; +} + +int machine_kexec_post_load(struct kimage *kimage) +{ + int rc; + pgd_t *trans_pgd; + void *reloc_code = page_to_virt(kimage->control_code_page); + long reloc_size; + struct trans_pgd_info info = { + .trans_alloc_page = kexec_page_alloc, + .trans_alloc_arg = kimage, + }; + + /* If in place, relocation is not used, only flush next kernel */ + if (kimage->head & IND_DONE) { + kexec_segment_flush(kimage); + kexec_image_info(kimage); + return 0; + } + + kimage->arch.el2_vectors = 0; + if (is_hyp_nvhe()) { + rc = trans_pgd_copy_el2_vectors(&info, + &kimage->arch.el2_vectors); + if (rc) + return rc; + } + + /* Create a copy of the linear map */ + trans_pgd = kexec_page_alloc(kimage); + if (!trans_pgd) + return -ENOMEM; + rc = trans_pgd_create_copy(&info, &trans_pgd, PAGE_OFFSET, PAGE_END); + if (rc) + return rc; + kimage->arch.ttbr1 = __pa(trans_pgd); + kimage->arch.zero_page = __pa_symbol(empty_zero_page); + + reloc_size = __relocate_new_kernel_end - __relocate_new_kernel_start; + memcpy(reloc_code, __relocate_new_kernel_start, reloc_size); + kimage->arch.kern_reloc = __pa(reloc_code); + rc = trans_pgd_idmap_page(&info, &kimage->arch.ttbr0, + &kimage->arch.t0sz, reloc_code); + if (rc) + return rc; + kimage->arch.phys_offset = virt_to_phys(kimage) - (long)kimage; + + /* Flush the reloc_code in preparation for its execution. */ + dcache_clean_inval_poc((unsigned long)reloc_code, + (unsigned long)reloc_code + reloc_size); + icache_inval_pou((uintptr_t)reloc_code, + (uintptr_t)reloc_code + reloc_size); + kexec_image_info(kimage); + + return 0; +} + /** * machine_kexec - Do the kexec reboot. * @@ -146,8 +164,6 @@ static void kexec_segment_flush(const struct kimage *kimage) */ void machine_kexec(struct kimage *kimage) { - phys_addr_t reboot_code_buffer_phys; - void *reboot_code_buffer; bool in_kexec_crash = (kimage == kexec_crash_image); bool stuck_cpus = cpus_are_stuck_in_kernel(); @@ -158,106 +174,39 @@ void machine_kexec(struct kimage *kimage) WARN(in_kexec_crash && (stuck_cpus || smp_crash_stop_failed()), "Some CPUs may be stale, kdump will be unreliable.\n"); - reboot_code_buffer_phys = page_to_phys(kimage->control_code_page); - reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys); - - kexec_image_info(kimage); - - pr_debug("%s:%d: control_code_page: %p\n", __func__, __LINE__, - kimage->control_code_page); - pr_debug("%s:%d: reboot_code_buffer_phys: %pa\n", __func__, __LINE__, - &reboot_code_buffer_phys); - pr_debug("%s:%d: reboot_code_buffer: %p\n", __func__, __LINE__, - reboot_code_buffer); - pr_debug("%s:%d: relocate_new_kernel: %p\n", __func__, __LINE__, - arm64_relocate_new_kernel); - pr_debug("%s:%d: relocate_new_kernel_size: 0x%lx(%lu) bytes\n", - __func__, __LINE__, arm64_relocate_new_kernel_size, - arm64_relocate_new_kernel_size); - - /* - * Copy arm64_relocate_new_kernel to the reboot_code_buffer for use - * after the kernel is shut down. - */ - memcpy(reboot_code_buffer, arm64_relocate_new_kernel, - arm64_relocate_new_kernel_size); - - /* Flush the reboot_code_buffer in preparation for its execution. */ - __flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size); - - /* - * Although we've killed off the secondary CPUs, we don't update - * the online mask if we're handling a crash kernel and consequently - * need to avoid flush_icache_range(), which will attempt to IPI - * the offline CPUs. Therefore, we must use the __* variant here. - */ - __flush_icache_range((uintptr_t)reboot_code_buffer, - arm64_relocate_new_kernel_size); - - /* Flush the kimage list and its buffers. */ - kexec_list_flush(kimage); - - /* Flush the new image if already in place. */ - if ((kimage != kexec_crash_image) && (kimage->head & IND_DONE)) - kexec_segment_flush(kimage); - pr_info("Bye!\n"); local_daif_mask(); /* - * cpu_soft_restart will shutdown the MMU, disable data caches, then - * transfer control to the reboot_code_buffer which contains a copy of - * the arm64_relocate_new_kernel routine. arm64_relocate_new_kernel - * uses physical addressing to relocate the new image to its final - * position and transfers control to the image entry point when the - * relocation is complete. + * Both restart and kernel_reloc will shutdown the MMU, disable data + * caches. However, restart will start new kernel or purgatory directly, + * kernel_reloc contains the body of arm64_relocate_new_kernel * In kexec case, kimage->start points to purgatory assuming that * kernel entry and dtb address are embedded in purgatory by * userspace (kexec-tools). * In kexec_file case, the kernel starts directly without purgatory. */ - cpu_soft_restart(reboot_code_buffer_phys, kimage->head, kimage->start, -#ifdef CONFIG_KEXEC_FILE - kimage->arch.dtb_mem); -#else - 0); -#endif + if (kimage->head & IND_DONE) { + typeof(cpu_soft_restart) *restart; + + cpu_install_idmap(); + restart = (void *)__pa_symbol(cpu_soft_restart); + restart(is_hyp_nvhe(), kimage->start, kimage->arch.dtb_mem, + 0, 0); + } else { + void (*kernel_reloc)(struct kimage *kimage); + + if (is_hyp_nvhe()) + __hyp_set_vectors(kimage->arch.el2_vectors); + cpu_install_ttbr0(kimage->arch.ttbr0, kimage->arch.t0sz); + kernel_reloc = (void *)kimage->arch.kern_reloc; + kernel_reloc(kimage); + } BUG(); /* Should never get here. */ } -static void machine_kexec_mask_interrupts(void) -{ - unsigned int i; - struct irq_desc *desc; - - for_each_irq_desc(i, desc) { - struct irq_chip *chip; - int ret; - - chip = irq_desc_get_chip(desc); - if (!chip) - continue; - - /* - * First try to remove the active state. If this - * fails, try to EOI the interrupt. - */ - ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false); - - if (ret && irqd_irq_inprogress(&desc->irq_data) && - chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); - - if (chip->irq_mask) - chip->irq_mask(&desc->irq_data); - - if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) - chip->irq_disable(&desc->irq_data); - } -} - /** * machine_crash_shutdown - shutdown non-crashing cpus and save registers */ @@ -275,29 +224,7 @@ void machine_crash_shutdown(struct pt_regs *regs) pr_info("Starting crashdump kernel...\n"); } -void arch_kexec_protect_crashkres(void) -{ - int i; - - kexec_segment_flush(kexec_crash_image); - - for (i = 0; i < kexec_crash_image->nr_segments; i++) - set_memory_valid( - __phys_to_virt(kexec_crash_image->segment[i].mem), - kexec_crash_image->segment[i].memsz >> PAGE_SHIFT, 0); -} - -void arch_kexec_unprotect_crashkres(void) -{ - int i; - - for (i = 0; i < kexec_crash_image->nr_segments; i++) - set_memory_valid( - __phys_to_virt(kexec_crash_image->segment[i].mem), - kexec_crash_image->segment[i].memsz >> PAGE_SHIFT, 1); -} - -#ifdef CONFIG_HIBERNATION +#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION) /* * To preserve the crash dump kernel image, the relevant memory segments * should be mapped again around the hibernation. @@ -321,10 +248,10 @@ void crash_post_resume(void) * but does not hold any data of loaded kernel image. * * Note that all the pages in crash dump kernel memory have been initially - * marked as Reserved in kexec_reserve_crashkres_pages(). + * marked as Reserved as memory was allocated via memblock_reserve(). * * In hibernation, the pages which are Reserved and yet "nosave" are excluded - * from the hibernation iamge. crash_is_nosave() does thich check for crash + * from the hibernation image. crash_is_nosave() does thich check for crash * dump kernel and will reduce the total size of hibernation image. */ @@ -338,8 +265,13 @@ bool crash_is_nosave(unsigned long pfn) /* in reserved memory? */ addr = __pfn_to_phys(pfn); - if ((addr < crashk_res.start) || (crashk_res.end < addr)) - return false; + if ((addr < crashk_res.start) || (crashk_res.end < addr)) { + if (!crashk_low_res.end) + return false; + + if ((addr < crashk_low_res.start) || (crashk_low_res.end < addr)) + return false; + } if (!kexec_crash_image) return true; @@ -361,7 +293,6 @@ void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) for (addr = begin; addr < end; addr += PAGE_SIZE) { page = phys_to_page(addr); - ClearPageReserved(page); free_reserved_page(page); } } diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index f2c211a6229b..410060ebd86d 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -15,18 +15,12 @@ #include <linux/kexec.h> #include <linux/libfdt.h> #include <linux/memblock.h> +#include <linux/of.h> #include <linux/of_fdt.h> -#include <linux/random.h> +#include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> #include <linux/vmalloc.h> -#include <asm/byteorder.h> - -/* relevant device tree properties */ -#define FDT_PROP_INITRD_START "linux,initrd-start" -#define FDT_PROP_INITRD_END "linux,initrd-end" -#define FDT_PROP_BOOTARGS "bootargs" -#define FDT_PROP_KASLR_SEED "kaslr-seed" const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_image_ops, @@ -35,147 +29,119 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { int arch_kimage_file_post_load_cleanup(struct kimage *image) { - vfree(image->arch.dtb); + kvfree(image->arch.dtb); image->arch.dtb = NULL; + vfree(image->elf_headers); + image->elf_headers = NULL; + image->elf_headers_sz = 0; + return kexec_image_post_load_cleanup_default(image); } -static int setup_dtb(struct kimage *image, - unsigned long initrd_load_addr, unsigned long initrd_len, - char *cmdline, void *dtb) +#ifdef CONFIG_CRASH_DUMP +static int prepare_elf_headers(void **addr, unsigned long *sz) { - int off, ret; - - ret = fdt_path_offset(dtb, "/chosen"); - if (ret < 0) - goto out; - - off = ret; - - /* add bootargs */ - if (cmdline) { - ret = fdt_setprop_string(dtb, off, FDT_PROP_BOOTARGS, cmdline); - if (ret) - goto out; - } else { - ret = fdt_delprop(dtb, off, FDT_PROP_BOOTARGS); - if (ret && (ret != -FDT_ERR_NOTFOUND)) - goto out; - } - - /* add initrd-* */ - if (initrd_load_addr) { - ret = fdt_setprop_u64(dtb, off, FDT_PROP_INITRD_START, - initrd_load_addr); - if (ret) - goto out; - - ret = fdt_setprop_u64(dtb, off, FDT_PROP_INITRD_END, - initrd_load_addr + initrd_len); - if (ret) - goto out; - } else { - ret = fdt_delprop(dtb, off, FDT_PROP_INITRD_START); - if (ret && (ret != -FDT_ERR_NOTFOUND)) - goto out; - - ret = fdt_delprop(dtb, off, FDT_PROP_INITRD_END); - if (ret && (ret != -FDT_ERR_NOTFOUND)) - goto out; + struct crash_mem *cmem; + unsigned int nr_ranges; + int ret; + u64 i; + phys_addr_t start, end; + + nr_ranges = 2; /* for exclusion of crashkernel region */ + for_each_mem_range(i, &start, &end) + nr_ranges++; + + cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL); + if (!cmem) + return -ENOMEM; + + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; + for_each_mem_range(i, &start, &end) { + cmem->ranges[cmem->nr_ranges].start = start; + cmem->ranges[cmem->nr_ranges].end = end - 1; + cmem->nr_ranges++; } - /* add kaslr-seed */ - ret = fdt_delprop(dtb, off, FDT_PROP_KASLR_SEED); - if (ret == -FDT_ERR_NOTFOUND) - ret = 0; - else if (ret) + /* Exclude crashkernel region */ + ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (ret) goto out; - if (rng_is_initialized()) { - u64 seed = get_random_u64(); - ret = fdt_setprop_u64(dtb, off, FDT_PROP_KASLR_SEED, seed); + if (crashk_low_res.end) { + ret = crash_exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); if (ret) goto out; - } else { - pr_notice("RNG is not initialised: omitting \"%s\" property\n", - FDT_PROP_KASLR_SEED); } -out: - if (ret) - return (ret == -FDT_ERR_NOSPACE) ? -ENOMEM : -EINVAL; + ret = crash_prepare_elf64_headers(cmem, true, addr, sz); - return 0; +out: + kfree(cmem); + return ret; } +#endif /* - * More space needed so that we can add initrd, bootargs and kaslr-seed. + * Tries to add the initrd and DTB to the image. If it is not possible to find + * valid locations, this function will undo changes to the image and return non + * zero. */ -#define DTB_EXTRA_SPACE 0x1000 - -static int create_dtb(struct kimage *image, - unsigned long initrd_load_addr, unsigned long initrd_len, - char *cmdline, void **dtb) -{ - void *buf; - size_t buf_size; - int ret; - - buf_size = fdt_totalsize(initial_boot_params) - + strlen(cmdline) + DTB_EXTRA_SPACE; - - for (;;) { - buf = vmalloc(buf_size); - if (!buf) - return -ENOMEM; - - /* duplicate a device tree blob */ - ret = fdt_open_into(initial_boot_params, buf, buf_size); - if (ret) - return -EINVAL; - - ret = setup_dtb(image, initrd_load_addr, initrd_len, - cmdline, buf); - if (ret) { - vfree(buf); - if (ret == -ENOMEM) { - /* unlikely, but just in case */ - buf_size += DTB_EXTRA_SPACE; - continue; - } else { - return ret; - } - } - - /* trim it */ - fdt_pack(buf); - *dtb = buf; - - return 0; - } -} - int load_other_segments(struct kimage *image, unsigned long kernel_load_addr, unsigned long kernel_size, char *initrd, unsigned long initrd_len, char *cmdline) { - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; void *dtb = NULL; - unsigned long initrd_load_addr = 0, dtb_len; + unsigned long initrd_load_addr = 0, dtb_len, + orig_segments = image->nr_segments; int ret = 0; kbuf.image = image; /* not allocate anything below the kernel */ kbuf.buf_min = kernel_load_addr + kernel_size; +#ifdef CONFIG_CRASH_DUMP + /* load elf core header */ + void *headers; + unsigned long headers_sz; + if (image->type == KEXEC_TYPE_CRASH) { + ret = prepare_elf_headers(&headers, &headers_sz); + if (ret) { + pr_err("Preparing elf core header failed\n"); + goto out_err; + } + + kbuf.buffer = headers; + kbuf.bufsz = headers_sz; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = headers_sz; + kbuf.buf_align = SZ_64K; /* largest supported page size */ + kbuf.buf_max = ULONG_MAX; + kbuf.top_down = true; + + ret = kexec_add_buffer(&kbuf); + if (ret) { + vfree(headers); + goto out_err; + } + image->elf_headers = headers; + image->elf_load_addr = kbuf.mem; + image->elf_headers_sz = headers_sz; + + kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); + } +#endif + /* load initrd */ if (initrd) { kbuf.buffer = initrd; kbuf.bufsz = initrd_len; - kbuf.mem = 0; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; kbuf.memsz = initrd_len; kbuf.buf_align = 0; /* within 1GB-aligned window of up to 32GB in size */ @@ -188,21 +154,25 @@ int load_other_segments(struct kimage *image, goto out_err; initrd_load_addr = kbuf.mem; - pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - initrd_load_addr, initrd_len, initrd_len); + kexec_dprintk("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + initrd_load_addr, kbuf.bufsz, kbuf.memsz); } /* load dtb */ - ret = create_dtb(image, initrd_load_addr, initrd_len, cmdline, &dtb); - if (ret) { + dtb = of_kexec_alloc_and_setup_fdt(image, initrd_load_addr, + initrd_len, cmdline, 0); + if (!dtb) { pr_err("Preparing for new dtb failed\n"); + ret = -EINVAL; goto out_err; } + /* trim it */ + fdt_pack(dtb); dtb_len = fdt_totalsize(dtb); kbuf.buffer = dtb; kbuf.bufsz = dtb_len; - kbuf.mem = 0; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; kbuf.memsz = dtb_len; /* not across 2MB boundary */ kbuf.buf_align = SZ_2M; @@ -215,12 +185,13 @@ int load_other_segments(struct kimage *image, image->arch.dtb = dtb; image->arch.dtb_mem = kbuf.mem; - pr_debug("Loaded dtb at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - kbuf.mem, dtb_len, dtb_len); + kexec_dprintk("Loaded dtb at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kbuf.mem, kbuf.bufsz, kbuf.memsz); return 0; out_err: - vfree(dtb); + image->nr_segments = orig_segments; + kvfree(dtb); return ret; } diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c index 255941394941..7afd370da9f4 100644 --- a/arch/arm64/kernel/module-plts.c +++ b/arch/arm64/kernel/module-plts.c @@ -1,14 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2014-2017 Linaro Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/elf.h> +#include <linux/ftrace.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/moduleloader.h> #include <linux/sort.h> static struct plt_entry __get_adrp_add_pair(u64 dst, u64 pc, @@ -39,7 +38,8 @@ struct plt_entry get_plt_entry(u64 dst, void *pc) return plt; } -bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b) +static bool plt_entries_equal(const struct plt_entry *a, + const struct plt_entry *b) { u64 p, q; @@ -66,17 +66,12 @@ bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b) (q + aarch64_insn_adrp_get_offset(le32_to_cpu(b->adrp))); } -static bool in_init(const struct module *mod, void *loc) -{ - return (u64)loc - (u64)mod->init_layout.base < mod->init_layout.size; -} - u64 module_emit_plt_entry(struct module *mod, Elf64_Shdr *sechdrs, void *loc, const Elf64_Rela *rela, Elf64_Sym *sym) { - struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core : - &mod->arch.init; + struct mod_plt_sec *pltsec = !within_module_init((unsigned long)loc, mod) ? + &mod->arch.core : &mod->arch.init; struct plt_entry *plt = (struct plt_entry *)sechdrs[pltsec->plt_shndx].sh_addr; int i = pltsec->plt_num_entries; int j = i - 1; @@ -106,8 +101,8 @@ u64 module_emit_plt_entry(struct module *mod, Elf64_Shdr *sechdrs, u64 module_emit_veneer_for_adrp(struct module *mod, Elf64_Shdr *sechdrs, void *loc, u64 val) { - struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core : - &mod->arch.init; + struct mod_plt_sec *pltsec = !within_module_init((unsigned long)loc, mod) ? + &mod->arch.core : &mod->arch.init; struct plt_entry *plt = (struct plt_entry *)sechdrs[pltsec->plt_shndx].sh_addr; int i = pltsec->plt_num_entries++; u32 br; @@ -133,7 +128,7 @@ u64 module_emit_veneer_for_adrp(struct module *mod, Elf64_Shdr *sechdrs, } #endif -#define cmp_3way(a,b) ((a) < (b) ? -1 : (a) > (b)) +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) static int cmp_rela(const void *a, const void *b) { @@ -172,9 +167,6 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, switch (ELF64_R_TYPE(rela[i].r_info)) { case R_AARCH64_JUMP26: case R_AARCH64_CALL26: - if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - break; - /* * We only have to consider branch targets that resolve * to symbols that are defined in a different section. @@ -208,8 +200,7 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, break; case R_AARCH64_ADR_PREL_PG_HI21_NC: case R_AARCH64_ADR_PREL_PG_HI21: - if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) || - !cpus_have_const_cap(ARM64_WORKAROUND_843419)) + if (!cpus_have_final_cap(ARM64_WORKAROUND_843419)) break; /* @@ -222,7 +213,7 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, * increasing the section's alignment so that the * resulting address of this instruction is guaranteed * to equal the offset in that particular bit (as well - * as all less signficant bits). This ensures that the + * as all less significant bits). This ensures that the * address modulo 4 KB != 0xfff8 or 0xfffc (which would * have all ones in bits [11:3]) */ @@ -244,24 +235,55 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, } } - if (IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) && - cpus_have_const_cap(ARM64_WORKAROUND_843419)) + if (cpus_have_final_cap(ARM64_WORKAROUND_843419)) { /* * Add some slack so we can skip PLT slots that may trigger * the erratum due to the placement of the ADRP instruction. */ ret += DIV_ROUND_UP(ret, (SZ_4K / sizeof(struct plt_entry))); + } return ret; } +static bool branch_rela_needs_plt(Elf64_Sym *syms, Elf64_Rela *rela, + Elf64_Word dstidx) +{ + + Elf64_Sym *s = syms + ELF64_R_SYM(rela->r_info); + + if (s->st_shndx == dstidx) + return false; + + return ELF64_R_TYPE(rela->r_info) == R_AARCH64_JUMP26 || + ELF64_R_TYPE(rela->r_info) == R_AARCH64_CALL26; +} + +/* Group branch PLT relas at the front end of the array. */ +static int partition_branch_plt_relas(Elf64_Sym *syms, Elf64_Rela *rela, + int numrels, Elf64_Word dstidx) +{ + int i = 0, j = numrels - 1; + + while (i < j) { + if (branch_rela_needs_plt(syms, &rela[i], dstidx)) + i++; + else if (branch_rela_needs_plt(syms, &rela[j], dstidx)) + swap(rela[i], rela[j]); + else + j--; + } + + return i; +} + int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { unsigned long core_plts = 0; unsigned long init_plts = 0; Elf64_Sym *syms = NULL; - Elf_Shdr *pltsec, *tramp = NULL; + Elf_Shdr *pltsec, *tramp = NULL, *init_tramp = NULL; int i; /* @@ -273,10 +295,12 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, mod->arch.core.plt_shndx = i; else if (!strcmp(secstrings + sechdrs[i].sh_name, ".init.plt")) mod->arch.init.plt_shndx = i; - else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE) && - !strcmp(secstrings + sechdrs[i].sh_name, + else if (!strcmp(secstrings + sechdrs[i].sh_name, ".text.ftrace_trampoline")) tramp = sechdrs + i; + else if (!strcmp(secstrings + sechdrs[i].sh_name, + ".init.text.ftrace_trampoline")) + init_tramp = sechdrs + i; else if (sechdrs[i].sh_type == SHT_SYMTAB) syms = (Elf64_Sym *)sechdrs[i].sh_addr; } @@ -292,7 +316,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, for (i = 0; i < ehdr->e_shnum; i++) { Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset; - int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela); + int nents, numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela); Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info; if (sechdrs[i].sh_type != SHT_RELA) @@ -302,10 +326,16 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, if (!(dstsec->sh_flags & SHF_EXECINSTR)) continue; - /* sort by type, symbol index and addend */ - sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL); + /* + * sort branch relocations requiring a PLT by type, symbol index + * and addend + */ + nents = partition_branch_plt_relas(syms, rels, numrels, + sechdrs[i].sh_info); + if (nents) + sort(rels, nents, sizeof(Elf64_Rela), cmp_rela, NULL); - if (strncmp(secstrings + dstsec->sh_name, ".init", 5) != 0) + if (!module_init_layout_section(secstrings + dstsec->sh_name)) core_plts += count_plts(syms, rels, numrels, sechdrs[i].sh_info, dstsec); else @@ -333,7 +363,14 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, tramp->sh_type = SHT_NOBITS; tramp->sh_flags = SHF_EXECINSTR | SHF_ALLOC; tramp->sh_addralign = __alignof__(struct plt_entry); - tramp->sh_size = sizeof(struct plt_entry); + tramp->sh_size = NR_FTRACE_PLTS * sizeof(struct plt_entry); + } + + if (init_tramp) { + init_tramp->sh_type = SHT_NOBITS; + init_tramp->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + init_tramp->sh_addralign = __alignof__(struct plt_entry); + init_tramp->sh_size = NR_FTRACE_PLTS * sizeof(struct plt_entry); } return 0; diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index f713e2fc4d75..24adb581af0e 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -1,72 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * AArch64 loadable module support. * * Copyright (C) 2012 ARM Limited * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * * Author: Will Deacon <will.deacon@arm.com> */ +#define pr_fmt(fmt) "Modules: " fmt + #include <linux/bitops.h> #include <linux/elf.h> -#include <linux/gfp.h> +#include <linux/ftrace.h> #include <linux/kasan.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/moduleloader.h> -#include <linux/vmalloc.h> +#include <linux/random.h> +#include <linux/scs.h> + #include <asm/alternative.h> #include <asm/insn.h> +#include <asm/scs.h> #include <asm/sections.h> - -void *module_alloc(unsigned long size) -{ - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - /* Silence the initial allocation */ - if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) - gfp_mask |= __GFP_NOWARN; - - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, - module_alloc_base + MODULES_VSIZE, - gfp_mask, PAGE_KERNEL_EXEC, 0, - NUMA_NO_NODE, __builtin_return_address(0)); - - if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && - !IS_ENABLED(CONFIG_KASAN)) - /* - * KASAN can only deal with module allocations being served - * from the reserved module region, since the remainder of - * the vmalloc region is already backed by zero shadow pages, - * and punching holes into it is non-trivial. Since the module - * region is not randomized when KASAN is enabled, it is even - * less likely that the module region gets exhausted, so we - * can simply omit this fallback in that case. - */ - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, - module_alloc_base + SZ_4G, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - - if (p && (kasan_module_alloc(p, size) < 0)) { - vfree(p); - return NULL; - } - - return p; -} +#include <asm/text-patching.h> enum aarch64_reloc_op { RELOC_OP_NONE, @@ -92,23 +49,67 @@ static u64 do_reloc(enum aarch64_reloc_op reloc_op, __le32 *place, u64 val) return 0; } -static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) +#define WRITE_PLACE(place, val, mod) do { \ + __typeof__(val) __val = (val); \ + \ + if (mod->state == MODULE_STATE_UNFORMED) \ + *(place) = __val; \ + else \ + aarch64_insn_copy(place, &(__val), sizeof(*place)); \ +} while (0) + +static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len, + struct module *me) { s64 sval = do_reloc(op, place, val); + /* + * The ELF psABI for AArch64 documents the 16-bit and 32-bit place + * relative and absolute relocations as having a range of [-2^15, 2^16) + * or [-2^31, 2^32), respectively. However, in order to be able to + * detect overflows reliably, we have to choose whether we interpret + * such quantities as signed or as unsigned, and stick with it. + * The way we organize our address space requires a signed + * interpretation of 32-bit relative references, so let's use that + * for all R_AARCH64_PRELxx relocations. This means our upper + * bound for overflow detection should be Sxx_MAX rather than Uxx_MAX. + */ + switch (len) { case 16: - *(s16 *)place = sval; - if (sval < S16_MIN || sval > U16_MAX) - return -ERANGE; + WRITE_PLACE((s16 *)place, sval, me); + switch (op) { + case RELOC_OP_ABS: + if (sval < 0 || sval > U16_MAX) + return -ERANGE; + break; + case RELOC_OP_PREL: + if (sval < S16_MIN || sval > S16_MAX) + return -ERANGE; + break; + default: + pr_err("Invalid 16-bit data relocation (%d)\n", op); + return 0; + } break; case 32: - *(s32 *)place = sval; - if (sval < S32_MIN || sval > U32_MAX) - return -ERANGE; + WRITE_PLACE((s32 *)place, sval, me); + switch (op) { + case RELOC_OP_ABS: + if (sval < 0 || sval > U32_MAX) + return -ERANGE; + break; + case RELOC_OP_PREL: + if (sval < S32_MIN || sval > S32_MAX) + return -ERANGE; + break; + default: + pr_err("Invalid 32-bit data relocation (%d)\n", op); + return 0; + } break; case 64: - *(s64 *)place = sval; + WRITE_PLACE((s64 *)place, sval, me); break; default: pr_err("Invalid length (%d) for data relocation\n", len); @@ -123,7 +124,8 @@ enum aarch64_insn_movw_imm_type { }; static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, - int lsb, enum aarch64_insn_movw_imm_type imm_type) + int lsb, enum aarch64_insn_movw_imm_type imm_type, + struct module *me) { u64 imm; s64 sval; @@ -155,7 +157,7 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, /* Update the instruction with the new encoding. */ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); - *place = cpu_to_le32(insn); + WRITE_PLACE(place, cpu_to_le32(insn), me); if (imm > U16_MAX) return -ERANGE; @@ -164,7 +166,8 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, } static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, - int lsb, int len, enum aarch64_insn_imm_type imm_type) + int lsb, int len, enum aarch64_insn_imm_type imm_type, + struct module *me) { u64 imm, imm_mask; s64 sval; @@ -180,7 +183,7 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, /* Update the instruction's immediate field. */ insn = aarch64_insn_encode_immediate(imm_type, insn, imm); - *place = cpu_to_le32(insn); + WRITE_PLACE(place, cpu_to_le32(insn), me); /* * Extract the upper value bits (including the sign bit) and @@ -199,17 +202,17 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, } static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs, - __le32 *place, u64 val) + __le32 *place, u64 val, struct module *me) { u32 insn; if (!is_forbidden_offset_for_adrp(place)) return reloc_insn_imm(RELOC_OP_PAGE, place, val, 12, 21, - AARCH64_INSN_IMM_ADR); + AARCH64_INSN_IMM_ADR, me); /* patch ADRP to ADR if it is in range */ if (!reloc_insn_imm(RELOC_OP_PREL, place, val & ~0xfff, 0, 21, - AARCH64_INSN_IMM_ADR)) { + AARCH64_INSN_IMM_ADR, me)) { insn = le32_to_cpu(*place); insn &= ~BIT(31); } else { @@ -221,7 +224,7 @@ static int reloc_insn_adrp(struct module *mod, Elf64_Shdr *sechdrs, AARCH64_INSN_BRANCH_NOLINK); } - *place = cpu_to_le32(insn); + WRITE_PLACE(place, cpu_to_le32(insn), me); return 0; } @@ -265,109 +268,113 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, /* Data relocations. */ case R_AARCH64_ABS64: overflow_check = false; - ovf = reloc_data(RELOC_OP_ABS, loc, val, 64); + ovf = reloc_data(RELOC_OP_ABS, loc, val, 64, me); break; case R_AARCH64_ABS32: - ovf = reloc_data(RELOC_OP_ABS, loc, val, 32); + ovf = reloc_data(RELOC_OP_ABS, loc, val, 32, me); break; case R_AARCH64_ABS16: - ovf = reloc_data(RELOC_OP_ABS, loc, val, 16); + ovf = reloc_data(RELOC_OP_ABS, loc, val, 16, me); break; case R_AARCH64_PREL64: overflow_check = false; - ovf = reloc_data(RELOC_OP_PREL, loc, val, 64); + ovf = reloc_data(RELOC_OP_PREL, loc, val, 64, me); break; case R_AARCH64_PREL32: - ovf = reloc_data(RELOC_OP_PREL, loc, val, 32); + ovf = reloc_data(RELOC_OP_PREL, loc, val, 32, me); break; case R_AARCH64_PREL16: - ovf = reloc_data(RELOC_OP_PREL, loc, val, 16); + ovf = reloc_data(RELOC_OP_PREL, loc, val, 16, me); break; /* MOVW instruction relocations. */ case R_AARCH64_MOVW_UABS_G0_NC: overflow_check = false; + fallthrough; case R_AARCH64_MOVW_UABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_UABS_G1_NC: overflow_check = false; + fallthrough; case R_AARCH64_MOVW_UABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_UABS_G2_NC: overflow_check = false; + fallthrough; case R_AARCH64_MOVW_UABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_UABS_G3: /* We're using the top bits so we can't overflow. */ overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_SABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_SABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_SABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_PREL_G0_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_PREL_G0: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_PREL_G1_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_PREL_G1: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_PREL_G2_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, - AARCH64_INSN_IMM_MOVKZ); + AARCH64_INSN_IMM_MOVKZ, me); break; case R_AARCH64_MOVW_PREL_G2: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; case R_AARCH64_MOVW_PREL_G3: /* We're using the top bits so we can't overflow. */ overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 48, - AARCH64_INSN_IMM_MOVNZ); + AARCH64_INSN_IMM_MOVNZ, me); break; /* Immediate instruction relocations. */ case R_AARCH64_LD_PREL_LO19: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19, - AARCH64_INSN_IMM_19); + AARCH64_INSN_IMM_19, me); break; case R_AARCH64_ADR_PREL_LO21: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21, - AARCH64_INSN_IMM_ADR); + AARCH64_INSN_IMM_ADR, me); break; case R_AARCH64_ADR_PREL_PG_HI21_NC: overflow_check = false; + fallthrough; case R_AARCH64_ADR_PREL_PG_HI21: - ovf = reloc_insn_adrp(me, sechdrs, loc, val); + ovf = reloc_insn_adrp(me, sechdrs, loc, val, me); if (ovf && ovf != -ERANGE) return ovf; break; @@ -375,48 +382,46 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_LDST8_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 0, 12, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_LDST16_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 1, 11, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_LDST32_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 2, 10, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_LDST64_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 3, 9, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_LDST128_ABS_LO12_NC: overflow_check = false; ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 4, 8, - AARCH64_INSN_IMM_12); + AARCH64_INSN_IMM_12, me); break; case R_AARCH64_TSTBR14: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 14, - AARCH64_INSN_IMM_14); + AARCH64_INSN_IMM_14, me); break; case R_AARCH64_CONDBR19: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19, - AARCH64_INSN_IMM_19); + AARCH64_INSN_IMM_19, me); break; case R_AARCH64_JUMP26: case R_AARCH64_CALL26: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26, - AARCH64_INSN_IMM_26); - - if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && - ovf == -ERANGE) { + AARCH64_INSN_IMM_26, me); + if (ovf == -ERANGE) { val = module_emit_plt_entry(me, sechdrs, loc, &rel[i], sym); if (!val) return -ENOEXEC; ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, - 26, AARCH64_INSN_IMM_26); + 26, AARCH64_INSN_IMM_26, me); } break; @@ -439,22 +444,76 @@ overflow: return -ENOEXEC; } +static inline void __init_plt(struct plt_entry *plt, unsigned long addr) +{ + *plt = get_plt_entry(addr, plt); +} + +static int module_init_ftrace_plt(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *mod) +{ +#if defined(CONFIG_DYNAMIC_FTRACE) + const Elf_Shdr *s; + struct plt_entry *plts; + + s = find_section(hdr, sechdrs, ".text.ftrace_trampoline"); + if (!s) + return -ENOEXEC; + + plts = (void *)s->sh_addr; + + __init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR); + + mod->arch.ftrace_trampolines = plts; + + s = find_section(hdr, sechdrs, ".init.text.ftrace_trampoline"); + if (!s) + return -ENOEXEC; + + plts = (void *)s->sh_addr; + + __init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR); + + mod->arch.init_ftrace_trampolines = plts; + +#endif + return 0; +} + int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *se; - const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) { - if (strcmp(".altinstructions", secstrs + s->sh_name) == 0) - apply_alternatives_module((void *)s->sh_addr, s->sh_size); -#ifdef CONFIG_ARM64_MODULE_PLTS - if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE) && - !strcmp(".text.ftrace_trampoline", secstrs + s->sh_name)) - me->arch.ftrace_trampoline = (void *)s->sh_addr; -#endif + const Elf_Shdr *s; + int ret; + + s = find_section(hdr, sechdrs, ".altinstructions"); + if (s) { + ret = apply_alternatives_module((void *)s->sh_addr, s->sh_size); + if (ret < 0) { + pr_err("module %s: error occurred when applying alternatives\n", me->name); + return ret; + } } - return 0; + if (scs_is_dynamic()) { + s = find_section(hdr, sechdrs, ".init.eh_frame"); + if (s) { + /* + * Because we can reject modules that are malformed + * so SCS patching fails, skip dry run and try to patch + * it in place. If patching fails, the module would not + * be loaded anyway. + */ + ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size, true); + if (ret) { + pr_err("module %s: error occurred during dynamic SCS patching (%d)\n", + me->name, ret); + return -ENOEXEC; + } + } + } + + return module_init_ftrace_plt(hdr, sechdrs, me); } diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds deleted file mode 100644 index 22e36a21c113..000000000000 --- a/arch/arm64/kernel/module.lds +++ /dev/null @@ -1,5 +0,0 @@ -SECTIONS { - .plt (NOLOAD) : { BYTE(0) } - .init.plt (NOLOAD) : { BYTE(0) } - .text.ftrace_trampoline (NOLOAD) : { BYTE(0) } -} diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c new file mode 100644 index 000000000000..32148bf09c1d --- /dev/null +++ b/arch/arm64/kernel/mte.c @@ -0,0 +1,655 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 ARM Ltd. + */ + +#include <linux/bitops.h> +#include <linux/cpu.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/prctl.h> +#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/string.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/thread_info.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/uio.h> + +#include <asm/barrier.h> +#include <asm/cpufeature.h> +#include <asm/mte.h> +#include <asm/ptrace.h> +#include <asm/sysreg.h> + +static DEFINE_PER_CPU_READ_MOSTLY(u64, mte_tcf_preferred); + +#ifdef CONFIG_KASAN_HW_TAGS +/* + * The asynchronous and asymmetric MTE modes have the same behavior for + * store operations. This flag is set when either of these modes is enabled. + */ +DEFINE_STATIC_KEY_FALSE(mte_async_or_asymm_mode); +EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode); +#endif + +void mte_sync_tags(pte_t pte, unsigned int nr_pages) +{ + struct page *page = pte_page(pte); + struct folio *folio = page_folio(page); + unsigned long i; + + if (folio_test_hugetlb(folio)) { + unsigned long nr = folio_nr_pages(folio); + + /* Hugetlb MTE flags are set for head page only */ + if (folio_try_hugetlb_mte_tagging(folio)) { + for (i = 0; i < nr; i++, page++) + mte_clear_page_tags(page_address(page)); + folio_set_hugetlb_mte_tagged(folio); + } + + /* ensure the tags are visible before the PTE is set */ + smp_wmb(); + + return; + } + + /* if PG_mte_tagged is set, tags have already been initialised */ + for (i = 0; i < nr_pages; i++, page++) { + if (try_page_mte_tagging(page)) { + mte_clear_page_tags(page_address(page)); + set_page_mte_tagged(page); + } + } + + /* ensure the tags are visible before the PTE is set */ + smp_wmb(); +} + +int memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = page_address(page1); + addr2 = page_address(page2); + ret = memcmp(addr1, addr2, PAGE_SIZE); + + if (!system_supports_mte() || ret) + return ret; + + /* + * If the page content is identical but at least one of the pages is + * tagged, return non-zero to avoid KSM merging. If only one of the + * pages is tagged, __set_ptes() may zero or change the tags of the + * other page via mte_sync_tags(). + */ + if (page_mte_tagged(page1) || page_mte_tagged(page2)) + return addr1 != addr2; + + return ret; +} + +static inline void __mte_enable_kernel(const char *mode, unsigned long tcf) +{ + /* Enable MTE Sync Mode for EL1. */ + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK, + SYS_FIELD_PREP(SCTLR_EL1, TCF, tcf)); + isb(); + + pr_info_once("MTE: enabled in %s mode at EL1\n", mode); +} + +#ifdef CONFIG_KASAN_HW_TAGS +void mte_enable_kernel_sync(void) +{ + /* + * Make sure we enter this function when no PE has set + * async mode previously. + */ + WARN_ONCE(system_uses_mte_async_or_asymm_mode(), + "MTE async mode enabled system wide!"); + + __mte_enable_kernel("synchronous", SCTLR_EL1_TCF_SYNC); +} + +void mte_enable_kernel_async(void) +{ + __mte_enable_kernel("asynchronous", SCTLR_EL1_TCF_ASYNC); + + /* + * MTE async mode is set system wide by the first PE that + * executes this function. + * + * Note: If in future KASAN acquires a runtime switching + * mode in between sync and async, this strategy needs + * to be reviewed. + */ + if (!system_uses_mte_async_or_asymm_mode()) + static_branch_enable(&mte_async_or_asymm_mode); +} + +void mte_enable_kernel_asymm(void) +{ + if (cpus_have_cap(ARM64_MTE_ASYMM)) { + __mte_enable_kernel("asymmetric", SCTLR_EL1_TCF_ASYMM); + + /* + * MTE asymm mode behaves as async mode for store + * operations. The mode is set system wide by the + * first PE that executes this function. + * + * Note: If in future KASAN acquires a runtime switching + * mode in between sync and async, this strategy needs + * to be reviewed. + */ + if (!system_uses_mte_async_or_asymm_mode()) + static_branch_enable(&mte_async_or_asymm_mode); + } else { + /* + * If the CPU does not support MTE asymmetric mode the + * kernel falls back on synchronous mode which is the + * default for kasan=on. + */ + mte_enable_kernel_sync(); + } +} + +int mte_enable_kernel_store_only(void) +{ + /* + * If the CPU does not support MTE store only, + * the kernel checks all operations. + */ + if (!cpus_have_cap(ARM64_MTE_STORE_ONLY)) + return -EINVAL; + + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCSO_MASK, + SYS_FIELD_PREP(SCTLR_EL1, TCSO, 1)); + isb(); + + pr_info_once("MTE: enabled store only mode at EL1\n"); + + return 0; +} +#endif + +#ifdef CONFIG_KASAN_HW_TAGS +void mte_check_tfsr_el1(void) +{ + u64 tfsr_el1 = read_sysreg_s(SYS_TFSR_EL1); + + if (unlikely(tfsr_el1 & SYS_TFSR_EL1_TF1)) { + /* + * Note: isb() is not required after this direct write + * because there is no indirect read subsequent to it + * (per ARM DDI 0487F.c table D13-1). + */ + write_sysreg_s(0, SYS_TFSR_EL1); + + kasan_report_async(); + } +} +#endif + +/* + * This is where we actually resolve the system and process MTE mode + * configuration into an actual value in SCTLR_EL1 that affects + * userspace. + */ +static void mte_update_sctlr_user(struct task_struct *task) +{ + /* + * This must be called with preemption disabled and can only be called + * on the current or next task since the CPU must match where the thread + * is going to run. The caller is responsible for calling + * update_sctlr_el1() later in the same preemption disabled block. + */ + unsigned long sctlr = task->thread.sctlr_user; + unsigned long mte_ctrl = task->thread.mte_ctrl; + unsigned long pref, resolved_mte_tcf; + + pref = __this_cpu_read(mte_tcf_preferred); + /* + * If there is no overlap between the system preferred and + * program requested values go with what was requested. + */ + resolved_mte_tcf = (mte_ctrl & pref) ? pref : mte_ctrl; + sctlr &= ~(SCTLR_EL1_TCF0_MASK | SCTLR_EL1_TCSO0_MASK); + /* + * Pick an actual setting. The order in which we check for + * set bits and map into register values determines our + * default order. + */ + if (resolved_mte_tcf & MTE_CTRL_TCF_ASYMM) + sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, ASYMM); + else if (resolved_mte_tcf & MTE_CTRL_TCF_ASYNC) + sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, ASYNC); + else if (resolved_mte_tcf & MTE_CTRL_TCF_SYNC) + sctlr |= SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF0, SYNC); + + if (mte_ctrl & MTE_CTRL_STORE_ONLY) + sctlr |= SYS_FIELD_PREP(SCTLR_EL1, TCSO0, 1); + + task->thread.sctlr_user = sctlr; +} + +static void mte_update_gcr_excl(struct task_struct *task) +{ + /* + * SYS_GCR_EL1 will be set to current->thread.mte_ctrl value by + * mte_set_user_gcr() in kernel_exit, but only if KASAN is enabled. + */ + if (kasan_hw_tags_enabled()) + return; + + write_sysreg_s( + ((task->thread.mte_ctrl >> MTE_CTRL_GCR_USER_EXCL_SHIFT) & + SYS_GCR_EL1_EXCL_MASK) | SYS_GCR_EL1_RRND, + SYS_GCR_EL1); +} + +#ifdef CONFIG_KASAN_HW_TAGS +/* Only called from assembly, silence sparse */ +void __init kasan_hw_tags_enable(struct alt_instr *alt, __le32 *origptr, + __le32 *updptr, int nr_inst); + +void __init kasan_hw_tags_enable(struct alt_instr *alt, __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); /* Branch -> NOP */ + + if (kasan_hw_tags_enabled()) + *updptr = cpu_to_le32(aarch64_insn_gen_nop()); +} +#endif + +void mte_thread_init_user(void) +{ + if (!system_supports_mte()) + return; + + /* clear any pending asynchronous tag fault */ + dsb(ish); + write_sysreg_s(0, SYS_TFSRE0_EL1); + clear_thread_flag(TIF_MTE_ASYNC_FAULT); + /* disable tag checking and reset tag generation mask */ + set_mte_ctrl(current, 0); +} + +void mte_thread_switch(struct task_struct *next) +{ + if (!system_supports_mte()) + return; + + mte_update_sctlr_user(next); + mte_update_gcr_excl(next); + + /* TCO may not have been disabled on exception entry for the current task. */ + mte_disable_tco_entry(next); + + /* + * Check if an async tag exception occurred at EL1. + * + * Note: On the context switch path we rely on the dsb() present + * in __switch_to() to guarantee that the indirect writes to TFSR_EL1 + * are synchronized before this point. + */ + isb(); + mte_check_tfsr_el1(); +} + +void mte_cpu_setup(void) +{ + u64 rgsr; + + /* + * CnP must be enabled only after the MAIR_EL1 register has been set + * up. Inconsistent MAIR_EL1 between CPUs sharing the same TLB may + * lead to the wrong memory type being used for a brief window during + * CPU power-up. + * + * CnP is not a boot feature so MTE gets enabled before CnP, but let's + * make sure that is the case. + */ + BUG_ON(read_sysreg(ttbr0_el1) & TTBR_CNP_BIT); + BUG_ON(read_sysreg(ttbr1_el1) & TTBR_CNP_BIT); + + /* Normal Tagged memory type at the corresponding MAIR index */ + sysreg_clear_set(mair_el1, + MAIR_ATTRIDX(MAIR_ATTR_MASK, MT_NORMAL_TAGGED), + MAIR_ATTRIDX(MAIR_ATTR_NORMAL_TAGGED, + MT_NORMAL_TAGGED)); + + write_sysreg_s(KERNEL_GCR_EL1, SYS_GCR_EL1); + + /* + * If GCR_EL1.RRND=1 is implemented the same way as RRND=0, then + * RGSR_EL1.SEED must be non-zero for IRG to produce + * pseudorandom numbers. As RGSR_EL1 is UNKNOWN out of reset, we + * must initialize it. + */ + rgsr = (read_sysreg(CNTVCT_EL0) & SYS_RGSR_EL1_SEED_MASK) << + SYS_RGSR_EL1_SEED_SHIFT; + if (rgsr == 0) + rgsr = 1 << SYS_RGSR_EL1_SEED_SHIFT; + write_sysreg_s(rgsr, SYS_RGSR_EL1); + + /* clear any pending tag check faults in TFSR*_EL1 */ + write_sysreg_s(0, SYS_TFSR_EL1); + write_sysreg_s(0, SYS_TFSRE0_EL1); + + local_flush_tlb_all(); +} + +void mte_suspend_enter(void) +{ + if (!system_supports_mte()) + return; + + /* + * The barriers are required to guarantee that the indirect writes + * to TFSR_EL1 are synchronized before we report the state. + */ + dsb(nsh); + isb(); + + /* Report SYS_TFSR_EL1 before suspend entry */ + mte_check_tfsr_el1(); +} + +void mte_suspend_exit(void) +{ + if (!system_supports_mte()) + return; + + mte_cpu_setup(); +} + +long set_mte_ctrl(struct task_struct *task, unsigned long arg) +{ + u64 mte_ctrl = (~((arg & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT) & + SYS_GCR_EL1_EXCL_MASK) << MTE_CTRL_GCR_USER_EXCL_SHIFT; + + if (!system_supports_mte()) + return 0; + + if (arg & PR_MTE_TCF_ASYNC) + mte_ctrl |= MTE_CTRL_TCF_ASYNC; + if (arg & PR_MTE_TCF_SYNC) + mte_ctrl |= MTE_CTRL_TCF_SYNC; + + /* + * If the system supports it and both sync and async modes are + * specified then implicitly enable asymmetric mode. + * Userspace could see a mix of both sync and async anyway due + * to differing or changing defaults on CPUs. + */ + if (cpus_have_cap(ARM64_MTE_ASYMM) && + (arg & PR_MTE_TCF_ASYNC) && + (arg & PR_MTE_TCF_SYNC)) + mte_ctrl |= MTE_CTRL_TCF_ASYMM; + + if (arg & PR_MTE_STORE_ONLY) + mte_ctrl |= MTE_CTRL_STORE_ONLY; + + task->thread.mte_ctrl = mte_ctrl; + if (task == current) { + preempt_disable(); + mte_update_sctlr_user(task); + mte_update_gcr_excl(task); + update_sctlr_el1(task->thread.sctlr_user); + preempt_enable(); + } + + return 0; +} + +long get_mte_ctrl(struct task_struct *task) +{ + unsigned long ret; + u64 mte_ctrl = task->thread.mte_ctrl; + u64 incl = (~mte_ctrl >> MTE_CTRL_GCR_USER_EXCL_SHIFT) & + SYS_GCR_EL1_EXCL_MASK; + + if (!system_supports_mte()) + return 0; + + ret = incl << PR_MTE_TAG_SHIFT; + if (mte_ctrl & MTE_CTRL_TCF_ASYNC) + ret |= PR_MTE_TCF_ASYNC; + if (mte_ctrl & MTE_CTRL_TCF_SYNC) + ret |= PR_MTE_TCF_SYNC; + if (mte_ctrl & MTE_CTRL_STORE_ONLY) + ret |= PR_MTE_STORE_ONLY; + + return ret; +} + +/* + * Access MTE tags in another process' address space as given in mm. Update + * the number of tags copied. Return 0 if any tags copied, error otherwise. + * Inspired by __access_remote_vm(). + */ +static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, + struct iovec *kiov, unsigned int gup_flags) +{ + void __user *buf = kiov->iov_base; + size_t len = kiov->iov_len; + int err = 0; + int write = gup_flags & FOLL_WRITE; + + if (!access_ok(buf, len)) + return -EFAULT; + + if (mmap_read_lock_killable(mm)) + return -EIO; + + while (len) { + struct vm_area_struct *vma; + unsigned long tags, offset; + void *maddr; + struct page *page = get_user_page_vma_remote(mm, addr, + gup_flags, &vma); + struct folio *folio; + + if (IS_ERR(page)) { + err = PTR_ERR(page); + break; + } + + /* + * Only copy tags if the page has been mapped as PROT_MTE + * (PG_mte_tagged set). Otherwise the tags are not valid and + * not accessible to user. Moreover, an mprotect(PROT_MTE) + * would cause the existing tags to be cleared if the page + * was never mapped with PROT_MTE. + */ + if (!(vma->vm_flags & VM_MTE)) { + err = -EOPNOTSUPP; + put_page(page); + break; + } + + folio = page_folio(page); + if (folio_test_hugetlb(folio)) + WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio) && + !is_huge_zero_folio(folio)); + else + WARN_ON_ONCE(!page_mte_tagged(page) && !is_zero_page(page)); + + /* limit access to the end of the page */ + offset = offset_in_page(addr); + tags = min(len, (PAGE_SIZE - offset) / MTE_GRANULE_SIZE); + + maddr = page_address(page); + if (write) { + tags = mte_copy_tags_from_user(maddr + offset, buf, tags); + set_page_dirty_lock(page); + } else { + tags = mte_copy_tags_to_user(buf, maddr + offset, tags); + } + put_page(page); + + /* error accessing the tracer's buffer */ + if (!tags) + break; + + len -= tags; + buf += tags; + addr += tags * MTE_GRANULE_SIZE; + } + mmap_read_unlock(mm); + + /* return an error if no tags copied */ + kiov->iov_len = buf - kiov->iov_base; + if (!kiov->iov_len) { + /* check for error accessing the tracee's address space */ + if (err) + return -EIO; + else + return -EFAULT; + } + + return 0; +} + +/* + * Copy MTE tags in another process' address space at 'addr' to/from tracer's + * iovec buffer. Return 0 on success. Inspired by ptrace_access_vm(). + */ +static int access_remote_tags(struct task_struct *tsk, unsigned long addr, + struct iovec *kiov, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return -EPERM; + + if (!tsk->ptrace || (current != tsk->parent) || + ((get_dumpable(mm) != SUID_DUMP_USER) && + !ptracer_capable(tsk, mm->user_ns))) { + mmput(mm); + return -EPERM; + } + + ret = __access_remote_tags(mm, addr, kiov, gup_flags); + mmput(mm); + + return ret; +} + +int mte_ptrace_copy_tags(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret; + struct iovec kiov; + struct iovec __user *uiov = (void __user *)data; + unsigned int gup_flags = FOLL_FORCE; + + if (!system_supports_mte()) + return -EIO; + + if (get_user(kiov.iov_base, &uiov->iov_base) || + get_user(kiov.iov_len, &uiov->iov_len)) + return -EFAULT; + + if (request == PTRACE_POKEMTETAGS) + gup_flags |= FOLL_WRITE; + + /* align addr to the MTE tag granule */ + addr &= MTE_GRANULE_MASK; + + ret = access_remote_tags(child, addr, &kiov, gup_flags); + if (!ret) + ret = put_user(kiov.iov_len, &uiov->iov_len); + + return ret; +} + +static ssize_t mte_tcf_preferred_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + switch (per_cpu(mte_tcf_preferred, dev->id)) { + case MTE_CTRL_TCF_ASYNC: + return sysfs_emit(buf, "async\n"); + case MTE_CTRL_TCF_SYNC: + return sysfs_emit(buf, "sync\n"); + case MTE_CTRL_TCF_ASYMM: + return sysfs_emit(buf, "asymm\n"); + default: + return sysfs_emit(buf, "???\n"); + } +} + +static ssize_t mte_tcf_preferred_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u64 tcf; + + if (sysfs_streq(buf, "async")) + tcf = MTE_CTRL_TCF_ASYNC; + else if (sysfs_streq(buf, "sync")) + tcf = MTE_CTRL_TCF_SYNC; + else if (cpus_have_cap(ARM64_MTE_ASYMM) && sysfs_streq(buf, "asymm")) + tcf = MTE_CTRL_TCF_ASYMM; + else + return -EINVAL; + + device_lock(dev); + per_cpu(mte_tcf_preferred, dev->id) = tcf; + device_unlock(dev); + + return count; +} +static DEVICE_ATTR_RW(mte_tcf_preferred); + +static int register_mte_tcf_preferred_sysctl(void) +{ + unsigned int cpu; + + if (!system_supports_mte()) + return 0; + + for_each_possible_cpu(cpu) { + per_cpu(mte_tcf_preferred, cpu) = MTE_CTRL_TCF_ASYNC; + device_create_file(get_cpu_device(cpu), + &dev_attr_mte_tcf_preferred); + } + + return 0; +} +subsys_initcall(register_mte_tcf_preferred_sysctl); + +/* + * Return 0 on success, the number of bytes not probed otherwise. + */ +size_t mte_probe_user_range(const char __user *uaddr, size_t size) +{ + const char __user *end = uaddr + size; + char val; + + __raw_get_user(val, uaddr, efault); + + uaddr = PTR_ALIGN(uaddr, MTE_GRANULE_SIZE); + while (uaddr < end) { + /* + * A read is sufficient for mte, the caller should have probed + * for the pte write permission if required. + */ + __raw_get_user(val, uaddr, efault); + uaddr += MTE_GRANULE_SIZE; + } + (void)val; + + return 0; + +efault: + return end - uaddr; +} diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index 75c158b0353f..aa718d6a9274 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -1,25 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. * * Copyright (C) 2013 Citrix Systems * * Author: Stefano Stabellini <stefano.stabellini@eu.citrix.com> */ +#define pr_fmt(fmt) "arm-pv: " fmt + +#include <linux/arm-smccc.h> +#include <linux/cpuhotplug.h> #include <linux/export.h> +#include <linux/io.h> #include <linux/jump_label.h> +#include <linux/printk.h> +#include <linux/psci.h> +#include <linux/reboot.h> +#include <linux/slab.h> #include <linux/types.h> +#include <linux/static_call.h> + #include <asm/paravirt.h> +#include <asm/pvclock-abi.h> +#include <asm/smp_plat.h> struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; -struct paravirt_patch_template pv_ops; -EXPORT_SYMBOL_GPL(pv_ops); +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); + +struct pv_time_stolen_time_region { + struct pvclock_vcpu_stolen_time __rcu *kaddr; +}; + +static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region); + +static bool steal_acc = true; +static int __init parse_no_stealacc(char *arg) +{ + steal_acc = false; + return 0; +} + +early_param("no-steal-acc", parse_no_stealacc); + +/* return stolen time in ns by asking the hypervisor */ +static u64 para_steal_clock(int cpu) +{ + struct pvclock_vcpu_stolen_time *kaddr = NULL; + struct pv_time_stolen_time_region *reg; + u64 ret = 0; + + reg = per_cpu_ptr(&stolen_time_region, cpu); + + /* + * paravirt_steal_clock() may be called before the CPU + * online notification callback runs. Until the callback + * has run we just return zero. + */ + rcu_read_lock(); + kaddr = rcu_dereference(reg->kaddr); + if (!kaddr) { + rcu_read_unlock(); + return 0; + } + + ret = le64_to_cpu(READ_ONCE(kaddr->stolen_time)); + rcu_read_unlock(); + return ret; +} + +static int stolen_time_cpu_down_prepare(unsigned int cpu) +{ + struct pvclock_vcpu_stolen_time *kaddr = NULL; + struct pv_time_stolen_time_region *reg; + + reg = this_cpu_ptr(&stolen_time_region); + if (!reg->kaddr) + return 0; + + kaddr = rcu_replace_pointer(reg->kaddr, NULL, true); + synchronize_rcu(); + memunmap(kaddr); + + return 0; +} + +static int stolen_time_cpu_online(unsigned int cpu) +{ + struct pvclock_vcpu_stolen_time *kaddr = NULL; + struct pv_time_stolen_time_region *reg; + struct arm_smccc_res res; + + reg = this_cpu_ptr(&stolen_time_region); + + arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_TIME_ST, &res); + + if (res.a0 == SMCCC_RET_NOT_SUPPORTED) + return -EINVAL; + + kaddr = memremap(res.a0, + sizeof(struct pvclock_vcpu_stolen_time), + MEMREMAP_WB); + + rcu_assign_pointer(reg->kaddr, kaddr); + + if (!reg->kaddr) { + pr_warn("Failed to map stolen time data structure\n"); + return -ENOMEM; + } + + if (le32_to_cpu(kaddr->revision) != 0 || + le32_to_cpu(kaddr->attributes) != 0) { + pr_warn_once("Unexpected revision or attributes in stolen time data\n"); + return -ENXIO; + } + + return 0; +} + +static int __init pv_time_init_stolen_time(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "hypervisor/arm/pvtime:online", + stolen_time_cpu_online, + stolen_time_cpu_down_prepare); + if (ret < 0) + return ret; + return 0; +} + +static bool __init has_pv_steal_clock(void) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_HV_PV_TIME_FEATURES, &res); + + if (res.a0 != SMCCC_RET_SUCCESS) + return false; + + arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_TIME_FEATURES, + ARM_SMCCC_HV_PV_TIME_ST, &res); + + return (res.a0 == SMCCC_RET_SUCCESS); +} + +int __init pv_time_init(void) +{ + int ret; + + if (!has_pv_steal_clock()) + return 0; + + ret = pv_time_init_stolen_time(); + if (ret) + return ret; + + static_call_update(pv_steal_clock, para_steal_clock); + + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + + pr_info("using stolen time PV\n"); + + return 0; +} diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c new file mode 100644 index 000000000000..1041bc67a3ee --- /dev/null +++ b/arch/arm64/kernel/patching.c @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/spinlock.h> +#include <linux/stop_machine.h> +#include <linux/uaccess.h> + +#include <asm/cacheflush.h> +#include <asm/fixmap.h> +#include <asm/insn.h> +#include <asm/kprobes.h> +#include <asm/text-patching.h> +#include <asm/sections.h> + +static DEFINE_RAW_SPINLOCK(patch_lock); + +static bool is_exit_text(unsigned long addr) +{ + /* discarded with init text/data */ + return system_state < SYSTEM_RUNNING && + addr >= (unsigned long)__exittext_begin && + addr < (unsigned long)__exittext_end; +} + +static bool is_image_text(unsigned long addr) +{ + return core_kernel_text(addr) || is_exit_text(addr); +} + +static void __kprobes *patch_map(void *addr, int fixmap) +{ + phys_addr_t phys; + + if (is_image_text((unsigned long)addr)) { + phys = __pa_symbol(addr); + } else { + struct page *page = vmalloc_to_page(addr); + BUG_ON(!page); + phys = page_to_phys(page) + offset_in_page(addr); + } + + return (void *)set_fixmap_offset(fixmap, phys); +} + +static void __kprobes patch_unmap(int fixmap) +{ + clear_fixmap(fixmap); +} +/* + * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always + * little-endian. + */ +int __kprobes aarch64_insn_read(void *addr, u32 *insnp) +{ + int ret; + __le32 val; + + ret = copy_from_kernel_nofault(&val, addr, AARCH64_INSN_SIZE); + if (!ret) + *insnp = le32_to_cpu(val); + + return ret; +} + +static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) +{ + void *waddr = addr; + unsigned long flags = 0; + int ret; + + raw_spin_lock_irqsave(&patch_lock, flags); + waddr = patch_map(addr, FIX_TEXT_POKE0); + + ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE); + + patch_unmap(FIX_TEXT_POKE0); + raw_spin_unlock_irqrestore(&patch_lock, flags); + + return ret; +} + +int __kprobes aarch64_insn_write(void *addr, u32 insn) +{ + return __aarch64_insn_write(addr, cpu_to_le32(insn)); +} + +noinstr int aarch64_insn_write_literal_u64(void *addr, u64 val) +{ + u64 *waddr; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&patch_lock, flags); + waddr = patch_map(addr, FIX_TEXT_POKE0); + + ret = copy_to_kernel_nofault(waddr, &val, sizeof(val)); + + patch_unmap(FIX_TEXT_POKE0); + raw_spin_unlock_irqrestore(&patch_lock, flags); + + return ret; +} + +typedef void text_poke_f(void *dst, void *src, size_t patched, size_t len); + +static void *__text_poke(text_poke_f func, void *addr, void *src, size_t len) +{ + unsigned long flags; + size_t patched = 0; + size_t size; + void *waddr; + void *ptr; + + raw_spin_lock_irqsave(&patch_lock, flags); + + while (patched < len) { + ptr = addr + patched; + size = min_t(size_t, PAGE_SIZE - offset_in_page(ptr), + len - patched); + + waddr = patch_map(ptr, FIX_TEXT_POKE0); + func(waddr, src, patched, size); + patch_unmap(FIX_TEXT_POKE0); + + patched += size; + } + raw_spin_unlock_irqrestore(&patch_lock, flags); + + flush_icache_range((uintptr_t)addr, (uintptr_t)addr + len); + + return addr; +} + +static void text_poke_memcpy(void *dst, void *src, size_t patched, size_t len) +{ + copy_to_kernel_nofault(dst, src + patched, len); +} + +static void text_poke_memset(void *dst, void *src, size_t patched, size_t len) +{ + u32 c = *(u32 *)src; + + memset32(dst, c, len / 4); +} + +/** + * aarch64_insn_copy - Copy instructions into (an unused part of) RX memory + * @dst: address to modify + * @src: source of the copy + * @len: length to copy + * + * Useful for JITs to dump new code blocks into unused regions of RX memory. + */ +noinstr void *aarch64_insn_copy(void *dst, void *src, size_t len) +{ + /* A64 instructions must be word aligned */ + if ((uintptr_t)dst & 0x3) + return NULL; + + return __text_poke(text_poke_memcpy, dst, src, len); +} + +/** + * aarch64_insn_set - memset for RX memory regions. + * @dst: address to modify + * @insn: value to set + * @len: length of memory region. + * + * Useful for JITs to fill regions of RX memory with illegal instructions. + */ +noinstr void *aarch64_insn_set(void *dst, u32 insn, size_t len) +{ + if ((uintptr_t)dst & 0x3) + return NULL; + + return __text_poke(text_poke_memset, dst, &insn, len); +} + +int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) +{ + u32 *tp = addr; + int ret; + + /* A64 instructions must be word aligned */ + if ((uintptr_t)tp & 0x3) + return -EINVAL; + + ret = aarch64_insn_write(tp, insn); + if (ret == 0) + caches_clean_inval_pou((uintptr_t)tp, + (uintptr_t)tp + AARCH64_INSN_SIZE); + + return ret; +} + +struct aarch64_insn_patch { + void **text_addrs; + u32 *new_insns; + int insn_cnt; + atomic_t cpu_count; +}; + +static int __kprobes aarch64_insn_patch_text_cb(void *arg) +{ + int i, ret = 0; + struct aarch64_insn_patch *pp = arg; + + /* The last CPU becomes master */ + if (atomic_inc_return(&pp->cpu_count) == num_online_cpus()) { + for (i = 0; ret == 0 && i < pp->insn_cnt; i++) + ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i], + pp->new_insns[i]); + /* Notify other processors with an additional increment. */ + atomic_inc(&pp->cpu_count); + } else { + while (atomic_read(&pp->cpu_count) <= num_online_cpus()) + cpu_relax(); + isb(); + } + + return ret; +} + +int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt) +{ + struct aarch64_insn_patch patch = { + .text_addrs = addrs, + .new_insns = insns, + .insn_cnt = cnt, + .cpu_count = ATOMIC_INIT(0), + }; + + if (cnt <= 0) + return -EINVAL; + + return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch, + cpu_online_mask); +} diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index bb85e2f4603f..fd9a7bed83ce 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -1,39 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Code borrowed from powerpc/kernel/pci-common.c * * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM * Copyright (C) 2014 ARM Ltd. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation. - * */ -#include <linux/acpi.h> -#include <linux/init.h> -#include <linux/io.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/of_pci.h> -#include <linux/of_platform.h> #include <linux/pci.h> -#include <linux/pci-acpi.h> -#include <linux/pci-ecam.h> -#include <linux/slab.h> - -#ifdef CONFIG_ACPI -/* - * Try to assign the IRQ number when probing a new device - */ -int pcibios_alloc_irq(struct pci_dev *dev) -{ - if (!acpi_disabled) - acpi_pci_irq_enable(dev); - - return 0; -} -#endif /* * raw_pci_read/write - Platform-specific PCI config space access. @@ -67,149 +40,3 @@ int pcibus_to_node(struct pci_bus *bus) EXPORT_SYMBOL(pcibus_to_node); #endif - -#ifdef CONFIG_ACPI - -struct acpi_pci_generic_root_info { - struct acpi_pci_root_info common; - struct pci_config_window *cfg; /* config space mapping */ -}; - -int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) -{ - struct pci_config_window *cfg = bus->sysdata; - struct acpi_device *adev = to_acpi_device(cfg->parent); - struct acpi_pci_root *root = acpi_driver_data(adev); - - return root->segment; -} - -int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) -{ - if (!acpi_disabled) { - struct pci_config_window *cfg = bridge->bus->sysdata; - struct acpi_device *adev = to_acpi_device(cfg->parent); - struct device *bus_dev = &bridge->bus->dev; - - ACPI_COMPANION_SET(&bridge->dev, adev); - set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev))); - } - - return 0; -} - -static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) -{ - struct resource_entry *entry, *tmp; - int status; - - status = acpi_pci_probe_root_resources(ci); - resource_list_for_each_entry_safe(entry, tmp, &ci->resources) { - if (!(entry->res->flags & IORESOURCE_WINDOW)) - resource_list_destroy_entry(entry); - } - return status; -} - -/* - * Lookup the bus range for the domain in MCFG, and set up config space - * mapping. - */ -static struct pci_config_window * -pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root) -{ - struct device *dev = &root->device->dev; - struct resource *bus_res = &root->secondary; - u16 seg = root->segment; - struct pci_ecam_ops *ecam_ops; - struct resource cfgres; - struct acpi_device *adev; - struct pci_config_window *cfg; - int ret; - - ret = pci_mcfg_lookup(root, &cfgres, &ecam_ops); - if (ret) { - dev_err(dev, "%04x:%pR ECAM region not found\n", seg, bus_res); - return NULL; - } - - adev = acpi_resource_consumer(&cfgres); - if (adev) - dev_info(dev, "ECAM area %pR reserved by %s\n", &cfgres, - dev_name(&adev->dev)); - else - dev_warn(dev, FW_BUG "ECAM area %pR not reserved in ACPI namespace\n", - &cfgres); - - cfg = pci_ecam_create(dev, &cfgres, bus_res, ecam_ops); - if (IS_ERR(cfg)) { - dev_err(dev, "%04x:%pR error %ld mapping ECAM\n", seg, bus_res, - PTR_ERR(cfg)); - return NULL; - } - - return cfg; -} - -/* release_info: free resources allocated by init_info */ -static void pci_acpi_generic_release_info(struct acpi_pci_root_info *ci) -{ - struct acpi_pci_generic_root_info *ri; - - ri = container_of(ci, struct acpi_pci_generic_root_info, common); - pci_ecam_free(ri->cfg); - kfree(ci->ops); - kfree(ri); -} - -/* Interface called from ACPI code to setup PCI host controller */ -struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) -{ - struct acpi_pci_generic_root_info *ri; - struct pci_bus *bus, *child; - struct acpi_pci_root_ops *root_ops; - - ri = kzalloc(sizeof(*ri), GFP_KERNEL); - if (!ri) - return NULL; - - root_ops = kzalloc(sizeof(*root_ops), GFP_KERNEL); - if (!root_ops) { - kfree(ri); - return NULL; - } - - ri->cfg = pci_acpi_setup_ecam_mapping(root); - if (!ri->cfg) { - kfree(ri); - kfree(root_ops); - return NULL; - } - - root_ops->release_info = pci_acpi_generic_release_info; - root_ops->prepare_resources = pci_acpi_root_prepare_resources; - root_ops->pci_ops = &ri->cfg->ops->pci_ops; - bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg); - if (!bus) - return NULL; - - pci_bus_size_bridges(bus); - pci_bus_assign_resources(bus); - - list_for_each_entry(child, &bus->children, node) - pcie_bus_configure_settings(child); - - return bus; -} - -void pcibios_add_bus(struct pci_bus *bus) -{ - acpi_pci_add_bus(bus); -} - -void pcibios_remove_bus(struct pci_bus *bus) -{ - acpi_pci_remove_bus(bus); -} - -#endif diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 61d983f5756f..9b7f26b128b5 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -1,202 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arm64 callchain support * * Copyright (C) 2015 ARM Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/perf_event.h> +#include <linux/stacktrace.h> #include <linux/uaccess.h> #include <asm/pointer_auth.h> -#include <asm/stacktrace.h> - -struct frame_tail { - struct frame_tail __user *fp; - unsigned long lr; -} __attribute__((packed)); -/* - * Get the return address for a single stackframe and return a pointer to the - * next frame tail. - */ -static struct frame_tail __user * -user_backtrace(struct frame_tail __user *tail, - struct perf_callchain_entry_ctx *entry) +static bool callchain_trace(void *data, unsigned long pc) { - struct frame_tail buftail; - unsigned long err; - unsigned long lr; - - /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(tail, sizeof(buftail))) - return NULL; - - pagefault_disable(); - err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); - pagefault_enable(); - - if (err) - return NULL; - - lr = ptrauth_strip_insn_pac(buftail.lr); - - perf_callchain_store(entry, lr); - - /* - * Frame pointers should strictly progress back up the stack - * (towards higher addresses). - */ - if (tail >= buftail.fp) - return NULL; - - return buftail.fp; -} - -#ifdef CONFIG_COMPAT -/* - * The registers we're interested in are at the end of the variable - * length saved register structure. The fp points at the end of this - * structure so the address of this struct is: - * (struct compat_frame_tail *)(xxx->fp)-1 - * - * This code has been adapted from the ARM OProfile support. - */ -struct compat_frame_tail { - compat_uptr_t fp; /* a (struct compat_frame_tail *) in compat mode */ - u32 sp; - u32 lr; -} __attribute__((packed)); - -static struct compat_frame_tail __user * -compat_user_backtrace(struct compat_frame_tail __user *tail, - struct perf_callchain_entry_ctx *entry) -{ - struct compat_frame_tail buftail; - unsigned long err; - - /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(tail, sizeof(buftail))) - return NULL; - - pagefault_disable(); - err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); - pagefault_enable(); - - if (err) - return NULL; - - perf_callchain_store(entry, buftail.lr); - - /* - * Frame pointers should strictly progress back up the stack - * (towards higher addresses). - */ - if (tail + 1 >= (struct compat_frame_tail __user *) - compat_ptr(buftail.fp)) - return NULL; + struct perf_callchain_entry_ctx *entry = data; - return (struct compat_frame_tail __user *)compat_ptr(buftail.fp) - 1; + return perf_callchain_store(entry, pc) == 0; } -#endif /* CONFIG_COMPAT */ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* We don't support guest os callchain now */ return; } - perf_callchain_store(entry, regs->pc); - - if (!compat_user_mode(regs)) { - /* AARCH64 mode */ - struct frame_tail __user *tail; - - tail = (struct frame_tail __user *)regs->regs[29]; - - while (entry->nr < entry->max_stack && - tail && !((unsigned long)tail & 0xf)) - tail = user_backtrace(tail, entry); - } else { -#ifdef CONFIG_COMPAT - /* AARCH32 compat mode */ - struct compat_frame_tail __user *tail; - - tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; - - while ((entry->nr < entry->max_stack) && - tail && !((unsigned long)tail & 0x3)) - tail = compat_user_backtrace(tail, entry); -#endif - } -} - -/* - * Gets called by walk_stackframe() for every stackframe. This will be called - * whist unwinding the stackframe and is like a subroutine return so we use - * the PC. - */ -static int callchain_trace(struct stackframe *frame, void *data) -{ - struct perf_callchain_entry_ctx *entry = data; - perf_callchain_store(entry, frame->pc); - return 0; + arch_stack_walk_user(callchain_trace, entry, regs); } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct stackframe frame; - - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* We don't support guest os callchain now */ return; } - frame.fp = regs->regs[29]; - frame.pc = regs->pc; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = 0; -#endif - - walk_stackframe(current, &frame, callchain_trace, entry); -} - -unsigned long perf_instruction_pointer(struct pt_regs *regs) -{ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); - - return instruction_pointer(regs); -} - -unsigned long perf_misc_flags(struct pt_regs *regs) -{ - int misc = 0; - - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) - misc |= PERF_RECORD_MISC_GUEST_USER; - else - misc |= PERF_RECORD_MISC_GUEST_KERNEL; - } else { - if (user_mode(regs)) - misc |= PERF_RECORD_MISC_USER; - else - misc |= PERF_RECORD_MISC_KERNEL; - } - - return misc; + arch_stack_walk(callchain_trace, entry, current, regs); } diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c deleted file mode 100644 index 1620a371b1f5..000000000000 --- a/arch/arm64/kernel/perf_event.c +++ /dev/null @@ -1,1206 +0,0 @@ -/* - * ARMv8 PMUv3 Performance Events handling code. - * - * Copyright (C) 2012 ARM Limited - * Author: Will Deacon <will.deacon@arm.com> - * - * This code is based heavily on the ARMv7 perf event code. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <asm/irq_regs.h> -#include <asm/perf_event.h> -#include <asm/sysreg.h> -#include <asm/virt.h> - -#include <linux/acpi.h> -#include <linux/clocksource.h> -#include <linux/of.h> -#include <linux/perf/arm_pmu.h> -#include <linux/platform_device.h> - -/* ARMv8 Cortex-A53 specific event types. */ -#define ARMV8_A53_PERFCTR_PREF_LINEFILL 0xC2 - -/* ARMv8 Cavium ThunderX specific event types. */ -#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_MISS_ST 0xE9 -#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_ACCESS 0xEA -#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_MISS 0xEB -#define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS 0xEC -#define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS 0xED - -/* - * ARMv8 Architectural defined events, not all of these may - * be supported on any given implementation. Unsupported events will - * be disabled at run-time based on the PMCEID registers. - */ -static const unsigned armv8_pmuv3_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_BUS_CYCLES] = ARMV8_PMUV3_PERFCTR_BUS_CYCLES, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = ARMV8_PMUV3_PERFCTR_STALL_FRONTEND, - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = ARMV8_PMUV3_PERFCTR_STALL_BACKEND, -}; - -static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - PERF_CACHE_MAP_ALL_UNSUPPORTED, - - [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [C(L1D)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, - - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL, - [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1D_TLB, - - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, - [C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB, - - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, -}; - -static const unsigned armv8_a53_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - PERF_CACHE_MAP_ALL_UNSUPPORTED, - - [C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_A53_PERFCTR_PREF_LINEFILL, - - [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, - [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, -}; - -static const unsigned armv8_a57_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - PERF_CACHE_MAP_ALL_UNSUPPORTED, - - [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD, - [C(L1D)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD, - [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, - [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR, - - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, - [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, - - [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, - [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, -}; - -static const unsigned armv8_a73_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - PERF_CACHE_MAP_ALL_UNSUPPORTED, - - [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD, - [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, -}; - -static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - PERF_CACHE_MAP_ALL_UNSUPPORTED, - - [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD, - [C(L1D)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD, - [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, - [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_MISS_ST, - [C(L1D)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_ACCESS, - [C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_MISS, - - [C(L1I)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS, - [C(L1I)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS, - - [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD, - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, - [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR, - [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, -}; - -static const unsigned armv8_vulcan_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - PERF_CACHE_MAP_ALL_UNSUPPORTED, - - [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD, - [C(L1D)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD, - [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, - [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR, - - [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD, - [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR, - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, - [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, - - [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, - [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, -}; - -static ssize_t -armv8pmu_events_sysfs_show(struct device *dev, - struct device_attribute *attr, char *page) -{ - struct perf_pmu_events_attr *pmu_attr; - - pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); - - return sprintf(page, "event=0x%03llx\n", pmu_attr->id); -} - -#define ARMV8_EVENT_ATTR_RESOLVE(m) #m -#define ARMV8_EVENT_ATTR(name, config) \ - PMU_EVENT_ATTR(name, armv8_event_attr_##name, \ - config, armv8pmu_events_sysfs_show) - -ARMV8_EVENT_ATTR(sw_incr, ARMV8_PMUV3_PERFCTR_SW_INCR); -ARMV8_EVENT_ATTR(l1i_cache_refill, ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL); -ARMV8_EVENT_ATTR(l1i_tlb_refill, ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL); -ARMV8_EVENT_ATTR(l1d_cache_refill, ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL); -ARMV8_EVENT_ATTR(l1d_cache, ARMV8_PMUV3_PERFCTR_L1D_CACHE); -ARMV8_EVENT_ATTR(l1d_tlb_refill, ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL); -ARMV8_EVENT_ATTR(ld_retired, ARMV8_PMUV3_PERFCTR_LD_RETIRED); -ARMV8_EVENT_ATTR(st_retired, ARMV8_PMUV3_PERFCTR_ST_RETIRED); -ARMV8_EVENT_ATTR(inst_retired, ARMV8_PMUV3_PERFCTR_INST_RETIRED); -ARMV8_EVENT_ATTR(exc_taken, ARMV8_PMUV3_PERFCTR_EXC_TAKEN); -ARMV8_EVENT_ATTR(exc_return, ARMV8_PMUV3_PERFCTR_EXC_RETURN); -ARMV8_EVENT_ATTR(cid_write_retired, ARMV8_PMUV3_PERFCTR_CID_WRITE_RETIRED); -ARMV8_EVENT_ATTR(pc_write_retired, ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED); -ARMV8_EVENT_ATTR(br_immed_retired, ARMV8_PMUV3_PERFCTR_BR_IMMED_RETIRED); -ARMV8_EVENT_ATTR(br_return_retired, ARMV8_PMUV3_PERFCTR_BR_RETURN_RETIRED); -ARMV8_EVENT_ATTR(unaligned_ldst_retired, ARMV8_PMUV3_PERFCTR_UNALIGNED_LDST_RETIRED); -ARMV8_EVENT_ATTR(br_mis_pred, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED); -ARMV8_EVENT_ATTR(cpu_cycles, ARMV8_PMUV3_PERFCTR_CPU_CYCLES); -ARMV8_EVENT_ATTR(br_pred, ARMV8_PMUV3_PERFCTR_BR_PRED); -ARMV8_EVENT_ATTR(mem_access, ARMV8_PMUV3_PERFCTR_MEM_ACCESS); -ARMV8_EVENT_ATTR(l1i_cache, ARMV8_PMUV3_PERFCTR_L1I_CACHE); -ARMV8_EVENT_ATTR(l1d_cache_wb, ARMV8_PMUV3_PERFCTR_L1D_CACHE_WB); -ARMV8_EVENT_ATTR(l2d_cache, ARMV8_PMUV3_PERFCTR_L2D_CACHE); -ARMV8_EVENT_ATTR(l2d_cache_refill, ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL); -ARMV8_EVENT_ATTR(l2d_cache_wb, ARMV8_PMUV3_PERFCTR_L2D_CACHE_WB); -ARMV8_EVENT_ATTR(bus_access, ARMV8_PMUV3_PERFCTR_BUS_ACCESS); -ARMV8_EVENT_ATTR(memory_error, ARMV8_PMUV3_PERFCTR_MEMORY_ERROR); -ARMV8_EVENT_ATTR(inst_spec, ARMV8_PMUV3_PERFCTR_INST_SPEC); -ARMV8_EVENT_ATTR(ttbr_write_retired, ARMV8_PMUV3_PERFCTR_TTBR_WRITE_RETIRED); -ARMV8_EVENT_ATTR(bus_cycles, ARMV8_PMUV3_PERFCTR_BUS_CYCLES); -/* Don't expose the chain event in /sys, since it's useless in isolation */ -ARMV8_EVENT_ATTR(l1d_cache_allocate, ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE); -ARMV8_EVENT_ATTR(l2d_cache_allocate, ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE); -ARMV8_EVENT_ATTR(br_retired, ARMV8_PMUV3_PERFCTR_BR_RETIRED); -ARMV8_EVENT_ATTR(br_mis_pred_retired, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED); -ARMV8_EVENT_ATTR(stall_frontend, ARMV8_PMUV3_PERFCTR_STALL_FRONTEND); -ARMV8_EVENT_ATTR(stall_backend, ARMV8_PMUV3_PERFCTR_STALL_BACKEND); -ARMV8_EVENT_ATTR(l1d_tlb, ARMV8_PMUV3_PERFCTR_L1D_TLB); -ARMV8_EVENT_ATTR(l1i_tlb, ARMV8_PMUV3_PERFCTR_L1I_TLB); -ARMV8_EVENT_ATTR(l2i_cache, ARMV8_PMUV3_PERFCTR_L2I_CACHE); -ARMV8_EVENT_ATTR(l2i_cache_refill, ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL); -ARMV8_EVENT_ATTR(l3d_cache_allocate, ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE); -ARMV8_EVENT_ATTR(l3d_cache_refill, ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL); -ARMV8_EVENT_ATTR(l3d_cache, ARMV8_PMUV3_PERFCTR_L3D_CACHE); -ARMV8_EVENT_ATTR(l3d_cache_wb, ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB); -ARMV8_EVENT_ATTR(l2d_tlb_refill, ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL); -ARMV8_EVENT_ATTR(l2i_tlb_refill, ARMV8_PMUV3_PERFCTR_L2I_TLB_REFILL); -ARMV8_EVENT_ATTR(l2d_tlb, ARMV8_PMUV3_PERFCTR_L2D_TLB); -ARMV8_EVENT_ATTR(l2i_tlb, ARMV8_PMUV3_PERFCTR_L2I_TLB); -ARMV8_EVENT_ATTR(remote_access, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS); -ARMV8_EVENT_ATTR(ll_cache, ARMV8_PMUV3_PERFCTR_LL_CACHE); -ARMV8_EVENT_ATTR(ll_cache_miss, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS); -ARMV8_EVENT_ATTR(dtlb_walk, ARMV8_PMUV3_PERFCTR_DTLB_WALK); -ARMV8_EVENT_ATTR(itlb_walk, ARMV8_PMUV3_PERFCTR_ITLB_WALK); -ARMV8_EVENT_ATTR(ll_cache_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_RD); -ARMV8_EVENT_ATTR(ll_cache_miss_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD); -ARMV8_EVENT_ATTR(remote_access_rd, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD); -ARMV8_EVENT_ATTR(sample_pop, ARMV8_SPE_PERFCTR_SAMPLE_POP); -ARMV8_EVENT_ATTR(sample_feed, ARMV8_SPE_PERFCTR_SAMPLE_FEED); -ARMV8_EVENT_ATTR(sample_filtrate, ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE); -ARMV8_EVENT_ATTR(sample_collision, ARMV8_SPE_PERFCTR_SAMPLE_COLLISION); - -static struct attribute *armv8_pmuv3_event_attrs[] = { - &armv8_event_attr_sw_incr.attr.attr, - &armv8_event_attr_l1i_cache_refill.attr.attr, - &armv8_event_attr_l1i_tlb_refill.attr.attr, - &armv8_event_attr_l1d_cache_refill.attr.attr, - &armv8_event_attr_l1d_cache.attr.attr, - &armv8_event_attr_l1d_tlb_refill.attr.attr, - &armv8_event_attr_ld_retired.attr.attr, - &armv8_event_attr_st_retired.attr.attr, - &armv8_event_attr_inst_retired.attr.attr, - &armv8_event_attr_exc_taken.attr.attr, - &armv8_event_attr_exc_return.attr.attr, - &armv8_event_attr_cid_write_retired.attr.attr, - &armv8_event_attr_pc_write_retired.attr.attr, - &armv8_event_attr_br_immed_retired.attr.attr, - &armv8_event_attr_br_return_retired.attr.attr, - &armv8_event_attr_unaligned_ldst_retired.attr.attr, - &armv8_event_attr_br_mis_pred.attr.attr, - &armv8_event_attr_cpu_cycles.attr.attr, - &armv8_event_attr_br_pred.attr.attr, - &armv8_event_attr_mem_access.attr.attr, - &armv8_event_attr_l1i_cache.attr.attr, - &armv8_event_attr_l1d_cache_wb.attr.attr, - &armv8_event_attr_l2d_cache.attr.attr, - &armv8_event_attr_l2d_cache_refill.attr.attr, - &armv8_event_attr_l2d_cache_wb.attr.attr, - &armv8_event_attr_bus_access.attr.attr, - &armv8_event_attr_memory_error.attr.attr, - &armv8_event_attr_inst_spec.attr.attr, - &armv8_event_attr_ttbr_write_retired.attr.attr, - &armv8_event_attr_bus_cycles.attr.attr, - &armv8_event_attr_l1d_cache_allocate.attr.attr, - &armv8_event_attr_l2d_cache_allocate.attr.attr, - &armv8_event_attr_br_retired.attr.attr, - &armv8_event_attr_br_mis_pred_retired.attr.attr, - &armv8_event_attr_stall_frontend.attr.attr, - &armv8_event_attr_stall_backend.attr.attr, - &armv8_event_attr_l1d_tlb.attr.attr, - &armv8_event_attr_l1i_tlb.attr.attr, - &armv8_event_attr_l2i_cache.attr.attr, - &armv8_event_attr_l2i_cache_refill.attr.attr, - &armv8_event_attr_l3d_cache_allocate.attr.attr, - &armv8_event_attr_l3d_cache_refill.attr.attr, - &armv8_event_attr_l3d_cache.attr.attr, - &armv8_event_attr_l3d_cache_wb.attr.attr, - &armv8_event_attr_l2d_tlb_refill.attr.attr, - &armv8_event_attr_l2i_tlb_refill.attr.attr, - &armv8_event_attr_l2d_tlb.attr.attr, - &armv8_event_attr_l2i_tlb.attr.attr, - &armv8_event_attr_remote_access.attr.attr, - &armv8_event_attr_ll_cache.attr.attr, - &armv8_event_attr_ll_cache_miss.attr.attr, - &armv8_event_attr_dtlb_walk.attr.attr, - &armv8_event_attr_itlb_walk.attr.attr, - &armv8_event_attr_ll_cache_rd.attr.attr, - &armv8_event_attr_ll_cache_miss_rd.attr.attr, - &armv8_event_attr_remote_access_rd.attr.attr, - &armv8_event_attr_sample_pop.attr.attr, - &armv8_event_attr_sample_feed.attr.attr, - &armv8_event_attr_sample_filtrate.attr.attr, - &armv8_event_attr_sample_collision.attr.attr, - NULL, -}; - -static umode_t -armv8pmu_event_attr_is_visible(struct kobject *kobj, - struct attribute *attr, int unused) -{ - struct device *dev = kobj_to_dev(kobj); - struct pmu *pmu = dev_get_drvdata(dev); - struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu); - struct perf_pmu_events_attr *pmu_attr; - - pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); - - if (pmu_attr->id < ARMV8_PMUV3_MAX_COMMON_EVENTS && - test_bit(pmu_attr->id, cpu_pmu->pmceid_bitmap)) - return attr->mode; - - pmu_attr->id -= ARMV8_PMUV3_EXT_COMMON_EVENT_BASE; - if (pmu_attr->id < ARMV8_PMUV3_MAX_COMMON_EVENTS && - test_bit(pmu_attr->id, cpu_pmu->pmceid_ext_bitmap)) - return attr->mode; - - return 0; -} - -static struct attribute_group armv8_pmuv3_events_attr_group = { - .name = "events", - .attrs = armv8_pmuv3_event_attrs, - .is_visible = armv8pmu_event_attr_is_visible, -}; - -PMU_FORMAT_ATTR(event, "config:0-15"); -PMU_FORMAT_ATTR(long, "config1:0"); - -static inline bool armv8pmu_event_is_64bit(struct perf_event *event) -{ - return event->attr.config1 & 0x1; -} - -static struct attribute *armv8_pmuv3_format_attrs[] = { - &format_attr_event.attr, - &format_attr_long.attr, - NULL, -}; - -static struct attribute_group armv8_pmuv3_format_attr_group = { - .name = "format", - .attrs = armv8_pmuv3_format_attrs, -}; - -/* - * Perf Events' indices - */ -#define ARMV8_IDX_CYCLE_COUNTER 0 -#define ARMV8_IDX_COUNTER0 1 -#define ARMV8_IDX_COUNTER_LAST(cpu_pmu) \ - (ARMV8_IDX_CYCLE_COUNTER + cpu_pmu->num_events - 1) - -/* - * We must chain two programmable counters for 64 bit events, - * except when we have allocated the 64bit cycle counter (for CPU - * cycles event). This must be called only when the event has - * a counter allocated. - */ -static inline bool armv8pmu_event_is_chained(struct perf_event *event) -{ - int idx = event->hw.idx; - - return !WARN_ON(idx < 0) && - armv8pmu_event_is_64bit(event) && - (idx != ARMV8_IDX_CYCLE_COUNTER); -} - -/* - * ARMv8 low level PMU access - */ - -/* - * Perf Event to low level counters mapping - */ -#define ARMV8_IDX_TO_COUNTER(x) \ - (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) - -static inline u32 armv8pmu_pmcr_read(void) -{ - return read_sysreg(pmcr_el0); -} - -static inline void armv8pmu_pmcr_write(u32 val) -{ - val &= ARMV8_PMU_PMCR_MASK; - isb(); - write_sysreg(val, pmcr_el0); -} - -static inline int armv8pmu_has_overflowed(u32 pmovsr) -{ - return pmovsr & ARMV8_PMU_OVERFLOWED_MASK; -} - -static inline int armv8pmu_counter_valid(struct arm_pmu *cpu_pmu, int idx) -{ - return idx >= ARMV8_IDX_CYCLE_COUNTER && - idx <= ARMV8_IDX_COUNTER_LAST(cpu_pmu); -} - -static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) -{ - return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); -} - -static inline void armv8pmu_select_counter(int idx) -{ - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(counter, pmselr_el0); - isb(); -} - -static inline u32 armv8pmu_read_evcntr(int idx) -{ - armv8pmu_select_counter(idx); - return read_sysreg(pmxevcntr_el0); -} - -static inline u64 armv8pmu_read_hw_counter(struct perf_event *event) -{ - int idx = event->hw.idx; - u64 val = 0; - - val = armv8pmu_read_evcntr(idx); - if (armv8pmu_event_is_chained(event)) - val = (val << 32) | armv8pmu_read_evcntr(idx - 1); - return val; -} - -static inline u64 armv8pmu_read_counter(struct perf_event *event) -{ - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - u64 value = 0; - - if (!armv8pmu_counter_valid(cpu_pmu, idx)) - pr_err("CPU%u reading wrong counter %d\n", - smp_processor_id(), idx); - else if (idx == ARMV8_IDX_CYCLE_COUNTER) - value = read_sysreg(pmccntr_el0); - else - value = armv8pmu_read_hw_counter(event); - - return value; -} - -static inline void armv8pmu_write_evcntr(int idx, u32 value) -{ - armv8pmu_select_counter(idx); - write_sysreg(value, pmxevcntr_el0); -} - -static inline void armv8pmu_write_hw_counter(struct perf_event *event, - u64 value) -{ - int idx = event->hw.idx; - - if (armv8pmu_event_is_chained(event)) { - armv8pmu_write_evcntr(idx, upper_32_bits(value)); - armv8pmu_write_evcntr(idx - 1, lower_32_bits(value)); - } else { - armv8pmu_write_evcntr(idx, value); - } -} - -static inline void armv8pmu_write_counter(struct perf_event *event, u64 value) -{ - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - - if (!armv8pmu_counter_valid(cpu_pmu, idx)) - pr_err("CPU%u writing wrong counter %d\n", - smp_processor_id(), idx); - else if (idx == ARMV8_IDX_CYCLE_COUNTER) { - /* - * The cycles counter is really a 64-bit counter. - * When treating it as a 32-bit counter, we only count - * the lower 32 bits, and set the upper 32-bits so that - * we get an interrupt upon 32-bit overflow. - */ - if (!armv8pmu_event_is_64bit(event)) - value |= 0xffffffff00000000ULL; - write_sysreg(value, pmccntr_el0); - } else - armv8pmu_write_hw_counter(event, value); -} - -static inline void armv8pmu_write_evtype(int idx, u32 val) -{ - armv8pmu_select_counter(idx); - val &= ARMV8_PMU_EVTYPE_MASK; - write_sysreg(val, pmxevtyper_el0); -} - -static inline void armv8pmu_write_event_type(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - - /* - * For chained events, the low counter is programmed to count - * the event of interest and the high counter is programmed - * with CHAIN event code with filters set to count at all ELs. - */ - if (armv8pmu_event_is_chained(event)) { - u32 chain_evt = ARMV8_PMUV3_PERFCTR_CHAIN | - ARMV8_PMU_INCLUDE_EL2; - - armv8pmu_write_evtype(idx - 1, hwc->config_base); - armv8pmu_write_evtype(idx, chain_evt); - } else { - armv8pmu_write_evtype(idx, hwc->config_base); - } -} - -static inline int armv8pmu_enable_counter(int idx) -{ - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmcntenset_el0); - return idx; -} - -static inline void armv8pmu_enable_event_counter(struct perf_event *event) -{ - int idx = event->hw.idx; - - armv8pmu_enable_counter(idx); - if (armv8pmu_event_is_chained(event)) - armv8pmu_enable_counter(idx - 1); - isb(); -} - -static inline int armv8pmu_disable_counter(int idx) -{ - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmcntenclr_el0); - return idx; -} - -static inline void armv8pmu_disable_event_counter(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - - if (armv8pmu_event_is_chained(event)) - armv8pmu_disable_counter(idx - 1); - armv8pmu_disable_counter(idx); -} - -static inline int armv8pmu_enable_intens(int idx) -{ - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmintenset_el1); - return idx; -} - -static inline int armv8pmu_enable_event_irq(struct perf_event *event) -{ - return armv8pmu_enable_intens(event->hw.idx); -} - -static inline int armv8pmu_disable_intens(int idx) -{ - u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(BIT(counter), pmintenclr_el1); - isb(); - /* Clear the overflow flag in case an interrupt is pending. */ - write_sysreg(BIT(counter), pmovsclr_el0); - isb(); - - return idx; -} - -static inline int armv8pmu_disable_event_irq(struct perf_event *event) -{ - return armv8pmu_disable_intens(event->hw.idx); -} - -static inline u32 armv8pmu_getreset_flags(void) -{ - u32 value; - - /* Read */ - value = read_sysreg(pmovsclr_el0); - - /* Write to clear flags */ - value &= ARMV8_PMU_OVSR_MASK; - write_sysreg(value, pmovsclr_el0); - - return value; -} - -static void armv8pmu_enable_event(struct perf_event *event) -{ - unsigned long flags; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - /* - * Enable counter and interrupt, and set the counter to count - * the event that we're interested in. - */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); - - /* - * Disable counter - */ - armv8pmu_disable_event_counter(event); - - /* - * Set event (if destined for PMNx counters). - */ - armv8pmu_write_event_type(event); - - /* - * Enable interrupt for this counter - */ - armv8pmu_enable_event_irq(event); - - /* - * Enable counter - */ - armv8pmu_enable_event_counter(event); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); -} - -static void armv8pmu_disable_event(struct perf_event *event) -{ - unsigned long flags; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - /* - * Disable counter and interrupt - */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); - - /* - * Disable counter - */ - armv8pmu_disable_event_counter(event); - - /* - * Disable interrupt for this counter - */ - armv8pmu_disable_event_irq(event); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); -} - -static void armv8pmu_start(struct arm_pmu *cpu_pmu) -{ - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - raw_spin_lock_irqsave(&events->pmu_lock, flags); - /* Enable all counters */ - armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); -} - -static void armv8pmu_stop(struct arm_pmu *cpu_pmu) -{ - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - raw_spin_lock_irqsave(&events->pmu_lock, flags); - /* Disable all counters */ - armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); -} - -static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu) -{ - u32 pmovsr; - struct perf_sample_data data; - struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events); - struct pt_regs *regs; - int idx; - - /* - * Get and reset the IRQ flags - */ - pmovsr = armv8pmu_getreset_flags(); - - /* - * Did an overflow occur? - */ - if (!armv8pmu_has_overflowed(pmovsr)) - return IRQ_NONE; - - /* - * Handle the counter(s) overflow(s) - */ - regs = get_irq_regs(); - - /* - * Stop the PMU while processing the counter overflows - * to prevent skews in group events. - */ - armv8pmu_stop(cpu_pmu); - for (idx = 0; idx < cpu_pmu->num_events; ++idx) { - struct perf_event *event = cpuc->events[idx]; - struct hw_perf_event *hwc; - - /* Ignore if we don't have an event. */ - if (!event) - continue; - - /* - * We have a single interrupt for all counters. Check that - * each counter has overflowed before we process it. - */ - if (!armv8pmu_counter_has_overflowed(pmovsr, idx)) - continue; - - hwc = &event->hw; - armpmu_event_update(event); - perf_sample_data_init(&data, 0, hwc->last_period); - if (!armpmu_event_set_period(event)) - continue; - - if (perf_event_overflow(event, &data, regs)) - cpu_pmu->disable(event); - } - armv8pmu_start(cpu_pmu); - - /* - * Handle the pending perf events. - * - * Note: this call *must* be run with interrupts disabled. For - * platforms that can have the PMU interrupts raised as an NMI, this - * will not work. - */ - irq_work_run(); - - return IRQ_HANDLED; -} - -static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc, - struct arm_pmu *cpu_pmu) -{ - int idx; - - for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; idx ++) { - if (!test_and_set_bit(idx, cpuc->used_mask)) - return idx; - } - return -EAGAIN; -} - -static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc, - struct arm_pmu *cpu_pmu) -{ - int idx; - - /* - * Chaining requires two consecutive event counters, where - * the lower idx must be even. - */ - for (idx = ARMV8_IDX_COUNTER0 + 1; idx < cpu_pmu->num_events; idx += 2) { - if (!test_and_set_bit(idx, cpuc->used_mask)) { - /* Check if the preceding even counter is available */ - if (!test_and_set_bit(idx - 1, cpuc->used_mask)) - return idx; - /* Release the Odd counter */ - clear_bit(idx, cpuc->used_mask); - } - } - return -EAGAIN; -} - -static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, - struct perf_event *event) -{ - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct hw_perf_event *hwc = &event->hw; - unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; - - /* Always prefer to place a cycle counter into the cycle counter. */ - if (evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) { - if (!test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask)) - return ARMV8_IDX_CYCLE_COUNTER; - } - - /* - * Otherwise use events counters - */ - if (armv8pmu_event_is_64bit(event)) - return armv8pmu_get_chain_idx(cpuc, cpu_pmu); - else - return armv8pmu_get_single_idx(cpuc, cpu_pmu); -} - -static void armv8pmu_clear_event_idx(struct pmu_hw_events *cpuc, - struct perf_event *event) -{ - int idx = event->hw.idx; - - clear_bit(idx, cpuc->used_mask); - if (armv8pmu_event_is_chained(event)) - clear_bit(idx - 1, cpuc->used_mask); -} - -/* - * Add an event filter to a given event. This will only work for PMUv2 PMUs. - */ -static int armv8pmu_set_event_filter(struct hw_perf_event *event, - struct perf_event_attr *attr) -{ - unsigned long config_base = 0; - - if (attr->exclude_idle) - return -EPERM; - - /* - * If we're running in hyp mode, then we *are* the hypervisor. - * Therefore we ignore exclude_hv in this configuration, since - * there's no hypervisor to sample anyway. This is consistent - * with other architectures (x86 and Power). - */ - if (is_kernel_in_hyp_mode()) { - if (!attr->exclude_kernel) - config_base |= ARMV8_PMU_INCLUDE_EL2; - } else { - if (attr->exclude_kernel) - config_base |= ARMV8_PMU_EXCLUDE_EL1; - if (!attr->exclude_hv) - config_base |= ARMV8_PMU_INCLUDE_EL2; - } - if (attr->exclude_user) - config_base |= ARMV8_PMU_EXCLUDE_EL0; - - /* - * Install the filter into config_base as this is used to - * construct the event type. - */ - event->config_base = config_base; - - return 0; -} - -static int armv8pmu_filter_match(struct perf_event *event) -{ - unsigned long evtype = event->hw.config_base & ARMV8_PMU_EVTYPE_EVENT; - return evtype != ARMV8_PMUV3_PERFCTR_CHAIN; -} - -static void armv8pmu_reset(void *info) -{ - struct arm_pmu *cpu_pmu = (struct arm_pmu *)info; - u32 idx, nb_cnt = cpu_pmu->num_events; - - /* The counter and interrupt enable registers are unknown at reset. */ - for (idx = ARMV8_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx) { - armv8pmu_disable_counter(idx); - armv8pmu_disable_intens(idx); - } - - /* - * Initialize & Reset PMNC. Request overflow interrupt for - * 64 bit cycle counter but cheat in armv8pmu_write_counter(). - */ - armv8pmu_pmcr_write(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C | - ARMV8_PMU_PMCR_LC); -} - -static int __armv8_pmuv3_map_event(struct perf_event *event, - const unsigned (*extra_event_map) - [PERF_COUNT_HW_MAX], - const unsigned (*extra_cache_map) - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]) -{ - int hw_event_id; - struct arm_pmu *armpmu = to_arm_pmu(event->pmu); - - hw_event_id = armpmu_map_event(event, &armv8_pmuv3_perf_map, - &armv8_pmuv3_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); - - if (armv8pmu_event_is_64bit(event)) - event->hw.flags |= ARMPMU_EVT_64BIT; - - /* Only expose micro/arch events supported by this PMU */ - if ((hw_event_id > 0) && (hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS) - && test_bit(hw_event_id, armpmu->pmceid_bitmap)) { - return hw_event_id; - } - - return armpmu_map_event(event, extra_event_map, extra_cache_map, - ARMV8_PMU_EVTYPE_EVENT); -} - -static int armv8_pmuv3_map_event(struct perf_event *event) -{ - return __armv8_pmuv3_map_event(event, NULL, NULL); -} - -static int armv8_a53_map_event(struct perf_event *event) -{ - return __armv8_pmuv3_map_event(event, NULL, &armv8_a53_perf_cache_map); -} - -static int armv8_a57_map_event(struct perf_event *event) -{ - return __armv8_pmuv3_map_event(event, NULL, &armv8_a57_perf_cache_map); -} - -static int armv8_a73_map_event(struct perf_event *event) -{ - return __armv8_pmuv3_map_event(event, NULL, &armv8_a73_perf_cache_map); -} - -static int armv8_thunder_map_event(struct perf_event *event) -{ - return __armv8_pmuv3_map_event(event, NULL, - &armv8_thunder_perf_cache_map); -} - -static int armv8_vulcan_map_event(struct perf_event *event) -{ - return __armv8_pmuv3_map_event(event, NULL, - &armv8_vulcan_perf_cache_map); -} - -struct armv8pmu_probe_info { - struct arm_pmu *pmu; - bool present; -}; - -static void __armv8pmu_probe_pmu(void *info) -{ - struct armv8pmu_probe_info *probe = info; - struct arm_pmu *cpu_pmu = probe->pmu; - u64 dfr0; - u64 pmceid_raw[2]; - u32 pmceid[2]; - int pmuver; - - dfr0 = read_sysreg(id_aa64dfr0_el1); - pmuver = cpuid_feature_extract_unsigned_field(dfr0, - ID_AA64DFR0_PMUVER_SHIFT); - if (pmuver == 0xf || pmuver == 0) - return; - - probe->present = true; - - /* Read the nb of CNTx counters supported from PMNC */ - cpu_pmu->num_events = (armv8pmu_pmcr_read() >> ARMV8_PMU_PMCR_N_SHIFT) - & ARMV8_PMU_PMCR_N_MASK; - - /* Add the CPU cycles counter */ - cpu_pmu->num_events += 1; - - pmceid[0] = pmceid_raw[0] = read_sysreg(pmceid0_el0); - pmceid[1] = pmceid_raw[1] = read_sysreg(pmceid1_el0); - - bitmap_from_arr32(cpu_pmu->pmceid_bitmap, - pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS); - - pmceid[0] = pmceid_raw[0] >> 32; - pmceid[1] = pmceid_raw[1] >> 32; - - bitmap_from_arr32(cpu_pmu->pmceid_ext_bitmap, - pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS); -} - -static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu) -{ - struct armv8pmu_probe_info probe = { - .pmu = cpu_pmu, - .present = false, - }; - int ret; - - ret = smp_call_function_any(&cpu_pmu->supported_cpus, - __armv8pmu_probe_pmu, - &probe, 1); - if (ret) - return ret; - - return probe.present ? 0 : -ENODEV; -} - -static int armv8_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8pmu_probe_pmu(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->handle_irq = armv8pmu_handle_irq; - cpu_pmu->enable = armv8pmu_enable_event; - cpu_pmu->disable = armv8pmu_disable_event; - cpu_pmu->read_counter = armv8pmu_read_counter; - cpu_pmu->write_counter = armv8pmu_write_counter; - cpu_pmu->get_event_idx = armv8pmu_get_event_idx; - cpu_pmu->clear_event_idx = armv8pmu_clear_event_idx; - cpu_pmu->start = armv8pmu_start; - cpu_pmu->stop = armv8pmu_stop; - cpu_pmu->reset = armv8pmu_reset; - cpu_pmu->set_event_filter = armv8pmu_set_event_filter; - cpu_pmu->filter_match = armv8pmu_filter_match; - - return 0; -} - -static int armv8_pmuv3_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_pmuv3"; - cpu_pmu->map_event = armv8_pmuv3_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cortex_a35"; - cpu_pmu->map_event = armv8_a53_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cortex_a53"; - cpu_pmu->map_event = armv8_a53_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cortex_a57"; - cpu_pmu->map_event = armv8_a57_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cortex_a72"; - cpu_pmu->map_event = armv8_a57_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cortex_a73"; - cpu_pmu->map_event = armv8_a73_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_cavium_thunder"; - cpu_pmu->map_event = armv8_thunder_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu) -{ - int ret = armv8_pmu_init(cpu_pmu); - if (ret) - return ret; - - cpu_pmu->name = "armv8_brcm_vulcan"; - cpu_pmu->map_event = armv8_vulcan_map_event; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = - &armv8_pmuv3_events_attr_group; - cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = - &armv8_pmuv3_format_attr_group; - - return 0; -} - -static const struct of_device_id armv8_pmu_of_device_ids[] = { - {.compatible = "arm,armv8-pmuv3", .data = armv8_pmuv3_init}, - {.compatible = "arm,cortex-a35-pmu", .data = armv8_a35_pmu_init}, - {.compatible = "arm,cortex-a53-pmu", .data = armv8_a53_pmu_init}, - {.compatible = "arm,cortex-a57-pmu", .data = armv8_a57_pmu_init}, - {.compatible = "arm,cortex-a72-pmu", .data = armv8_a72_pmu_init}, - {.compatible = "arm,cortex-a73-pmu", .data = armv8_a73_pmu_init}, - {.compatible = "cavium,thunder-pmu", .data = armv8_thunder_pmu_init}, - {.compatible = "brcm,vulcan-pmu", .data = armv8_vulcan_pmu_init}, - {}, -}; - -static int armv8_pmu_device_probe(struct platform_device *pdev) -{ - return arm_pmu_device_probe(pdev, armv8_pmu_of_device_ids, NULL); -} - -static struct platform_driver armv8_pmu_driver = { - .driver = { - .name = ARMV8_PMU_PDEV_NAME, - .of_match_table = armv8_pmu_of_device_ids, - .suppress_bind_attrs = true, - }, - .probe = armv8_pmu_device_probe, -}; - -static int __init armv8_pmu_driver_init(void) -{ - if (acpi_disabled) - return platform_driver_register(&armv8_pmu_driver); - else - return arm_pmu_acpi_probe(armv8_pmuv3_init); -} -device_initcall(armv8_pmu_driver_init) - -void arch_perf_update_userpage(struct perf_event *event, - struct perf_event_mmap_page *userpg, u64 now) -{ - u32 freq; - u32 shift; - - /* - * Internal timekeeping for enabled/running/stopped times - * is always computed with the sched_clock. - */ - freq = arch_timer_get_rate(); - userpg->cap_user_time = 1; - - clocks_calc_mult_shift(&userpg->time_mult, &shift, freq, - NSEC_PER_SEC, 0); - /* - * time_shift is not expected to be greater than 31 due to - * the original published conversion algorithm shifting a - * 32-bit value (now specifies a 64-bit value) - refer - * perf_event_mmap_page documentation in perf_event.h. - */ - if (shift == 32) { - shift = 31; - userpg->time_mult >>= 1; - } - userpg->time_shift = (u16)shift; - userpg->time_offset = -now; -} diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c index 0bbac612146e..b4eece3eb17d 100644 --- a/arch/arm64/kernel/perf_regs.c +++ b/arch/arm64/kernel/perf_regs.c @@ -9,21 +9,58 @@ #include <asm/perf_regs.h> #include <asm/ptrace.h> +static u64 perf_ext_regs_value(int idx) +{ + switch (idx) { + case PERF_REG_ARM64_VG: + if (WARN_ON_ONCE(!system_supports_sve())) + return 0; + + /* + * Vector granule is current length in bits of SVE registers + * divided by 64. + */ + return (task_get_sve_vl(current) * 8) / 64; + default: + WARN_ON_ONCE(true); + return 0; + } +} + u64 perf_reg_value(struct pt_regs *regs, int idx) { - if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_MAX)) + if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_EXTENDED_MAX)) return 0; /* - * Compat (i.e. 32 bit) mode: - * - PC has been set in the pt_regs struct in kernel_entry, - * - Handle SP and LR here. + * Our handling of compat tasks (PERF_SAMPLE_REGS_ABI_32) is weird, but + * we're stuck with it for ABI compatibility reasons. + * + * For a 32-bit consumer inspecting a 32-bit task, then it will look at + * the first 16 registers (see arch/arm/include/uapi/asm/perf_regs.h). + * These correspond directly to a prefix of the registers saved in our + * 'struct pt_regs', with the exception of the PC, so we copy that down + * (x15 corresponds to SP_hyp in the architecture). + * + * So far, so good. + * + * The oddity arises when a 64-bit consumer looks at a 32-bit task and + * asks for registers beyond PERF_REG_ARM_MAX. In this case, we return + * SP_usr, LR_usr and PC in the positions where the AArch64 SP, LR and + * PC registers would normally live. The initial idea was to allow a + * 64-bit unwinder to unwind a 32-bit task and, although it's not clear + * how well that works in practice, somebody might be relying on it. + * + * At the time we make a sample, we don't know whether the consumer is + * 32-bit or 64-bit, so we have to cater for both possibilities. */ if (compat_user_mode(regs)) { if ((u32)idx == PERF_REG_ARM64_SP) return regs->compat_sp; if ((u32)idx == PERF_REG_ARM64_LR) return regs->compat_lr; + if (idx == 15) + return regs->pc; } if ((u32)idx == PERF_REG_ARM64_SP) @@ -32,6 +69,9 @@ u64 perf_reg_value(struct pt_regs *regs, int idx) if ((u32)idx == PERF_REG_ARM64_PC) return regs->pc; + if ((u32)idx >= PERF_REG_ARM64_MAX) + return perf_ext_regs_value(idx); + return regs->regs[idx]; } @@ -39,7 +79,12 @@ u64 perf_reg_value(struct pt_regs *regs, int idx) int perf_reg_validate(u64 mask) { - if (!mask || mask & REG_RESERVED) + u64 reserved_mask = REG_RESERVED; + + if (system_supports_sve()) + reserved_mask &= ~(1ULL << PERF_REG_ARM64_VG); + + if (!mask || mask & reserved_mask) return -EINVAL; return 0; @@ -54,8 +99,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); diff --git a/arch/arm64/kernel/pi/.gitignore b/arch/arm64/kernel/pi/.gitignore new file mode 100644 index 000000000000..efb29b663e85 --- /dev/null +++ b/arch/arm64/kernel/pi/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +relacheck diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile new file mode 100644 index 000000000000..be92d73c25b2 --- /dev/null +++ b/arch/arm64/kernel/pi/Makefile @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright 2022 Google LLC + +KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ + -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_KSTACK_ERASE) \ + $(DISABLE_LATENT_ENTROPY_PLUGIN) \ + $(call cc-option,-mbranch-protection=none) \ + -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \ + -include $(srctree)/include/linux/hidden.h \ + -D__DISABLE_EXPORTS -ffreestanding -D__NO_FORTIFY \ + -fno-asynchronous-unwind-tables -fno-unwind-tables \ + $(call cc-option,-fno-addrsig) + +# this code may run with the MMU off so disable unaligned accesses +CFLAGS_map_range.o += -mstrict-align + +# remove SCS flags from all objects in this directory +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_SCS), $(KBUILD_CFLAGS)) +# disable LTO +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS)) + +hostprogs := relacheck + +quiet_cmd_piobjcopy = $(quiet_cmd_objcopy) + cmd_piobjcopy = $(cmd_objcopy) && $(obj)/relacheck $(@) $(<) + +$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ \ + --remove-section=.note.gnu.property +$(obj)/%.pi.o: $(obj)/%.o $(obj)/relacheck FORCE + $(call if_changed,piobjcopy) + +# ensure that all the lib- code ends up as __init code and data +$(obj)/lib-%.pi.o: OBJCOPYFLAGS += --prefix-alloc-sections=.init + +$(obj)/lib-%.o: $(srctree)/lib/%.c FORCE + $(call if_changed_rule,cc_o_c) + +obj-y := idreg-override.pi.o \ + map_kernel.pi.o map_range.pi.o \ + lib-fdt.pi.o lib-fdt_ro.pi.o +obj-$(CONFIG_RELOCATABLE) += relocate.pi.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_early.pi.o +obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.pi.o +targets := $(patsubst %.pi.o,%.o,$(obj-y)) diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c new file mode 100644 index 000000000000..bc57b290e5e7 --- /dev/null +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Early cpufeature override framework + * + * Copyright (C) 2020 Google LLC + * Author: Marc Zyngier <maz@kernel.org> + */ + +#include <linux/ctype.h> +#include <linux/kernel.h> +#include <linux/libfdt.h> + +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/setup.h> + +#include "pi.h" + +#define FTR_DESC_NAME_LEN 20 +#define FTR_DESC_FIELD_LEN 10 +#define FTR_ALIAS_NAME_LEN 30 +#define FTR_ALIAS_OPTION_LEN 116 + +static u64 __boot_status __initdata; + +typedef bool filter_t(u64 val); + +struct ftr_set_desc { + char name[FTR_DESC_NAME_LEN]; + PREL64(struct arm64_ftr_override, override); + struct { + char name[FTR_DESC_FIELD_LEN]; + u8 shift; + u8 width; + PREL64(filter_t, filter); + } fields[]; +}; + +#define FIELD(n, s, f) { .name = n, .shift = s, .width = 4, .filter = f } + +static const struct ftr_set_desc mmfr0 __prel64_initconst = { + .name = "id_aa64mmfr0", + .override = &id_aa64mmfr0_override, + .fields = { + FIELD("ecv", ID_AA64MMFR0_EL1_ECV_SHIFT, NULL), + {} + }, +}; + +static bool __init mmfr1_vh_filter(u64 val) +{ + /* + * If we ever reach this point while running VHE, we're + * guaranteed to be on one of these funky, VHE-stuck CPUs. If + * the user was trying to force nVHE on us, proceed with + * attitude adjustment. + */ + return !(__boot_status == (BOOT_CPU_FLAG_E2H | BOOT_CPU_MODE_EL2) && + val == 0); +} + +static const struct ftr_set_desc mmfr1 __prel64_initconst = { + .name = "id_aa64mmfr1", + .override = &id_aa64mmfr1_override, + .fields = { + FIELD("vh", ID_AA64MMFR1_EL1_VH_SHIFT, mmfr1_vh_filter), + {} + }, +}; + + +static bool __init mmfr2_varange_filter(u64 val) +{ + int __maybe_unused feat; + + if (val) + return false; + +#ifdef CONFIG_ARM64_LPA2 + feat = cpuid_feature_extract_signed_field(read_sysreg(id_aa64mmfr0_el1), + ID_AA64MMFR0_EL1_TGRAN_SHIFT); + if (feat >= ID_AA64MMFR0_EL1_TGRAN_LPA2) { + id_aa64mmfr0_override.val |= + (ID_AA64MMFR0_EL1_TGRAN_LPA2 - 1) << ID_AA64MMFR0_EL1_TGRAN_SHIFT; + id_aa64mmfr0_override.mask |= 0xfU << ID_AA64MMFR0_EL1_TGRAN_SHIFT; + + /* + * Override PARange to 48 bits - the override will just be + * ignored if the actual PARange is smaller, but this is + * unlikely to be the case for LPA2 capable silicon. + */ + id_aa64mmfr0_override.val |= + ID_AA64MMFR0_EL1_PARANGE_48 << ID_AA64MMFR0_EL1_PARANGE_SHIFT; + id_aa64mmfr0_override.mask |= 0xfU << ID_AA64MMFR0_EL1_PARANGE_SHIFT; + } +#endif + return true; +} + +static const struct ftr_set_desc mmfr2 __prel64_initconst = { + .name = "id_aa64mmfr2", + .override = &id_aa64mmfr2_override, + .fields = { + FIELD("varange", ID_AA64MMFR2_EL1_VARange_SHIFT, mmfr2_varange_filter), + {} + }, +}; + +static bool __init pfr0_sve_filter(u64 val) +{ + /* + * Disabling SVE also means disabling all the features that + * are associated with it. The easiest way to do it is just to + * override id_aa64zfr0_el1 to be 0. + */ + if (!val) { + id_aa64zfr0_override.val = 0; + id_aa64zfr0_override.mask = GENMASK(63, 0); + } + + return true; +} + +static const struct ftr_set_desc pfr0 __prel64_initconst = { + .name = "id_aa64pfr0", + .override = &id_aa64pfr0_override, + .fields = { + FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter), + FIELD("el0", ID_AA64PFR0_EL1_EL0_SHIFT, NULL), + FIELD("mpam", ID_AA64PFR0_EL1_MPAM_SHIFT, NULL), + {} + }, +}; + +static bool __init pfr1_sme_filter(u64 val) +{ + /* + * Similarly to SVE, disabling SME also means disabling all + * the features that are associated with it. Just set + * id_aa64smfr0_el1 to 0 and don't look back. + */ + if (!val) { + id_aa64smfr0_override.val = 0; + id_aa64smfr0_override.mask = GENMASK(63, 0); + } + + return true; +} + +static const struct ftr_set_desc pfr1 __prel64_initconst = { + .name = "id_aa64pfr1", + .override = &id_aa64pfr1_override, + .fields = { + FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ), + FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL), + FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), + FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), + FIELD("mpam_frac", ID_AA64PFR1_EL1_MPAM_frac_SHIFT, NULL), + {} + }, +}; + +static const struct ftr_set_desc isar1 __prel64_initconst = { + .name = "id_aa64isar1", + .override = &id_aa64isar1_override, + .fields = { + FIELD("gpi", ID_AA64ISAR1_EL1_GPI_SHIFT, NULL), + FIELD("gpa", ID_AA64ISAR1_EL1_GPA_SHIFT, NULL), + FIELD("api", ID_AA64ISAR1_EL1_API_SHIFT, NULL), + FIELD("apa", ID_AA64ISAR1_EL1_APA_SHIFT, NULL), + {} + }, +}; + +static const struct ftr_set_desc isar2 __prel64_initconst = { + .name = "id_aa64isar2", + .override = &id_aa64isar2_override, + .fields = { + FIELD("gpa3", ID_AA64ISAR2_EL1_GPA3_SHIFT, NULL), + FIELD("apa3", ID_AA64ISAR2_EL1_APA3_SHIFT, NULL), + FIELD("mops", ID_AA64ISAR2_EL1_MOPS_SHIFT, NULL), + {} + }, +}; + +static const struct ftr_set_desc smfr0 __prel64_initconst = { + .name = "id_aa64smfr0", + .override = &id_aa64smfr0_override, + .fields = { + FIELD("smever", ID_AA64SMFR0_EL1_SMEver_SHIFT, NULL), + /* FA64 is a one bit field... :-/ */ + { "fa64", ID_AA64SMFR0_EL1_FA64_SHIFT, 1, }, + {} + }, +}; + +static bool __init hvhe_filter(u64 val) +{ + u64 mmfr1 = read_sysreg(id_aa64mmfr1_el1); + + return (val == 1 && + lower_32_bits(__boot_status) == BOOT_CPU_MODE_EL2 && + cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_EL1_VH_SHIFT)); +} + +static const struct ftr_set_desc sw_features __prel64_initconst = { + .name = "arm64_sw", + .override = &arm64_sw_feature_override, + .fields = { + FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL), + FIELD("hvhe", ARM64_SW_FEATURE_OVERRIDE_HVHE, hvhe_filter), + FIELD("rodataoff", ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF, NULL), + {} + }, +}; + +static const +PREL64(const struct ftr_set_desc, reg) regs[] __prel64_initconst = { + { &mmfr0 }, + { &mmfr1 }, + { &mmfr2 }, + { &pfr0 }, + { &pfr1 }, + { &isar1 }, + { &isar2 }, + { &smfr0 }, + { &sw_features }, +}; + +static const struct { + char alias[FTR_ALIAS_NAME_LEN]; + char feature[FTR_ALIAS_OPTION_LEN]; +} aliases[] __initconst = { + { "kvm_arm.mode=nvhe", "arm64_sw.hvhe=0 id_aa64mmfr1.vh=0" }, + { "kvm_arm.mode=protected", "arm64_sw.hvhe=1" }, + { "arm64.nosve", "id_aa64pfr0.sve=0" }, + { "arm64.nosme", "id_aa64pfr1.sme=0" }, + { "arm64.nobti", "id_aa64pfr1.bt=0" }, + { "arm64.nogcs", "id_aa64pfr1.gcs=0" }, + { "arm64.nopauth", + "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 " + "id_aa64isar1.api=0 id_aa64isar1.apa=0 " + "id_aa64isar2.gpa3=0 id_aa64isar2.apa3=0" }, + { "arm64.nomops", "id_aa64isar2.mops=0" }, + { "arm64.nomte", "id_aa64pfr1.mte=0" }, + { "nokaslr", "arm64_sw.nokaslr=1" }, + { "rodata=off", "arm64_sw.rodataoff=1" }, + { "arm64.nolva", "id_aa64mmfr2.varange=0" }, + { "arm64.no32bit_el0", "id_aa64pfr0.el0=1" }, + { "arm64.nompam", "id_aa64pfr0.mpam=0 id_aa64pfr1.mpam_frac=0" }, +}; + +static int __init parse_hexdigit(const char *p, u64 *v) +{ + // skip "0x" if it comes next + if (p[0] == '0' && tolower(p[1]) == 'x') + p += 2; + + // check whether the RHS is a single hex digit + if (!isxdigit(p[0]) || (p[1] && !isspace(p[1]))) + return -EINVAL; + + *v = tolower(*p) - (isdigit(*p) ? '0' : 'a' - 10); + return 0; +} + +static int __init find_field(const char *cmdline, char *opt, int len, + const struct ftr_set_desc *reg, int f, u64 *v) +{ + int flen = strlen(reg->fields[f].name); + + // append '<fieldname>=' to obtain '<name>.<fieldname>=' + memcpy(opt + len, reg->fields[f].name, flen); + len += flen; + opt[len++] = '='; + + if (memcmp(cmdline, opt, len)) + return -1; + + return parse_hexdigit(cmdline + len, v); +} + +static void __init match_options(const char *cmdline) +{ + char opt[FTR_DESC_NAME_LEN + FTR_DESC_FIELD_LEN + 2]; + int i; + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + const struct ftr_set_desc *reg = prel64_pointer(regs[i].reg); + struct arm64_ftr_override *override; + int len = strlen(reg->name); + int f; + + override = prel64_pointer(reg->override); + + // set opt[] to '<name>.' + memcpy(opt, reg->name, len); + opt[len++] = '.'; + + for (f = 0; reg->fields[f].name[0] != '\0'; f++) { + u64 shift = reg->fields[f].shift; + u64 width = reg->fields[f].width ?: 4; + u64 mask = GENMASK_ULL(shift + width - 1, shift); + bool (*filter)(u64 val); + u64 v; + + if (find_field(cmdline, opt, len, reg, f, &v)) + continue; + + /* + * If an override gets filtered out, advertise + * it by setting the value to the all-ones while + * clearing the mask... Yes, this is fragile. + */ + filter = prel64_pointer(reg->fields[f].filter); + if (filter && !filter(v)) { + override->val |= mask; + override->mask &= ~mask; + continue; + } + + override->val &= ~mask; + override->val |= (v << shift) & mask; + override->mask |= mask; + + return; + } + } +} + +static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) +{ + do { + char buf[256]; + size_t len; + int i; + + cmdline = skip_spaces(cmdline); + + /* terminate on "--" appearing on the command line by itself */ + if (cmdline[0] == '-' && cmdline[1] == '-' && isspace(cmdline[2])) + return; + + for (len = 0; cmdline[len] && !isspace(cmdline[len]); len++) { + if (len >= sizeof(buf) - 1) + break; + if (cmdline[len] == '-') + buf[len] = '_'; + else + buf[len] = cmdline[len]; + } + if (!len) + return; + + buf[len] = 0; + + cmdline += len; + + match_options(buf); + + for (i = 0; parse_aliases && i < ARRAY_SIZE(aliases); i++) + if (!memcmp(buf, aliases[i].alias, len + 1)) + __parse_cmdline(aliases[i].feature, false); + } while (1); +} + +static __init const u8 *get_bootargs_cmdline(const void *fdt, int node) +{ + static char const bootargs[] __initconst = "bootargs"; + const u8 *prop; + + if (node < 0) + return NULL; + + prop = fdt_getprop(fdt, node, bootargs, NULL); + if (!prop) + return NULL; + + return strlen(prop) ? prop : NULL; +} + +static __init void parse_cmdline(const void *fdt, int chosen) +{ + static char const cmdline[] __initconst = CONFIG_CMDLINE; + const u8 *prop = get_bootargs_cmdline(fdt, chosen); + + if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop) + __parse_cmdline(cmdline, true); + + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop) + __parse_cmdline(prop, true); +} + +void __init init_feature_override(u64 boot_status, const void *fdt, + int chosen) +{ + struct arm64_ftr_override *override; + const struct ftr_set_desc *reg; + int i; + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + reg = prel64_pointer(regs[i].reg); + override = prel64_pointer(reg->override); + + override->val = 0; + override->mask = 0; + } + + __boot_status = boot_status; + + parse_cmdline(fdt, chosen); + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + reg = prel64_pointer(regs[i].reg); + override = prel64_pointer(reg->override); + dcache_clean_inval_poc((unsigned long)override, + (unsigned long)(override + 1)); + } +} + +char * __init skip_spaces(const char *str) +{ + while (isspace(*str)) + ++str; + return (char *)str; +} diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c new file mode 100644 index 000000000000..e0e018046a46 --- /dev/null +++ b/arch/arm64/kernel/pi/kaslr_early.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2022 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +// NOTE: code in this file runs *very* early, and is not permitted to use +// global variables or anything that relies on absolute addressing. + +#include <linux/libfdt.h> +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/sizes.h> +#include <linux/string.h> + +#include <asm/archrandom.h> +#include <asm/memory.h> +#include <asm/pgtable.h> + +#include "pi.h" + +static u64 __init get_kaslr_seed(void *fdt, int node) +{ + static char const seed_str[] __initconst = "kaslr-seed"; + fdt64_t *prop; + u64 ret; + int len; + + if (node < 0) + return 0; + + prop = fdt_getprop_w(fdt, node, seed_str, &len); + if (!prop || len != sizeof(u64)) + return 0; + + ret = fdt64_to_cpu(*prop); + *prop = 0; + return ret; +} + +u64 __init kaslr_early_init(void *fdt, int chosen) +{ + u64 seed, range; + + if (kaslr_disabled_cmdline()) + return 0; + + seed = get_kaslr_seed(fdt, chosen); + if (!seed) { + if (!__early_cpu_has_rndr() || + !__arm64_rndr((unsigned long *)&seed)) + return 0; + } + + /* + * OK, so we are proceeding with KASLR enabled. Calculate a suitable + * kernel image offset from the seed. Let's place the kernel in the + * 'middle' half of the VMALLOC area, and stay clear of the lower and + * upper quarters to avoid colliding with other allocations. + */ + range = (VMALLOC_END - KIMAGE_VADDR) / 2; + return range / 2 + (((__uint128_t)range * seed) >> 64); +} diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c new file mode 100644 index 000000000000..a852264958c3 --- /dev/null +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +#include <linux/init.h> +#include <linux/libfdt.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/sizes.h> +#include <linux/string.h> + +#include <asm/memory.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> + +#include "pi.h" + +extern const u8 __eh_frame_start[], __eh_frame_end[]; + +extern void idmap_cpu_replace_ttbr1(phys_addr_t pgdir); + +static void __init map_segment(pgd_t *pg_dir, phys_addr_t *pgd, u64 va_offset, + void *start, void *end, pgprot_t prot, + bool may_use_cont, int root_level) +{ + map_range(pgd, ((u64)start + va_offset) & ~PAGE_OFFSET, + ((u64)end + va_offset) & ~PAGE_OFFSET, (u64)start, + prot, root_level, (pte_t *)pg_dir, may_use_cont, 0); +} + +static void __init unmap_segment(pgd_t *pg_dir, u64 va_offset, void *start, + void *end, int root_level) +{ + map_segment(pg_dir, NULL, va_offset, start, end, __pgprot(0), + false, root_level); +} + +static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) +{ + bool enable_scs = IS_ENABLED(CONFIG_UNWIND_PATCH_PAC_INTO_SCS); + bool twopass = IS_ENABLED(CONFIG_RELOCATABLE); + phys_addr_t pgdp = (phys_addr_t)init_pg_dir + PAGE_SIZE; + pgprot_t text_prot = PAGE_KERNEL_ROX; + pgprot_t data_prot = PAGE_KERNEL; + pgprot_t prot; + + /* + * External debuggers may need to write directly to the text mapping to + * install SW breakpoints. Allow this (only) when explicitly requested + * with rodata=off. + */ + if (arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF)) + text_prot = PAGE_KERNEL_EXEC; + + /* + * We only enable the shadow call stack dynamically if we are running + * on a system that does not implement PAC or BTI. PAC and SCS provide + * roughly the same level of protection, and BTI relies on the PACIASP + * instructions serving as landing pads, preventing us from patching + * those instructions into something else. + */ + if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) && cpu_has_pac()) + enable_scs = false; + + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && cpu_has_bti()) { + enable_scs = false; + + /* + * If we have a CPU that supports BTI and a kernel built for + * BTI then mark the kernel executable text as guarded pages + * now so we don't have to rewrite the page tables later. + */ + text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP); + } + + /* Map all code read-write on the first pass if needed */ + twopass |= enable_scs; + prot = twopass ? data_prot : text_prot; + + /* + * [_stext, _text) isn't executed after boot and contains some + * non-executable, unpredictable data, so map it non-executable. + */ + map_segment(init_pg_dir, &pgdp, va_offset, _text, _stext, data_prot, + false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot, + !twopass, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata, + __inittext_begin, data_prot, false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, __inittext_begin, + __inittext_end, prot, false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, __initdata_begin, + __initdata_end, data_prot, false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, _data, _end, data_prot, + true, root_level); + dsb(ishst); + + idmap_cpu_replace_ttbr1((phys_addr_t)init_pg_dir); + + if (twopass) { + if (IS_ENABLED(CONFIG_RELOCATABLE)) + relocate_kernel(kaslr_offset); + + if (enable_scs) { + scs_patch(__eh_frame_start + va_offset, + __eh_frame_end - __eh_frame_start, false); + asm("ic ialluis"); + + dynamic_scs_is_enabled = true; + } + + /* + * Unmap the text region before remapping it, to avoid + * potential TLB conflicts when creating the contiguous + * descriptors. + */ + unmap_segment(init_pg_dir, va_offset, _stext, _etext, + root_level); + dsb(ishst); + isb(); + __tlbi(vmalle1); + isb(); + + /* + * Remap these segments with different permissions + * No new page table allocations should be needed + */ + map_segment(init_pg_dir, NULL, va_offset, _stext, _etext, + text_prot, true, root_level); + map_segment(init_pg_dir, NULL, va_offset, __inittext_begin, + __inittext_end, text_prot, false, root_level); + } + + /* Copy the root page table to its final location */ + memcpy((void *)swapper_pg_dir + va_offset, init_pg_dir, PAGE_SIZE); + dsb(ishst); + idmap_cpu_replace_ttbr1((phys_addr_t)swapper_pg_dir); +} + +static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(phys_addr_t ttbr) +{ + u64 sctlr = read_sysreg(sctlr_el1); + u64 tcr = read_sysreg(tcr_el1) | TCR_EL1_DS; + u64 mmfr0 = read_sysreg(id_aa64mmfr0_el1); + u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_PARANGE_SHIFT); + + tcr &= ~TCR_EL1_IPS_MASK; + tcr |= parange << TCR_EL1_IPS_SHIFT; + + asm(" msr sctlr_el1, %0 ;" + " isb ;" + " msr ttbr0_el1, %1 ;" + " msr tcr_el1, %2 ;" + " isb ;" + " tlbi vmalle1 ;" + " dsb nsh ;" + " isb ;" + " msr sctlr_el1, %3 ;" + " isb ;" + :: "r"(sctlr & ~SCTLR_ELx_M), "r"(ttbr), "r"(tcr), "r"(sctlr)); +} + +static void __init remap_idmap_for_lpa2(void) +{ + /* clear the bits that change meaning once LPA2 is turned on */ + ptdesc_t mask = PTE_SHARED; + + /* + * We have to clear bits [9:8] in all block or page descriptors in the + * initial ID map, as otherwise they will be (mis)interpreted as + * physical address bits once we flick the LPA2 switch (TCR.DS). Since + * we cannot manipulate live descriptors in that way without creating + * potential TLB conflicts, let's create another temporary ID map in a + * LPA2 compatible fashion, and update the initial ID map while running + * from that. + */ + create_init_idmap(init_pg_dir, mask); + dsb(ishst); + set_ttbr0_for_lpa2((phys_addr_t)init_pg_dir); + + /* + * Recreate the initial ID map with the same granularity as before. + * Don't bother with the FDT, we no longer need it after this. + */ + memset(init_idmap_pg_dir, 0, + (char *)init_idmap_pg_end - (char *)init_idmap_pg_dir); + + create_init_idmap(init_idmap_pg_dir, mask); + dsb(ishst); + + /* switch back to the updated initial ID map */ + set_ttbr0_for_lpa2((phys_addr_t)init_idmap_pg_dir); + + /* wipe the temporary ID map from memory */ + memset(init_pg_dir, 0, (char *)init_pg_end - (char *)init_pg_dir); +} + +static void *__init map_fdt(phys_addr_t fdt) +{ + static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE); + phys_addr_t efdt = fdt + MAX_FDT_SIZE; + phys_addr_t ptep = (phys_addr_t)ptes; /* We're idmapped when called */ + + /* + * Map up to MAX_FDT_SIZE bytes, but avoid overlap with + * the kernel image. + */ + map_range(&ptep, fdt, (u64)_text > fdt ? min((u64)_text, efdt) : efdt, + fdt, PAGE_KERNEL, IDMAP_ROOT_LEVEL, + (pte_t *)init_idmap_pg_dir, false, 0); + dsb(ishst); + + return (void *)fdt; +} + +/* + * PI version of the Cavium Eratum 27456 detection, which makes it + * impossible to use non-global mappings. + */ +static bool __init ng_mappings_allowed(void) +{ + static const struct midr_range cavium_erratum_27456_cpus[] __initconst = { + /* Cavium ThunderX, T88 pass 1.x - 2.1 */ + MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), + /* Cavium ThunderX, T81 pass 1.0 */ + MIDR_REV(MIDR_THUNDERX_81XX, 0, 0), + {}, + }; + + for (const struct midr_range *r = cavium_erratum_27456_cpus; r->model; r++) { + if (midr_is_cpu_model_range(read_cpuid_id(), r->model, + r->rv_min, r->rv_max)) + return false; + } + + return true; +} + +asmlinkage void __init early_map_kernel(u64 boot_status, phys_addr_t fdt) +{ + static char const chosen_str[] __initconst = "/chosen"; + u64 va_base, pa_base = (u64)&_text; + u64 kaslr_offset = pa_base % MIN_KIMG_ALIGN; + int root_level = 4 - CONFIG_PGTABLE_LEVELS; + int va_bits = VA_BITS; + int chosen; + void *fdt_mapped = map_fdt(fdt); + + /* Clear BSS and the initial page tables */ + memset(__bss_start, 0, (char *)init_pg_end - (char *)__bss_start); + + /* Parse the command line for CPU feature overrides */ + chosen = fdt_path_offset(fdt_mapped, chosen_str); + init_feature_override(boot_status, fdt_mapped, chosen); + + if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && !cpu_has_lva()) { + va_bits = VA_BITS_MIN; + } else if (IS_ENABLED(CONFIG_ARM64_LPA2) && !cpu_has_lpa2()) { + va_bits = VA_BITS_MIN; + root_level++; + } + + if (va_bits > VA_BITS_MIN) + sysreg_clear_set(tcr_el1, TCR_EL1_T1SZ_MASK, TCR_T1SZ(va_bits)); + + /* + * The virtual KASLR displacement modulo 2MiB is decided by the + * physical placement of the image, as otherwise, we might not be able + * to create the early kernel mapping using 2 MiB block descriptors. So + * take the low bits of the KASLR offset from the physical address, and + * fill in the high bits from the seed. + */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + u64 kaslr_seed = kaslr_early_init(fdt_mapped, chosen); + + if (kaslr_seed && kaslr_requires_kpti()) + arm64_use_ng_mappings = ng_mappings_allowed(); + + kaslr_offset |= kaslr_seed & ~(MIN_KIMG_ALIGN - 1); + } + + if (IS_ENABLED(CONFIG_ARM64_LPA2) && va_bits > VA_BITS_MIN) + remap_idmap_for_lpa2(); + + va_base = KIMAGE_VADDR + kaslr_offset; + map_kernel(kaslr_offset, va_base - pa_base, root_level); +} diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c new file mode 100644 index 000000000000..de52cd85c691 --- /dev/null +++ b/arch/arm64/kernel/pi/map_range.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +#include <linux/types.h> +#include <linux/sizes.h> + +#include <asm/memory.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> + +#include "pi.h" + +/** + * map_range - Map a contiguous range of physical pages into virtual memory + * + * @pte: Address of physical pointer to array of pages to + * allocate page tables from + * @start: Virtual address of the start of the range + * @end: Virtual address of the end of the range (exclusive) + * @pa: Physical address of the start of the range + * @prot: Access permissions of the range + * @level: Translation level for the mapping + * @tbl: The level @level page table to create the mappings in + * @may_use_cont: Whether the use of the contiguous attribute is allowed + * @va_offset: Offset between a physical page and its current mapping + * in the VA space + */ +void __init map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset) +{ + u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; + ptdesc_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; + int lshift = (3 - level) * PTDESC_TABLE_SHIFT; + u64 lmask = (PAGE_SIZE << lshift) - 1; + + start &= PAGE_MASK; + pa &= PAGE_MASK; + + /* Advance tbl to the entry that covers start */ + tbl += (start >> (lshift + PAGE_SHIFT)) % PTRS_PER_PTE; + + /* + * Set the right block/page bits for this level unless we are + * clearing the mapping + */ + if (protval) + protval |= (level == 2) ? PMD_TYPE_SECT : PTE_TYPE_PAGE; + + while (start < end) { + u64 next = min((start | lmask) + 1, PAGE_ALIGN(end)); + + if (level < 2 || (level == 2 && (start | next | pa) & lmask)) { + /* + * This chunk needs a finer grained mapping. Create a + * table mapping if necessary and recurse. + */ + if (pte_none(*tbl)) { + *tbl = __pte(__phys_to_pte_val(*pte) | + PMD_TYPE_TABLE | PMD_TABLE_UXN); + *pte += PTRS_PER_PTE * sizeof(pte_t); + } + map_range(pte, start, next, pa, prot, level + 1, + (pte_t *)(__pte_to_phys(*tbl) + va_offset), + may_use_cont, va_offset); + } else { + /* + * Start a contiguous range if start and pa are + * suitably aligned + */ + if (((start | pa) & cmask) == 0 && may_use_cont) + protval |= PTE_CONT; + + /* + * Clear the contiguous attribute if the remaining + * range does not cover a contiguous block + */ + if ((end & ~cmask) <= start) + protval &= ~PTE_CONT; + + /* Put down a block or page mapping */ + *tbl = __pte(__phys_to_pte_val(pa) | protval); + } + pa += next - start; + start = next; + tbl++; + } +} + +asmlinkage phys_addr_t __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask) +{ + phys_addr_t ptep = (phys_addr_t)pg_dir + PAGE_SIZE; /* MMU is off */ + pgprot_t text_prot = PAGE_KERNEL_ROX; + pgprot_t data_prot = PAGE_KERNEL; + + pgprot_val(text_prot) &= ~clrmask; + pgprot_val(data_prot) &= ~clrmask; + + /* MMU is off; pointer casts to phys_addr_t are safe */ + map_range(&ptep, (u64)_stext, (u64)__initdata_begin, + (phys_addr_t)_stext, text_prot, IDMAP_ROOT_LEVEL, + (pte_t *)pg_dir, false, 0); + map_range(&ptep, (u64)__initdata_begin, (u64)_end, + (phys_addr_t)__initdata_begin, data_prot, IDMAP_ROOT_LEVEL, + (pte_t *)pg_dir, false, 0); + + return ptep; +} diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c new file mode 100644 index 000000000000..bbe7d30ed12b --- /dev/null +++ b/arch/arm64/kernel/pi/patch-scs.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022 - Google LLC + * Author: Ard Biesheuvel <ardb@google.com> + */ + +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/types.h> + +#include <asm/scs.h> + +#include "pi.h" + +bool dynamic_scs_is_enabled; + +// +// This minimal DWARF CFI parser is partially based on the code in +// arch/arc/kernel/unwind.c, and on the document below: +// https://refspecs.linuxbase.org/LSB_4.0.0/LSB-Core-generic/LSB-Core-generic/ehframechpt.html +// + +#define DW_CFA_nop 0x00 +#define DW_CFA_set_loc 0x01 +#define DW_CFA_advance_loc1 0x02 +#define DW_CFA_advance_loc2 0x03 +#define DW_CFA_advance_loc4 0x04 +#define DW_CFA_offset_extended 0x05 +#define DW_CFA_restore_extended 0x06 +#define DW_CFA_undefined 0x07 +#define DW_CFA_same_value 0x08 +#define DW_CFA_register 0x09 +#define DW_CFA_remember_state 0x0a +#define DW_CFA_restore_state 0x0b +#define DW_CFA_def_cfa 0x0c +#define DW_CFA_def_cfa_register 0x0d +#define DW_CFA_def_cfa_offset 0x0e +#define DW_CFA_def_cfa_expression 0x0f +#define DW_CFA_expression 0x10 +#define DW_CFA_offset_extended_sf 0x11 +#define DW_CFA_def_cfa_sf 0x12 +#define DW_CFA_def_cfa_offset_sf 0x13 +#define DW_CFA_val_offset 0x14 +#define DW_CFA_val_offset_sf 0x15 +#define DW_CFA_val_expression 0x16 +#define DW_CFA_lo_user 0x1c +#define DW_CFA_negate_ra_state 0x2d +#define DW_CFA_GNU_args_size 0x2e +#define DW_CFA_GNU_negative_offset_extended 0x2f +#define DW_CFA_hi_user 0x3f + +#define DW_EH_PE_sdata4 0x0b +#define DW_EH_PE_sdata8 0x0c +#define DW_EH_PE_pcrel 0x10 + +enum { + PACIASP = 0xd503233f, + AUTIASP = 0xd50323bf, + SCS_PUSH = 0xf800865e, + SCS_POP = 0xf85f8e5e, +}; + +static void __always_inline scs_patch_loc(u64 loc) +{ + u32 insn = le32_to_cpup((void *)loc); + + switch (insn) { + case PACIASP: + *(u32 *)loc = cpu_to_le32(SCS_PUSH); + break; + case AUTIASP: + *(u32 *)loc = cpu_to_le32(SCS_POP); + break; + default: + /* + * While the DW_CFA_negate_ra_state directive is guaranteed to + * appear right after a PACIASP/AUTIASP instruction, it may + * also appear after a DW_CFA_restore_state directive that + * restores a state that is only partially accurate, and is + * followed by DW_CFA_negate_ra_state directive to toggle the + * PAC bit again. So we permit other instructions here, and ignore + * them. + */ + return; + } + if (IS_ENABLED(CONFIG_ARM64_WORKAROUND_CLEAN_CACHE)) + asm("dc civac, %0" :: "r"(loc)); + else + asm(ALTERNATIVE("dc cvau, %0", "nop", ARM64_HAS_CACHE_IDC) + :: "r"(loc)); +} + +/* + * Skip one uleb128/sleb128 encoded quantity from the opcode stream. All bytes + * except the last one have bit #7 set. + */ +static int __always_inline skip_xleb128(const u8 **opcode, int size) +{ + u8 c; + + do { + c = *(*opcode)++; + size--; + } while (c & BIT(7)); + + return size; +} + +struct eh_frame { + /* + * The size of this frame if 0 < size < U32_MAX, 0 terminates the list. + */ + u32 size; + + /* + * The first frame is a Common Information Entry (CIE) frame, followed + * by one or more Frame Description Entry (FDE) frames. In the former + * case, this field is 0, otherwise it is the negated offset relative + * to the associated CIE frame. + */ + u32 cie_id_or_pointer; + + union { + struct { // CIE + u8 version; + u8 augmentation_string[3]; + u8 code_alignment_factor; + u8 data_alignment_factor; + u8 return_address_register; + u8 augmentation_data_size; + u8 fde_pointer_format; + }; + + struct { // FDE + s32 initial_loc; + s32 range; + u8 opcodes[]; + }; + + struct { // FDE + s64 initial_loc64; + s64 range64; + u8 opcodes64[]; + }; + }; +}; + +static int scs_handle_fde_frame(const struct eh_frame *frame, + int code_alignment_factor, + bool use_sdata8, + bool dry_run) +{ + int size = frame->size - offsetof(struct eh_frame, opcodes) + 4; + u64 loc = (u64)offset_to_ptr(&frame->initial_loc); + const u8 *opcode = frame->opcodes; + int l; + + if (use_sdata8) { + loc = (u64)&frame->initial_loc64 + frame->initial_loc64; + opcode = frame->opcodes64; + size -= 8; + } + + // assume single byte uleb128_t for augmentation data size + if (*opcode & BIT(7)) + return EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE; + + l = *opcode++; + opcode += l; + size -= l + 1; + + /* + * Starting from 'loc', apply the CFA opcodes that advance the location + * pointer, and identify the locations of the PAC instructions. + */ + while (size-- > 0) { + switch (*opcode++) { + case DW_CFA_nop: + case DW_CFA_remember_state: + case DW_CFA_restore_state: + break; + + case DW_CFA_advance_loc1: + loc += *opcode++ * code_alignment_factor; + size--; + break; + + case DW_CFA_advance_loc2: + loc += *opcode++ * code_alignment_factor; + loc += (*opcode++ << 8) * code_alignment_factor; + size -= 2; + break; + + case DW_CFA_def_cfa: + case DW_CFA_offset_extended: + size = skip_xleb128(&opcode, size); + fallthrough; + case DW_CFA_def_cfa_offset: + case DW_CFA_def_cfa_offset_sf: + case DW_CFA_def_cfa_register: + case DW_CFA_same_value: + case DW_CFA_restore_extended: + case 0x80 ... 0xbf: + size = skip_xleb128(&opcode, size); + break; + + case DW_CFA_negate_ra_state: + if (!dry_run) + scs_patch_loc(loc - 4); + break; + + case 0x40 ... 0x7f: + // advance loc + loc += (opcode[-1] & 0x3f) * code_alignment_factor; + break; + + case 0xc0 ... 0xff: + break; + + default: + return EDYNSCS_INVALID_CFA_OPCODE; + } + } + return 0; +} + +int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run) +{ + int code_alignment_factor = 1; + bool fde_use_sdata8 = false; + const u8 *p = eh_frame; + + while (size > 4) { + const struct eh_frame *frame = (const void *)p; + int ret; + + if (frame->size == 0 || + frame->size == U32_MAX || + frame->size > size) + break; + + if (frame->cie_id_or_pointer == 0) { + /* + * Require presence of augmentation data (z) with a + * specifier for the size of the FDE initial_loc and + * range fields (R), and nothing else. + */ + if (strcmp(frame->augmentation_string, "zR")) + return EDYNSCS_INVALID_CIE_HEADER; + + /* + * The code alignment factor is a uleb128 encoded field + * but given that the only sensible values are 1 or 4, + * there is no point in decoding the whole thing. Also + * sanity check the size of the data alignment factor + * field, and the values of the return address register + * and augmentation data size fields. + */ + if ((frame->code_alignment_factor & BIT(7)) || + (frame->data_alignment_factor & BIT(7)) || + frame->return_address_register != 30 || + frame->augmentation_data_size != 1) + return EDYNSCS_INVALID_CIE_HEADER; + + code_alignment_factor = frame->code_alignment_factor; + + switch (frame->fde_pointer_format) { + case DW_EH_PE_pcrel | DW_EH_PE_sdata4: + fde_use_sdata8 = false; + break; + case DW_EH_PE_pcrel | DW_EH_PE_sdata8: + fde_use_sdata8 = true; + break; + default: + return EDYNSCS_INVALID_CIE_SDATA_SIZE; + } + } else { + ret = scs_handle_fde_frame(frame, code_alignment_factor, + fde_use_sdata8, !skip_dry_run); + if (ret) + return ret; + + if (!skip_dry_run) + scs_handle_fde_frame(frame, code_alignment_factor, + fde_use_sdata8, false); + } + + p += sizeof(frame->size) + frame->size; + size -= sizeof(frame->size) + frame->size; + } + return 0; +} diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h new file mode 100644 index 000000000000..aec3172d4003 --- /dev/null +++ b/arch/arm64/kernel/pi/pi.h @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +#include <linux/types.h> + +#define __prel64_initconst __section(".init.rodata.prel64") + +#define PREL64(type, name) union { type *name; prel64_t name ## _prel; } + +#define prel64_pointer(__d) (typeof(__d))prel64_to_pointer(&__d##_prel) + +typedef volatile signed long prel64_t; + +static inline void *prel64_to_pointer(const prel64_t *offset) +{ + if (!*offset) + return NULL; + return (void *)offset + *offset; +} + +extern bool dynamic_scs_is_enabled; + +extern pgd_t init_idmap_pg_dir[], init_idmap_pg_end[]; +extern pgd_t init_pg_dir[], init_pg_end[]; + +void init_feature_override(u64 boot_status, const void *fdt, int chosen); +u64 kaslr_early_init(void *fdt, int chosen); +void relocate_kernel(u64 offset); +int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run); + +void map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset); + +asmlinkage void early_map_kernel(u64 boot_status, phys_addr_t fdt); + +asmlinkage phys_addr_t create_init_idmap(pgd_t *pgd, ptdesc_t clrmask); diff --git a/arch/arm64/kernel/pi/relacheck.c b/arch/arm64/kernel/pi/relacheck.c new file mode 100644 index 000000000000..b0cd4d0d275b --- /dev/null +++ b/arch/arm64/kernel/pi/relacheck.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 - Google LLC + * Author: Ard Biesheuvel <ardb@google.com> + */ + +#include <elf.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define HOST_ORDER ELFDATA2LSB +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define HOST_ORDER ELFDATA2MSB +#endif + +static Elf64_Ehdr *ehdr; +static Elf64_Shdr *shdr; +static const char *strtab; +static bool swap; + +static uint64_t swab_elfxword(uint64_t val) +{ + return swap ? __builtin_bswap64(val) : val; +} + +static uint32_t swab_elfword(uint32_t val) +{ + return swap ? __builtin_bswap32(val) : val; +} + +static uint16_t swab_elfhword(uint16_t val) +{ + return swap ? __builtin_bswap16(val) : val; +} + +int main(int argc, char *argv[]) +{ + struct stat stat; + int fd, ret; + + if (argc < 3) { + fprintf(stderr, "file arguments missing\n"); + exit(EXIT_FAILURE); + } + + fd = open(argv[1], O_RDWR); + if (fd < 0) { + fprintf(stderr, "failed to open %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + ret = fstat(fd, &stat); + if (ret < 0) { + fprintf(stderr, "failed to stat() %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + ehdr = mmap(0, stat.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ehdr == MAP_FAILED) { + fprintf(stderr, "failed to mmap() %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + swap = ehdr->e_ident[EI_DATA] != HOST_ORDER; + shdr = (void *)ehdr + swab_elfxword(ehdr->e_shoff); + strtab = (void *)ehdr + + swab_elfxword(shdr[swab_elfhword(ehdr->e_shstrndx)].sh_offset); + + for (int i = 0; i < swab_elfhword(ehdr->e_shnum); i++) { + unsigned long info, flags; + bool prel64 = false; + Elf64_Rela *rela; + int numrela; + + if (swab_elfword(shdr[i].sh_type) != SHT_RELA) + continue; + + /* only consider RELA sections operating on data */ + info = swab_elfword(shdr[i].sh_info); + flags = swab_elfxword(shdr[info].sh_flags); + if ((flags & (SHF_ALLOC | SHF_EXECINSTR)) != SHF_ALLOC) + continue; + + /* + * We generally don't permit ABS64 relocations in the code that + * runs before relocation processing occurs. If statically + * initialized absolute symbol references are unavoidable, they + * may be emitted into a *.rodata.prel64 section and they will + * be converted to place-relative 64-bit references. This + * requires special handling in the referring code. + */ + if (strstr(strtab + swab_elfword(shdr[info].sh_name), + ".rodata.prel64")) { + prel64 = true; + } + + rela = (void *)ehdr + swab_elfxword(shdr[i].sh_offset); + numrela = swab_elfxword(shdr[i].sh_size) / sizeof(*rela); + + for (int j = 0; j < numrela; j++) { + uint64_t info = swab_elfxword(rela[j].r_info); + + if (ELF64_R_TYPE(info) != R_AARCH64_ABS64) + continue; + + if (prel64) { + /* convert ABS64 into PREL64 */ + info ^= R_AARCH64_ABS64 ^ R_AARCH64_PREL64; + rela[j].r_info = swab_elfxword(info); + } else { + fprintf(stderr, + "Unexpected absolute relocations detected in %s\n", + argv[2]); + close(fd); + unlink(argv[1]); + exit(EXIT_FAILURE); + } + } + } + close(fd); + return 0; +} diff --git a/arch/arm64/kernel/pi/relocate.c b/arch/arm64/kernel/pi/relocate.c new file mode 100644 index 000000000000..2407d2696398 --- /dev/null +++ b/arch/arm64/kernel/pi/relocate.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Authors: Ard Biesheuvel <ardb@google.com> +// Peter Collingbourne <pcc@google.com> + +#include <linux/elf.h> +#include <linux/init.h> +#include <linux/types.h> + +#include "pi.h" + +extern const Elf64_Rela rela_start[], rela_end[]; +extern const u64 relr_start[], relr_end[]; + +void __init relocate_kernel(u64 offset) +{ + u64 *place = NULL; + + for (const Elf64_Rela *rela = rela_start; rela < rela_end; rela++) { + if (ELF64_R_TYPE(rela->r_info) != R_AARCH64_RELATIVE) + continue; + *(u64 *)(rela->r_offset + offset) = rela->r_addend + offset; + } + + if (!IS_ENABLED(CONFIG_RELR) || !offset) + return; + + /* + * Apply RELR relocations. + * + * RELR is a compressed format for storing relative relocations. The + * encoded sequence of entries looks like: + * [ AAAAAAAA BBBBBBB1 BBBBBBB1 ... AAAAAAAA BBBBBB1 ... ] + * + * i.e. start with an address, followed by any number of bitmaps. The + * address entry encodes 1 relocation. The subsequent bitmap entries + * encode up to 63 relocations each, at subsequent offsets following + * the last address entry. + * + * The bitmap entries must have 1 in the least significant bit. The + * assumption here is that an address cannot have 1 in lsb. Odd + * addresses are not supported. Any odd addresses are stored in the + * RELA section, which is handled above. + * + * With the exception of the least significant bit, each bit in the + * bitmap corresponds with a machine word that follows the base address + * word, and the bit value indicates whether or not a relocation needs + * to be applied to it. The second least significant bit represents the + * machine word immediately following the initial address, and each bit + * that follows represents the next word, in linear order. As such, a + * single bitmap can encode up to 63 relocations in a 64-bit object. + */ + for (const u64 *relr = relr_start; relr < relr_end; relr++) { + if ((*relr & 1) == 0) { + place = (u64 *)(*relr + offset); + *place++ += offset; + } else { + for (u64 *p = place, r = *relr >> 1; r; p++, r >>= 1) + if (r & 1) + *p += offset; + place += 63; + } + } +} diff --git a/arch/arm64/kernel/pointer_auth.c b/arch/arm64/kernel/pointer_auth.c index c507b584259d..2708b620b4ae 100644 --- a/arch/arm64/kernel/pointer_auth.c +++ b/arch/arm64/kernel/pointer_auth.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/compat.h> #include <linux/errno.h> #include <linux/prctl.h> #include <linux/random.h> @@ -9,7 +10,7 @@ int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg) { - struct ptrauth_keys *keys = &tsk->thread.keys_user; + struct ptrauth_keys_user *keys = &tsk->thread.keys_user; unsigned long addr_key_mask = PR_PAC_APIAKEY | PR_PAC_APIBKEY | PR_PAC_APDAKEY | PR_PAC_APDBKEY; unsigned long key_mask = addr_key_mask | PR_PAC_APGAKEY; @@ -17,9 +18,11 @@ int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg) if (!system_supports_address_auth() && !system_supports_generic_auth()) return -EINVAL; + if (is_compat_thread(task_thread_info(tsk))) + return -EINVAL; + if (!arg) { - ptrauth_keys_init(keys); - ptrauth_keys_switch(keys); + ptrauth_keys_init_user(keys); return 0; } @@ -40,8 +43,71 @@ int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg) get_random_bytes(&keys->apdb, sizeof(keys->apdb)); if (arg & PR_PAC_APGAKEY) get_random_bytes(&keys->apga, sizeof(keys->apga)); + ptrauth_keys_install_user(keys); + + return 0; +} + +static u64 arg_to_enxx_mask(unsigned long arg) +{ + u64 sctlr_enxx_mask = 0; + + WARN_ON(arg & ~PR_PAC_ENABLED_KEYS_MASK); + if (arg & PR_PAC_APIAKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENIA; + if (arg & PR_PAC_APIBKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENIB; + if (arg & PR_PAC_APDAKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENDA; + if (arg & PR_PAC_APDBKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENDB; + return sctlr_enxx_mask; +} + +int ptrauth_set_enabled_keys(struct task_struct *tsk, unsigned long keys, + unsigned long enabled) +{ + u64 sctlr; - ptrauth_keys_switch(keys); + if (!system_supports_address_auth()) + return -EINVAL; + + if (is_compat_thread(task_thread_info(tsk))) + return -EINVAL; + + if ((keys & ~PR_PAC_ENABLED_KEYS_MASK) || (enabled & ~keys)) + return -EINVAL; + + preempt_disable(); + sctlr = tsk->thread.sctlr_user; + sctlr &= ~arg_to_enxx_mask(keys); + sctlr |= arg_to_enxx_mask(enabled); + tsk->thread.sctlr_user = sctlr; + if (tsk == current) + update_sctlr_el1(sctlr); + preempt_enable(); return 0; } + +int ptrauth_get_enabled_keys(struct task_struct *tsk) +{ + int retval = 0; + + if (!system_supports_address_auth()) + return -EINVAL; + + if (is_compat_thread(task_thread_info(tsk))) + return -EINVAL; + + if (tsk->thread.sctlr_user & SCTLR_ELx_ENIA) + retval |= PR_PAC_APIAKEY; + if (tsk->thread.sctlr_user & SCTLR_ELx_ENIB) + retval |= PR_PAC_APIBKEY; + if (tsk->thread.sctlr_user & SCTLR_ELx_ENDA) + retval |= PR_PAC_APDAKEY; + if (tsk->thread.sctlr_user & SCTLR_ELx_ENDB) + retval |= PR_PAC_APDBKEY; + + return retval; +} diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 6bf6657a5a52..4137cc5ef031 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -1,16 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arch/arm64/kernel/probes/decode-insn.c * * Copyright (C) 2013 Linaro Limited. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/kernel.h> @@ -32,12 +24,13 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * currently safe. Lastly, MSR instructions can do any number of nasty * things we can't handle during single-stepping. */ - if (aarch64_get_insn_class(insn) == AARCH64_INSN_CLS_BR_SYS) { + if (aarch64_insn_is_class_branch_sys(insn)) { if (aarch64_insn_is_branch(insn) || aarch64_insn_is_msr_imm(insn) || aarch64_insn_is_msr_reg(insn) || aarch64_insn_is_exception(insn) || - aarch64_insn_is_eret(insn)) + aarch64_insn_is_eret(insn) || + aarch64_insn_is_eret_auth(insn)) return false; /* @@ -50,11 +43,13 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) != AARCH64_INSN_SPCLREG_DAIF; /* - * The HINT instruction is is problematic when single-stepping, - * except for the NOP case. + * The HINT instruction is steppable only if it is in whitelist + * and the rest of other such instructions are blocked for + * single stepping as they may cause exception or other + * unintended behaviour. */ if (aarch64_insn_is_hint(insn)) - return aarch64_insn_is_nop(insn); + return aarch64_insn_is_steppable_hint(insn); return true; } @@ -63,10 +58,13 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * Instructions which load PC relative literals are not going to work * when executed from an XOL slot. Instructions doing an exclusive * load/store are not going to complete successfully when single-step - * exception handling happens in the middle of the sequence. + * exception handling happens in the middle of the sequence. Memory + * copy/set instructions require that all three instructions be placed + * consecutively in memory. */ if (aarch64_insn_uses_literal(insn) || - aarch64_insn_is_exclusive(insn)) + aarch64_insn_is_exclusive(insn) || + aarch64_insn_is_mops(insn)) return false; return true; @@ -78,9 +76,18 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot. */ enum probe_insn __kprobes -arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) +arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api) { /* + * While 'nop' instruction can execute in the out-of-line slot, + * simulating them in breakpoint handling offers better performance. + */ + if (aarch64_insn_is_nop(insn)) { + api->handler = simulate_nop; + return INSN_GOOD_NO_SLOT; + } + + /* * Instructions reading or modifying the PC won't work from the XOL * slot. */ @@ -101,13 +108,10 @@ arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) aarch64_insn_is_bl(insn)) { api->handler = simulate_b_bl; } else if (aarch64_insn_is_br(insn) || - aarch64_insn_is_blr(insn) || - aarch64_insn_is_ret(insn)) { - api->handler = simulate_br_blr_ret; - } else if (aarch64_insn_is_ldr_lit(insn)) { - api->handler = simulate_ldr_literal; - } else if (aarch64_insn_is_ldrsw_lit(insn)) { - api->handler = simulate_ldrsw_literal; + aarch64_insn_is_blr(insn)) { + api->handler = simulate_br_blr; + } else if (aarch64_insn_is_ret(insn)) { + api->handler = simulate_ret; } else { /* * Instruction cannot be stepped out-of-line and we don't @@ -142,9 +146,20 @@ enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) { enum probe_insn decoded; - probe_opcode_t insn = le32_to_cpu(*addr); - probe_opcode_t *scan_end = NULL; + u32 insn = le32_to_cpu(*addr); + kprobe_opcode_t *scan_end = NULL; unsigned long size = 0, offset = 0; + struct arch_probe_insn *api = &asi->api; + + if (aarch64_insn_is_ldr_lit(insn)) { + api->handler = simulate_ldr_literal; + decoded = INSN_GOOD_NO_SLOT; + } else if (aarch64_insn_is_ldrsw_lit(insn)) { + api->handler = simulate_ldrsw_literal; + decoded = INSN_GOOD_NO_SLOT; + } else { + decoded = arm_probe_decode_insn(insn, &asi->api); + } /* * If there's a symbol defined in front of and near enough to @@ -162,7 +177,6 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) else scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE; } - decoded = arm_probe_decode_insn(insn, &asi->api); if (decoded != INSN_REJECTED && scan_end) if (is_probed_address_atomic(addr - 1, scan_end)) diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h index 192ab007bacb..0e4195de8206 100644 --- a/arch/arm64/kernel/probes/decode-insn.h +++ b/arch/arm64/kernel/probes/decode-insn.h @@ -1,16 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * arch/arm64/kernel/probes/decode-insn.h * * Copyright (C) 2013 Linaro Limited. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef _ARM_KERNEL_KPROBES_ARM64_H @@ -36,6 +28,6 @@ enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi); #endif enum probe_insn __kprobes -arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *asi); +arm_probe_decode_insn(u32 insn, struct arch_probe_insn *asi); #endif /* _ARM_KERNEL_KPROBES_ARM64_H */ diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 2a5b338b2542..43a0361a8bf0 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arch/arm64/kernel/probes/kprobes.c * @@ -5,36 +6,33 @@ * * Copyright (C) 2013 Linaro Limited. * Author: Sandeepa Prabhu <sandeepa.prabhu@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * */ + +#define pr_fmt(fmt) "kprobes: " fmt + +#include <linux/execmem.h> +#include <linux/extable.h> #include <linux/kasan.h> #include <linux/kernel.h> #include <linux/kprobes.h> -#include <linux/extable.h> -#include <linux/slab.h> -#include <linux/stop_machine.h> #include <linux/sched/debug.h> #include <linux/set_memory.h> +#include <linux/slab.h> +#include <linux/stop_machine.h> #include <linux/stringify.h> +#include <linux/uaccess.h> #include <linux/vmalloc.h> -#include <asm/traps.h> -#include <asm/ptrace.h> + #include <asm/cacheflush.h> +#include <asm/daifflags.h> #include <asm/debug-monitors.h> -#include <asm/system_misc.h> #include <asm/insn.h> -#include <linux/uaccess.h> #include <asm/irq.h> +#include <asm/text-patching.h> +#include <asm/ptrace.h> #include <asm/sections.h> +#include <asm/system_misc.h> +#include <asm/traps.h> #include "decode-insn.h" @@ -42,39 +40,59 @@ DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); static void __kprobes -post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *); +post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *); -static int __kprobes patch_text(kprobe_opcode_t *addr, u32 opcode) +void *alloc_insn_page(void) { - void *addrs[1]; - u32 insns[1]; - - addrs[0] = addr; - insns[0] = opcode; - - return aarch64_insn_patch_text(addrs, insns, 1); + void *addr; + + addr = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); + if (!addr) + return NULL; + if (set_memory_rox((unsigned long)addr, 1)) { + execmem_free(addr); + return NULL; + } + return addr; } static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { - /* prepare insn slot */ - patch_text(p->ainsn.api.insn, p->opcode); + kprobe_opcode_t *addr = p->ainsn.xol_insn; - flush_icache_range((uintptr_t) (p->ainsn.api.insn), - (uintptr_t) (p->ainsn.api.insn) + - MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + /* + * Prepare insn slot, Mark Rutland points out it depends on a coupe of + * subtleties: + * + * - That the I-cache maintenance for these instructions is complete + * *before* the kprobe BRK is written (and aarch64_insn_patch_text_nosync() + * ensures this, but just omits causing a Context-Synchronization-Event + * on all CPUS). + * + * - That the kprobe BRK results in an exception (and consequently a + * Context-Synchronoization-Event), which ensures that the CPU will + * fetch thesingle-step slot instructions *after* this, ensuring that + * the new instructions are used + * + * It supposes to place ISB after patching to guarantee I-cache maintenance + * is observed on all CPUS, however, single-step slot is installed in + * the BRK exception handler, so it is unnecessary to generate + * Contex-Synchronization-Event via ISB again. + */ + aarch64_insn_patch_text_nosync(addr, le32_to_cpu(p->opcode)); + aarch64_insn_patch_text_nosync(addr + 1, BRK64_OPCODE_KPROBES_SS); /* * Needs restoring of return address after stepping xol. */ - p->ainsn.api.restore = (unsigned long) p->addr + + p->ainsn.xol_restore = (unsigned long) p->addr + sizeof(kprobe_opcode_t); } static void __kprobes arch_prepare_simulate(struct kprobe *p) { /* This instructions is not executed xol. No need to adjust the PC */ - p->ainsn.api.restore = 0; + p->ainsn.xol_restore = 0; } static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) @@ -82,28 +100,23 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); if (p->ainsn.api.handler) - p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs); + p->ainsn.api.handler(le32_to_cpu(p->opcode), (long)p->addr, regs); /* single step simulated, now go for post processing */ - post_kprobe_handler(kcb, regs); + post_kprobe_handler(p, kcb, regs); } int __kprobes arch_prepare_kprobe(struct kprobe *p) { unsigned long probe_addr = (unsigned long)p->addr; - extern char __start_rodata[]; - extern char __end_rodata[]; if (probe_addr & 0x3) return -EINVAL; /* copy instruction */ - p->opcode = le32_to_cpu(*p->addr); + p->opcode = *p->addr; - if (in_exception_text(probe_addr)) - return -EINVAL; - if (probe_addr >= (unsigned long) __start_rodata && - probe_addr <= (unsigned long) __end_rodata) + if (search_exception_tables(probe_addr)) return -EINVAL; /* decode instruction */ @@ -112,18 +125,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; case INSN_GOOD_NO_SLOT: /* insn need simulation */ - p->ainsn.api.insn = NULL; + p->ainsn.xol_insn = NULL; break; case INSN_GOOD: /* instruction uses slot */ - p->ainsn.api.insn = get_insn_slot(); - if (!p->ainsn.api.insn) + p->ainsn.xol_insn = get_insn_slot(); + if (!p->ainsn.xol_insn) return -ENOMEM; break; } /* prepare the instruction */ - if (p->ainsn.api.insn) + if (p->ainsn.xol_insn) arch_prepare_ss_slot(p); else arch_prepare_simulate(p); @@ -131,34 +144,29 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return 0; } -void *alloc_insn_page(void) -{ - void *page; - - page = vmalloc_exec(PAGE_SIZE); - if (page) - set_memory_ro((unsigned long)page, 1); - - return page; -} - /* arm kprobe: install breakpoint in text */ void __kprobes arch_arm_kprobe(struct kprobe *p) { - patch_text(p->addr, BRK64_OPCODE_KPROBES); + void *addr = p->addr; + u32 insn = BRK64_OPCODE_KPROBES; + + aarch64_insn_patch_text(&addr, &insn, 1); } /* disarm kprobe: remove breakpoint from text */ void __kprobes arch_disarm_kprobe(struct kprobe *p) { - patch_text(p->addr, p->opcode); + void *addr = p->addr; + u32 insn = le32_to_cpu(p->opcode); + + aarch64_insn_patch_text(&addr, &insn, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) { - if (p->ainsn.api.insn) { - free_insn_slot(p->ainsn.api.insn, 0); - p->ainsn.api.insn = NULL; + if (p->ainsn.xol_insn) { + free_insn_slot(p->ainsn.xol_insn, 0); + p->ainsn.xol_insn = NULL; } } @@ -180,67 +188,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p) } /* - * When PSTATE.D is set (masked), then software step exceptions can not be - * generated. - * SPSR's D bit shows the value of PSTATE.D immediately before the - * exception was taken. PSTATE.D is set while entering into any exception - * mode, however software clears it for any normal (none-debug-exception) - * mode in the exception entry. Therefore, when we are entering into kprobe - * breakpoint handler from any normal mode then SPSR.D bit is already - * cleared, however it is set when we are entering from any debug exception - * mode. - * Since we always need to generate single step exception after a kprobe - * breakpoint exception therefore we need to clear it unconditionally, when - * we become sure that the current breakpoint exception is for kprobe. - */ -static void __kprobes -spsr_set_debug_flag(struct pt_regs *regs, int mask) -{ - unsigned long spsr = regs->pstate; - - if (mask) - spsr |= PSR_D_BIT; - else - spsr &= ~PSR_D_BIT; - - regs->pstate = spsr; -} - -/* - * Interrupts need to be disabled before single-step mode is set, and not - * reenabled until after single-step mode ends. - * Without disabling interrupt on local CPU, there is a chance of - * interrupt occurrence in the period of exception return and start of - * out-of-line single-step, that result in wrongly single stepping - * into the interrupt handler. + * Mask all of DAIF while executing the instruction out-of-line, to keep things + * simple and avoid nesting exceptions. Interrupts do have to be disabled since + * the kprobe state is per-CPU and doesn't get migrated. */ static void __kprobes kprobes_save_local_irqflag(struct kprobe_ctlblk *kcb, struct pt_regs *regs) { - kcb->saved_irqflag = regs->pstate; - regs->pstate |= PSR_I_BIT; + kcb->saved_irqflag = regs->pstate & DAIF_MASK; + regs->pstate |= DAIF_MASK; } static void __kprobes kprobes_restore_local_irqflag(struct kprobe_ctlblk *kcb, struct pt_regs *regs) { - if (kcb->saved_irqflag & PSR_I_BIT) - regs->pstate |= PSR_I_BIT; - else - regs->pstate &= ~PSR_I_BIT; -} - -static void __kprobes -set_ss_context(struct kprobe_ctlblk *kcb, unsigned long addr) -{ - kcb->ss_ctx.ss_pending = true; - kcb->ss_ctx.match_addr = addr + sizeof(kprobe_opcode_t); -} - -static void __kprobes clear_ss_context(struct kprobe_ctlblk *kcb) -{ - kcb->ss_ctx.ss_pending = false; - kcb->ss_ctx.match_addr = 0; + regs->pstate &= ~DAIF_MASK; + regs->pstate |= kcb->saved_irqflag; } static void __kprobes setup_singlestep(struct kprobe *p, @@ -258,17 +221,11 @@ static void __kprobes setup_singlestep(struct kprobe *p, } - if (p->ainsn.api.insn) { + if (p->ainsn.xol_insn) { /* prepare for single stepping */ - slot = (unsigned long)p->ainsn.api.insn; - - set_ss_context(kcb, slot); /* mark pending ss */ + slot = (unsigned long)p->ainsn.xol_insn; - spsr_set_debug_flag(regs, 0); - - /* IRQs and single stepping do not mix well. */ kprobes_save_local_irqflag(kcb, regs); - kernel_enable_single_step(regs); instruction_pointer_set(regs, slot); } else { /* insn simulation */ @@ -288,7 +245,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, break; case KPROBE_HIT_SS: case KPROBE_REENTER: - pr_warn("Unrecoverable kprobe detected.\n"); + pr_warn("Failed to recover from reentered kprobes.\n"); dump_kprobe(p); BUG(); break; @@ -301,16 +258,11 @@ static int __kprobes reenter_kprobe(struct kprobe *p, } static void __kprobes -post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) +post_kprobe_handler(struct kprobe *cur, struct kprobe_ctlblk *kcb, struct pt_regs *regs) { - struct kprobe *cur = kprobe_running(); - - if (!cur) - return; - /* return addr restore if non-branching insn */ - if (cur->ainsn.api.restore != 0) - instruction_pointer_set(regs, cur->ainsn.api.restore); + if (cur->ainsn.xol_restore != 0) + instruction_pointer_set(regs, cur->ainsn.xol_restore); /* restore back original saved kprobe variables and continue */ if (kcb->kprobe_status == KPROBE_REENTER) { @@ -319,12 +271,8 @@ post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) } /* call post handler */ kcb->kprobe_status = KPROBE_HIT_SSDONE; - if (cur->post_handler) { - /* post_handler can hit breakpoint and single step - * again, so we enable D-flag for recursive exception. - */ + if (cur->post_handler) cur->post_handler(cur, regs, 0); - } reset_current_kprobe(); } @@ -345,47 +293,22 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr) * normal page fault. */ instruction_pointer_set(regs, (unsigned long) cur->addr); - if (!instruction_pointer(regs)) - BUG(); - - kernel_disable_single_step(); + BUG_ON(!instruction_pointer(regs)); - if (kcb->kprobe_status == KPROBE_REENTER) + if (kcb->kprobe_status == KPROBE_REENTER) { restore_previous_kprobe(kcb); - else + } else { + kprobes_restore_local_irqflag(kcb, regs); reset_current_kprobe(); + } break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: - /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accounting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(cur); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (cur->fault_handler && cur->fault_handler(cur, regs, fsr)) - return 1; - - /* - * In case the user-specified fault handler returned - * zero, try to fix up. - */ - if (fixup_exception(regs)) - return 1; } return 0; } -static void __kprobes kprobe_handler(struct pt_regs *regs) +int __kprobes +kprobe_brk_handler(struct pt_regs *regs, unsigned long esr) { struct kprobe *p, *cur_kprobe; struct kprobe_ctlblk *kcb; @@ -395,189 +318,101 @@ static void __kprobes kprobe_handler(struct pt_regs *regs) cur_kprobe = kprobe_running(); p = get_kprobe((kprobe_opcode_t *) addr); - - if (p) { - if (cur_kprobe) { - if (reenter_kprobe(p, regs, kcb)) - return; - } else { - /* Probe hit */ - set_current_kprobe(p); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - - /* - * If we have no pre-handler or it returned 0, we - * continue with normal processing. If we have a - * pre-handler and it returned non-zero, it will - * modify the execution path and no need to single - * stepping. Let's just reset current kprobe and exit. - * - * pre_handler can hit a breakpoint and can step thru - * before return, keep PSTATE D-flag enabled until - * pre_handler return back. - */ - if (!p->pre_handler || !p->pre_handler(p, regs)) { - setup_singlestep(p, regs, kcb, 0); - } else - reset_current_kprobe(); - } + if (WARN_ON_ONCE(!p)) { + /* + * Something went wrong. This BRK used an immediate reserved + * for kprobes, but we couldn't find any corresponding probe. + */ + return DBG_HOOK_ERROR; } - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - * Return back to original instruction, and continue. - */ -} -static int __kprobes -kprobe_ss_hit(struct kprobe_ctlblk *kcb, unsigned long addr) -{ - if ((kcb->ss_ctx.ss_pending) - && (kcb->ss_ctx.match_addr == addr)) { - clear_ss_context(kcb); /* clear pending ss */ - return DBG_HOOK_HANDLED; + if (cur_kprobe) { + /* Hit a kprobe inside another kprobe */ + if (!reenter_kprobe(p, regs, kcb)) + return DBG_HOOK_ERROR; + } else { + /* Probe hit */ + set_current_kprobe(p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + /* + * If we have no pre-handler or it returned 0, we + * continue with normal processing. If we have a + * pre-handler and it returned non-zero, it will + * modify the execution path and not need to single-step + * Let's just reset current kprobe and exit. + */ + if (!p->pre_handler || !p->pre_handler(p, regs)) + setup_singlestep(p, regs, kcb, 0); + else + reset_current_kprobe(); } - /* not ours, kprobes should ignore it */ - return DBG_HOOK_ERROR; + + return DBG_HOOK_HANDLED; } int __kprobes -kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr) +kprobe_ss_brk_handler(struct pt_regs *regs, unsigned long esr) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - int retval; - - /* return error if this is not our step */ - retval = kprobe_ss_hit(kcb, instruction_pointer(regs)); + unsigned long addr = instruction_pointer(regs); + struct kprobe *cur = kprobe_running(); - if (retval == DBG_HOOK_HANDLED) { + if (cur && (kcb->kprobe_status & (KPROBE_HIT_SS | KPROBE_REENTER)) && + ((unsigned long)&cur->ainsn.xol_insn[1] == addr)) { kprobes_restore_local_irqflag(kcb, regs); - kernel_disable_single_step(); + post_kprobe_handler(cur, kcb, regs); - post_kprobe_handler(kcb, regs); + return DBG_HOOK_HANDLED; } - return retval; + /* not ours, kprobes should ignore it */ + return DBG_HOOK_ERROR; } int __kprobes -kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr) -{ - kprobe_handler(regs); - return DBG_HOOK_HANDLED; -} - -bool arch_within_kprobe_blacklist(unsigned long addr) +kretprobe_brk_handler(struct pt_regs *regs, unsigned long esr) { - if ((addr >= (unsigned long)__kprobes_text_start && - addr < (unsigned long)__kprobes_text_end) || - (addr >= (unsigned long)__entry_text_start && - addr < (unsigned long)__entry_text_end) || - (addr >= (unsigned long)__idmap_text_start && - addr < (unsigned long)__idmap_text_end) || - !!search_exception_tables(addr)) - return true; - - if (!is_kernel_in_hyp_mode()) { - if ((addr >= (unsigned long)__hyp_text_start && - addr < (unsigned long)__hyp_text_end) || - (addr >= (unsigned long)__hyp_idmap_text_start && - addr < (unsigned long)__hyp_idmap_text_end)) - return true; - } + if (regs->pc != (unsigned long)__kretprobe_trampoline) + return DBG_HOOK_ERROR; - return false; + regs->pc = kretprobe_trampoline_handler(regs, (void *)regs->regs[29]); + return DBG_HOOK_HANDLED; } -void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs) +/* + * Provide a blacklist of symbols identifying ranges which cannot be kprobed. + * This blacklist is exposed to userspace via debugfs (kprobes/blacklist). + */ +int __init arch_populate_kprobe_blacklist(void) { - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address = - (unsigned long)&kretprobe_trampoline; - kprobe_opcode_t *correct_ret_addr = NULL; - - INIT_HLIST_HEAD(&empty_rp); - kretprobe_hash_lock(current, &head, &flags); - - /* - * It is possible to have multiple instances associated with a given - * task either because multiple functions in the call path have - * return probes installed on them, and/or more than one - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always pushed into the head of the list - * - when multiple return probes are registered for the same - * function, the (chronologically) first instance's ret_addr - * will be the real return address, and all the rest will - * point to kretprobe_trampoline. - */ - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - orig_ret_address = (unsigned long)ri->ret_addr; - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - - correct_ret_addr = ri->ret_addr; - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - orig_ret_address = (unsigned long)ri->ret_addr; - if (ri->rp && ri->rp->handler) { - __this_cpu_write(current_kprobe, &ri->rp->kp); - get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; - ri->ret_addr = correct_ret_addr; - ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, NULL); - } - - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_hash_unlock(current, &flags); - - hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - return (void *)orig_ret_address; + int ret; + + ret = kprobe_add_area_blacklist((unsigned long)__entry_text_start, + (unsigned long)__entry_text_end); + if (ret) + return ret; + ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, + (unsigned long)__irqentry_text_end); + if (ret) + return ret; + ret = kprobe_add_area_blacklist((unsigned long)__hyp_text_start, + (unsigned long)__hyp_text_end); + if (ret || is_kernel_in_hyp_mode()) + return ret; + ret = kprobe_add_area_blacklist((unsigned long)__hyp_idmap_text_start, + (unsigned long)__hyp_idmap_text_end); + return ret; } void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { ri->ret_addr = (kprobe_opcode_t *)regs->regs[30]; + ri->fp = (void *)regs->regs[29]; /* replace return addr (x30) with trampoline */ - regs->regs[30] = (long)&kretprobe_trampoline; + regs->regs[30] = (long)&__kretprobe_trampoline; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S index 45dce03aaeaf..b60739d3983f 100644 --- a/arch/arm64/kernel/probes/kprobes_trampoline.S +++ b/arch/arm64/kernel/probes/kprobes_trampoline.S @@ -4,79 +4,17 @@ */ #include <linux/linkage.h> -#include <asm/asm-offsets.h> +#include <asm/asm-bug.h> #include <asm/assembler.h> .text - .macro save_all_base_regs - stp x0, x1, [sp, #S_X0] - stp x2, x3, [sp, #S_X2] - stp x4, x5, [sp, #S_X4] - stp x6, x7, [sp, #S_X6] - stp x8, x9, [sp, #S_X8] - stp x10, x11, [sp, #S_X10] - stp x12, x13, [sp, #S_X12] - stp x14, x15, [sp, #S_X14] - stp x16, x17, [sp, #S_X16] - stp x18, x19, [sp, #S_X18] - stp x20, x21, [sp, #S_X20] - stp x22, x23, [sp, #S_X22] - stp x24, x25, [sp, #S_X24] - stp x26, x27, [sp, #S_X26] - stp x28, x29, [sp, #S_X28] - add x0, sp, #S_FRAME_SIZE - stp lr, x0, [sp, #S_LR] +SYM_CODE_START(__kretprobe_trampoline) /* - * Construct a useful saved PSTATE + * Trigger a breakpoint exception. The PC will be adjusted by + * kretprobe_brk_handler(), and no subsequent instructions will + * be executed from the trampoline. */ - mrs x0, nzcv - mrs x1, daif - orr x0, x0, x1 - mrs x1, CurrentEL - orr x0, x0, x1 - mrs x1, SPSel - orr x0, x0, x1 - stp xzr, x0, [sp, #S_PC] - .endm - - .macro restore_all_base_regs - ldr x0, [sp, #S_PSTATE] - and x0, x0, #(PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT) - msr nzcv, x0 - ldp x0, x1, [sp, #S_X0] - ldp x2, x3, [sp, #S_X2] - ldp x4, x5, [sp, #S_X4] - ldp x6, x7, [sp, #S_X6] - ldp x8, x9, [sp, #S_X8] - ldp x10, x11, [sp, #S_X10] - ldp x12, x13, [sp, #S_X12] - ldp x14, x15, [sp, #S_X14] - ldp x16, x17, [sp, #S_X16] - ldp x18, x19, [sp, #S_X18] - ldp x20, x21, [sp, #S_X20] - ldp x22, x23, [sp, #S_X22] - ldp x24, x25, [sp, #S_X24] - ldp x26, x27, [sp, #S_X26] - ldp x28, x29, [sp, #S_X28] - .endm - -ENTRY(kretprobe_trampoline) - sub sp, sp, #S_FRAME_SIZE - - save_all_base_regs - - mov x0, sp - bl trampoline_probe_handler - /* - * Replace trampoline address in lr with actual orig_ret_addr return - * address. - */ - mov lr, x0 - - restore_all_base_regs - - add sp, sp, #S_FRAME_SIZE - ret - -ENDPROC(kretprobe_trampoline) + brk #KRETPROBES_BRK_IMM + ASM_BUG() +SYM_CODE_END(__kretprobe_trampoline) diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index be05868418ee..89fbeb32107e 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -1,16 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arch/arm64/kernel/probes/simulate-insn.c * * Copyright (C) 2013 Linaro Limited. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/bitops.h> @@ -18,8 +10,10 @@ #include <linux/kprobes.h> #include <asm/ptrace.h> +#include <asm/traps.h> #include "simulate-insn.h" +#include "asm/gcs.h" #define bbl_displacement(insn) \ sign_extend32(((insn) & 0x3ffffff) << 2, 27) @@ -56,6 +50,21 @@ static inline u32 get_w_reg(struct pt_regs *regs, int reg) return lower_32_bits(pt_regs_read_reg(regs, reg)); } +static inline int update_lr(struct pt_regs *regs, long addr) +{ + int err = 0; + + if (user_mode(regs) && task_gcs_el0_enabled(current)) { + push_user_gcs(addr, &err); + if (err) { + force_sig(SIGSEGV); + return err; + } + } + procedure_link_pointer_set(regs, addr); + return err; +} + static bool __kprobes check_cbz(u32 opcode, struct pt_regs *regs) { int xn = opcode & 0x1f; @@ -114,9 +123,9 @@ simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs) { int disp = bbl_displacement(opcode); - /* Link register is x30 */ if (opcode & (1 << 31)) - set_x_reg(regs, 30, addr + 4); + if (update_lr(regs, addr + 4)) + return; instruction_pointer_set(regs, addr + disp); } @@ -133,16 +142,34 @@ simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs) } void __kprobes -simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs) +simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs) { int xn = (opcode >> 5) & 0x1f; + u64 b_target = get_x_reg(regs, xn); - /* update pc first in case we're doing a "blr lr" */ - instruction_pointer_set(regs, get_x_reg(regs, xn)); - - /* Link register is x30 */ if (((opcode >> 21) & 0x3) == 1) - set_x_reg(regs, 30, addr + 4); + if (update_lr(regs, addr + 4)) + return; + + instruction_pointer_set(regs, b_target); +} + +void __kprobes +simulate_ret(u32 opcode, long addr, struct pt_regs *regs) +{ + u64 ret_addr; + int err = 0; + int xn = (opcode >> 5) & 0x1f; + u64 r_target = get_x_reg(regs, xn); + + if (user_mode(regs) && task_gcs_el0_enabled(current)) { + ret_addr = pop_user_gcs(&err); + if (err || ret_addr != r_target) { + force_sig(SIGSEGV); + return; + } + } + instruction_pointer_set(regs, r_target); } void __kprobes @@ -178,17 +205,15 @@ simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs) void __kprobes simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs) { - u64 *load_addr; + unsigned long load_addr; int xn = opcode & 0x1f; - int disp; - disp = ldr_displacement(opcode); - load_addr = (u64 *) (addr + disp); + load_addr = addr + ldr_displacement(opcode); if (opcode & (1 << 30)) /* x0-x30 */ - set_x_reg(regs, xn, *load_addr); + set_x_reg(regs, xn, READ_ONCE(*(u64 *)load_addr)); else /* w0-w30 */ - set_w_reg(regs, xn, *load_addr); + set_w_reg(regs, xn, READ_ONCE(*(u32 *)load_addr)); instruction_pointer_set(regs, instruction_pointer(regs) + 4); } @@ -196,14 +221,18 @@ simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs) void __kprobes simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs) { - s32 *load_addr; + unsigned long load_addr; int xn = opcode & 0x1f; - int disp; - disp = ldr_displacement(opcode); - load_addr = (s32 *) (addr + disp); + load_addr = addr + ldr_displacement(opcode); - set_x_reg(regs, xn, *load_addr); + set_x_reg(regs, xn, READ_ONCE(*(s32 *)load_addr)); instruction_pointer_set(regs, instruction_pointer(regs) + 4); } + +void __kprobes +simulate_nop(u32 opcode, long addr, struct pt_regs *regs) +{ + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); +} diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h index 050bde683c2d..9e772a292d56 100644 --- a/arch/arm64/kernel/probes/simulate-insn.h +++ b/arch/arm64/kernel/probes/simulate-insn.h @@ -1,16 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * arch/arm64/kernel/probes/simulate-insn.h * * Copyright (C) 2013 Linaro Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef _ARM_KERNEL_KPROBES_SIMULATE_INSN_H @@ -19,10 +11,12 @@ void simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs); void simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs); void simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs); -void simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs); +void simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs); +void simulate_ret(u32 opcode, long addr, struct pt_regs *regs); void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs); +void simulate_nop(u32 opcode, long addr, struct pt_regs *regs); #endif /* _ARM_KERNEL_KPROBES_SIMULATE_INSN_H */ diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index 636ca0119c0e..941668800aea 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -1,14 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2014-2016 Pratyush Anand <panand@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/highmem.h> #include <linux/ptrace.h> #include <linux/uprobes.h> #include <asm/cacheflush.h> +#include <asm/gcs.h> #include "decode-insn.h" @@ -20,12 +18,20 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *xol_page_kaddr = kmap_atomic(page); void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); + /* + * Initial cache maintenance of the xol page done via set_pte_at(). + * Subsequent CMOs only needed if the xol slot changes. + */ + if (!memcmp(dst, src, len)) + goto done; + /* Initialize the slot */ memcpy(dst, src, len); /* flush caches (dcache/icache) */ - sync_icache_aliases(dst, len); + sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len); +done: kunmap_atomic(xol_page_kaddr); } @@ -37,15 +43,15 @@ unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - probe_opcode_t insn; + u32 insn; /* TODO: Currently we do not support AARCH32 instruction probing */ if (mm->context.flags & MMCF_AARCH32) - return -ENOTSUPP; + return -EOPNOTSUPP; else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) return -EINVAL; - insn = *(probe_opcode_t *)(&auprobe->insn[0]); + insn = le32_to_cpu(auprobe->insn); switch (arm_probe_decode_insn(insn, &auprobe->api)) { case INSN_REJECTED: @@ -105,13 +111,13 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - probe_opcode_t insn; + u32 insn; unsigned long addr; if (!auprobe->simulate) return false; - insn = *(probe_opcode_t *)(&auprobe->insn[0]); + insn = le32_to_cpu(auprobe->insn); addr = instruction_pointer(regs); if (auprobe->api.handler) @@ -125,7 +131,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) struct uprobe_task *utask = current->utask; /* - * Task has received a fatal signal, so reset back to probbed + * Task has received a fatal signal, so reset back to probed * address. */ instruction_pointer_set(regs, utask->vaddr); @@ -154,11 +160,43 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { unsigned long orig_ret_vaddr; + unsigned long gcs_ret_vaddr; + int err = 0; + u64 gcspr; orig_ret_vaddr = procedure_link_pointer(regs); + + if (task_gcs_el0_enabled(current)) { + gcspr = read_sysreg_s(SYS_GCSPR_EL0); + gcs_ret_vaddr = get_user_gcs((__force unsigned long __user *)gcspr, &err); + if (err) { + force_sig(SIGSEGV); + goto out; + } + + /* + * If the LR and GCS return addr don't match, then some kind of PAC + * signing or control flow occurred since entering the probed function. + * Likely because the user is attempting to retprobe on an instruction + * that isn't a function boundary or inside a leaf function. Explicitly + * abort this retprobe because it will generate a GCS exception. + */ + if (gcs_ret_vaddr != orig_ret_vaddr) { + orig_ret_vaddr = -1; + goto out; + } + + put_user_gcs(trampoline_vaddr, (__force unsigned long __user *)gcspr, &err); + if (err) { + force_sig(SIGSEGV); + goto out; + } + } + /* Replace the return addr with trampoline addr */ procedure_link_pointer_set(regs, trampoline_vaddr); +out: return orig_ret_vaddr; } @@ -168,49 +206,24 @@ int arch_uprobe_exception_notify(struct notifier_block *self, return NOTIFY_DONE; } -static int uprobe_breakpoint_handler(struct pt_regs *regs, - unsigned int esr) +int uprobe_brk_handler(struct pt_regs *regs, + unsigned long esr) { - if (user_mode(regs) && uprobe_pre_sstep_notifier(regs)) + if (uprobe_pre_sstep_notifier(regs)) return DBG_HOOK_HANDLED; return DBG_HOOK_ERROR; } -static int uprobe_single_step_handler(struct pt_regs *regs, - unsigned int esr) +int uprobe_single_step_handler(struct pt_regs *regs, + unsigned long esr) { struct uprobe_task *utask = current->utask; - if (user_mode(regs)) { - WARN_ON(utask && - (instruction_pointer(regs) != utask->xol_vaddr + 4)); - - if (uprobe_post_sstep_notifier(regs)) - return DBG_HOOK_HANDLED; - } + WARN_ON(utask && (instruction_pointer(regs) != utask->xol_vaddr + 4)); + if (uprobe_post_sstep_notifier(regs)) + return DBG_HOOK_HANDLED; return DBG_HOOK_ERROR; } -/* uprobe breakpoint handler hook */ -static struct break_hook uprobes_break_hook = { - .esr_mask = BRK64_ESR_MASK, - .esr_val = BRK64_ESR_UPROBES, - .fn = uprobe_breakpoint_handler, -}; - -/* uprobe single step handler hook */ -static struct step_hook uprobes_step_hook = { - .fn = uprobe_single_step_handler, -}; - -static int __init arch_init_uprobes(void) -{ - register_break_hook(&uprobes_break_hook); - register_step_hook(&uprobes_step_hook); - - return 0; -} - -device_initcall(arch_init_uprobes); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a0f985a6ac50..fba7ca102a8c 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -1,35 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/process.c * * Original Copyright (C) 1995 Linus Torvalds * Copyright (C) 1996-2000 Russell King - Converted to ARM. * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ - -#include <stdarg.h> - #include <linux/compat.h> #include <linux/efi.h> +#include <linux/elf.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/kernel.h> +#include <linux/mman.h> #include <linux/mm.h> +#include <linux/nospec.h> #include <linux/stddef.h> +#include <linux/sysctl.h> #include <linux/unistd.h> #include <linux/user.h> #include <linux/delay.h> @@ -49,20 +39,28 @@ #include <trace/events/power.h> #include <linux/percpu.h> #include <linux/thread_info.h> +#include <linux/prctl.h> +#include <linux/stacktrace.h> #include <asm/alternative.h> +#include <asm/arch_timer.h> #include <asm/compat.h> +#include <asm/cpufeature.h> #include <asm/cacheflush.h> #include <asm/exec.h> #include <asm/fpsimd.h> +#include <asm/gcs.h> #include <asm/mmu_context.h> +#include <asm/mte.h> #include <asm/processor.h> #include <asm/pointer_auth.h> #include <asm/stacktrace.h> +#include <asm/switch_to.h> +#include <asm/system_misc.h> #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include <linux/stackprotector.h> -unsigned long __stack_chk_guard __read_mostly; +unsigned long __stack_chk_guard __ro_after_init; EXPORT_SYMBOL(__stack_chk_guard); #endif @@ -72,25 +70,8 @@ EXPORT_SYMBOL(__stack_chk_guard); void (*pm_power_off)(void); EXPORT_SYMBOL_GPL(pm_power_off); -void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); - -/* - * This is our default idle handler. - */ -void arch_cpu_idle(void) -{ - /* - * This should do all the clock switching and wait for interrupt - * tricks - */ - trace_cpu_idle_rcuidle(1, smp_processor_id()); - cpu_do_idle(); - local_irq_enable(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); -} - #ifdef CONFIG_HOTPLUG_CPU -void arch_cpu_idle_dead(void) +void __noreturn arch_cpu_idle_dead(void) { cpu_die(); } @@ -103,11 +84,11 @@ void arch_cpu_idle_dead(void) * to execute e.g. a RAM-based pin loop is not sufficient. This allows the * kexec'd kernel to use any and all RAM as it sees fit, without having to * avoid any code or data used by any SW CPU pin loop. The CPU hotplug - * functionality embodied in disable_nonboot_cpus() to achieve this. + * functionality embodied in smpt_shutdown_nonboot_cpus() to achieve this. */ void machine_shutdown(void) { - disable_nonboot_cpus(); + smp_shutdown_nonboot_cpus(reboot_cpu); } /* @@ -132,8 +113,7 @@ void machine_power_off(void) { local_irq_disable(); smp_send_stop(); - if (pm_power_off) - pm_power_off(); + do_kernel_power_off(); } /* @@ -159,10 +139,7 @@ void machine_restart(char *cmd) efi_reboot(reboot_mode, NULL); /* Now call the architecture specific reboot code. */ - if (arm_pm_restart) - arm_pm_restart(reboot_mode, cmd); - else - do_kernel_restart(cmd); + do_kernel_restart(cmd); /* * Whoops - the architecture was unable to reboot. @@ -171,12 +148,21 @@ void machine_restart(char *cmd) while (1); } +#define bstr(suffix, str) [PSR_BTYPE_ ## suffix >> PSR_BTYPE_SHIFT] = str +static const char *const btypes[] = { + bstr(NONE, "--"), + bstr( JC, "jc"), + bstr( C, "-c"), + bstr( J , "j-") +}; +#undef bstr + static void print_pstate(struct pt_regs *regs) { u64 pstate = regs->pstate; if (compat_user_mode(regs)) { - printk("pstate: %08llx (%c%c%c%c %c %s %s %c%c%c)\n", + printk("pstate: %08llx (%c%c%c%c %c %s %s %c%c%c %cDIT %cSSBS)\n", pstate, pstate & PSR_AA32_N_BIT ? 'N' : 'n', pstate & PSR_AA32_Z_BIT ? 'Z' : 'z', @@ -187,9 +173,14 @@ static void print_pstate(struct pt_regs *regs) pstate & PSR_AA32_E_BIT ? "BE" : "LE", pstate & PSR_AA32_A_BIT ? 'A' : 'a', pstate & PSR_AA32_I_BIT ? 'I' : 'i', - pstate & PSR_AA32_F_BIT ? 'F' : 'f'); + pstate & PSR_AA32_F_BIT ? 'F' : 'f', + pstate & PSR_AA32_DIT_BIT ? '+' : '-', + pstate & PSR_AA32_SSBS_BIT ? '+' : '-'); } else { - printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO)\n", + const char *btype_str = btypes[(pstate & PSR_BTYPE_MASK) >> + PSR_BTYPE_SHIFT]; + + printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO %cTCO %cDIT %cSSBS BTYPE=%s)\n", pstate, pstate & PSR_N_BIT ? 'N' : 'n', pstate & PSR_Z_BIT ? 'Z' : 'z', @@ -200,7 +191,11 @@ static void print_pstate(struct pt_regs *regs) pstate & PSR_I_BIT ? 'I' : 'i', pstate & PSR_F_BIT ? 'F' : 'f', pstate & PSR_PAN_BIT ? '+' : '-', - pstate & PSR_UAO_BIT ? '+' : '-'); + pstate & PSR_UAO_BIT ? '+' : '-', + pstate & PSR_TCO_BIT ? '+' : '-', + pstate & PSR_DIT_BIT ? '+' : '-', + pstate & PSR_SSBS_BIT ? '+' : '-', + btype_str); } } @@ -224,7 +219,7 @@ void __show_regs(struct pt_regs *regs) if (!user_mode(regs)) { printk("pc : %pS\n", (void *)regs->pc); - printk("lr : %pS\n", (void *)lr); + printk("lr : %pS\n", (void *)ptrauth_strip_kernel_insn_pac(lr)); } else { printk("pc : %016llx\n", regs->pc); printk("lr : %016llx\n", lr); @@ -232,30 +227,32 @@ void __show_regs(struct pt_regs *regs) printk("sp : %016llx\n", sp); + if (system_uses_irq_prio_masking()) + printk("pmr: %08x\n", regs->pmr); + i = top_reg; while (i >= 0) { - printk("x%-2d: %016llx ", i, regs->regs[i]); - i--; + printk("x%-2d: %016llx", i, regs->regs[i]); - if (i % 2 == 0) { - pr_cont("x%-2d: %016llx ", i, regs->regs[i]); - i--; - } + while (i-- % 3) + pr_cont(" x%-2d: %016llx", i, regs->regs[i]); pr_cont("\n"); } } -void show_regs(struct pt_regs * regs) +void show_regs(struct pt_regs *regs) { __show_regs(regs); - dump_backtrace(regs, NULL); + dump_backtrace(regs, NULL, KERN_DEFAULT); } static void tls_thread_flush(void) { write_sysreg(0, tpidr_el0); + if (system_supports_tpidr2()) + write_sysreg_s(0, SYS_TPIDR2_EL0); if (is_compat_task()) { current->thread.uw.tp_value = 0; @@ -270,15 +267,75 @@ static void tls_thread_flush(void) } } +static void flush_tagged_addr_state(void) +{ + if (IS_ENABLED(CONFIG_ARM64_TAGGED_ADDR_ABI)) + clear_thread_flag(TIF_TAGGED_ADDR); +} + +static void flush_poe(void) +{ + if (!system_supports_poe()) + return; + + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); +} + +#ifdef CONFIG_ARM64_GCS + +static void flush_gcs(void) +{ + if (!system_supports_gcs()) + return; + + current->thread.gcspr_el0 = 0; + current->thread.gcs_base = 0; + current->thread.gcs_size = 0; + current->thread.gcs_el0_mode = 0; + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); + write_sysreg_s(0, SYS_GCSPR_EL0); +} + +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + unsigned long gcs; + + if (!system_supports_gcs()) + return 0; + + p->thread.gcs_base = 0; + p->thread.gcs_size = 0; + + p->thread.gcs_el0_mode = current->thread.gcs_el0_mode; + p->thread.gcs_el0_locked = current->thread.gcs_el0_locked; + + gcs = gcs_alloc_thread_stack(p, args); + if (IS_ERR_VALUE(gcs)) + return PTR_ERR((void *)gcs); + + return 0; +} + +#else + +static void flush_gcs(void) { } +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + return 0; +} + +#endif + void flush_thread(void) { fpsimd_flush_thread(); tls_thread_flush(); flush_ptrace_hw_breakpoint(current); -} - -void release_thread(struct task_struct *dead_task) -{ + flush_tagged_addr_state(); + flush_poe(); + flush_gcs(); } void arch_release_task_struct(struct task_struct *tsk) @@ -286,42 +343,81 @@ void arch_release_task_struct(struct task_struct *tsk) fpsimd_release_task(tsk); } -/* - * src and dst may temporarily have aliased sve_state after task_struct - * is copied. We cannot fix this properly here, because src may have - * live SVE state and dst's thread_info may not exist yet, so tweaking - * either src's or dst's TIF_SVE is not safe. - * - * The unaliasing is done in copy_thread() instead. This works because - * dst is not schedulable or traceable until both of these functions - * have been called. - */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - if (current->mm) - fpsimd_preserve_current_state(); + /* + * The current/src task's FPSIMD state may or may not be live, and may + * have been altered by ptrace after entry to the kernel. Save the + * effective FPSIMD state so that this will be copied into dst. + */ + fpsimd_save_and_flush_current_state(); + fpsimd_sync_from_effective_state(src); + *dst = *src; + /* + * Drop stale reference to src's sve_state and convert dst to + * non-streaming FPSIMD mode. + */ + dst->thread.fp_type = FP_STATE_FPSIMD; + dst->thread.sve_state = NULL; + clear_tsk_thread_flag(dst, TIF_SVE); + task_smstop_sm(dst); + + /* + * Drop stale reference to src's sme_state and ensure dst has ZA + * disabled. + * + * When necessary, ZA will be inherited later in copy_thread_za(). + */ + dst->thread.sme_state = NULL; + clear_tsk_thread_flag(dst, TIF_SME); + dst->thread.svcr &= ~SVCR_ZA_MASK; + + /* clear any pending asynchronous tag fault raised by the parent */ + clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT); + + return 0; +} + +static int copy_thread_za(struct task_struct *dst, struct task_struct *src) +{ + if (!thread_za_enabled(&src->thread)) + return 0; + + dst->thread.sve_state = kzalloc(sve_state_size(src), + GFP_KERNEL); + if (!dst->thread.sve_state) + return -ENOMEM; + + dst->thread.sme_state = kmemdup(src->thread.sme_state, + sme_state_size(src), + GFP_KERNEL); + if (!dst->thread.sme_state) { + kfree(dst->thread.sve_state); + dst->thread.sve_state = NULL; + return -ENOMEM; + } + + set_tsk_thread_flag(dst, TIF_SME); + dst->thread.svcr |= SVCR_ZA_MASK; + return 0; } asmlinkage void ret_from_fork(void) asm("ret_from_fork"); -int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + u64 clone_flags = args->flags; + unsigned long stack_start = args->stack; + unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); + int ret; memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); /* - * Unalias p->thread.sve_state (if any) from the parent task - * and disable discard SVE state for p: - */ - clear_tsk_thread_flag(p, TIF_SVE); - p->thread.sve_state = NULL; - - /* * In case p was allocated the same task_struct pointer as some * other recently-exited task, make sure p is disassociated from * any cpu that may have run that now-exited task recently. @@ -330,7 +426,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, */ fpsimd_flush_task_state(p); - if (likely(!(p->flags & PF_KTHREAD))) { + ptrauth_thread_init_kernel(p); + + if (likely(!args->fn)) { *childregs = *current_pt_regs(); childregs->regs[0] = 0; @@ -340,6 +438,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, */ *task_user_tls(p) = read_sysreg(tpidr_el0); + if (system_supports_poe()) + p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (stack_start) { if (is_compat_thread(task_thread_info(p))) childregs->compat_sp = stack_start; @@ -348,26 +449,68 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, } /* - * If a TLS pointer was passed to clone (4th argument), use it - * for the new thread. + * Due to the AAPCS64 "ZA lazy saving scheme", PSTATE.ZA and + * TPIDR2 need to be manipulated as a pair, and either both + * need to be inherited or both need to be reset. + * + * Within a process, child threads must not inherit their + * parent's TPIDR2 value or they may clobber their parent's + * stack at some later point. + * + * When a process is fork()'d, the child must inherit ZA and + * TPIDR2 from its parent in case there was dormant ZA state. + * + * Use CLONE_VM to determine when the child will share the + * address space with the parent, and cannot safely inherit the + * state. + */ + if (system_supports_sme()) { + if (!(clone_flags & CLONE_VM)) { + p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + ret = copy_thread_za(p, current); + if (ret) + return ret; + } else { + p->thread.tpidr2_el0 = 0; + WARN_ON_ONCE(p->thread.svcr & SVCR_ZA_MASK); + } + } + + /* + * If a TLS pointer was passed to clone, use it for the new + * thread. */ if (clone_flags & CLONE_SETTLS) - p->thread.uw.tp_value = childregs->regs[3]; + p->thread.uw.tp_value = tls; + + ret = copy_thread_gcs(p, args); + if (ret != 0) + return ret; } else { + /* + * A kthread has no context to ERET to, so ensure any buggy + * ERET is treated as an illegal exception return. + * + * When a user task is created from a kthread, childregs will + * be initialized by start_thread() or start_compat_thread(). + */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->pstate = PSR_MODE_EL1h; - if (IS_ENABLED(CONFIG_ARM64_UAO) && - cpus_have_const_cap(ARM64_HAS_UAO)) - childregs->pstate |= PSR_UAO_BIT; + childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT; + childregs->stackframe.type = FRAME_META_TYPE_FINAL; - if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) - childregs->pstate |= PSR_SSBS_BIT; + p->thread.cpu_context.x19 = (unsigned long)args->fn; + p->thread.cpu_context.x20 = (unsigned long)args->fn_arg; - p->thread.cpu_context.x19 = stack_start; - p->thread.cpu_context.x20 = stk_sz; + if (system_supports_poe()) + p->thread.por_el0 = POR_EL0_INIT; } p->thread.cpu_context.pc = (unsigned long)ret_from_fork; p->thread.cpu_context.sp = (unsigned long)childregs; + /* + * For the benefit of the unwinder, set up childregs->stackframe + * as the final frame for the new task. + */ + p->thread.cpu_context.fp = (unsigned long)&childregs->stackframe; ptrace_hw_copy_thread(p); @@ -377,6 +520,8 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, void tls_preserve_current_state(void) { *task_user_tls(current) = read_sysreg(tpidr_el0); + if (system_supports_tpidr2() && !is_compat_task()) + current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); } static void tls_thread_switch(struct task_struct *next) @@ -385,21 +530,35 @@ static void tls_thread_switch(struct task_struct *next) if (is_compat_thread(task_thread_info(next))) write_sysreg(next->thread.uw.tp_value, tpidrro_el0); - else if (!arm64_kernel_unmapped_at_el0()) + else write_sysreg(0, tpidrro_el0); write_sysreg(*task_user_tls(next), tpidr_el0); + if (system_supports_tpidr2()) + write_sysreg_s(next->thread.tpidr2_el0, SYS_TPIDR2_EL0); } -/* Restore the UAO state depending on next's addr_limit */ -void uao_thread_switch(struct task_struct *next) +/* + * Force SSBS state on context-switch, since it may be lost after migrating + * from a CPU which treats the bit as RES0 in a heterogeneous system. + */ +static void ssbs_thread_switch(struct task_struct *next) { - if (IS_ENABLED(CONFIG_ARM64_UAO)) { - if (task_thread_info(next)->addr_limit == KERNEL_DS) - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO)); - else - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO)); - } + /* + * Nothing to do for kernel threads, but 'regs' may be junk + * (e.g. idle task) so check the flags and bail early. + */ + if (unlikely(next->flags & PF_KTHREAD)) + return; + + /* + * If all CPUs implement the SSBS extension, then we just need to + * context-switch the PSTATE field. + */ + if (alternative_has_cap_unlikely(ARM64_SSBS)) + return; + + spectre_v4_enable_task_mitigation(next); } /* @@ -416,10 +575,134 @@ static void entry_task_switch(struct task_struct *next) __this_cpu_write(__entry_task, next); } +#ifdef CONFIG_ARM64_GCS + +void gcs_preserve_current_state(void) +{ + current->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); +} + +static void gcs_thread_switch(struct task_struct *next) +{ + if (!system_supports_gcs()) + return; + + /* GCSPR_EL0 is always readable */ + gcs_preserve_current_state(); + write_sysreg_s(next->thread.gcspr_el0, SYS_GCSPR_EL0); + + if (current->thread.gcs_el0_mode != next->thread.gcs_el0_mode) + gcs_set_el0_mode(next); + + /* + * Ensure that GCS memory effects of the 'prev' thread are + * ordered before other memory accesses with release semantics + * (or preceded by a DMB) on the current PE. In addition, any + * memory accesses with acquire semantics (or succeeded by a + * DMB) are ordered before GCS memory effects of the 'next' + * thread. This will ensure that the GCS memory effects are + * visible to other PEs in case of migration. + */ + if (task_gcs_el0_enabled(current) || task_gcs_el0_enabled(next)) + gcsb_dsync(); +} + +#else + +static void gcs_thread_switch(struct task_struct *next) +{ +} + +#endif + +/* + * Handle sysreg updates for ARM erratum 1418040 which affects the 32bit view of + * CNTVCT, various other errata which require trapping all CNTVCT{,_EL0} + * accesses and prctl(PR_SET_TSC). Ensure access is disabled iff a workaround is + * required or PR_TSC_SIGSEGV is set. + */ +static void update_cntkctl_el1(struct task_struct *next) +{ + struct thread_info *ti = task_thread_info(next); + + if (test_ti_thread_flag(ti, TIF_TSC_SIGSEGV) || + has_erratum_handler(read_cntvct_el0) || + (IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) && + this_cpu_has_cap(ARM64_WORKAROUND_1418040) && + is_compat_thread(ti))) + sysreg_clear_set(cntkctl_el1, ARCH_TIMER_USR_VCT_ACCESS_EN, 0); + else + sysreg_clear_set(cntkctl_el1, 0, ARCH_TIMER_USR_VCT_ACCESS_EN); +} + +static void cntkctl_thread_switch(struct task_struct *prev, + struct task_struct *next) +{ + if ((read_ti_thread_flags(task_thread_info(prev)) & + (_TIF_32BIT | _TIF_TSC_SIGSEGV)) != + (read_ti_thread_flags(task_thread_info(next)) & + (_TIF_32BIT | _TIF_TSC_SIGSEGV))) + update_cntkctl_el1(next); +} + +static int do_set_tsc_mode(unsigned int val) +{ + bool tsc_sigsegv; + + if (val == PR_TSC_SIGSEGV) + tsc_sigsegv = true; + else if (val == PR_TSC_ENABLE) + tsc_sigsegv = false; + else + return -EINVAL; + + preempt_disable(); + update_thread_flag(TIF_TSC_SIGSEGV, tsc_sigsegv); + update_cntkctl_el1(current); + preempt_enable(); + + return 0; +} + +static void permission_overlay_switch(struct task_struct *next) +{ + if (!system_supports_poe()) + return; + + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (current->thread.por_el0 != next->thread.por_el0) { + write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); + /* + * No ISB required as we can tolerate spurious Overlay faults - + * the fault handler will check again based on the new value + * of POR_EL0. + */ + } +} + +/* + * __switch_to() checks current->thread.sctlr_user as an optimisation. Therefore + * this function must be called with preemption disabled and the update to + * sctlr_user must be made in the same preemption disabled block so that + * __switch_to() does not see the variable update before the SCTLR_EL1 one. + */ +void update_sctlr_el1(u64 sctlr) +{ + /* + * EnIA must not be cleared while in the kernel as this is necessary for + * in-kernel PAC. It will be cleared on kernel exit if needed. + */ + sysreg_clear_set(sctlr_el1, SCTLR_USER_MASK & ~SCTLR_ELx_ENIA, sctlr); + + /* ISB required for the kernel uaccess routines when setting TCF0. */ + isb(); +} + /* * Thread switching. */ -__notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, +__notrace_funcgraph __sched +struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next) { struct task_struct *last; @@ -429,75 +712,260 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, hw_breakpoint_thread_switch(next); contextidr_thread_switch(next); entry_task_switch(next); - uao_thread_switch(next); - ptrauth_thread_switch(next); + ssbs_thread_switch(next); + cntkctl_thread_switch(prev, next); + ptrauth_thread_switch_user(next); + permission_overlay_switch(next); + gcs_thread_switch(next); /* - * Complete any pending TLB or cache maintenance on this CPU in case - * the thread migrates to a different CPU. - * This full barrier is also required by the membarrier system - * call. + * Complete any pending TLB or cache maintenance on this CPU in case the + * thread migrates to a different CPU. This full barrier is also + * required by the membarrier system call. Additionally it makes any + * in-progress pgtable writes visible to the table walker; See + * emit_pte_barriers(). */ dsb(ish); + /* + * MTE thread switching must happen after the DSB above to ensure that + * any asynchronous tag check faults have been logged in the TFSR*_EL1 + * registers. + */ + mte_thread_switch(next); + /* avoid expensive SCTLR_EL1 accesses if no change */ + if (prev->thread.sctlr_user != next->thread.sctlr_user) + update_sctlr_el1(next->thread.sctlr_user); + /* the actual thread switch */ last = cpu_switch_to(prev, next); return last; } -unsigned long get_wchan(struct task_struct *p) +struct wchan_info { + unsigned long pc; + int count; +}; + +static bool get_wchan_cb(void *arg, unsigned long pc) { - struct stackframe frame; - unsigned long stack_page, ret = 0; - int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) - return 0; + struct wchan_info *wchan_info = arg; + + if (!in_sched_functions(pc)) { + wchan_info->pc = pc; + return false; + } + return wchan_info->count++ < 16; +} - stack_page = (unsigned long)try_get_task_stack(p); - if (!stack_page) +unsigned long __get_wchan(struct task_struct *p) +{ + struct wchan_info wchan_info = { + .pc = 0, + .count = 0, + }; + + if (!try_get_task_stack(p)) return 0; - frame.fp = thread_saved_fp(p); - frame.pc = thread_saved_pc(p); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = 0; -#endif - do { - if (unwind_frame(p, &frame)) - goto out; - if (!in_sched_functions(frame.pc)) { - ret = frame.pc; - goto out; - } - } while (count ++ < 16); + arch_stack_walk(get_wchan_cb, &wchan_info, p, NULL); -out: put_task_stack(p); - return ret; + + return wchan_info.pc; } unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() & ~PAGE_MASK; + sp -= get_random_u32_below(PAGE_SIZE); return sp & ~0xf; } -unsigned long arch_randomize_brk(struct mm_struct *mm) +#ifdef CONFIG_COMPAT +int compat_elf_check_arch(const struct elf32_hdr *hdr) { - if (is_compat_task()) - return randomize_page(mm->brk, SZ_32M); - else - return randomize_page(mm->brk, SZ_1G); + if (!system_supports_32bit_el0()) + return false; + + if ((hdr)->e_machine != EM_ARM) + return false; + + if (!((hdr)->e_flags & EF_ARM_EABI_MASK)) + return false; + + /* + * Prevent execve() of a 32-bit program from a deadline task + * if the restricted affinity mask would be inadmissible on an + * asymmetric system. + */ + return !static_branch_unlikely(&arm64_mismatched_32bit_el0) || + !dl_task_check_affinity(current, system_32bit_el0_cpumask()); } +#endif /* * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. */ void arch_setup_new_exec(void) { - current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0; + unsigned long mmflags = 0; + + if (is_compat_task()) { + mmflags = MMCF_AARCH32; + + /* + * Restrict the CPU affinity mask for a 32-bit task so that + * it contains only 32-bit-capable CPUs. + * + * From the perspective of the task, this looks similar to + * what would happen if the 64-bit-only CPUs were hot-unplugged + * at the point of execve(), although we try a bit harder to + * honour the cpuset hierarchy. + */ + if (static_branch_unlikely(&arm64_mismatched_32bit_el0)) + force_compatible_cpus_allowed_ptr(current); + } else if (static_branch_unlikely(&arm64_mismatched_32bit_el0)) { + relax_compatible_cpus_allowed_ptr(current); + } + + current->mm->context.flags = mmflags; + ptrauth_thread_init_user(); + mte_thread_init_user(); + do_set_tsc_mode(PR_TSC_ENABLE); + + if (task_spec_ssb_noexec(current)) { + arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS, + PR_SPEC_ENABLE); + } +} + +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI +/* + * Control the relaxed ABI allowing tagged user addresses into the kernel. + */ +static unsigned int tagged_addr_disabled; + +long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg) +{ + unsigned long valid_mask = PR_TAGGED_ADDR_ENABLE; + struct thread_info *ti = task_thread_info(task); + + if (is_compat_thread(ti)) + return -EINVAL; + + if (system_supports_mte()) { + valid_mask |= PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC \ + | PR_MTE_TAG_MASK; + + if (cpus_have_cap(ARM64_MTE_STORE_ONLY)) + valid_mask |= PR_MTE_STORE_ONLY; + } + + if (arg & ~valid_mask) + return -EINVAL; + + /* + * Do not allow the enabling of the tagged address ABI if globally + * disabled via sysctl abi.tagged_addr_disabled. + */ + if (arg & PR_TAGGED_ADDR_ENABLE && tagged_addr_disabled) + return -EINVAL; + + if (set_mte_ctrl(task, arg) != 0) + return -EINVAL; + + update_ti_thread_flag(ti, TIF_TAGGED_ADDR, arg & PR_TAGGED_ADDR_ENABLE); + + return 0; +} + +long get_tagged_addr_ctrl(struct task_struct *task) +{ + long ret = 0; + struct thread_info *ti = task_thread_info(task); + + if (is_compat_thread(ti)) + return -EINVAL; + + if (test_ti_thread_flag(ti, TIF_TAGGED_ADDR)) + ret = PR_TAGGED_ADDR_ENABLE; + + ret |= get_mte_ctrl(task); + + return ret; +} + +/* + * Global sysctl to disable the tagged user addresses support. This control + * only prevents the tagged address ABI enabling via prctl() and does not + * disable it for tasks that already opted in to the relaxed ABI. + */ + +static const struct ctl_table tagged_addr_sysctl_table[] = { + { + .procname = "tagged_addr_disabled", + .mode = 0644, + .data = &tagged_addr_disabled, + .maxlen = sizeof(int), + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static int __init tagged_addr_init(void) +{ + if (!register_sysctl("abi", tagged_addr_sysctl_table)) + return -EINVAL; + return 0; +} + +core_initcall(tagged_addr_init); +#endif /* CONFIG_ARM64_TAGGED_ADDR_ABI */ + +#ifdef CONFIG_BINFMT_ELF +int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state, + bool has_interp, bool is_interp) +{ + /* + * For dynamically linked executables the interpreter is + * responsible for setting PROT_BTI on everything except + * itself. + */ + if (is_interp != has_interp) + return prot; + + if (!(state->flags & ARM64_ELF_BTI)) + return prot; + + if (prot & PROT_EXEC) + prot |= PROT_BTI; + + return prot; +} +#endif + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (is_compat_task()) + return -EINVAL; + + if (test_thread_flag(TIF_TSC_SIGSEGV)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (is_compat_task()) + return -EINVAL; - ptrauth_thread_init_user(current); + return do_set_tsc_mode(val); } diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c new file mode 100644 index 000000000000..80a580e019c5 --- /dev/null +++ b/arch/arm64/kernel/proton-pack.c @@ -0,0 +1,1204 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Handle detection, reporting and mitigation of Spectre v1, v2, v3a and v4, as + * detailed at: + * + * https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability + * + * This code was originally written hastily under an awful lot of stress and so + * aspects of it are somewhat hacky. Unfortunately, changing anything in here + * instantly makes me feel ill. Thanks, Jann. Thann. + * + * Copyright (C) 2018 ARM Ltd, All Rights Reserved. + * Copyright (C) 2020 Google LLC + * + * "If there's something strange in your neighbourhood, who you gonna call?" + * + * Authors: Will Deacon <will@kernel.org> and Marc Zyngier <maz@kernel.org> + */ + +#include <linux/arm-smccc.h> +#include <linux/bpf.h> +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/nospec.h> +#include <linux/prctl.h> +#include <linux/sched/task_stack.h> + +#include <asm/debug-monitors.h> +#include <asm/insn.h> +#include <asm/spectre.h> +#include <asm/traps.h> +#include <asm/vectors.h> +#include <asm/virt.h> + +/* + * We try to ensure that the mitigation state can never change as the result of + * onlining a late CPU. + */ +static void update_mitigation_state(enum mitigation_state *oldp, + enum mitigation_state new) +{ + enum mitigation_state state; + + do { + state = READ_ONCE(*oldp); + if (new <= state) + break; + + /* Userspace almost certainly can't deal with this. */ + if (WARN_ON(system_capabilities_finalized())) + break; + } while (cmpxchg_relaxed(oldp, state, new) != state); +} + +/* + * Spectre v1. + * + * The kernel can't protect userspace for this one: it's each person for + * themselves. Advertise what we're doing and be done with it. + */ +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} + +/* + * Spectre v2. + * + * This one sucks. A CPU is either: + * + * - Mitigated in hardware and advertised by ID_AA64PFR0_EL1.CSV2. + * - Mitigated in hardware and listed in our "safe list". + * - Mitigated in software by firmware. + * - Mitigated in software by a CPU-specific dance in the kernel and a + * firmware call at EL2. + * - Vulnerable. + * + * It's not unlikely for different CPUs in a big.LITTLE system to fall into + * different camps. + */ +static enum mitigation_state spectre_v2_state; + +static bool __read_mostly __nospectre_v2; +static int __init parse_spectre_v2_param(char *str) +{ + __nospectre_v2 = true; + return 0; +} +early_param("nospectre_v2", parse_spectre_v2_param); + +static bool spectre_v2_mitigations_off(void) +{ + return __nospectre_v2 || cpu_mitigations_off(); +} + +static const char *get_bhb_affected_string(enum mitigation_state bhb_state) +{ + switch (bhb_state) { + case SPECTRE_UNAFFECTED: + return ""; + default: + case SPECTRE_VULNERABLE: + return ", but not BHB"; + case SPECTRE_MITIGATED: + return ", BHB"; + } +} + +static bool _unprivileged_ebpf_enabled(void) +{ +#ifdef CONFIG_BPF_SYSCALL + return !sysctl_unprivileged_bpf_disabled; +#else + return false; +#endif +} + +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, + char *buf) +{ + enum mitigation_state bhb_state = arm64_get_spectre_bhb_state(); + const char *bhb_str = get_bhb_affected_string(bhb_state); + const char *v2_str = "Branch predictor hardening"; + + switch (spectre_v2_state) { + case SPECTRE_UNAFFECTED: + if (bhb_state == SPECTRE_UNAFFECTED) + return sprintf(buf, "Not affected\n"); + + /* + * Platforms affected by Spectre-BHB can't report + * "Not affected" for Spectre-v2. + */ + v2_str = "CSV2"; + fallthrough; + case SPECTRE_MITIGATED: + if (bhb_state == SPECTRE_MITIGATED && _unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + + return sprintf(buf, "Mitigation: %s%s\n", v2_str, bhb_str); + case SPECTRE_VULNERABLE: + fallthrough; + default: + return sprintf(buf, "Vulnerable\n"); + } +} + +static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void) +{ + u64 pfr0; + static const struct midr_range spectre_v2_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + { /* sentinel */ } + }; + + /* If the CPU has CSV2 set, we're safe */ + pfr0 = read_cpuid(ID_AA64PFR0_EL1); + if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_CSV2_SHIFT)) + return SPECTRE_UNAFFECTED; + + /* Alternatively, we have a list of unaffected CPUs */ + if (is_midr_in_range_list(spectre_v2_safe_list)) + return SPECTRE_UNAFFECTED; + + return SPECTRE_VULNERABLE; +} + +static enum mitigation_state spectre_v2_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_1, &res); + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + default: + fallthrough; + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +bool has_spectre_v2(const struct arm64_cpu_capabilities *entry, int scope) +{ + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + if (spectre_v2_get_cpu_hw_mitigation_state() == SPECTRE_UNAFFECTED) + return false; + + if (spectre_v2_get_cpu_fw_mitigation_state() == SPECTRE_UNAFFECTED) + return false; + + return true; +} + +enum mitigation_state arm64_get_spectre_v2_state(void) +{ + return spectre_v2_state; +} + +DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); + +static void install_bp_hardening_cb(bp_hardening_cb_t fn) +{ + __this_cpu_write(bp_hardening_data.fn, fn); + + /* + * Vinz Clortho takes the hyp_vecs start/end "keys" at + * the door when we're a guest. Skip the hyp-vectors work. + */ + if (!is_hyp_mode_available()) + return; + + __this_cpu_write(bp_hardening_data.slot, HYP_VECTOR_SPECTRE_DIRECT); +} + +/* Called during entry so must be noinstr */ +static noinstr void call_smc_arch_workaround_1(void) +{ + arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); +} + +/* Called during entry so must be noinstr */ +static noinstr void call_hvc_arch_workaround_1(void) +{ + arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); +} + +/* Called during entry so must be noinstr */ +static noinstr void qcom_link_stack_sanitisation(void) +{ + u64 tmp; + + asm volatile("mov %0, x30 \n" + ".rept 16 \n" + "bl . + 4 \n" + ".endr \n" + "mov x30, %0 \n" + : "=&r" (tmp)); +} + +static bp_hardening_cb_t spectre_v2_get_sw_mitigation_cb(void) +{ + u32 midr = read_cpuid_id(); + if (((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR) && + ((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR_V1)) + return NULL; + + return qcom_link_stack_sanitisation; +} + +static enum mitigation_state spectre_v2_enable_fw_mitigation(void) +{ + bp_hardening_cb_t cb; + enum mitigation_state state; + + state = spectre_v2_get_cpu_fw_mitigation_state(); + if (state != SPECTRE_MITIGATED) + return state; + + if (spectre_v2_mitigations_off()) + return SPECTRE_VULNERABLE; + + switch (arm_smccc_1_1_get_conduit()) { + case SMCCC_CONDUIT_HVC: + cb = call_hvc_arch_workaround_1; + break; + + case SMCCC_CONDUIT_SMC: + cb = call_smc_arch_workaround_1; + break; + + default: + return SPECTRE_VULNERABLE; + } + + /* + * Prefer a CPU-specific workaround if it exists. Note that we + * still rely on firmware for the mitigation at EL2. + */ + cb = spectre_v2_get_sw_mitigation_cb() ?: cb; + install_bp_hardening_cb(cb); + return SPECTRE_MITIGATED; +} + +void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused) +{ + enum mitigation_state state; + + WARN_ON(preemptible()); + + state = spectre_v2_get_cpu_hw_mitigation_state(); + if (state == SPECTRE_VULNERABLE) + state = spectre_v2_enable_fw_mitigation(); + + update_mitigation_state(&spectre_v2_state, state); +} + +/* + * Spectre-v3a. + * + * Phew, there's not an awful lot to do here! We just instruct EL2 to use + * an indirect trampoline for the hyp vectors so that guests can't read + * VBAR_EL2 to defeat randomisation of the hypervisor VA layout. + */ +bool has_spectre_v3a(const struct arm64_cpu_capabilities *entry, int scope) +{ + static const struct midr_range spectre_v3a_unsafe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + {}, + }; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + return is_midr_in_range_list(spectre_v3a_unsafe_list); +} + +void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused) +{ + struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); + + if (this_cpu_has_cap(ARM64_SPECTRE_V3A)) + data->slot += HYP_VECTOR_INDIRECT; +} + +/* + * Spectre v4. + * + * If you thought Spectre v2 was nasty, wait until you see this mess. A CPU is + * either: + * + * - Mitigated in hardware and listed in our "safe list". + * - Mitigated in hardware via PSTATE.SSBS. + * - Mitigated in software by firmware (sometimes referred to as SSBD). + * + * Wait, that doesn't sound so bad, does it? Keep reading... + * + * A major source of headaches is that the software mitigation is enabled both + * on a per-task basis, but can also be forced on for the kernel, necessitating + * both context-switch *and* entry/exit hooks. To make it even worse, some CPUs + * allow EL0 to toggle SSBS directly, which can end up with the prctl() state + * being stale when re-entering the kernel. The usual big.LITTLE caveats apply, + * so you can have systems that have both firmware and SSBS mitigations. This + * means we actually have to reject late onlining of CPUs with mitigations if + * all of the currently onlined CPUs are safelisted, as the mitigation tends to + * be opt-in for userspace. Yes, really, the cure is worse than the disease. + * + * The only good part is that if the firmware mitigation is present, then it is + * present for all CPUs, meaning we don't have to worry about late onlining of a + * vulnerable CPU if one of the boot CPUs is using the firmware mitigation. + * + * Give me a VAX-11/780 any day of the week... + */ +static enum mitigation_state spectre_v4_state; + +/* This is the per-cpu state tracking whether we need to talk to firmware */ +DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); + +enum spectre_v4_policy { + SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, + SPECTRE_V4_POLICY_MITIGATION_ENABLED, + SPECTRE_V4_POLICY_MITIGATION_DISABLED, +}; + +static enum spectre_v4_policy __read_mostly __spectre_v4_policy; + +static const struct spectre_v4_param { + const char *str; + enum spectre_v4_policy policy; +} spectre_v4_params[] = { + { "force-on", SPECTRE_V4_POLICY_MITIGATION_ENABLED, }, + { "force-off", SPECTRE_V4_POLICY_MITIGATION_DISABLED, }, + { "kernel", SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, }, +}; +static int __init parse_spectre_v4_param(char *str) +{ + int i; + + if (!str || !str[0]) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(spectre_v4_params); i++) { + const struct spectre_v4_param *param = &spectre_v4_params[i]; + + if (strncmp(str, param->str, strlen(param->str))) + continue; + + __spectre_v4_policy = param->policy; + return 0; + } + + return -EINVAL; +} +early_param("ssbd", parse_spectre_v4_param); + +/* + * Because this was all written in a rush by people working in different silos, + * we've ended up with multiple command line options to control the same thing. + * Wrap these up in some helpers, which prefer disabling the mitigation if faced + * with contradictory parameters. The mitigation is always either "off", + * "dynamic" or "on". + */ +static bool spectre_v4_mitigations_off(void) +{ + return cpu_mitigations_off() || + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED; +} + +/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */ +static bool spectre_v4_mitigations_dynamic(void) +{ + return !spectre_v4_mitigations_off() && + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DYNAMIC; +} + +static bool spectre_v4_mitigations_on(void) +{ + return !spectre_v4_mitigations_off() && + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_ENABLED; +} + +ssize_t cpu_show_spec_store_bypass(struct device *dev, + struct device_attribute *attr, char *buf) +{ + switch (spectre_v4_state) { + case SPECTRE_UNAFFECTED: + return sprintf(buf, "Not affected\n"); + case SPECTRE_MITIGATED: + return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n"); + case SPECTRE_VULNERABLE: + fallthrough; + default: + return sprintf(buf, "Vulnerable\n"); + } +} + +enum mitigation_state arm64_get_spectre_v4_state(void) +{ + return spectre_v4_state; +} + +static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void) +{ + static const struct midr_range spectre_v4_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + { /* sentinel */ }, + }; + + if (is_midr_in_range_list(spectre_v4_safe_list)) + return SPECTRE_UNAFFECTED; + + /* CPU features are detected first */ + if (this_cpu_has_cap(ARM64_SSBS)) + return SPECTRE_MITIGATED; + + return SPECTRE_VULNERABLE; +} + +static enum mitigation_state spectre_v4_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_2, &res); + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + fallthrough; + case SMCCC_RET_NOT_REQUIRED: + return SPECTRE_UNAFFECTED; + default: + fallthrough; + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope) +{ + enum mitigation_state state; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + state = spectre_v4_get_cpu_hw_mitigation_state(); + if (state == SPECTRE_VULNERABLE) + state = spectre_v4_get_cpu_fw_mitigation_state(); + + return state != SPECTRE_UNAFFECTED; +} + +bool try_emulate_el1_ssbs(struct pt_regs *regs, u32 instr) +{ + const u32 instr_mask = ~(1U << PSTATE_Imm_shift); + const u32 instr_val = 0xd500401f | PSTATE_SSBS; + + if ((instr & instr_mask) != instr_val) + return false; + + if (instr & BIT(PSTATE_Imm_shift)) + regs->pstate |= PSR_SSBS_BIT; + else + regs->pstate &= ~PSR_SSBS_BIT; + + arm64_skip_faulting_instruction(regs, 4); + return true; +} + +static enum mitigation_state spectre_v4_enable_hw_mitigation(void) +{ + enum mitigation_state state; + + /* + * If the system is mitigated but this CPU doesn't have SSBS, then + * we must be on the safelist and there's nothing more to do. + */ + state = spectre_v4_get_cpu_hw_mitigation_state(); + if (state != SPECTRE_MITIGATED || !this_cpu_has_cap(ARM64_SSBS)) + return state; + + if (spectre_v4_mitigations_off()) { + sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS); + set_pstate_ssbs(1); + return SPECTRE_VULNERABLE; + } + + /* SCTLR_EL1.DSSBS was initialised to 0 during boot */ + set_pstate_ssbs(0); + + /* + * SSBS is self-synchronizing and is intended to affect subsequent + * speculative instructions, but some CPUs can speculate with a stale + * value of SSBS. + * + * Mitigate this with an unconditional speculation barrier, as CPUs + * could mis-speculate branches and bypass a conditional barrier. + */ + if (IS_ENABLED(CONFIG_ARM64_ERRATUM_3194386)) + spec_bar(); + + return SPECTRE_MITIGATED; +} + +/* + * Patch a branch over the Spectre-v4 mitigation code with a NOP so that + * we fallthrough and check whether firmware needs to be called on this CPU. + */ +void __init spectre_v4_patch_fw_mitigation_enable(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); /* Branch -> NOP */ + + if (spectre_v4_mitigations_off()) + return; + + if (cpus_have_cap(ARM64_SSBS)) + return; + + if (spectre_v4_mitigations_dynamic()) + *updptr = cpu_to_le32(aarch64_insn_gen_nop()); +} + +/* + * Patch a NOP in the Spectre-v4 mitigation code with an SMC/HVC instruction + * to call into firmware to adjust the mitigation state. + */ +void __init smccc_patch_fw_mitigation_conduit(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + u32 insn; + + BUG_ON(nr_inst != 1); /* NOP -> HVC/SMC */ + + switch (arm_smccc_1_1_get_conduit()) { + case SMCCC_CONDUIT_HVC: + insn = aarch64_insn_get_hvc_value(); + break; + case SMCCC_CONDUIT_SMC: + insn = aarch64_insn_get_smc_value(); + break; + default: + return; + } + + *updptr = cpu_to_le32(insn); +} + +static enum mitigation_state spectre_v4_enable_fw_mitigation(void) +{ + enum mitigation_state state; + + state = spectre_v4_get_cpu_fw_mitigation_state(); + if (state != SPECTRE_MITIGATED) + return state; + + if (spectre_v4_mitigations_off()) { + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, false, NULL); + return SPECTRE_VULNERABLE; + } + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, true, NULL); + + if (spectre_v4_mitigations_dynamic()) + __this_cpu_write(arm64_ssbd_callback_required, 1); + + return SPECTRE_MITIGATED; +} + +void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused) +{ + enum mitigation_state state; + + WARN_ON(preemptible()); + + state = spectre_v4_enable_hw_mitigation(); + if (state == SPECTRE_VULNERABLE) + state = spectre_v4_enable_fw_mitigation(); + + update_mitigation_state(&spectre_v4_state, state); +} + +static void __update_pstate_ssbs(struct pt_regs *regs, bool state) +{ + u64 bit = compat_user_mode(regs) ? PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; + + if (state) + regs->pstate |= bit; + else + regs->pstate &= ~bit; +} + +void spectre_v4_enable_task_mitigation(struct task_struct *tsk) +{ + struct pt_regs *regs = task_pt_regs(tsk); + bool ssbs = false, kthread = tsk->flags & PF_KTHREAD; + + if (spectre_v4_mitigations_off()) + ssbs = true; + else if (spectre_v4_mitigations_dynamic() && !kthread) + ssbs = !test_tsk_thread_flag(tsk, TIF_SSBD); + + __update_pstate_ssbs(regs, ssbs); +} + +/* + * The Spectre-v4 mitigation can be controlled via a prctl() from userspace. + * This is interesting because the "speculation disabled" behaviour can be + * configured so that it is preserved across exec(), which means that the + * prctl() may be necessary even when PSTATE.SSBS can be toggled directly + * from userspace. + */ +static void ssbd_prctl_enable_mitigation(struct task_struct *task) +{ + task_clear_spec_ssb_noexec(task); + task_set_spec_ssb_disable(task); + set_tsk_thread_flag(task, TIF_SSBD); +} + +static void ssbd_prctl_disable_mitigation(struct task_struct *task) +{ + task_clear_spec_ssb_noexec(task); + task_clear_spec_ssb_disable(task); + clear_tsk_thread_flag(task, TIF_SSBD); +} + +static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + switch (ctrl) { + case PR_SPEC_ENABLE: + /* Enable speculation: disable mitigation */ + /* + * Force disabled speculation prevents it from being + * re-enabled. + */ + if (task_spec_ssb_force_disable(task)) + return -EPERM; + + /* + * If the mitigation is forced on, then speculation is forced + * off and we again prevent it from being re-enabled. + */ + if (spectre_v4_mitigations_on()) + return -EPERM; + + ssbd_prctl_disable_mitigation(task); + break; + case PR_SPEC_FORCE_DISABLE: + /* Force disable speculation: force enable mitigation */ + /* + * If the mitigation is forced off, then speculation is forced + * on and we prevent it from being disabled. + */ + if (spectre_v4_mitigations_off()) + return -EPERM; + + task_set_spec_ssb_force_disable(task); + fallthrough; + case PR_SPEC_DISABLE: + /* Disable speculation: enable mitigation */ + /* Same as PR_SPEC_FORCE_DISABLE */ + if (spectre_v4_mitigations_off()) + return -EPERM; + + ssbd_prctl_enable_mitigation(task); + break; + case PR_SPEC_DISABLE_NOEXEC: + /* Disable speculation until execve(): enable mitigation */ + /* + * If the mitigation state is forced one way or the other, then + * we must fail now before we try to toggle it on execve(). + */ + if (task_spec_ssb_force_disable(task) || + spectre_v4_mitigations_off() || + spectre_v4_mitigations_on()) { + return -EPERM; + } + + ssbd_prctl_enable_mitigation(task); + task_set_spec_ssb_noexec(task); + break; + default: + return -ERANGE; + } + + spectre_v4_enable_task_mitigation(task); + return 0; +} + +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssbd_prctl_set(task, ctrl); + default: + return -ENODEV; + } +} + +static int ssbd_prctl_get(struct task_struct *task) +{ + switch (spectre_v4_state) { + case SPECTRE_UNAFFECTED: + return PR_SPEC_NOT_AFFECTED; + case SPECTRE_MITIGATED: + if (spectre_v4_mitigations_on()) + return PR_SPEC_NOT_AFFECTED; + + if (spectre_v4_mitigations_dynamic()) + break; + + /* Mitigations are disabled, so we're vulnerable. */ + fallthrough; + case SPECTRE_VULNERABLE: + fallthrough; + default: + return PR_SPEC_ENABLE; + } + + /* Check the mitigation state for this task */ + if (task_spec_ssb_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + + if (task_spec_ssb_noexec(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC; + + if (task_spec_ssb_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; +} + +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssbd_prctl_get(task); + default: + return -ENODEV; + } +} + +/* + * Spectre BHB. + * + * A CPU is either: + * - Mitigated by a branchy loop a CPU specific number of times, and listed + * in our "loop mitigated list". + * - Mitigated in software by the firmware Spectre v2 call. + * - Has the ClearBHB instruction to perform the mitigation. + * - Has the 'Exception Clears Branch History Buffer' (ECBHB) feature, so no + * software mitigation in the vectors is needed. + * - Has CSV2.3, so is unaffected. + */ +static enum mitigation_state spectre_bhb_state; + +enum mitigation_state arm64_get_spectre_bhb_state(void) +{ + return spectre_bhb_state; +} + +enum bhb_mitigation_bits { + BHB_LOOP, + BHB_FW, + BHB_HW, + BHB_INSN, +}; +static unsigned long system_bhb_mitigations; + +/* + * This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any + * SCOPE_SYSTEM call will give the right answer. + */ +static bool is_spectre_bhb_safe(int scope) +{ + static const struct midr_range spectre_bhb_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A510), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A520), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + {}, + }; + static bool all_safe = true; + + if (scope != SCOPE_LOCAL_CPU) + return all_safe; + + if (is_midr_in_range_list(spectre_bhb_safe_list)) + return true; + + all_safe = false; + + return false; +} + +static u8 spectre_bhb_loop_affected(void) +{ + u8 k = 0; + + static const struct midr_range spectre_bhb_k132_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), + {}, + }; + static const struct midr_range spectre_bhb_k38_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE), + {}, + }; + static const struct midr_range spectre_bhb_k32_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + {}, + }; + static const struct midr_range spectre_bhb_k24_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD), + MIDR_ALL_VERSIONS(MIDR_HISI_HIP09), + {}, + }; + static const struct midr_range spectre_bhb_k11_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + {}, + }; + static const struct midr_range spectre_bhb_k8_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + {}, + }; + + if (is_midr_in_range_list(spectre_bhb_k132_list)) + k = 132; + else if (is_midr_in_range_list(spectre_bhb_k38_list)) + k = 38; + else if (is_midr_in_range_list(spectre_bhb_k32_list)) + k = 32; + else if (is_midr_in_range_list(spectre_bhb_k24_list)) + k = 24; + else if (is_midr_in_range_list(spectre_bhb_k11_list)) + k = 11; + else if (is_midr_in_range_list(spectre_bhb_k8_list)) + k = 8; + + return k; +} + +static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_3, &res); + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + default: + fallthrough; + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +static bool has_spectre_bhb_fw_mitigation(void) +{ + enum mitigation_state fw_state; + bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE; + + fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); + return has_smccc && fw_state == SPECTRE_MITIGATED; +} + +static bool supports_ecbhb(int scope) +{ + u64 mmfr1; + + if (scope == SCOPE_LOCAL_CPU) + mmfr1 = read_sysreg_s(SYS_ID_AA64MMFR1_EL1); + else + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + + return cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_EL1_ECBHB_SHIFT); +} + +static u8 max_bhb_k; + +bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, + int scope) +{ + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + if (supports_csv2p3(scope)) + return false; + + if (is_spectre_bhb_safe(scope)) + return false; + + /* + * At this point the core isn't known to be "safe" so we're going to + * assume it's vulnerable. We still need to update `max_bhb_k` though, + * but only if we aren't mitigating with clearbhb though. + */ + if (scope == SCOPE_LOCAL_CPU && !supports_clearbhb(SCOPE_LOCAL_CPU)) + max_bhb_k = max(max_bhb_k, spectre_bhb_loop_affected()); + + return true; +} + +u8 get_spectre_bhb_loop_value(void) +{ + return max_bhb_k; +} + +static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) +{ + const char *v = arm64_get_bp_hardening_vector(slot); + + __this_cpu_write(this_cpu_vector, v); + + /* + * When KPTI is in use, the vectors are switched when exiting to + * user-space. + */ + if (cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0)) + return; + + write_sysreg(v, vbar_el1); + isb(); +} + +bool __read_mostly __nospectre_bhb; +static int __init parse_spectre_bhb_param(char *str) +{ + __nospectre_bhb = true; + return 0; +} +early_param("nospectre_bhb", parse_spectre_bhb_param); + +void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) +{ + bp_hardening_cb_t cpu_cb; + enum mitigation_state state = SPECTRE_VULNERABLE; + struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); + + if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU)) + return; + + if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) { + /* No point mitigating Spectre-BHB alone. */ + } else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) { + /* Do nothing */ + } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { + state = SPECTRE_MITIGATED; + set_bit(BHB_HW, &system_bhb_mitigations); + } else if (supports_clearbhb(SCOPE_LOCAL_CPU)) { + /* + * Ensure KVM uses the indirect vector which will have ClearBHB + * added. + */ + if (!data->slot) + data->slot = HYP_VECTOR_INDIRECT; + + this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN); + state = SPECTRE_MITIGATED; + set_bit(BHB_INSN, &system_bhb_mitigations); + } else if (spectre_bhb_loop_affected()) { + /* + * Ensure KVM uses the indirect vector which will have the + * branchy-loop added. A57/A72-r0 will already have selected + * the spectre-indirect vector, which is sufficient for BHB + * too. + */ + if (!data->slot) + data->slot = HYP_VECTOR_INDIRECT; + + this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP); + state = SPECTRE_MITIGATED; + set_bit(BHB_LOOP, &system_bhb_mitigations); + } else if (has_spectre_bhb_fw_mitigation()) { + /* + * Ensure KVM uses one of the spectre bp_hardening + * vectors. The indirect vector doesn't include the EL3 + * call, so needs upgrading to + * HYP_VECTOR_SPECTRE_INDIRECT. + */ + if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) + data->slot += 1; + + this_cpu_set_vectors(EL1_VECTOR_BHB_FW); + + /* + * The WA3 call in the vectors supersedes the WA1 call + * made during context-switch. Uninstall any firmware + * bp_hardening callback. + */ + cpu_cb = spectre_v2_get_sw_mitigation_cb(); + if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) + __this_cpu_write(bp_hardening_data.fn, NULL); + + state = SPECTRE_MITIGATED; + set_bit(BHB_FW, &system_bhb_mitigations); + } + + update_mitigation_state(&spectre_bhb_state, state); +} + +bool is_spectre_bhb_fw_mitigated(void) +{ + return test_bit(BHB_FW, &system_bhb_mitigations); +} + +/* Patched to NOP when enabled */ +void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); + + if (test_bit(BHB_LOOP, &system_bhb_mitigations)) + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); +} + +/* Patched to NOP when enabled */ +void noinstr spectre_bhb_patch_fw_mitigation_enabled(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); + + if (test_bit(BHB_FW, &system_bhb_mitigations)) + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); +} + +/* Patched to correct the immediate */ +void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u8 rd; + u32 insn; + + BUG_ON(nr_inst != 1); /* MOV -> MOV */ + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) + return; + + insn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + insn = aarch64_insn_gen_movewide(rd, max_bhb_k, 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr++ = cpu_to_le32(insn); +} + +/* Patched to mov WA3 when supported */ +void noinstr spectre_bhb_patch_wa3(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u8 rd; + u32 insn; + + BUG_ON(nr_inst != 1); /* MOV -> MOV */ + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY) || + !test_bit(BHB_FW, &system_bhb_mitigations)) + return; + + insn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + + insn = aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_ORR, + AARCH64_INSN_VARIANT_32BIT, + AARCH64_INSN_REG_ZR, rd, + ARM_SMCCC_ARCH_WORKAROUND_3); + if (WARN_ON_ONCE(insn == AARCH64_BREAK_FAULT)) + return; + + *updptr++ = cpu_to_le32(insn); +} + +/* Patched to NOP when not supported */ +void __init spectre_bhb_patch_clearbhb(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 2); + + if (test_bit(BHB_INSN, &system_bhb_mitigations)) + return; + + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); +} + +#ifdef CONFIG_BPF_SYSCALL +#define EBPF_WARN "Unprivileged eBPF is enabled, data leaks possible via Spectre v2 BHB attacks!\n" +void unpriv_ebpf_notify(int new_state) +{ + if (spectre_v2_state == SPECTRE_VULNERABLE || + spectre_bhb_state != SPECTRE_MITIGATED) + return; + + if (!new_state) + pr_err("WARNING: %s", EBPF_WARN); +} +#endif + +void spectre_print_disabled_mitigations(void) +{ + /* Keep a single copy of the common message suffix to avoid duplication. */ + const char *spectre_disabled_suffix = "mitigation disabled by command-line option\n"; + + if (spectre_v2_mitigations_off()) + pr_info("spectre-v2 %s", spectre_disabled_suffix); + + if (spectre_v4_mitigations_off()) + pr_info("spectre-v4 %s", spectre_disabled_suffix); + + if (__nospectre_bhb || cpu_mitigations_off()) + pr_info("spectre-bhb %s", spectre_disabled_suffix); +} diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c index 8cdaf25e99cd..fabd732d0a2d 100644 --- a/arch/arm64/kernel/psci.c +++ b/arch/arm64/kernel/psci.c @@ -1,12 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. * * Copyright (C) 2013 ARM Limited * @@ -45,14 +38,20 @@ static int __init cpu_psci_cpu_prepare(unsigned int cpu) static int cpu_psci_cpu_boot(unsigned int cpu) { - int err = psci_ops.cpu_on(cpu_logical_map(cpu), __pa_symbol(secondary_entry)); - if (err) + phys_addr_t pa_secondary_entry = __pa_symbol(secondary_entry); + int err = psci_ops.cpu_on(cpu_logical_map(cpu), pa_secondary_entry); + if (err && err != -EPERM) pr_err("failed to boot CPU%d (%d)\n", cpu, err); return err; } #ifdef CONFIG_HOTPLUG_CPU +static bool cpu_psci_cpu_can_disable(unsigned int cpu) +{ + return !psci_tos_resident_on(cpu); +} + static int cpu_psci_cpu_disable(unsigned int cpu) { /* Fail early if we don't have CPU_OFF support */ @@ -68,7 +67,6 @@ static int cpu_psci_cpu_disable(unsigned int cpu) static void cpu_psci_cpu_die(unsigned int cpu) { - int ret; /* * There are no known implementations of PSCI actually using the * power state field, pass a sensible default for now. @@ -76,14 +74,13 @@ static void cpu_psci_cpu_die(unsigned int cpu) u32 state = PSCI_POWER_STATE_TYPE_POWER_DOWN << PSCI_0_2_POWER_STATE_TYPE_SHIFT; - ret = psci_ops.cpu_off(state); - - pr_crit("unable to power off CPU%u (%d)\n", cpu, ret); + psci_ops.cpu_off(state); } static int cpu_psci_cpu_kill(unsigned int cpu) { - int err, i; + int err; + unsigned long start, end; if (!psci_ops.affinity_info) return 0; @@ -93,16 +90,18 @@ static int cpu_psci_cpu_kill(unsigned int cpu) * while it is dying. So, try again a few times. */ - for (i = 0; i < 10; i++) { + start = jiffies; + end = start + msecs_to_jiffies(100); + do { err = psci_ops.affinity_info(cpu_logical_map(cpu), 0); if (err == PSCI_0_2_AFFINITY_LEVEL_OFF) { - pr_info("CPU%d killed.\n", cpu); + pr_info("CPU%d killed (polled %d ms)\n", cpu, + jiffies_to_msecs(jiffies - start)); return 0; } - msleep(10); - pr_info("Retrying again to check for CPU kill\n"); - } + usleep_range(100, 1000); + } while (time_before(jiffies, end)); pr_warn("CPU%d may not have shut down cleanly (AFFINITY_INFO reports %d)\n", cpu, err); @@ -112,14 +111,11 @@ static int cpu_psci_cpu_kill(unsigned int cpu) const struct cpu_operations cpu_psci_ops = { .name = "psci", -#ifdef CONFIG_CPU_IDLE - .cpu_init_idle = psci_cpu_init_idle, - .cpu_suspend = psci_cpu_suspend_enter, -#endif .cpu_init = cpu_psci_cpu_init, .cpu_prepare = cpu_psci_cpu_prepare, .cpu_boot = cpu_psci_cpu_boot, #ifdef CONFIG_HOTPLUG_CPU + .cpu_can_disable = cpu_psci_cpu_can_disable, .cpu_disable = cpu_psci_cpu_disable, .cpu_die = cpu_psci_cpu_die, .cpu_kill = cpu_psci_cpu_kill, diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 9dce33b0e260..b9bdd83fbbca 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/ptrace.c * @@ -5,18 +6,6 @@ * edited by Linus Torvalds * ARM modifications Copyright (C) 2000 Russell King * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/audit.h> @@ -38,14 +27,15 @@ #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/regset.h> -#include <linux/tracehook.h> #include <linux/elf.h> +#include <linux/rseq.h> #include <asm/compat.h> #include <asm/cpufeature.h> #include <asm/debug-monitors.h> #include <asm/fpsimd.h> -#include <asm/pgtable.h> +#include <asm/gcs.h> +#include <asm/mte.h> #include <asm/pointer_auth.h> #include <asm/stacktrace.h> #include <asm/syscall.h> @@ -133,7 +123,7 @@ static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) { return ((addr & ~(THREAD_SIZE - 1)) == (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))) || - on_irq_stack(addr, NULL); + on_irq_stack(addr, sizeof(unsigned long)); } /** @@ -151,7 +141,7 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) addr += n; if (regs_within_kernel_stack(regs, (unsigned long)addr)) - return *addr; + return READ_ONCE_NOCHECK(*addr); else return 0; } @@ -185,7 +175,6 @@ static void ptrace_hbptriggered(struct perf_event *bp, struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp); const char *desc = "Hardware breakpoint trap (ptrace)"; -#ifdef CONFIG_COMPAT if (is_compat_task()) { int si_errno = 0; int i; @@ -203,14 +192,12 @@ static void ptrace_hbptriggered(struct perf_event *bp, break; } } - arm64_force_sig_ptrace_errno_trap(si_errno, - (void __user *)bkpt->trigger, + arm64_force_sig_ptrace_errno_trap(si_errno, bkpt->trigger, desc); + return; } -#endif - arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, - (void __user *)(bkpt->trigger), - desc); + + arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, bkpt->trigger, desc); } /* @@ -486,11 +473,10 @@ static int ptrace_hbp_set_addr(unsigned int note_type, static int hw_break_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { unsigned int note_type = regset->core_note_type; - int ret, idx = 0, offset, limit; + int ret, idx = 0; u32 info, ctrl; u64 addr; @@ -499,49 +485,21 @@ static int hw_break_get(struct task_struct *target, if (ret) return ret; - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &info, 0, - sizeof(info)); - if (ret) - return ret; - - /* Pad */ - offset = offsetof(struct user_hwdebug_state, pad); - ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, offset, - offset + PTRACE_HBP_PAD_SZ); - if (ret) - return ret; - + membuf_write(&to, &info, sizeof(info)); + membuf_zero(&to, sizeof(u32)); /* (address, ctrl) registers */ - offset = offsetof(struct user_hwdebug_state, dbg_regs); - limit = regset->n * regset->size; - while (count && offset < limit) { + while (to.left) { ret = ptrace_hbp_get_addr(note_type, target, idx, &addr); if (ret) return ret; - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &addr, - offset, offset + PTRACE_HBP_ADDR_SZ); - if (ret) - return ret; - offset += PTRACE_HBP_ADDR_SZ; - ret = ptrace_hbp_get_ctrl(note_type, target, idx, &ctrl); if (ret) return ret; - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &ctrl, - offset, offset + PTRACE_HBP_CTRL_SZ); - if (ret) - return ret; - offset += PTRACE_HBP_CTRL_SZ; - - ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - offset, - offset + PTRACE_HBP_PAD_SZ); - if (ret) - return ret; - offset += PTRACE_HBP_PAD_SZ; + membuf_store(&to, addr); + membuf_store(&to, ctrl); + membuf_zero(&to, sizeof(u32)); idx++; } - return 0; } @@ -557,9 +515,7 @@ static int hw_break_set(struct task_struct *target, /* Resource info and pad */ offset = offsetof(struct user_hwdebug_state, dbg_regs); - ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, offset); - if (ret) - return ret; + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, offset); /* (address, ctrl) registers */ limit = regset->n * regset->size; @@ -586,11 +542,8 @@ static int hw_break_set(struct task_struct *target, return ret; offset += PTRACE_HBP_CTRL_SZ; - ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, - offset, - offset + PTRACE_HBP_PAD_SZ); - if (ret) - return ret; + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, + offset, offset + PTRACE_HBP_PAD_SZ); offset += PTRACE_HBP_PAD_SZ; idx++; } @@ -601,11 +554,10 @@ static int hw_break_set(struct task_struct *target, static int gpr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct user_pt_regs *uregs = &task_pt_regs(target)->user_regs; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, 0, -1); + return membuf_write(&to, uregs, sizeof(*uregs)); } static int gpr_set(struct task_struct *target, const struct user_regset *regset, @@ -626,32 +578,39 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset, return 0; } +static int fpr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!system_supports_fpsimd()) + return -ENODEV; + return regset->n; +} + /* * TODO: update fp accessors for lazy context switching (sync/flush hwstate) */ static int __fpr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, unsigned int start_pos) + struct membuf to) { struct user_fpsimd_state *uregs; - sve_sync_to_fpsimd(target); + fpsimd_sync_from_effective_state(target); uregs = &target->thread.uw.fpsimd_state; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, - start_pos, start_pos + sizeof(*uregs)); + return membuf_write(&to, uregs, sizeof(*uregs)); } static int fpr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { + if (!system_supports_fpsimd()) + return -EINVAL; + if (target == current) fpsimd_preserve_current_state(); - return __fpr_get(target, regset, pos, count, kbuf, ubuf, 0); + return __fpr_get(target, regset, to); } static int __fpr_set(struct task_struct *target, @@ -667,7 +626,7 @@ static int __fpr_set(struct task_struct *target, * Ensure target->thread.uw.fpsimd_state is up to date, so that a * short copyin can't resurrect stale data. */ - sve_sync_to_fpsimd(target); + fpsimd_sync_from_effective_state(target); newstate = target->thread.uw.fpsimd_state; @@ -687,26 +646,34 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, { int ret; + if (!system_supports_fpsimd()) + return -EINVAL; + ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, 0); if (ret) return ret; - sve_sync_from_fpsimd_zeropad(target); + fpsimd_sync_to_effective_state_zeropad(target); fpsimd_flush_task_state(target); return ret; } static int tls_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - unsigned long *tls = &target->thread.uw.tp_value; + int ret; if (target == current) tls_preserve_current_state(); - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, tls, 0, -1); + ret = membuf_store(&to, target->thread.uw.tp_value); + if (system_supports_tpidr2()) + ret = membuf_store(&to, target->thread.tpidr2_el0); + else + ret = membuf_zero(&to, sizeof(u64)); + + return ret; } static int tls_set(struct task_struct *target, const struct user_regset *regset, @@ -714,25 +681,63 @@ static int tls_set(struct task_struct *target, const struct user_regset *regset, const void *kbuf, const void __user *ubuf) { int ret; - unsigned long tls = target->thread.uw.tp_value; + unsigned long tls[2]; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &tls, 0, -1); + tls[0] = target->thread.uw.tp_value; + if (system_supports_tpidr2()) + tls[1] = target->thread.tpidr2_el0; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, tls, 0, count); if (ret) return ret; - target->thread.uw.tp_value = tls; + target->thread.uw.tp_value = tls[0]; + if (system_supports_tpidr2()) + target->thread.tpidr2_el0 = tls[1]; + return ret; } +static int fpmr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_fpmr()) + return -EINVAL; + + if (target == current) + fpsimd_preserve_current_state(); + + return membuf_store(&to, target->thread.uw.fpmr); +} + +static int fpmr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + unsigned long fpmr; + + if (!system_supports_fpmr()) + return -EINVAL; + + fpmr = target->thread.uw.fpmr; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpmr, 0, count); + if (ret) + return ret; + + target->thread.uw.fpmr = fpmr; + + fpsimd_flush_task_state(target); + + return 0; +} + static int system_call_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - int syscallno = task_pt_regs(target)->syscallno; - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &syscallno, 0, -1); + return membuf_store(&to, task_pt_regs(target)->syscallno); } static int system_call_set(struct task_struct *target, @@ -754,22 +759,49 @@ static int system_call_set(struct task_struct *target, #ifdef CONFIG_ARM64_SVE static void sve_init_header_from_task(struct user_sve_header *header, - struct task_struct *target) + struct task_struct *target, + enum vec_type type) { unsigned int vq; + bool active; + enum vec_type task_type; memset(header, 0, sizeof(*header)); - header->flags = test_tsk_thread_flag(target, TIF_SVE) ? - SVE_PT_REGS_SVE : SVE_PT_REGS_FPSIMD; - if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) - header->flags |= SVE_PT_VL_INHERIT; + /* Check if the requested registers are active for the task */ + if (thread_sm_enabled(&target->thread)) + task_type = ARM64_VEC_SME; + else + task_type = ARM64_VEC_SVE; + active = (task_type == type); + + if (active && target->thread.fp_type == FP_STATE_SVE) + header->flags = SVE_PT_REGS_SVE; + else + header->flags = SVE_PT_REGS_FPSIMD; - header->vl = target->thread.sve_vl; + switch (type) { + case ARM64_VEC_SVE: + if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) + header->flags |= SVE_PT_VL_INHERIT; + break; + case ARM64_VEC_SME: + if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) + header->flags |= SVE_PT_VL_INHERIT; + break; + default: + WARN_ON_ONCE(1); + return; + } + + header->vl = task_get_vl(target, type); vq = sve_vq_from_vl(header->vl); - header->max_vl = sve_max_vl; - header->size = SVE_PT_SIZE(vq, header->flags); + header->max_vl = vec_max_vl(type); + if (active) + header->size = SVE_PT_SIZE(vq, header->flags); + else + header->size = sizeof(header); header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl), SVE_PT_REGS_SVE); } @@ -779,67 +811,218 @@ static unsigned int sve_size_from_header(struct user_sve_header const *header) return ALIGN(header->size, SVE_VQ_BYTES); } -static unsigned int sve_get_size(struct task_struct *target, - const struct user_regset *regset) +static int sve_get_common(struct task_struct *target, + const struct user_regset *regset, + struct membuf to, + enum vec_type type) { struct user_sve_header header; + unsigned int vq; + unsigned long start, end; - if (!system_supports_sve()) + if (target == current) + fpsimd_preserve_current_state(); + + /* Header */ + sve_init_header_from_task(&header, target, type); + vq = sve_vq_from_vl(header.vl); + + membuf_write(&to, &header, sizeof(header)); + + BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); + BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); + + /* + * When the requested vector type is not active, do not present data + * from the other mode to userspace. + */ + if (header.size == sizeof(header)) return 0; - sve_init_header_from_task(&header, target); - return sve_size_from_header(&header); + switch ((header.flags & SVE_PT_REGS_MASK)) { + case SVE_PT_REGS_FPSIMD: + return __fpr_get(target, regset, to); + + case SVE_PT_REGS_SVE: + start = SVE_PT_SVE_OFFSET; + end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); + membuf_write(&to, target->thread.sve_state, end - start); + + start = end; + end = SVE_PT_SVE_FPSR_OFFSET(vq); + membuf_zero(&to, end - start); + + /* + * Copy fpsr, and fpcr which must follow contiguously in + * struct fpsimd_state: + */ + start = end; + end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; + membuf_write(&to, &target->thread.uw.fpsimd_state.fpsr, + end - start); + + start = end; + end = sve_size_from_header(&header); + return membuf_zero(&to, end - start); + + default: + BUILD_BUG(); + } } static int sve_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) +{ + if (!system_supports_sve()) + return -EINVAL; + + return sve_get_common(target, regset, to, ARM64_VEC_SVE); +} + +static int sve_set_common(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf, + enum vec_type type) { int ret; struct user_sve_header header; unsigned int vq; unsigned long start, end; + bool fpsimd; - if (!system_supports_sve()) - return -EINVAL; + fpsimd_flush_task_state(target); /* Header */ - sve_init_header_from_task(&header, target); - vq = sve_vq_from_vl(header.vl); - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &header, - 0, sizeof(header)); + if (count < sizeof(header)) + return -EINVAL; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, + 0, sizeof(header)); if (ret) return ret; - if (target == current) - fpsimd_preserve_current_state(); + /* + * Streaming SVE data is always stored and presented in SVE format. + * Require the user to provide SVE formatted data for consistency, and + * to avoid the risk that we configure the task into an invalid state. + */ + fpsimd = (header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD; + if (fpsimd && type == ARM64_VEC_SME) + return -EINVAL; + + /* + * On systems without SVE we accept FPSIMD format writes with + * a VL of 0 to allow exiting streaming mode, otherwise a VL + * is required. + */ + if (header.vl) { + /* + * If the system does not support SVE we can't + * configure a SVE VL. + */ + if (!system_supports_sve() && type == ARM64_VEC_SVE) + return -EINVAL; + + /* + * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are + * consumed by vec_set_vector_length(), which will + * also validate them for us: + */ + ret = vec_set_vector_length(target, type, header.vl, + ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); + if (ret) + return ret; + } else { + /* If the system supports SVE we require a VL. */ + if (system_supports_sve()) + return -EINVAL; + + /* + * Only FPSIMD formatted data with no flags set is + * supported. + */ + if (header.flags != SVE_PT_REGS_FPSIMD) + return -EINVAL; + } + + /* Allocate SME storage if necessary, preserving any existing ZA/ZT state */ + if (type == ARM64_VEC_SME) { + sme_alloc(target, false); + if (!target->thread.sme_state) + return -ENOMEM; + } + + /* Allocate SVE storage if necessary, zeroing any existing SVE state */ + if (!fpsimd) { + sve_alloc(target, true); + if (!target->thread.sve_state) + return -ENOMEM; + } + + /* + * Actual VL set may be different from what the user asked + * for, or we may have configured the _ONEXEC VL not the + * current VL: + */ + vq = sve_vq_from_vl(task_get_vl(target, type)); + + /* Enter/exit streaming mode */ + if (system_supports_sme()) { + switch (type) { + case ARM64_VEC_SVE: + target->thread.svcr &= ~SVCR_SM_MASK; + set_tsk_thread_flag(target, TIF_SVE); + break; + case ARM64_VEC_SME: + target->thread.svcr |= SVCR_SM_MASK; + set_tsk_thread_flag(target, TIF_SME); + break; + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + } + + /* Always zero V regs, FPSR, and FPCR */ + memset(¤t->thread.uw.fpsimd_state, 0, + sizeof(current->thread.uw.fpsimd_state)); /* Registers: FPSIMD-only case */ BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); - if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) - return __fpr_get(target, regset, pos, count, kbuf, ubuf, - SVE_PT_FPSIMD_OFFSET); + if (fpsimd) { + clear_tsk_thread_flag(target, TIF_SVE); + target->thread.fp_type = FP_STATE_FPSIMD; + ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, + SVE_PT_FPSIMD_OFFSET); + return ret; + } + + /* Otherwise: no registers or full SVE case. */ + + target->thread.fp_type = FP_STATE_SVE; - /* Otherwise: full SVE case */ + /* + * If setting a different VL from the requested VL and there is + * register data, the data layout will be wrong: don't even + * try to set the registers in this case. + */ + if (count && vq != sve_vq_from_vl(header.vl)) + return -EIO; BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); start = SVE_PT_SVE_OFFSET; end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - target->thread.sve_state, - start, end); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + target->thread.sve_state, + start, end); if (ret) return ret; start = end; end = SVE_PT_SVE_FPSR_OFFSET(vq); - ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - start, end); - if (ret) - return ret; + user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, start, end); /* * Copy fpsr, and fpcr which must follow contiguously in @@ -847,16 +1030,11 @@ static int sve_get(struct task_struct *target, */ start = end; end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.uw.fpsimd_state.fpsr, - start, end); - if (ret) - return ret; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.uw.fpsimd_state.fpsr, + start, end); - start = end; - end = sve_size_from_header(&header); - return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - start, end); + return ret; } static int sve_set(struct task_struct *target, @@ -864,12 +1042,99 @@ static int sve_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { + if (!system_supports_sve() && !system_supports_sme()) + return -EINVAL; + + return sve_set_common(target, regset, pos, count, kbuf, ubuf, + ARM64_VEC_SVE); +} + +#endif /* CONFIG_ARM64_SVE */ + +#ifdef CONFIG_ARM64_SME + +static int ssve_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_sme()) + return -EINVAL; + + return sve_get_common(target, regset, to, ARM64_VEC_SME); +} + +static int ssve_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + if (!system_supports_sme()) + return -EINVAL; + + return sve_set_common(target, regset, pos, count, kbuf, ubuf, + ARM64_VEC_SME); +} + +static int za_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_za_header header; + unsigned int vq; + unsigned long start, end; + + if (!system_supports_sme()) + return -EINVAL; + + /* Header */ + memset(&header, 0, sizeof(header)); + + if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) + header.flags |= ZA_PT_VL_INHERIT; + + header.vl = task_get_sme_vl(target); + vq = sve_vq_from_vl(header.vl); + header.max_vl = sme_max_vl(); + header.max_size = ZA_PT_SIZE(vq); + + /* If ZA is not active there is only the header */ + if (thread_za_enabled(&target->thread)) + header.size = ZA_PT_SIZE(vq); + else + header.size = ZA_PT_ZA_OFFSET; + + membuf_write(&to, &header, sizeof(header)); + + BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); + end = ZA_PT_ZA_OFFSET; + + if (target == current) + fpsimd_preserve_current_state(); + + /* Any register data to include? */ + if (thread_za_enabled(&target->thread)) { + start = end; + end = ZA_PT_SIZE(vq); + membuf_write(&to, target->thread.sme_state, end - start); + } + + /* Zero any trailing padding */ + start = end; + end = ALIGN(header.size, SVE_VQ_BYTES); + return membuf_zero(&to, end - start); +} + +static int za_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ int ret; - struct user_sve_header header; + struct user_za_header header; unsigned int vq; unsigned long start, end; - if (!system_supports_sve()) + if (!system_supports_sme()) return -EINVAL; /* Header */ @@ -881,87 +1146,133 @@ static int sve_set(struct task_struct *target, goto out; /* - * Apart from PT_SVE_REGS_MASK, all PT_SVE_* flags are consumed by - * sve_set_vector_length(), which will also validate them for us: + * All current ZA_PT_* flags are consumed by + * vec_set_vector_length(), which will also validate them for + * us: */ - ret = sve_set_vector_length(target, header.vl, - ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); + ret = vec_set_vector_length(target, ARM64_VEC_SME, header.vl, + ((unsigned long)header.flags) << 16); if (ret) goto out; - /* Actual VL set may be less than the user asked for: */ - vq = sve_vq_from_vl(target->thread.sve_vl); + /* + * Actual VL set may be different from what the user asked + * for, or we may have configured the _ONEXEC rather than + * current VL: + */ + vq = sve_vq_from_vl(task_get_sme_vl(target)); - /* Registers: FPSIMD-only case */ + /* Ensure there is some SVE storage for streaming mode */ + if (!target->thread.sve_state) { + sve_alloc(target, false); + if (!target->thread.sve_state) { + ret = -ENOMEM; + goto out; + } + } - BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); - if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) { - ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, - SVE_PT_FPSIMD_OFFSET); - clear_tsk_thread_flag(target, TIF_SVE); + /* + * Only flush the storage if PSTATE.ZA was not already set, + * otherwise preserve any existing data. + */ + sme_alloc(target, !thread_za_enabled(&target->thread)); + if (!target->thread.sme_state) + return -ENOMEM; + + /* If there is no data then disable ZA */ + if (!count) { + target->thread.svcr &= ~SVCR_ZA_MASK; goto out; } - /* Otherwise: full SVE case */ - /* * If setting a different VL from the requested VL and there is * register data, the data layout will be wrong: don't even * try to set the registers in this case. */ - if (count && vq != sve_vq_from_vl(header.vl)) { + if (vq != sve_vq_from_vl(header.vl)) { ret = -EIO; goto out; } - sve_alloc(target); - - /* - * Ensure target->thread.sve_state is up to date with target's - * FPSIMD regs, so that a short copyin leaves trailing registers - * unmodified. - */ - fpsimd_sync_to_sve(target); - set_tsk_thread_flag(target, TIF_SVE); - - BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); - start = SVE_PT_SVE_OFFSET; - end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); + BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); + start = ZA_PT_ZA_OFFSET; + end = ZA_PT_SIZE(vq); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - target->thread.sve_state, + target->thread.sme_state, start, end); if (ret) goto out; - start = end; - end = SVE_PT_SVE_FPSR_OFFSET(vq); - ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, - start, end); - if (ret) - goto out; + /* Mark ZA as active and let userspace use it */ + set_tsk_thread_flag(target, TIF_SME); + target->thread.svcr |= SVCR_ZA_MASK; + +out: + fpsimd_flush_task_state(target); + return ret; +} + +static int zt_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_sme2()) + return -EINVAL; /* - * Copy fpsr, and fpcr which must follow contiguously in - * struct fpsimd_state: + * If PSTATE.ZA is not set then ZT will be zeroed when it is + * enabled so report the current register value as zero. */ - start = end; - end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; + if (thread_za_enabled(&target->thread)) + membuf_write(&to, thread_zt_state(&target->thread), + ZT_SIG_REG_BYTES); + else + membuf_zero(&to, ZT_SIG_REG_BYTES); + + return 0; +} + +static int zt_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + if (!system_supports_sme2()) + return -EINVAL; + + /* Ensure SVE storage in case this is first use of SME */ + sve_alloc(target, false); + if (!target->thread.sve_state) + return -ENOMEM; + + if (!thread_za_enabled(&target->thread)) { + sme_alloc(target, true); + if (!target->thread.sme_state) + return -ENOMEM; + } + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.uw.fpsimd_state.fpsr, - start, end); + thread_zt_state(&target->thread), + 0, ZT_SIG_REG_BYTES); + if (ret == 0) { + target->thread.svcr |= SVCR_ZA_MASK; + set_tsk_thread_flag(target, TIF_SME); + } -out: fpsimd_flush_task_state(target); + return ret; } -#endif /* CONFIG_ARM64_SVE */ +#endif /* CONFIG_ARM64_SME */ #ifdef CONFIG_ARM64_PTR_AUTH static int pac_mask_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { /* * The PAC bits can differ across data and instruction pointers @@ -977,10 +1288,291 @@ static int pac_mask_get(struct task_struct *target, if (!system_supports_address_auth()) return -EINVAL; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &uregs, 0, -1); + return membuf_write(&to, &uregs, sizeof(uregs)); +} + +static int pac_enabled_keys_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + long enabled_keys = ptrauth_get_enabled_keys(target); + + if (IS_ERR_VALUE(enabled_keys)) + return enabled_keys; + + return membuf_write(&to, &enabled_keys, sizeof(enabled_keys)); +} + +static int pac_enabled_keys_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + long enabled_keys = ptrauth_get_enabled_keys(target); + + if (IS_ERR_VALUE(enabled_keys)) + return enabled_keys; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &enabled_keys, 0, + sizeof(long)); + if (ret) + return ret; + + return ptrauth_set_enabled_keys(target, PR_PAC_ENABLED_KEYS_MASK, + enabled_keys); +} + +#ifdef CONFIG_CHECKPOINT_RESTORE +static __uint128_t pac_key_to_user(const struct ptrauth_key *key) +{ + return (__uint128_t)key->hi << 64 | key->lo; +} + +static struct ptrauth_key pac_key_from_user(__uint128_t ukey) +{ + struct ptrauth_key key = { + .lo = (unsigned long)ukey, + .hi = (unsigned long)(ukey >> 64), + }; + + return key; +} + +static void pac_address_keys_to_user(struct user_pac_address_keys *ukeys, + const struct ptrauth_keys_user *keys) +{ + ukeys->apiakey = pac_key_to_user(&keys->apia); + ukeys->apibkey = pac_key_to_user(&keys->apib); + ukeys->apdakey = pac_key_to_user(&keys->apda); + ukeys->apdbkey = pac_key_to_user(&keys->apdb); +} + +static void pac_address_keys_from_user(struct ptrauth_keys_user *keys, + const struct user_pac_address_keys *ukeys) +{ + keys->apia = pac_key_from_user(ukeys->apiakey); + keys->apib = pac_key_from_user(ukeys->apibkey); + keys->apda = pac_key_from_user(ukeys->apdakey); + keys->apdb = pac_key_from_user(ukeys->apdbkey); +} + +static int pac_address_keys_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct ptrauth_keys_user *keys = &target->thread.keys_user; + struct user_pac_address_keys user_keys; + + if (!system_supports_address_auth()) + return -EINVAL; + + pac_address_keys_to_user(&user_keys, keys); + + return membuf_write(&to, &user_keys, sizeof(user_keys)); +} + +static int pac_address_keys_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct ptrauth_keys_user *keys = &target->thread.keys_user; + struct user_pac_address_keys user_keys; + int ret; + + if (!system_supports_address_auth()) + return -EINVAL; + + pac_address_keys_to_user(&user_keys, keys); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &user_keys, 0, -1); + if (ret) + return ret; + pac_address_keys_from_user(keys, &user_keys); + + return 0; +} + +static void pac_generic_keys_to_user(struct user_pac_generic_keys *ukeys, + const struct ptrauth_keys_user *keys) +{ + ukeys->apgakey = pac_key_to_user(&keys->apga); +} + +static void pac_generic_keys_from_user(struct ptrauth_keys_user *keys, + const struct user_pac_generic_keys *ukeys) +{ + keys->apga = pac_key_from_user(ukeys->apgakey); +} + +static int pac_generic_keys_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct ptrauth_keys_user *keys = &target->thread.keys_user; + struct user_pac_generic_keys user_keys; + + if (!system_supports_generic_auth()) + return -EINVAL; + + pac_generic_keys_to_user(&user_keys, keys); + + return membuf_write(&to, &user_keys, sizeof(user_keys)); } + +static int pac_generic_keys_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct ptrauth_keys_user *keys = &target->thread.keys_user; + struct user_pac_generic_keys user_keys; + int ret; + + if (!system_supports_generic_auth()) + return -EINVAL; + + pac_generic_keys_to_user(&user_keys, keys); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &user_keys, 0, -1); + if (ret) + return ret; + pac_generic_keys_from_user(keys, &user_keys); + + return 0; +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ #endif /* CONFIG_ARM64_PTR_AUTH */ +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI +static int tagged_addr_ctrl_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + long ctrl = get_tagged_addr_ctrl(target); + + if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl))) + return ctrl; + + return membuf_write(&to, &ctrl, sizeof(ctrl)); +} + +static int tagged_addr_ctrl_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + long ctrl; + + ctrl = get_tagged_addr_ctrl(target); + if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl))) + return ctrl; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + return set_tagged_addr_ctrl(target, ctrl); +} +#endif + +#ifdef CONFIG_ARM64_POE +static int poe_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_poe()) + return -EINVAL; + + return membuf_write(&to, &target->thread.por_el0, + sizeof(target->thread.por_el0)); +} + +static int poe_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + long ctrl; + + if (!system_supports_poe()) + return -EINVAL; + + ctrl = target->thread.por_el0; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + target->thread.por_el0 = ctrl; + + return 0; +} +#endif + +#ifdef CONFIG_ARM64_GCS +static void task_gcs_to_user(struct user_gcs *user_gcs, + const struct task_struct *target) +{ + user_gcs->features_enabled = target->thread.gcs_el0_mode; + user_gcs->features_locked = target->thread.gcs_el0_locked; + user_gcs->gcspr_el0 = target->thread.gcspr_el0; +} + +static void task_gcs_from_user(struct task_struct *target, + const struct user_gcs *user_gcs) +{ + target->thread.gcs_el0_mode = user_gcs->features_enabled; + target->thread.gcs_el0_locked = user_gcs->features_locked; + target->thread.gcspr_el0 = user_gcs->gcspr_el0; +} + +static int gcs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + if (target == current) + gcs_preserve_current_state(); + + task_gcs_to_user(&user_gcs, target); + + return membuf_write(&to, &user_gcs, sizeof(user_gcs)); +} + +static int gcs_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + task_gcs_to_user(&user_gcs, target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1); + if (ret) + return ret; + + if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + task_gcs_from_user(target, &user_gcs); + + return 0; +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -989,26 +1581,46 @@ enum aarch64_regset { REGSET_HW_BREAK, REGSET_HW_WATCH, #endif + REGSET_FPMR, REGSET_SYSTEM_CALL, #ifdef CONFIG_ARM64_SVE REGSET_SVE, #endif +#ifdef CONFIG_ARM64_SME + REGSET_SSVE, + REGSET_ZA, + REGSET_ZT, +#endif #ifdef CONFIG_ARM64_PTR_AUTH REGSET_PAC_MASK, + REGSET_PAC_ENABLED_KEYS, +#ifdef CONFIG_CHECKPOINT_RESTORE + REGSET_PACA_KEYS, + REGSET_PACG_KEYS, +#endif +#endif +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI + REGSET_TAGGED_ADDR_CTRL, +#endif +#ifdef CONFIG_ARM64_POE + REGSET_POE, +#endif +#ifdef CONFIG_ARM64_GCS + REGSET_GCS, #endif }; static const struct user_regset aarch64_regsets[] = { [REGSET_GPR] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = sizeof(struct user_pt_regs) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), - .get = gpr_get, + .regset_get = gpr_get, .set = gpr_set }, [REGSET_FPR] = { - .core_note_type = NT_PRFPREG, + USER_REGSET_NOTE_TYPE(PRFPREG), .n = sizeof(struct user_fpsimd_state) / sizeof(u32), /* * We pretend we have 32-bit registers because the fpsr and @@ -1016,64 +1628,164 @@ static const struct user_regset aarch64_regsets[] = { */ .size = sizeof(u32), .align = sizeof(u32), - .get = fpr_get, + .active = fpr_active, + .regset_get = fpr_get, .set = fpr_set }, [REGSET_TLS] = { - .core_note_type = NT_ARM_TLS, - .n = 1, + USER_REGSET_NOTE_TYPE(ARM_TLS), + .n = 2, .size = sizeof(void *), .align = sizeof(void *), - .get = tls_get, + .regset_get = tls_get, .set = tls_set, }, #ifdef CONFIG_HAVE_HW_BREAKPOINT [REGSET_HW_BREAK] = { - .core_note_type = NT_ARM_HW_BREAK, + USER_REGSET_NOTE_TYPE(ARM_HW_BREAK), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = hw_break_get, + .regset_get = hw_break_get, .set = hw_break_set, }, [REGSET_HW_WATCH] = { - .core_note_type = NT_ARM_HW_WATCH, + USER_REGSET_NOTE_TYPE(ARM_HW_WATCH), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = hw_break_get, + .regset_get = hw_break_get, .set = hw_break_set, }, #endif [REGSET_SYSTEM_CALL] = { - .core_note_type = NT_ARM_SYSTEM_CALL, + USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL), .n = 1, .size = sizeof(int), .align = sizeof(int), - .get = system_call_get, + .regset_get = system_call_get, .set = system_call_set, }, + [REGSET_FPMR] = { + USER_REGSET_NOTE_TYPE(ARM_FPMR), + .n = 1, + .size = sizeof(u64), + .align = sizeof(u64), + .regset_get = fpmr_get, + .set = fpmr_set, + }, #ifdef CONFIG_ARM64_SVE [REGSET_SVE] = { /* Scalable Vector Extension */ - .core_note_type = NT_ARM_SVE, - .n = DIV_ROUND_UP(SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE), + USER_REGSET_NOTE_TYPE(ARM_SVE), + .n = DIV_ROUND_UP(SVE_PT_SIZE(ARCH_SVE_VQ_MAX, + SVE_PT_REGS_SVE), SVE_VQ_BYTES), .size = SVE_VQ_BYTES, .align = SVE_VQ_BYTES, - .get = sve_get, + .regset_get = sve_get, .set = sve_set, - .get_size = sve_get_size, + }, +#endif +#ifdef CONFIG_ARM64_SME + [REGSET_SSVE] = { /* Streaming mode SVE */ + USER_REGSET_NOTE_TYPE(ARM_SSVE), + .n = DIV_ROUND_UP(SVE_PT_SIZE(SME_VQ_MAX, SVE_PT_REGS_SVE), + SVE_VQ_BYTES), + .size = SVE_VQ_BYTES, + .align = SVE_VQ_BYTES, + .regset_get = ssve_get, + .set = ssve_set, + }, + [REGSET_ZA] = { /* SME ZA */ + USER_REGSET_NOTE_TYPE(ARM_ZA), + /* + * ZA is a single register but it's variably sized and + * the ptrace core requires that the size of any data + * be an exact multiple of the configured register + * size so report as though we had SVE_VQ_BYTES + * registers. These values aren't exposed to + * userspace. + */ + .n = DIV_ROUND_UP(ZA_PT_SIZE(SME_VQ_MAX), SVE_VQ_BYTES), + .size = SVE_VQ_BYTES, + .align = SVE_VQ_BYTES, + .regset_get = za_get, + .set = za_set, + }, + [REGSET_ZT] = { /* SME ZT */ + USER_REGSET_NOTE_TYPE(ARM_ZT), + .n = 1, + .size = ZT_SIG_REG_BYTES, + .align = sizeof(u64), + .regset_get = zt_get, + .set = zt_set, }, #endif #ifdef CONFIG_ARM64_PTR_AUTH [REGSET_PAC_MASK] = { - .core_note_type = NT_ARM_PAC_MASK, + USER_REGSET_NOTE_TYPE(ARM_PAC_MASK), .n = sizeof(struct user_pac_mask) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), - .get = pac_mask_get, + .regset_get = pac_mask_get, /* this cannot be set dynamically */ }, + [REGSET_PAC_ENABLED_KEYS] = { + USER_REGSET_NOTE_TYPE(ARM_PAC_ENABLED_KEYS), + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = pac_enabled_keys_get, + .set = pac_enabled_keys_set, + }, +#ifdef CONFIG_CHECKPOINT_RESTORE + [REGSET_PACA_KEYS] = { + USER_REGSET_NOTE_TYPE(ARM_PACA_KEYS), + .n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t), + .size = sizeof(__uint128_t), + .align = sizeof(__uint128_t), + .regset_get = pac_address_keys_get, + .set = pac_address_keys_set, + }, + [REGSET_PACG_KEYS] = { + USER_REGSET_NOTE_TYPE(ARM_PACG_KEYS), + .n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t), + .size = sizeof(__uint128_t), + .align = sizeof(__uint128_t), + .regset_get = pac_generic_keys_get, + .set = pac_generic_keys_set, + }, +#endif +#endif +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI + [REGSET_TAGGED_ADDR_CTRL] = { + USER_REGSET_NOTE_TYPE(ARM_TAGGED_ADDR_CTRL), + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = tagged_addr_ctrl_get, + .set = tagged_addr_ctrl_set, + }, +#endif +#ifdef CONFIG_ARM64_POE + [REGSET_POE] = { + USER_REGSET_NOTE_TYPE(ARM_POE), + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = poe_get, + .set = poe_set, + }, +#endif +#ifdef CONFIG_ARM64_GCS + [REGSET_GCS] = { + USER_REGSET_NOTE_TYPE(ARM_GCS), + .n = sizeof(struct user_gcs) / sizeof(u64), + .size = sizeof(u64), + .align = sizeof(u64), + .regset_get = gcs_get, + .set = gcs_set, + }, #endif }; @@ -1082,63 +1794,36 @@ static const struct user_regset_view user_aarch64_view = { .regsets = aarch64_regsets, .n = ARRAY_SIZE(aarch64_regsets) }; -#ifdef CONFIG_COMPAT enum compat_regset { REGSET_COMPAT_GPR, REGSET_COMPAT_VFP, }; +static inline compat_ulong_t compat_get_user_reg(struct task_struct *task, int idx) +{ + struct pt_regs *regs = task_pt_regs(task); + + switch (idx) { + case 15: + return regs->pc; + case 16: + return pstate_to_compat_psr(regs->pstate); + case 17: + return regs->orig_x0; + default: + return regs->regs[idx]; + } +} + static int compat_gpr_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - int ret = 0; - unsigned int i, start, num_regs; - - /* Calculate the number of AArch32 registers contained in count */ - num_regs = count / regset->size; - - /* Convert pos into an register number */ - start = pos / regset->size; - - if (start + num_regs > regset->n) - return -EIO; - - for (i = 0; i < num_regs; ++i) { - unsigned int idx = start + i; - compat_ulong_t reg; - - switch (idx) { - case 15: - reg = task_pt_regs(target)->pc; - break; - case 16: - reg = task_pt_regs(target)->pstate; - reg = pstate_to_compat_psr(reg); - break; - case 17: - reg = task_pt_regs(target)->orig_x0; - break; - default: - reg = task_pt_regs(target)->regs[idx]; - } + int i = 0; - if (kbuf) { - memcpy(kbuf, ®, sizeof(reg)); - kbuf += sizeof(reg); - } else { - ret = copy_to_user(ubuf, ®, sizeof(reg)); - if (ret) { - ret = -EFAULT; - break; - } - - ubuf += sizeof(reg); - } - } - - return ret; + while (to.left) + membuf_store(&to, compat_get_user_reg(target, i++)); + return 0; } static int compat_gpr_set(struct task_struct *target, @@ -1205,12 +1890,13 @@ static int compat_gpr_set(struct task_struct *target, static int compat_vfp_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct user_fpsimd_state *uregs; compat_ulong_t fpscr; - int ret, vregs_end_pos; + + if (!system_supports_fpsimd()) + return -EINVAL; uregs = &target->thread.uw.fpsimd_state; @@ -1221,19 +1907,10 @@ static int compat_vfp_get(struct task_struct *target, * The VFP registers are packed into the fpsimd_state, so they all sit * nicely together for us. We just need to create the fpscr separately. */ - vregs_end_pos = VFP_STATE_SIZE - sizeof(compat_ulong_t); - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, - 0, vregs_end_pos); - - if (count && !ret) { - fpscr = (uregs->fpsr & VFP_FPSCR_STAT_MASK) | - (uregs->fpcr & VFP_FPSCR_CTRL_MASK); - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &fpscr, - vregs_end_pos, VFP_STATE_SIZE); - } - - return ret; + membuf_write(&to, uregs, VFP_STATE_SIZE - sizeof(compat_ulong_t)); + fpscr = (uregs->fpsr & VFP_FPSCR_STAT_MASK) | + (uregs->fpcr & VFP_FPSCR_CTRL_MASK); + return membuf_store(&to, fpscr); } static int compat_vfp_set(struct task_struct *target, @@ -1245,6 +1922,9 @@ static int compat_vfp_set(struct task_struct *target, compat_ulong_t fpscr; int ret, vregs_end_pos; + if (!system_supports_fpsimd()) + return -EINVAL; + uregs = &target->thread.uw.fpsimd_state; vregs_end_pos = VFP_STATE_SIZE - sizeof(compat_ulong_t); @@ -1265,11 +1945,10 @@ static int compat_vfp_set(struct task_struct *target, } static int compat_tls_get(struct task_struct *target, - const struct user_regset *regset, unsigned int pos, - unsigned int count, void *kbuf, void __user *ubuf) + const struct user_regset *regset, + struct membuf to) { - compat_ulong_t tls = (compat_ulong_t)target->thread.uw.tp_value; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &tls, 0, -1); + return membuf_store(&to, (compat_ulong_t)target->thread.uw.tp_value); } static int compat_tls_set(struct task_struct *target, @@ -1290,19 +1969,20 @@ static int compat_tls_set(struct task_struct *target, static const struct user_regset aarch32_regsets[] = { [REGSET_COMPAT_GPR] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), - .get = compat_gpr_get, + .regset_get = compat_gpr_get, .set = compat_gpr_set }, [REGSET_COMPAT_VFP] = { - .core_note_type = NT_ARM_VFP, + USER_REGSET_NOTE_TYPE(ARM_VFP), .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), - .get = compat_vfp_get, + .active = fpr_active, + .regset_get = compat_vfp_get, .set = compat_vfp_set }, }; @@ -1314,53 +1994,53 @@ static const struct user_regset_view user_aarch32_view = { static const struct user_regset aarch32_ptrace_regsets[] = { [REGSET_GPR] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), - .get = compat_gpr_get, + .regset_get = compat_gpr_get, .set = compat_gpr_set }, [REGSET_FPR] = { - .core_note_type = NT_ARM_VFP, + USER_REGSET_NOTE_TYPE(ARM_VFP), .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), - .get = compat_vfp_get, + .regset_get = compat_vfp_get, .set = compat_vfp_set }, [REGSET_TLS] = { - .core_note_type = NT_ARM_TLS, + USER_REGSET_NOTE_TYPE(ARM_TLS), .n = 1, .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), - .get = compat_tls_get, + .regset_get = compat_tls_get, .set = compat_tls_set, }, #ifdef CONFIG_HAVE_HW_BREAKPOINT [REGSET_HW_BREAK] = { - .core_note_type = NT_ARM_HW_BREAK, + USER_REGSET_NOTE_TYPE(ARM_HW_BREAK), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = hw_break_get, + .regset_get = hw_break_get, .set = hw_break_set, }, [REGSET_HW_WATCH] = { - .core_note_type = NT_ARM_HW_WATCH, + USER_REGSET_NOTE_TYPE(ARM_HW_WATCH), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = hw_break_get, + .regset_get = hw_break_get, .set = hw_break_set, }, #endif [REGSET_SYSTEM_CALL] = { - .core_note_type = NT_ARM_SYSTEM_CALL, + USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL), .n = 1, .size = sizeof(int), .align = sizeof(int), - .get = system_call_get, + .regset_get = system_call_get, .set = system_call_set, }, }; @@ -1370,6 +2050,7 @@ static const struct user_regset_view user_aarch32_ptrace_view = { .regsets = aarch32_ptrace_regsets, .n = ARRAY_SIZE(aarch32_ptrace_regsets) }; +#ifdef CONFIG_COMPAT static int compat_ptrace_read_user(struct task_struct *tsk, compat_ulong_t off, compat_ulong_t __user *ret) { @@ -1385,9 +2066,7 @@ static int compat_ptrace_read_user(struct task_struct *tsk, compat_ulong_t off, else if (off == COMPAT_PT_TEXT_END_ADDR) tmp = tsk->mm->end_code; else if (off < sizeof(compat_elf_gregset_t)) - return copy_regset_to_user(tsk, &user_aarch32_view, - REGSET_COMPAT_GPR, off, - sizeof(compat_ulong_t), ret); + tmp = compat_get_user_reg(tsk, off >> 2); else if (off >= COMPAT_USER_SZ) return -EIO; else @@ -1399,8 +2078,8 @@ static int compat_ptrace_read_user(struct task_struct *tsk, compat_ulong_t off, static int compat_ptrace_write_user(struct task_struct *tsk, compat_ulong_t off, compat_ulong_t val) { - int ret; - mm_segment_t old_fs = get_fs(); + struct pt_regs newregs = *task_pt_regs(tsk); + unsigned int idx = off / 4; if (off & 3 || off >= COMPAT_USER_SZ) return -EIO; @@ -1408,14 +2087,25 @@ static int compat_ptrace_write_user(struct task_struct *tsk, compat_ulong_t off, if (off >= sizeof(compat_elf_gregset_t)) return 0; - set_fs(KERNEL_DS); - ret = copy_regset_from_user(tsk, &user_aarch32_view, - REGSET_COMPAT_GPR, off, - sizeof(compat_ulong_t), - &val); - set_fs(old_fs); + switch (idx) { + case 15: + newregs.pc = val; + break; + case 16: + newregs.pstate = compat_psr_to_pstate(val); + break; + case 17: + newregs.orig_x0 = val; + break; + default: + newregs.regs[idx] = val; + } - return ret; + if (!valid_user_regs(&newregs.user_regs, tsk)) + return -EINVAL; + + *task_pt_regs(tsk) = newregs; + return 0; } #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -1622,7 +2312,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, const struct user_regset_view *task_user_regset_view(struct task_struct *task) { -#ifdef CONFIG_COMPAT /* * Core dumping of 32-bit tasks or compat ptrace requests must use the * user_aarch32_view compatible with arm32. Native ptrace requests on @@ -1633,13 +2322,19 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) return &user_aarch32_view; else if (is_compat_thread(task_thread_info(task))) return &user_aarch32_ptrace_view; -#endif + return &user_aarch64_view; } long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { + switch (request) { + case PTRACE_PEEKMTETAGS: + case PTRACE_POKEMTETAGS: + return mte_ptrace_copy_tags(child, request, addr, data); + } + return ptrace_request(child, request, addr, data); } @@ -1648,36 +2343,63 @@ enum ptrace_syscall_dir { PTRACE_SYSCALL_EXIT, }; -static void tracehook_report_syscall(struct pt_regs *regs, - enum ptrace_syscall_dir dir) +static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir) { int regno; unsigned long saved_reg; /* - * A scratch register (ip(r12) on AArch32, x7 on AArch64) is - * used to denote syscall entry/exit: + * We have some ABI weirdness here in the way that we handle syscall + * exit stops because we indicate whether or not the stop has been + * signalled from syscall entry or syscall exit by clobbering a general + * purpose register (ip/r12 for AArch32, x7 for AArch64) in the tracee + * and restoring its old value after the stop. This means that: + * + * - Any writes by the tracer to this register during the stop are + * ignored/discarded. + * + * - The actual value of the register is not available during the stop, + * so the tracer cannot save it and restore it later. + * + * - Syscall stops behave differently to seccomp and pseudo-step traps + * (the latter do not nobble any registers). */ regno = (is_compat_task() ? 12 : 7); saved_reg = regs->regs[regno]; regs->regs[regno] = dir; - if (dir == PTRACE_SYSCALL_EXIT) - tracehook_report_syscall_exit(regs, 0); - else if (tracehook_report_syscall_entry(regs)) - forget_syscall(regs); + if (dir == PTRACE_SYSCALL_ENTER) { + if (ptrace_report_syscall_entry(regs)) + forget_syscall(regs); + regs->regs[regno] = saved_reg; + } else if (!test_thread_flag(TIF_SINGLESTEP)) { + ptrace_report_syscall_exit(regs, 0); + regs->regs[regno] = saved_reg; + } else { + regs->regs[regno] = saved_reg; - regs->regs[regno] = saved_reg; + /* + * Signal a pseudo-step exception since we are stepping but + * tracer modifications to the registers may have rewound the + * state machine. + */ + ptrace_report_syscall_exit(regs, 1); + } } int syscall_trace_enter(struct pt_regs *regs) { - if (test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); + unsigned long flags = read_thread_flags(); + + if (flags & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE)) { + report_syscall(regs, PTRACE_SYSCALL_ENTER); + if (flags & _TIF_SYSCALL_EMU) + return NO_SYSCALL; + } /* Do the secure computing after ptrace; failures should be fast. */ - if (secure_computing(NULL) == -1) - return -1; + if (secure_computing() == -1) + return NO_SYSCALL; if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, regs->syscallno); @@ -1690,31 +2412,34 @@ int syscall_trace_enter(struct pt_regs *regs) void syscall_trace_exit(struct pt_regs *regs) { + unsigned long flags = read_thread_flags(); + audit_syscall_exit(regs); - if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) - trace_sys_exit(regs, regs_return_value(regs)); + if (flags & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, syscall_get_return_value(current, regs)); - if (test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT); + if (flags & (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP)) + report_syscall(regs, PTRACE_SYSCALL_EXIT); rseq_syscall(regs); } /* - * SPSR_ELx bits which are always architecturally RES0 per ARM DDI 0487C.a - * We also take into account DIT (bit 24), which is not yet documented, and - * treat PAN and UAO as RES0 bits, as they are meaningless at EL0, and may be - * allocated an EL0 meaning in future. + * SPSR_ELx bits which are always architecturally RES0 per ARM DDI 0487D.a. + * We permit userspace to set SSBS (AArch64 bit 12, AArch32 bit 23) which is + * not described in ARM DDI 0487D.a. + * We treat PAN and UAO as RES0 bits, as they are meaningless at EL0, and may + * be allocated an EL0 meaning in future. * Userspace cannot use these until they have an architectural meaning. * Note that this follows the SPSR_ELx format, not the AArch32 PSR format. * We also reserve IL for the kernel; SS is handled dynamically. */ #define SPSR_EL1_AARCH64_RES0_BITS \ - (GENMASK_ULL(63,32) | GENMASK_ULL(27, 25) | GENMASK_ULL(23, 22) | \ - GENMASK_ULL(20, 10) | GENMASK_ULL(5, 5)) + (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 26) | GENMASK_ULL(23, 22) | \ + GENMASK_ULL(20, 13) | GENMASK_ULL(5, 5)) #define SPSR_EL1_AARCH32_RES0_BITS \ - (GENMASK_ULL(63,32) | GENMASK_ULL(23, 22) | GENMASK_ULL(20,20)) + (GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20)) static int valid_compat_regs(struct user_pt_regs *regs) { @@ -1772,8 +2497,8 @@ static int valid_native_regs(struct user_pt_regs *regs) */ int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task) { - if (!test_tsk_thread_flag(task, TIF_SINGLESTEP)) - regs->pstate &= ~DBG_SPSR_SS; + /* https://lore.kernel.org/lkml/20191118131525.GA4180@willie-the-truck */ + user_regs_reset_single_step(regs, task); if (is_compat_thread(task_thread_info(task))) return valid_compat_regs(regs); diff --git a/arch/arm64/kernel/reloc_test_core.c b/arch/arm64/kernel/reloc_test_core.c index 5915ce5759cc..5b0891146054 100644 --- a/arch/arm64/kernel/reloc_test_core.c +++ b/arch/arm64/kernel/reloc_test_core.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * */ #include <linux/module.h> @@ -52,7 +48,7 @@ static struct { { "R_AARCH64_PREL16", relative_data16, (u64)&sym64_rel }, }; -static int reloc_test_init(void) +static int __init reloc_test_init(void) { int i; @@ -71,11 +67,12 @@ static int reloc_test_init(void) return 0; } -static void reloc_test_exit(void) +static void __exit reloc_test_exit(void) { } module_init(reloc_test_init); module_exit(reloc_test_exit); +MODULE_DESCRIPTION("Relocation testing module"); MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/kernel/reloc_test_syms.S b/arch/arm64/kernel/reloc_test_syms.S index 2b8d9cb8b078..c50f45fa29fa 100644 --- a/arch/arm64/kernel/reloc_test_syms.S +++ b/arch/arm64/kernel/reloc_test_syms.S @@ -1,89 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * */ #include <linux/linkage.h> -ENTRY(absolute_data64) +SYM_FUNC_START(absolute_data64) ldr x0, 0f ret 0: .quad sym64_abs -ENDPROC(absolute_data64) +SYM_FUNC_END(absolute_data64) -ENTRY(absolute_data32) +SYM_FUNC_START(absolute_data32) ldr w0, 0f ret 0: .long sym32_abs -ENDPROC(absolute_data32) +SYM_FUNC_END(absolute_data32) -ENTRY(absolute_data16) +SYM_FUNC_START(absolute_data16) adr x0, 0f ldrh w0, [x0] ret 0: .short sym16_abs, 0 -ENDPROC(absolute_data16) +SYM_FUNC_END(absolute_data16) -ENTRY(signed_movw) +SYM_FUNC_START(signed_movw) movz x0, #:abs_g2_s:sym64_abs movk x0, #:abs_g1_nc:sym64_abs movk x0, #:abs_g0_nc:sym64_abs ret -ENDPROC(signed_movw) +SYM_FUNC_END(signed_movw) -ENTRY(unsigned_movw) +SYM_FUNC_START(unsigned_movw) movz x0, #:abs_g3:sym64_abs movk x0, #:abs_g2_nc:sym64_abs movk x0, #:abs_g1_nc:sym64_abs movk x0, #:abs_g0_nc:sym64_abs ret -ENDPROC(unsigned_movw) +SYM_FUNC_END(unsigned_movw) .align 12 .space 0xff8 -ENTRY(relative_adrp) +SYM_FUNC_START(relative_adrp) adrp x0, sym64_rel add x0, x0, #:lo12:sym64_rel ret -ENDPROC(relative_adrp) +SYM_FUNC_END(relative_adrp) .align 12 .space 0xffc -ENTRY(relative_adrp_far) +SYM_FUNC_START(relative_adrp_far) adrp x0, memstart_addr add x0, x0, #:lo12:memstart_addr ret -ENDPROC(relative_adrp_far) +SYM_FUNC_END(relative_adrp_far) -ENTRY(relative_adr) +SYM_FUNC_START(relative_adr) adr x0, sym64_rel ret -ENDPROC(relative_adr) +SYM_FUNC_END(relative_adr) -ENTRY(relative_data64) +SYM_FUNC_START(relative_data64) adr x1, 0f ldr x0, [x1] add x0, x0, x1 ret 0: .quad sym64_rel - . -ENDPROC(relative_data64) +SYM_FUNC_END(relative_data64) -ENTRY(relative_data32) +SYM_FUNC_START(relative_data32) adr x1, 0f ldr w0, [x1] add x0, x0, x1 ret 0: .long sym64_rel - . -ENDPROC(relative_data32) +SYM_FUNC_END(relative_data32) -ENTRY(relative_data16) +SYM_FUNC_START(relative_data16) adr x1, 0f ldrsh w0, [x1] add x0, x0, x1 ret 0: .short sym64_rel - ., 0 -ENDPROC(relative_data16) +SYM_FUNC_END(relative_data16) diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index 95fd94209aae..413f899e4ac6 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -1,12 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * kexec for arm64 * * Copyright (C) Linaro. * Copyright (C) Huawei Futurewei Technologies. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * Copyright (C) 2021, Microsoft Corporation. + * Pasha Tatashin <pasha.tatashin@soleen.com> */ #include <linux/kexec.h> @@ -16,117 +15,86 @@ #include <asm/kexec.h> #include <asm/page.h> #include <asm/sysreg.h> +#include <asm/virt.h> + +.macro turn_off_mmu tmp1, tmp2 + mov_q \tmp1, INIT_SCTLR_EL1_MMU_OFF + pre_disable_mmu_workaround + msr sctlr_el1, \tmp1 + isb +.endm +.section ".kexec_relocate.text", "ax" /* * arm64_relocate_new_kernel - Put a 2nd stage image in place and boot it. * - * The memory that the old kernel occupies may be overwritten when coping the + * The memory that the old kernel occupies may be overwritten when copying the * new image to its final location. To assure that the * arm64_relocate_new_kernel routine which does that copy is not overwritten, * all code and data needed by arm64_relocate_new_kernel must be between the * symbols arm64_relocate_new_kernel and arm64_relocate_new_kernel_end. The * machine_kexec() routine will copy arm64_relocate_new_kernel to the kexec - * control_code_page, a special page which has been set up to be preserved - * during the copy operation. + * safe memory that has been set up to be preserved during the copy operation. */ -ENTRY(arm64_relocate_new_kernel) +SYM_CODE_START(arm64_relocate_new_kernel) + /* + * The kimage structure isn't allocated specially and may be clobbered + * during relocation. We must load any values we need from it prior to + * any relocation occurring. + */ + ldr x28, [x0, #KIMAGE_START] + ldr x27, [x0, #KIMAGE_ARCH_EL2_VECTORS] + ldr x26, [x0, #KIMAGE_ARCH_DTB_MEM] /* Setup the list loop variables. */ - mov x18, x2 /* x18 = dtb address */ - mov x17, x1 /* x17 = kimage_start */ - mov x16, x0 /* x16 = kimage_head */ - raw_dcache_line_size x15, x0 /* x15 = dcache line size */ - mov x14, xzr /* x14 = entry ptr */ - mov x13, xzr /* x13 = copy dest */ - - /* Clear the sctlr_el2 flags. */ - mrs x0, CurrentEL - cmp x0, #CurrentEL_EL2 - b.ne 1f - mrs x0, sctlr_el2 - ldr x1, =SCTLR_ELx_FLAGS - bic x0, x0, x1 - pre_disable_mmu_workaround - msr sctlr_el2, x0 - isb -1: - - /* Check if the new image needs relocation. */ - tbnz x16, IND_DONE_BIT, .Ldone - + ldr x18, [x0, #KIMAGE_ARCH_ZERO_PAGE] /* x18 = zero page for BBM */ + ldr x17, [x0, #KIMAGE_ARCH_TTBR1] /* x17 = linear map copy */ + ldr x16, [x0, #KIMAGE_HEAD] /* x16 = kimage_head */ + ldr x22, [x0, #KIMAGE_ARCH_PHYS_OFFSET] /* x22 phys_offset */ + raw_dcache_line_size x15, x1 /* x15 = dcache line size */ + break_before_make_ttbr_switch x18, x17, x1, x2 /* set linear map */ .Lloop: and x12, x16, PAGE_MASK /* x12 = addr */ - + sub x12, x12, x22 /* Convert x12 to virt */ /* Test the entry flags. */ .Ltest_source: tbz x16, IND_SOURCE_BIT, .Ltest_indirection /* Invalidate dest page to PoC. */ - mov x0, x13 - add x20, x0, #PAGE_SIZE - sub x1, x15, #1 - bic x0, x0, x1 -2: dc ivac, x0 - add x0, x0, x15 - cmp x0, x20 - b.lo 2b - dsb sy - - mov x20, x13 - mov x21, x12 - copy_page x20, x21, x0, x1, x2, x3, x4, x5, x6, x7 - - /* dest += PAGE_SIZE */ - add x13, x13, PAGE_SIZE + mov x19, x13 + copy_page x13, x12, x1, x2, x3, x4, x5, x6, x7, x8 + add x1, x19, #PAGE_SIZE + dcache_by_myline_op civac, sy, x19, x1, x15, x20 b .Lnext - .Ltest_indirection: tbz x16, IND_INDIRECTION_BIT, .Ltest_destination - - /* ptr = addr */ - mov x14, x12 + mov x14, x12 /* ptr = addr */ b .Lnext - .Ltest_destination: tbz x16, IND_DESTINATION_BIT, .Lnext - - /* dest = addr */ - mov x13, x12 - + mov x13, x12 /* dest = addr */ .Lnext: - /* entry = *ptr++ */ - ldr x16, [x14], #8 - - /* while (!(entry & DONE)) */ - tbz x16, IND_DONE_BIT, .Lloop - -.Ldone: + ldr x16, [x14], #8 /* entry = *ptr++ */ + tbz x16, IND_DONE_BIT, .Lloop /* while (!(entry & DONE)) */ /* wait for writes from copy_page to finish */ dsb nsh ic iallu dsb nsh isb + turn_off_mmu x12, x13 /* Start new image. */ - mov x0, x18 + cbz x27, .Lel1 + mov x1, x28 /* kernel entry point */ + mov x2, x26 /* dtb address */ + mov x3, xzr + mov x4, xzr + mov x0, #HVC_SOFT_RESTART + hvc #0 /* Jumps from el2 */ +.Lel1: + mov x0, x26 /* dtb address */ mov x1, xzr mov x2, xzr mov x3, xzr - br x17 - -ENDPROC(arm64_relocate_new_kernel) - -.ltorg - -.align 3 /* To keep the 64-bit values below naturally aligned. */ - -.Lcopy_end: -.org KEXEC_CONTROL_PAGE_SIZE - -/* - * arm64_relocate_new_kernel_size - Number of bytes to copy to the - * control_code_page. - */ -.globl arm64_relocate_new_kernel_size -arm64_relocate_new_kernel_size: - .quad .Lcopy_end - arm64_relocate_new_kernel + br x28 /* Jumps from el1 */ +SYM_CODE_END(arm64_relocate_new_kernel) diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 53c40196b607..68330017d04f 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -1,53 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arch/arm64/kernel/return_address.c * * Copyright (C) 2013 Linaro Limited * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/export.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> +#include <linux/stacktrace.h> #include <asm/stack_pointer.h> -#include <asm/stacktrace.h> struct return_address_data { unsigned int level; void *addr; }; -static int save_return_addr(struct stackframe *frame, void *d) +static bool save_return_addr(void *d, unsigned long pc) { struct return_address_data *data = d; if (!data->level) { - data->addr = (void *)frame->pc; - return 1; + data->addr = (void *)pc; + return false; } else { --data->level; - return 0; + return true; } } +NOKPROBE_SYMBOL(save_return_addr); void *return_address(unsigned int level) { struct return_address_data data; - struct stackframe frame; data.level = level + 2; data.addr = NULL; - frame.fp = (unsigned long)__builtin_frame_address(0); - frame.pc = (unsigned long)return_address; /* dummy */ -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = 0; -#endif - - walk_stackframe(current, &frame, save_return_addr, &data); + arch_stack_walk(save_return_addr, &data, current, NULL); if (!data.level) return data.addr; @@ -55,3 +47,4 @@ void *return_address(unsigned int level) return NULL; } EXPORT_SYMBOL_GPL(return_address); +NOKPROBE_SYMBOL(return_address); diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c new file mode 100644 index 000000000000..c64a06f58c0b --- /dev/null +++ b/arch/arm64/kernel/rsi.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include <linux/jump_label.h> +#include <linux/memblock.h> +#include <linux/psci.h> +#include <linux/swiotlb.h> +#include <linux/cc_platform.h> +#include <linux/platform_device.h> + +#include <asm/io.h> +#include <asm/mem_encrypt.h> +#include <asm/rsi.h> + +static struct realm_config config; + +unsigned long prot_ns_shared; +EXPORT_SYMBOL(prot_ns_shared); + +DEFINE_STATIC_KEY_FALSE_RO(rsi_present); +EXPORT_SYMBOL(rsi_present); + +bool cc_platform_has(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_MEM_ENCRYPT: + return is_realm_world(); + default: + return false; + } +} +EXPORT_SYMBOL_GPL(cc_platform_has); + +static bool rsi_version_matches(void) +{ + unsigned long ver_lower, ver_higher; + unsigned long ret = rsi_request_version(RSI_ABI_VERSION, + &ver_lower, + &ver_higher); + + if (ret == SMCCC_RET_NOT_SUPPORTED) + return false; + + if (ret != RSI_SUCCESS) { + pr_err("RME: RMM doesn't support RSI version %lu.%lu. Supported range: %lu.%lu-%lu.%lu\n", + RSI_ABI_VERSION_MAJOR, RSI_ABI_VERSION_MINOR, + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower), + RSI_ABI_VERSION_GET_MAJOR(ver_higher), + RSI_ABI_VERSION_GET_MINOR(ver_higher)); + return false; + } + + pr_info("RME: Using RSI version %lu.%lu\n", + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower)); + + return true; +} + +static void __init arm64_rsi_setup_memory(void) +{ + u64 i; + phys_addr_t start, end; + + /* + * Iterate over the available memory ranges and convert the state to + * protected memory. We should take extra care to ensure that we DO NOT + * permit any "DESTROYED" pages to be converted to "RAM". + * + * panic() is used because if the attempt to switch the memory to + * protected has failed here, then future accesses to the memory are + * simply going to be reflected as a SEA (Synchronous External Abort) + * which we can't handle. Bailing out early prevents the guest limping + * on and dying later. + */ + for_each_mem_range(i, &start, &end) { + if (rsi_set_memory_range_protected_safe(start, end)) { + panic("Failed to set memory range to protected: %pa-%pa", + &start, &end); + } + } +} + +/* + * Check if a given PA range is Trusted (e.g., Protected memory, a Trusted Device + * mapping, or an MMIO emulated in the Realm world). + * + * We can rely on the RIPAS value of the region to detect if a given region is + * protected. + * + * RIPAS_DEV - A trusted device memory or a trusted emulated MMIO (in the Realm + * world + * RIPAS_RAM - Memory (RAM), protected by the RMM guarantees. (e.g., Firmware + * reserved regions for data sharing). + * + * RIPAS_DESTROYED is a special case of one of the above, where the host did + * something without our permission and as such we can't do anything about it. + * + * The only case where something is emulated by the untrusted hypervisor or is + * backed by shared memory is indicated by RSI_RIPAS_EMPTY. + */ +bool arm64_rsi_is_protected(phys_addr_t base, size_t size) +{ + enum ripas ripas; + phys_addr_t end, top; + + /* Overflow ? */ + if (WARN_ON(base + size <= base)) + return false; + + end = ALIGN(base + size, RSI_GRANULE_SIZE); + base = ALIGN_DOWN(base, RSI_GRANULE_SIZE); + + while (base < end) { + if (WARN_ON(rsi_ipa_state_get(base, end, &ripas, &top))) + break; + if (WARN_ON(top <= base)) + break; + if (ripas == RSI_RIPAS_EMPTY) + break; + base = top; + } + + return base >= end; +} +EXPORT_SYMBOL(arm64_rsi_is_protected); + +static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot) +{ + if (arm64_rsi_is_protected(phys, size)) + *prot = pgprot_encrypted(*prot); + else + *prot = pgprot_decrypted(*prot); + + return 0; +} + +void __init arm64_rsi_init(void) +{ + if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) + return; + if (!rsi_version_matches()) + return; + if (WARN_ON(rsi_get_realm_config(&config))) + return; + prot_ns_shared = BIT(config.ipa_bits - 1); + + if (arm64_ioremap_prot_hook_register(realm_ioremap_hook)) + return; + + if (realm_register_memory_enc_ops()) + return; + + arm64_rsi_setup_memory(); + + static_branch_enable(&rsi_present); +} + +static struct platform_device rsi_dev = { + .name = RSI_PDEV_NAME, + .id = PLATFORM_DEVID_NONE +}; + +static int __init arm64_create_dummy_rsi_dev(void) +{ + if (is_realm_world() && + platform_device_register(&rsi_dev)) + pr_err("failed to register rsi platform device\n"); + return 0; +} + +arch_initcall(arm64_create_dummy_rsi_dev) diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index 5ba4465e44f0..778f2a1faac8 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -2,13 +2,16 @@ // Copyright (C) 2017 Arm Ltd. #define pr_fmt(fmt) "sdei: " fmt +#include <linux/arm-smccc.h> #include <linux/arm_sdei.h> #include <linux/hardirq.h> #include <linux/irqflags.h> #include <linux/sched/task_stack.h> +#include <linux/scs.h> #include <linux/uaccess.h> #include <asm/alternative.h> +#include <asm/exception.h> #include <asm/kprobes.h> #include <asm/mmu.h> #include <asm/ptrace.h> @@ -31,11 +34,20 @@ unsigned long sdei_exit_mode; DECLARE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); DECLARE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); -#ifdef CONFIG_VMAP_STACK DEFINE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); DEFINE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); + +DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr); +DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr); + +#ifdef CONFIG_SHADOW_CALL_STACK +DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr); +DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr); #endif +DEFINE_PER_CPU(struct sdei_registered_event *, sdei_active_normal_event); +DEFINE_PER_CPU(struct sdei_registered_event *, sdei_active_critical_event); + static void _free_sdei_stack(unsigned long * __percpu *ptr, int cpu) { unsigned long *p; @@ -89,52 +101,60 @@ static int init_sdei_stacks(void) return err; } -static bool on_sdei_normal_stack(unsigned long sp, struct stack_info *info) +static void _free_sdei_scs(unsigned long * __percpu *ptr, int cpu) { - unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr); - unsigned long high = low + SDEI_STACK_SIZE; + void *s; - if (sp < low || sp >= high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = STACK_TYPE_SDEI_NORMAL; + s = per_cpu(*ptr, cpu); + if (s) { + per_cpu(*ptr, cpu) = NULL; + scs_free(s); } - - return true; } -static bool on_sdei_critical_stack(unsigned long sp, struct stack_info *info) +static void free_sdei_scs(void) { - unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr); - unsigned long high = low + SDEI_STACK_SIZE; - - if (sp < low || sp >= high) - return false; + int cpu; - if (info) { - info->low = low; - info->high = high; - info->type = STACK_TYPE_SDEI_CRITICAL; + for_each_possible_cpu(cpu) { + _free_sdei_scs(&sdei_shadow_call_stack_normal_ptr, cpu); + _free_sdei_scs(&sdei_shadow_call_stack_critical_ptr, cpu); } +} + +static int _init_sdei_scs(unsigned long * __percpu *ptr, int cpu) +{ + void *s; - return true; + s = scs_alloc(cpu_to_node(cpu)); + if (!s) + return -ENOMEM; + per_cpu(*ptr, cpu) = s; + + return 0; } -bool _on_sdei_stack(unsigned long sp, struct stack_info *info) +static int init_sdei_scs(void) { - if (!IS_ENABLED(CONFIG_VMAP_STACK)) - return false; + int cpu; + int err = 0; - if (on_sdei_critical_stack(sp, info)) - return true; + if (!scs_is_enabled()) + return 0; - if (on_sdei_normal_stack(sp, info)) - return true; + for_each_possible_cpu(cpu) { + err = _init_sdei_scs(&sdei_shadow_call_stack_normal_ptr, cpu); + if (err) + break; + err = _init_sdei_scs(&sdei_shadow_call_stack_critical_ptr, cpu); + if (err) + break; + } + + if (err) + free_sdei_scs(); - return false; + return err; } unsigned long sdei_arch_get_entry_point(int conduit) @@ -145,17 +165,18 @@ unsigned long sdei_arch_get_entry_point(int conduit) * dropped to EL1 because we don't support VHE, then we can't support * SDEI. */ - if (is_hyp_mode_available() && !is_kernel_in_hyp_mode()) { + if (is_hyp_nvhe()) { pr_err("Not supported on this hardware/boot configuration\n"); - return 0; + goto out_err; } - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - if (init_sdei_stacks()) - return 0; - } + if (init_sdei_stacks()) + goto out_err; - sdei_exit_mode = (conduit == CONDUIT_HVC) ? SDEI_EXIT_HVC : SDEI_EXIT_SMC; + if (init_sdei_scs()) + goto out_err_free_stacks; + + sdei_exit_mode = (conduit == SMCCC_CONDUIT_HVC) ? SDEI_EXIT_HVC : SDEI_EXIT_SMC; #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 if (arm64_kernel_unmapped_at_el0()) { @@ -168,16 +189,20 @@ unsigned long sdei_arch_get_entry_point(int conduit) #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ return (unsigned long)__sdei_asm_handler; +out_err_free_stacks: + free_sdei_stacks(); +out_err: + return 0; } /* - * __sdei_handler() returns one of: + * do_sdei_event() returns one of: * SDEI_EV_HANDLED - success, return to the interrupted context. - * SDEI_EV_FAILED - failure, return this error code to firmare. + * SDEI_EV_FAILED - failure, return this error code to firmware. * virtual-address - success, return to this address. */ -static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, - struct sdei_registered_event *arg) +unsigned long __kprobes do_sdei_event(struct pt_regs *regs, + struct sdei_registered_event *arg) { u32 mode; int i, err = 0; @@ -195,12 +220,6 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, sdei_api_event_context(i, ®s->regs[i]); } - /* - * We didn't take an exception to get here, set PAN. UAO will be cleared - * by sdei_event_handler()s set_fs(USER_DS) call. - */ - __uaccess_enable_hw_pan(); - err = sdei_event_handler(regs, arg); if (err) return SDEI_EV_FAILED; @@ -220,7 +239,7 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, * If we interrupted the kernel with interrupts masked, we always go * back to wherever we came from. */ - if (mode == kernel_mode && !interrupts_enabled(regs)) + if (mode == kernel_mode && regs_irqs_disabled(regs)) return SDEI_EV_HANDLED; /* @@ -238,28 +257,3 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, return vbar + 0x480; } - - -asmlinkage __kprobes notrace unsigned long -__sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) -{ - unsigned long ret; - bool do_nmi_exit = false; - - /* - * nmi_enter() deals with printk() re-entrance and use of RCU when - * RCU believed this CPU was idle. Because critical events can - * interrupt normal events, we may already be in_nmi(). - */ - if (!in_nmi()) { - nmi_enter(); - do_nmi_exit = true; - } - - ret = _sdei_handler(regs, arg); - - if (do_nmi_exit) - nmi_exit(); - - return ret; -} diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 4b0e1231625c..23c05dc7a8f2 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/setup.c * * Copyright (C) 1995-2001 Russell King * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/acpi.h> @@ -34,12 +23,14 @@ #include <linux/interrupt.h> #include <linux/smp.h> #include <linux/fs.h> +#include <linux/panic_notifier.h> #include <linux/proc_fs.h> #include <linux/memblock.h> #include <linux/of_fdt.h> #include <linux/efi.h> #include <linux/psci.h> #include <linux/sched/task.h> +#include <linux/scs.h> #include <linux/mm.h> #include <asm/acpi.h> @@ -52,13 +43,14 @@ #include <asm/cpu_ops.h> #include <asm/kasan.h> #include <asm/numa.h> +#include <asm/rsi.h> +#include <asm/scs.h> #include <asm/sections.h> #include <asm/setup.h> #include <asm/smp_plat.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/traps.h> -#include <asm/memblock.h> #include <asm/efi.h> #include <asm/xen/hypervisor.h> #include <asm/mmu_context.h> @@ -67,6 +59,7 @@ static int num_standard_resources; static struct resource *standard_resources; phys_addr_t __fdt_pointer __initdata; +u64 mmu_enabled_at_boot __initdata; /* * Standard memory resources @@ -97,14 +90,8 @@ u64 __cacheline_aligned boot_args[4]; void __init smp_setup_processor_id(void) { u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; - cpu_logical_map(0) = mpidr; + set_cpu_logical_map(0, mpidr); - /* - * clear __my_cpu_offset on boot CPU to avoid hang caused by - * using percpu variable early, for example, lockdep will - * access percpu variable inside lock_release - */ - set_my_cpu_offset(0); pr_info("Booting Linux on physical CPU 0x%010lx [0x%08x]\n", (unsigned long)mpidr, read_cpuid_id()); } @@ -182,20 +169,36 @@ static void __init smp_build_mpidr_hash(void) static void __init setup_machine_fdt(phys_addr_t dt_phys) { - void *dt_virt = fixmap_remap_fdt(dt_phys); + int size = 0; + void *dt_virt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); const char *name; - if (!dt_virt || !early_init_dt_scan(dt_virt)) { + if (dt_virt) + memblock_reserve(dt_phys, size); + + /* + * dt_virt is a fixmap address, hence __pa(dt_virt) can't be used. + * Pass dt_phys directly. + */ + if (!early_init_dt_scan(dt_virt, dt_phys)) { pr_crit("\n" - "Error: invalid device tree blob at physical address %pa (virtual address 0x%p)\n" - "The dtb must be 8-byte aligned and must not exceed 2 MB in size\n" - "\nPlease check your bootloader.", - &dt_phys, dt_virt); + "Error: invalid device tree blob: PA=%pa, VA=%px, size=%d bytes\n" + "The dtb must be 8-byte aligned and must not exceed 2 MB in size.\n" + "\nPlease check your bootloader.\n", + &dt_phys, dt_virt, size); + /* + * Note that in this _really_ early stage we cannot even BUG() + * or oops, so the least terrible thing to do is cpu_relax(), + * or else we could end-up printing non-initialized data, etc. + */ while (true) cpu_relax(); } + /* Early fixups are done, map the FDT as read-only now */ + fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO); + name = of_flat_dt_get_machine_name(); if (!name) return; @@ -209,43 +212,34 @@ static void __init request_standard_resources(void) struct memblock_region *region; struct resource *res; unsigned long i = 0; + size_t res_size; kernel_code.start = __pa_symbol(_text); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); + insert_resource(&iomem_resource, &kernel_code); + insert_resource(&iomem_resource, &kernel_data); num_standard_resources = memblock.memory.cnt; - standard_resources = memblock_alloc_low(num_standard_resources * - sizeof(*standard_resources), - SMP_CACHE_BYTES); + res_size = num_standard_resources * sizeof(*standard_resources); + standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES); - for_each_memblock(memory, region) { + for_each_mem_region(region) { res = &standard_resources[i++]; if (memblock_is_nomap(region)) { res->name = "reserved"; res->flags = IORESOURCE_MEM; + res->start = __pfn_to_phys(memblock_region_reserved_base_pfn(region)); + res->end = __pfn_to_phys(memblock_region_reserved_end_pfn(region)) - 1; } else { res->name = "System RAM"; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); + res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; } - res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); - res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; - - request_resource(&iomem_resource, res); - - if (kernel_code.start >= res->start && - kernel_code.end <= res->end) - request_resource(res, &kernel_code); - if (kernel_data.start >= res->start && - kernel_data.end <= res->end) - request_resource(res, &kernel_data); -#ifdef CONFIG_KEXEC_CORE - /* Userspace will find "Crash kernel" region in /proc/iomem. */ - if (crashk_res.end && crashk_res.start >= res->start && - crashk_res.end <= res->end) - request_resource(res, &crashk_res); -#endif + + insert_resource(&iomem_resource, res); } } @@ -260,7 +254,7 @@ static int __init reserve_memblock_reserved_regions(void) if (!memblock_is_region_reserved(mem->start, mem_size)) continue; - for_each_reserved_mem_region(j, &r_start, &r_end) { + for_each_reserved_mem_range(j, &r_start, &r_end) { resource_size_t start, end; start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start); @@ -279,26 +273,43 @@ arch_initcall(reserve_memblock_reserved_regions); u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; -void __init setup_arch(char **cmdline_p) +u64 cpu_logical_map(unsigned int cpu) +{ + return __cpu_logical_map[cpu]; +} + +void __init __no_sanitize_address setup_arch(char **cmdline_p) { - init_mm.start_code = (unsigned long) _text; - init_mm.end_code = (unsigned long) _etext; - init_mm.end_data = (unsigned long) _edata; - init_mm.brk = (unsigned long) _end; + setup_initial_init_mm(_text, _etext, _edata, _end); *cmdline_p = boot_command_line; + kaslr_init(); + early_fixmap_init(); early_ioremap_init(); setup_machine_fdt(__fdt_pointer); + /* + * Initialise the static keys early as they may be enabled by the + * cpufeature code and early parameters. + */ + jump_label_init(); parse_early_param(); + dynamic_scs_init(); + /* - * Unmask asynchronous aborts and fiq after bringing up possible - * earlycon. (Report possible System Errors once we can report this - * occurred). + * The primary CPU enters the kernel with all DAIF exceptions masked. + * + * We must unmask Debug and SError before preemption or scheduling is + * possible to ensure that these are consistently unmasked across + * threads, and we want to unmask SError as soon as possible after + * initializing earlycon so that we can report any SErrors immediately. + * + * IRQ and FIQ will be unmasked after the root irqchip has been + * detected and initialized. */ local_daif_restore(DAIF_PROCCTX_NOIRQ); @@ -310,10 +321,17 @@ void __init setup_arch(char **cmdline_p) xen_early_init(); efi_init(); + + if (!efi_enabled(EFI_BOOT)) { + if ((u64)_text % MIN_KIMG_ALIGN) + pr_warn(FW_BUG "Kernel image misaligned at boot, please fix your bootloader!"); + WARN_TAINT(mmu_enabled_at_boot, TAINT_FIRMWARE_WORKAROUND, + FW_BUG "Booted with MMU enabled!"); + } + arm64_memblock_init(); paging_init(); - efi_apply_persistent_mem_reservations(); acpi_table_upgrade(); @@ -336,7 +354,9 @@ void __init setup_arch(char **cmdline_p) else psci_acpi_init(); - cpu_read_bootcpu_ops(); + arm64_rsi_init(); + + init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); @@ -346,12 +366,9 @@ void __init setup_arch(char **cmdline_p) * faults in case uaccess_enable() is inadvertently called by the init * thread. */ - init_task.thread_info.ttbr0 = __pa_symbol(empty_zero_page); + init_task.thread_info.ttbr0 = phys_to_ttbr(__pa_symbol(reserved_pg_dir)); #endif -#ifdef CONFIG_VT - conswitchp = &dummy_con; -#endif if (boot_args[1] || boot_args[2] || boot_args[3]) { pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n" "\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n" @@ -360,28 +377,23 @@ void __init setup_arch(char **cmdline_p) } } -static int __init topology_init(void) +static inline bool cpu_can_disable(unsigned int cpu) { - int i; +#ifdef CONFIG_HOTPLUG_CPU + const struct cpu_operations *ops = get_cpu_ops(cpu); - for_each_online_node(i) - register_one_node(i); - - for_each_possible_cpu(i) { - struct cpu *cpu = &per_cpu(cpu_data.cpu, i); - cpu->hotpluggable = 1; - register_cpu(cpu, i); - } + if (ops && ops->cpu_can_disable) + return ops->cpu_can_disable(cpu); +#endif + return false; +} - return 0; +bool arch_cpu_is_hotpluggable(int num) +{ + return cpu_can_disable(num); } -subsys_initcall(topology_init); -/* - * Dump out kernel offset information on panic. - */ -static int dump_kernel_offset(struct notifier_block *self, unsigned long v, - void *p) +static void dump_kernel_offset(void) { const unsigned long offset = kaslr_offset(); @@ -392,17 +404,33 @@ static int dump_kernel_offset(struct notifier_block *self, unsigned long v, } else { pr_emerg("Kernel Offset: disabled\n"); } +} + +static int arm64_panic_block_dump(struct notifier_block *self, + unsigned long v, void *p) +{ + dump_kernel_offset(); + dump_cpu_features(); + dump_mem_limit(); return 0; } -static struct notifier_block kernel_offset_notifier = { - .notifier_call = dump_kernel_offset +static struct notifier_block arm64_panic_block = { + .notifier_call = arm64_panic_block_dump }; -static int __init register_kernel_offset_dumper(void) +static int __init register_arm64_panic_block(void) { atomic_notifier_chain_register(&panic_notifier_list, - &kernel_offset_notifier); + &arm64_panic_block); + return 0; +} +device_initcall(register_arm64_panic_block); + +static int __init check_mmu_enabled_at_boot(void) +{ + if (!efi_enabled(EFI_BOOT) && mmu_enabled_at_boot) + panic("Non-EFI boot detected with MMU and caches enabled"); return 0; } -__initcall(register_kernel_offset_dumper); +device_initcall_sync(check_mmu_enabled_at_boot); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 867a7cea70e5..1110eeb21f57 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -1,49 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/signal.c * * Copyright (C) 1995-2009 Russell King * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/cache.h> #include <linux/compat.h> #include <linux/errno.h> +#include <linux/irq-entry-common.h> #include <linux/kernel.h> #include <linux/signal.h> -#include <linux/personality.h> #include <linux/freezer.h> #include <linux/stddef.h> #include <linux/uaccess.h> #include <linux/sizes.h> #include <linux/string.h> -#include <linux/tracehook.h> #include <linux/ratelimit.h> +#include <linux/rseq.h> #include <linux/syscalls.h> +#include <linux/pkeys.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/elf.h> +#include <asm/exception.h> #include <asm/cacheflush.h> +#include <asm/gcs.h> #include <asm/ucontext.h> #include <asm/unistd.h> #include <asm/fpsimd.h> #include <asm/ptrace.h> +#include <asm/syscall.h> #include <asm/signal32.h> #include <asm/traps.h> #include <asm/vdso.h> +#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) + /* * Do a signal return; undo the signal stack. These are aligned to 128-bit. */ @@ -52,11 +47,6 @@ struct rt_sigframe { struct ucontext uc; }; -struct frame_record { - u64 fp; - u64 lr; -}; - struct rt_sigframe_user_layout { struct rt_sigframe __user *sigframe; struct frame_record __user *next_frame; @@ -66,15 +56,76 @@ struct rt_sigframe_user_layout { unsigned long fpsimd_offset; unsigned long esr_offset; + unsigned long gcs_offset; unsigned long sve_offset; + unsigned long tpidr2_offset; + unsigned long za_offset; + unsigned long zt_offset; + unsigned long fpmr_offset; + unsigned long poe_offset; unsigned long extra_offset; unsigned long end_offset; }; -#define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16) +/* + * Holds any EL0-controlled state that influences unprivileged memory accesses. + * This includes both accesses done in userspace and uaccess done in the kernel. + * + * This state needs to be carefully managed to ensure that it doesn't cause + * uaccess to fail when setting up the signal frame, and the signal handler + * itself also expects a well-defined state when entered. + */ +struct user_access_state { + u64 por_el0; +}; + #define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16) #define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16) +/* + * Save the user access state into ua_state and reset it to disable any + * restrictions. + */ +static void save_reset_user_access_state(struct user_access_state *ua_state) +{ + if (system_supports_poe()) { + u64 por_enable_all = 0; + + for (int pkey = 0; pkey < arch_max_pkey(); pkey++) + por_enable_all |= POR_ELx_PERM_PREP(pkey, POE_RWX); + + ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0); + write_sysreg_s(por_enable_all, SYS_POR_EL0); + /* + * No ISB required as we can tolerate spurious Overlay faults - + * the fault handler will check again based on the new value + * of POR_EL0. + */ + } +} + +/* + * Set the user access state for invoking the signal handler. + * + * No uaccess should be done after that function is called. + */ +static void set_handler_user_access_state(void) +{ + if (system_supports_poe()) + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); +} + +/* + * Restore the user access state to the values saved in ua_state. + * + * No uaccess should be done after that function is called. + */ +static void restore_user_access_state(const struct user_access_state *ua_state) +{ + if (system_supports_poe()) + write_sysreg_s(ua_state->por_el0, SYS_POR_EL0); +} + static void init_user_layout(struct rt_sigframe_user_layout *user) { const size_t reserved_size = @@ -101,7 +152,7 @@ static size_t sigframe_size(struct rt_sigframe_user_layout const *user) * not taken into account. This limit is not a guarantee and is * NOT ABI. */ -#define SIGFRAME_MAXSZ SZ_64K +#define SIGFRAME_MAXSZ SZ_256K static int __sigframe_alloc(struct rt_sigframe_user_layout *user, unsigned long *offset, size_t size, bool extend) @@ -178,12 +229,33 @@ static void __user *apply_user_offset( return base + offset; } +struct user_ctxs { + struct fpsimd_context __user *fpsimd; + u32 fpsimd_size; + struct sve_context __user *sve; + u32 sve_size; + struct tpidr2_context __user *tpidr2; + u32 tpidr2_size; + struct za_context __user *za; + u32 za_size; + struct zt_context __user *zt; + u32 zt_size; + struct fpmr_context __user *fpmr; + u32 fpmr_size; + struct poe_context __user *poe; + u32 poe_size; + struct gcs_context __user *gcs; + u32 gcs_size; +}; + static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) { struct user_fpsimd_state const *fpsimd = ¤t->thread.uw.fpsimd_state; int err; + fpsimd_sync_from_effective_state(current); + /* copy the FP and status/control registers */ err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); __put_user_error(fpsimd->fpsr, &ctx->fpsr, err); @@ -196,40 +268,95 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) return err ? -EFAULT : 0; } -static int restore_fpsimd_context(struct fpsimd_context __user *ctx) +static int read_fpsimd_context(struct user_fpsimd_state *fpsimd, + struct user_ctxs *user) { - struct user_fpsimd_state fpsimd; - __u32 magic, size; - int err = 0; + int err; - /* check the magic/size information */ - __get_user_error(magic, &ctx->head.magic, err); - __get_user_error(size, &ctx->head.size, err); - if (err) - return -EFAULT; - if (magic != FPSIMD_MAGIC || size != sizeof(struct fpsimd_context)) + /* check the size information */ + if (user->fpsimd_size != sizeof(struct fpsimd_context)) return -EINVAL; /* copy the FP and status/control registers */ - err = __copy_from_user(fpsimd.vregs, ctx->vregs, - sizeof(fpsimd.vregs)); - __get_user_error(fpsimd.fpsr, &ctx->fpsr, err); - __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); + err = __copy_from_user(fpsimd->vregs, &(user->fpsimd->vregs), + sizeof(fpsimd->vregs)); + __get_user_error(fpsimd->fpsr, &(user->fpsimd->fpsr), err); + __get_user_error(fpsimd->fpcr, &(user->fpsimd->fpcr), err); + + return err ? -EFAULT : 0; +} + +static int restore_fpsimd_context(struct user_ctxs *user) +{ + struct user_fpsimd_state fpsimd; + int err; + + err = read_fpsimd_context(&fpsimd, user); + if (err) + return err; clear_thread_flag(TIF_SVE); + current->thread.svcr &= ~SVCR_SM_MASK; + current->thread.fp_type = FP_STATE_FPSIMD; /* load the hardware registers from the fpsimd_state structure */ + fpsimd_update_current_state(&fpsimd); + return 0; +} + +static int preserve_fpmr_context(struct fpmr_context __user *ctx) +{ + int err = 0; + + __put_user_error(FPMR_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(current->thread.uw.fpmr, &ctx->fpmr, err); + + return err; +} + +static int restore_fpmr_context(struct user_ctxs *user) +{ + u64 fpmr; + int err = 0; + + if (user->fpmr_size != sizeof(*user->fpmr)) + return -EINVAL; + + __get_user_error(fpmr, &user->fpmr->fpmr, err); if (!err) - fpsimd_update_current_state(&fpsimd); + current->thread.uw.fpmr = fpmr; - return err ? -EFAULT : 0; + return err; } +static int preserve_poe_context(struct poe_context __user *ctx, + const struct user_access_state *ua_state) +{ + int err = 0; -struct user_ctxs { - struct fpsimd_context __user *fpsimd; - struct sve_context __user *sve; -}; + __put_user_error(POE_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(ua_state->por_el0, &ctx->por_el0, err); + + return err; +} + +static int restore_poe_context(struct user_ctxs *user, + struct user_access_state *ua_state) +{ + u64 por_el0; + int err = 0; + + if (user->poe_size != sizeof(*user->poe)) + return -EINVAL; + + __get_user_error(por_el0, &(user->poe->por_el0), err); + if (!err) + ua_state->por_el0 = por_el0; + + return err; +} #ifdef CONFIG_ARM64_SVE @@ -237,11 +364,17 @@ static int preserve_sve_context(struct sve_context __user *ctx) { int err = 0; u16 reserved[ARRAY_SIZE(ctx->__reserved)]; - unsigned int vl = current->thread.sve_vl; + u16 flags = 0; + unsigned int vl = task_get_sve_vl(current); unsigned int vq = 0; - if (test_thread_flag(TIF_SVE)) + if (thread_sm_enabled(¤t->thread)) { + vl = task_get_sme_vl(current); + vq = sve_vq_from_vl(vl); + flags |= SVE_SIG_FLAG_SM; + } else if (current->thread.fp_type == FP_STATE_SVE) { vq = sve_vq_from_vl(vl); + } memset(reserved, 0, sizeof(reserved)); @@ -249,14 +382,11 @@ static int preserve_sve_context(struct sve_context __user *ctx) __put_user_error(round_up(SVE_SIG_CONTEXT_SIZE(vq), 16), &ctx->head.size, err); __put_user_error(vl, &ctx->vl, err); + __put_user_error(flags, &ctx->flags, err); BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { - /* - * This assumes that the SVE state has already been saved to - * the task struct by calling preserve_fpsimd_context(). - */ err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET, current->thread.sve_state, SVE_SIG_REGS_SIZE(vq)); @@ -267,43 +397,64 @@ static int preserve_sve_context(struct sve_context __user *ctx) static int restore_sve_fpsimd_context(struct user_ctxs *user) { - int err; - unsigned int vq; + int err = 0; + unsigned int vl, vq; struct user_fpsimd_state fpsimd; - struct sve_context sve; + u16 user_vl, flags; + bool sm; - if (__copy_from_user(&sve, user->sve, sizeof(sve))) - return -EFAULT; - - if (sve.vl != current->thread.sve_vl) + if (user->sve_size < sizeof(*user->sve)) return -EINVAL; - if (sve.head.size <= sizeof(*user->sve)) { - clear_thread_flag(TIF_SVE); - goto fpsimd_only; - } + __get_user_error(user_vl, &(user->sve->vl), err); + __get_user_error(flags, &(user->sve->flags), err); + if (err) + return err; - vq = sve_vq_from_vl(sve.vl); + sm = flags & SVE_SIG_FLAG_SM; + if (sm) { + if (!system_supports_sme()) + return -EINVAL; - if (sve.head.size < SVE_SIG_CONTEXT_SIZE(vq)) + vl = task_get_sme_vl(current); + } else { + /* + * A SME only system use SVE for streaming mode so can + * have a SVE formatted context with a zero VL and no + * payload data. + */ + if (!system_supports_sve() && !system_supports_sme()) + return -EINVAL; + + vl = task_get_sve_vl(current); + } + + if (user_vl != vl) return -EINVAL; /* - * Careful: we are about __copy_from_user() directly into - * thread.sve_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. + * Non-streaming SVE state may be preserved without an SVE payload, in + * which case the SVE context only has a header with VL==0, and all + * state can be restored from the FPSIMD context. + * + * Streaming SVE state is always preserved with an SVE payload. For + * consistency and robustness, reject restoring streaming SVE state + * without an SVE payload. */ + if (!sm && user->sve_size == sizeof(*user->sve)) + return restore_fpsimd_context(user); - fpsimd_flush_task_state(current); - barrier(); - /* From now, fpsimd_thread_switch() won't clear TIF_FOREIGN_FPSTATE */ + vq = sve_vq_from_vl(vl); - set_thread_flag(TIF_FOREIGN_FPSTATE); - barrier(); - /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ + if (user->sve_size < SVE_SIG_CONTEXT_SIZE(vq)) + return -EINVAL; + + sve_alloc(current, true); + if (!current->thread.sve_state) { + clear_thread_flag(TIF_SVE); + return -ENOMEM; + } - sve_alloc(current); err = __copy_from_user(current->thread.sve_state, (char __user const *)user->sve + SVE_SIG_REGS_OFFSET, @@ -311,31 +462,279 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (err) return -EFAULT; - set_thread_flag(TIF_SVE); + if (flags & SVE_SIG_FLAG_SM) + current->thread.svcr |= SVCR_SM_MASK; + else + set_thread_flag(TIF_SVE); + current->thread.fp_type = FP_STATE_SVE; -fpsimd_only: - /* copy the FP and status/control registers */ - /* restore_sigframe() already checked that user->fpsimd != NULL. */ - err = __copy_from_user(fpsimd.vregs, user->fpsimd->vregs, - sizeof(fpsimd.vregs)); - __get_user_error(fpsimd.fpsr, &user->fpsimd->fpsr, err); - __get_user_error(fpsimd.fpcr, &user->fpsimd->fpcr, err); + err = read_fpsimd_context(&fpsimd, user); + if (err) + return err; - /* load the hardware registers from the fpsimd_state structure */ - if (!err) - fpsimd_update_current_state(&fpsimd); + /* Merge the FPSIMD registers into the SVE state */ + fpsimd_update_current_state(&fpsimd); - return err ? -EFAULT : 0; + return 0; } #else /* ! CONFIG_ARM64_SVE */ -/* Turn any non-optimised out attempts to use these into a link error: */ +static int restore_sve_fpsimd_context(struct user_ctxs *user) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} + +/* Turn any non-optimised out attempts to use this into a link error: */ extern int preserve_sve_context(void __user *ctx); -extern int restore_sve_fpsimd_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_SME + +static int preserve_tpidr2_context(struct tpidr2_context __user *ctx) +{ + u64 tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + int err = 0; + + __put_user_error(TPIDR2_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(tpidr2_el0, &ctx->tpidr2, err); + + return err; +} + +static int restore_tpidr2_context(struct user_ctxs *user) +{ + u64 tpidr2_el0; + int err = 0; + + if (user->tpidr2_size != sizeof(*user->tpidr2)) + return -EINVAL; + + __get_user_error(tpidr2_el0, &user->tpidr2->tpidr2, err); + if (!err) + write_sysreg_s(tpidr2_el0, SYS_TPIDR2_EL0); + + return err; +} + +static int preserve_za_context(struct za_context __user *ctx) +{ + int err = 0; + u16 reserved[ARRAY_SIZE(ctx->__reserved)]; + unsigned int vl = task_get_sme_vl(current); + unsigned int vq; + + if (thread_za_enabled(¤t->thread)) + vq = sve_vq_from_vl(vl); + else + vq = 0; + + memset(reserved, 0, sizeof(reserved)); + + __put_user_error(ZA_MAGIC, &ctx->head.magic, err); + __put_user_error(round_up(ZA_SIG_CONTEXT_SIZE(vq), 16), + &ctx->head.size, err); + __put_user_error(vl, &ctx->vl, err); + BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); + err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); + + if (vq) { + err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET, + current->thread.sme_state, + ZA_SIG_REGS_SIZE(vq)); + } + + return err ? -EFAULT : 0; +} + +static int restore_za_context(struct user_ctxs *user) +{ + int err = 0; + unsigned int vq; + u16 user_vl; + + if (user->za_size < sizeof(*user->za)) + return -EINVAL; + + __get_user_error(user_vl, &(user->za->vl), err); + if (err) + return err; + + if (user_vl != task_get_sme_vl(current)) + return -EINVAL; + + if (user->za_size == sizeof(*user->za)) { + current->thread.svcr &= ~SVCR_ZA_MASK; + return 0; + } + + vq = sve_vq_from_vl(user_vl); + + if (user->za_size < ZA_SIG_CONTEXT_SIZE(vq)) + return -EINVAL; + + sme_alloc(current, true); + if (!current->thread.sme_state) { + current->thread.svcr &= ~SVCR_ZA_MASK; + clear_thread_flag(TIF_SME); + return -ENOMEM; + } + + err = __copy_from_user(current->thread.sme_state, + (char __user const *)user->za + + ZA_SIG_REGS_OFFSET, + ZA_SIG_REGS_SIZE(vq)); + if (err) + return -EFAULT; + + set_thread_flag(TIF_SME); + current->thread.svcr |= SVCR_ZA_MASK; + + return 0; +} + +static int preserve_zt_context(struct zt_context __user *ctx) +{ + int err = 0; + u16 reserved[ARRAY_SIZE(ctx->__reserved)]; + + if (WARN_ON(!thread_za_enabled(¤t->thread))) + return -EINVAL; + + memset(reserved, 0, sizeof(reserved)); + + __put_user_error(ZT_MAGIC, &ctx->head.magic, err); + __put_user_error(round_up(ZT_SIG_CONTEXT_SIZE(1), 16), + &ctx->head.size, err); + __put_user_error(1, &ctx->nregs, err); + BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); + err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); + + err |= __copy_to_user((char __user *)ctx + ZT_SIG_REGS_OFFSET, + thread_zt_state(¤t->thread), + ZT_SIG_REGS_SIZE(1)); + + return err ? -EFAULT : 0; +} + +static int restore_zt_context(struct user_ctxs *user) +{ + int err; + u16 nregs; + + /* ZA must be restored first for this check to be valid */ + if (!thread_za_enabled(¤t->thread)) + return -EINVAL; + + if (user->zt_size != ZT_SIG_CONTEXT_SIZE(1)) + return -EINVAL; + + if (__copy_from_user(&nregs, &(user->zt->nregs), sizeof(nregs))) + return -EFAULT; + + if (nregs != 1) + return -EINVAL; + + err = __copy_from_user(thread_zt_state(¤t->thread), + (char __user const *)user->zt + + ZT_SIG_REGS_OFFSET, + ZT_SIG_REGS_SIZE(1)); + if (err) + return -EFAULT; + + return 0; +} + +#else /* ! CONFIG_ARM64_SME */ + +/* Turn any non-optimised out attempts to use these into a link error: */ +extern int preserve_tpidr2_context(void __user *ctx); +extern int restore_tpidr2_context(struct user_ctxs *user); +extern int preserve_za_context(void __user *ctx); +extern int restore_za_context(struct user_ctxs *user); +extern int preserve_zt_context(void __user *ctx); +extern int restore_zt_context(struct user_ctxs *user); + +#endif /* ! CONFIG_ARM64_SME */ + +#ifdef CONFIG_ARM64_GCS + +static int preserve_gcs_context(struct gcs_context __user *ctx) +{ + int err = 0; + u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * If GCS is enabled we will add a cap token to the frame, + * include it in the GCSPR_EL0 we report to support stack + * switching via sigreturn if GCS is enabled. We do not allow + * enabling via sigreturn so the token is only relevant for + * threads with GCS enabled. + */ + if (task_gcs_el0_enabled(current)) + gcspr -= 8; + + __put_user_error(GCS_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(gcspr, &ctx->gcspr, err); + __put_user_error(0, &ctx->reserved, err); + __put_user_error(current->thread.gcs_el0_mode, + &ctx->features_enabled, err); + + return err; +} + +static int restore_gcs_context(struct user_ctxs *user) +{ + u64 gcspr, enabled; + int err = 0; + + if (user->gcs_size != sizeof(*user->gcs)) + return -EINVAL; + + __get_user_error(gcspr, &user->gcs->gcspr, err); + __get_user_error(enabled, &user->gcs->features_enabled, err); + if (err) + return err; + + /* Don't allow unknown modes */ + if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + err = gcs_check_locked(current, enabled); + if (err != 0) + return err; + + /* Don't allow enabling */ + if (!task_gcs_el0_enabled(current) && + (enabled & PR_SHADOW_STACK_ENABLE)) + return -EINVAL; + + /* If we are disabling disable everything */ + if (!(enabled & PR_SHADOW_STACK_ENABLE)) + enabled = 0; + + current->thread.gcs_el0_mode = enabled; + + /* + * We let userspace set GCSPR_EL0 to anything here, we will + * validate later in gcs_restore_signal(). + */ + write_sysreg_s(gcspr, SYS_GCSPR_EL0); + + return 0; +} + +#else /* ! CONFIG_ARM64_GCS */ + +/* Turn any non-optimised out attempts to use these into a link error: */ +extern int preserve_gcs_context(void __user *ctx); +extern int restore_gcs_context(struct user_ctxs *user); + +#endif /* ! CONFIG_ARM64_GCS */ static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) @@ -350,6 +749,12 @@ static int parse_user_sigframe(struct user_ctxs *user, user->fpsimd = NULL; user->sve = NULL; + user->tpidr2 = NULL; + user->za = NULL; + user->zt = NULL; + user->fpmr = NULL; + user->poe = NULL; + user->gcs = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -387,30 +792,94 @@ static int parse_user_sigframe(struct user_ctxs *user, goto done; case FPSIMD_MAGIC: - if (user->fpsimd) + if (!system_supports_fpsimd()) goto invalid; - - if (size < sizeof(*user->fpsimd)) + if (user->fpsimd) goto invalid; user->fpsimd = (struct fpsimd_context __user *)head; + user->fpsimd_size = size; break; case ESR_MAGIC: /* ignore */ break; + case POE_MAGIC: + if (!system_supports_poe()) + goto invalid; + + if (user->poe) + goto invalid; + + user->poe = (struct poe_context __user *)head; + user->poe_size = size; + break; + case SVE_MAGIC: - if (!system_supports_sve()) + if (!system_supports_sve() && !system_supports_sme()) goto invalid; if (user->sve) goto invalid; - if (size < sizeof(*user->sve)) + user->sve = (struct sve_context __user *)head; + user->sve_size = size; + break; + + case TPIDR2_MAGIC: + if (!system_supports_tpidr2()) goto invalid; - user->sve = (struct sve_context __user *)head; + if (user->tpidr2) + goto invalid; + + user->tpidr2 = (struct tpidr2_context __user *)head; + user->tpidr2_size = size; + break; + + case ZA_MAGIC: + if (!system_supports_sme()) + goto invalid; + + if (user->za) + goto invalid; + + user->za = (struct za_context __user *)head; + user->za_size = size; + break; + + case ZT_MAGIC: + if (!system_supports_sme2()) + goto invalid; + + if (user->zt) + goto invalid; + + user->zt = (struct zt_context __user *)head; + user->zt_size = size; + break; + + case FPMR_MAGIC: + if (!system_supports_fpmr()) + goto invalid; + + if (user->fpmr) + goto invalid; + + user->fpmr = (struct fpmr_context __user *)head; + user->fpmr_size = size; + break; + + case GCS_MAGIC: + if (!system_supports_gcs()) + goto invalid; + + if (user->gcs) + goto invalid; + + user->gcs = (struct gcs_context __user *)head; + user->gcs_size = size; break; case EXTRA_MAGIC: @@ -496,7 +965,8 @@ invalid: } static int restore_sigframe(struct pt_regs *regs, - struct rt_sigframe __user *sf) + struct rt_sigframe __user *sf, + struct user_access_state *ua_state) { sigset_t set; int i, err; @@ -518,31 +988,100 @@ static int restore_sigframe(struct pt_regs *regs, */ forget_syscall(regs); + fpsimd_save_and_flush_current_state(); + err |= !valid_user_regs(®s->user_regs, current); if (err == 0) err = parse_user_sigframe(&user, sf); - if (err == 0) { + if (err == 0 && system_supports_fpsimd()) { if (!user.fpsimd) return -EINVAL; - if (user.sve) { - if (!system_supports_sve()) - return -EINVAL; - + if (user.sve) err = restore_sve_fpsimd_context(&user); - } else { - err = restore_fpsimd_context(user.fpsimd); - } + else + err = restore_fpsimd_context(&user); } + if (err == 0 && system_supports_gcs() && user.gcs) + err = restore_gcs_context(&user); + + if (err == 0 && system_supports_tpidr2() && user.tpidr2) + err = restore_tpidr2_context(&user); + + if (err == 0 && system_supports_fpmr() && user.fpmr) + err = restore_fpmr_context(&user); + + if (err == 0 && system_supports_sme() && user.za) + err = restore_za_context(&user); + + if (err == 0 && system_supports_sme2() && user.zt) + err = restore_zt_context(&user); + + if (err == 0 && system_supports_poe() && user.poe) + err = restore_poe_context(&user, ua_state); + return err; } +#ifdef CONFIG_ARM64_GCS +static int gcs_restore_signal(void) +{ + u64 gcspr_el0, cap; + int ret; + + if (!system_supports_gcs()) + return 0; + + if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)) + return 0; + + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * Ensure that any changes to the GCS done via GCS operations + * are visible to the normal reads we do to validate the + * token. + */ + gcsb_dsync(); + + /* + * GCSPR_EL0 should be pointing at a capped GCS, read the cap. + * We don't enforce that this is in a GCS page, if it is not + * then faults will be generated on GCS operations - the main + * concern is to protect GCS pages. + */ + ret = copy_from_user(&cap, (unsigned long __user *)gcspr_el0, + sizeof(cap)); + if (ret) + return -EFAULT; + + /* + * Check that the cap is the actual GCS before replacing it. + */ + if (cap != GCS_SIGNAL_CAP(gcspr_el0)) + return -EINVAL; + + /* Invalidate the token to prevent reuse */ + put_user_gcs(0, (unsigned long __user *)gcspr_el0, &ret); + if (ret != 0) + return -EFAULT; + + write_sysreg_s(gcspr_el0 + 8, SYS_GCSPR_EL0); + + return 0; +} + +#else +static int gcs_restore_signal(void) { return 0; } +#endif + SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; + struct user_access_state ua_state; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; @@ -559,12 +1098,17 @@ SYSCALL_DEFINE0(rt_sigreturn) if (!access_ok(frame, sizeof (*frame))) goto badframe; - if (restore_sigframe(regs, frame)) + if (restore_sigframe(regs, frame, &ua_state)) + goto badframe; + + if (gcs_restore_signal()) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) goto badframe; + restore_user_access_state(&ua_state); + return regs->regs[0]; badframe: @@ -584,10 +1128,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, { int err; - err = sigframe_alloc(user, &user->fpsimd_offset, - sizeof(struct fpsimd_context)); - if (err) - return err; + if (system_supports_fpsimd()) { + err = sigframe_alloc(user, &user->fpsimd_offset, + sizeof(struct fpsimd_context)); + if (err) + return err; + } /* fault information, if valid */ if (add_all || current->thread.fault_code) { @@ -597,14 +1143,24 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } - if (system_supports_sve()) { +#ifdef CONFIG_ARM64_GCS + if (system_supports_gcs() && (add_all || current->thread.gcspr_el0)) { + err = sigframe_alloc(user, &user->gcs_offset, + sizeof(struct gcs_context)); + if (err) + return err; + } +#endif + + if (system_supports_sve() || system_supports_sme()) { unsigned int vq = 0; - if (add_all || test_thread_flag(TIF_SVE)) { - int vl = sve_max_vl; + if (add_all || current->thread.fp_type == FP_STATE_SVE || + thread_sm_enabled(¤t->thread)) { + int vl = max(sve_max_vl(), sme_max_vl()); if (!add_all) - vl = current->thread.sve_vl; + vl = thread_get_cur_vl(¤t->thread); vq = sve_vq_from_vl(vl); } @@ -615,11 +1171,60 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } + if (system_supports_tpidr2()) { + err = sigframe_alloc(user, &user->tpidr2_offset, + sizeof(struct tpidr2_context)); + if (err) + return err; + } + + if (system_supports_sme()) { + unsigned int vl; + unsigned int vq = 0; + + if (add_all) + vl = sme_max_vl(); + else + vl = task_get_sme_vl(current); + + if (thread_za_enabled(¤t->thread)) + vq = sve_vq_from_vl(vl); + + err = sigframe_alloc(user, &user->za_offset, + ZA_SIG_CONTEXT_SIZE(vq)); + if (err) + return err; + } + + if (system_supports_sme2()) { + if (add_all || thread_za_enabled(¤t->thread)) { + err = sigframe_alloc(user, &user->zt_offset, + ZT_SIG_CONTEXT_SIZE(1)); + if (err) + return err; + } + } + + if (system_supports_fpmr()) { + err = sigframe_alloc(user, &user->fpmr_offset, + sizeof(struct fpmr_context)); + if (err) + return err; + } + + if (system_supports_poe()) { + err = sigframe_alloc(user, &user->poe_offset, + sizeof(struct poe_context)); + if (err) + return err; + } + return sigframe_alloc_end(user); } static int setup_sigframe(struct rt_sigframe_user_layout *user, - struct pt_regs *regs, sigset_t *set) + struct pt_regs *regs, sigset_t *set, + const struct user_access_state *ua_state) { int i, err = 0; struct rt_sigframe __user *sf = user->sigframe; @@ -639,7 +1244,7 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); - if (err == 0) { + if (err == 0 && system_supports_fpsimd()) { struct fpsimd_context __user *fpsimd_ctx = apply_user_offset(user, user->fpsimd_offset); err |= preserve_fpsimd_context(fpsimd_ctx); @@ -655,13 +1260,55 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); } - /* Scalable Vector Extension state, if present */ - if (system_supports_sve() && err == 0 && user->sve_offset) { + if (system_supports_gcs() && err == 0 && user->gcs_offset) { + struct gcs_context __user *gcs_ctx = + apply_user_offset(user, user->gcs_offset); + err |= preserve_gcs_context(gcs_ctx); + } + + /* Scalable Vector Extension state (including streaming), if present */ + if ((system_supports_sve() || system_supports_sme()) && + err == 0 && user->sve_offset) { struct sve_context __user *sve_ctx = apply_user_offset(user, user->sve_offset); err |= preserve_sve_context(sve_ctx); } + /* TPIDR2 if supported */ + if (system_supports_tpidr2() && err == 0) { + struct tpidr2_context __user *tpidr2_ctx = + apply_user_offset(user, user->tpidr2_offset); + err |= preserve_tpidr2_context(tpidr2_ctx); + } + + /* FPMR if supported */ + if (system_supports_fpmr() && err == 0) { + struct fpmr_context __user *fpmr_ctx = + apply_user_offset(user, user->fpmr_offset); + err |= preserve_fpmr_context(fpmr_ctx); + } + + if (system_supports_poe() && err == 0) { + struct poe_context __user *poe_ctx = + apply_user_offset(user, user->poe_offset); + + err |= preserve_poe_context(poe_ctx, ua_state); + } + + /* ZA state if present */ + if (system_supports_sme() && err == 0 && user->za_offset) { + struct za_context __user *za_ctx = + apply_user_offset(user, user->za_offset); + err |= preserve_za_context(za_ctx); + } + + /* ZT state if present */ + if (system_supports_sme2() && err == 0 && user->zt_offset) { + struct zt_context __user *zt_ctx = + apply_user_offset(user, user->zt_offset); + err |= preserve_zt_context(zt_ctx); + } + if (err == 0 && user->extra_offset) { char __user *sfp = (char __user *)user->sigframe; char __user *userp = @@ -736,22 +1383,109 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, return 0; } -static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, +#ifdef CONFIG_ARM64_GCS + +static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) +{ + u64 gcspr_el0; + int ret = 0; + + if (!system_supports_gcs()) + return 0; + + if (!task_gcs_el0_enabled(current)) + return 0; + + /* + * We are entering a signal handler, current register state is + * active. + */ + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * Push a cap and the GCS entry for the trampoline onto the GCS. + */ + put_user_gcs((unsigned long)sigtramp, + (unsigned long __user *)(gcspr_el0 - 16), &ret); + put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 8), + (unsigned long __user *)(gcspr_el0 - 8), &ret); + if (ret != 0) + return ret; + + gcspr_el0 -= 16; + write_sysreg_s(gcspr_el0, SYS_GCSPR_EL0); + + return 0; +} +#else + +static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) +{ + return 0; +} + +#endif + +static int setup_return(struct pt_regs *regs, struct ksignal *ksig, struct rt_sigframe_user_layout *user, int usig) { __sigrestore_t sigtramp; + int err; + + if (ksig->ka.sa.sa_flags & SA_RESTORER) + sigtramp = ksig->ka.sa.sa_restorer; + else + sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); + + err = gcs_signal_entry(sigtramp, ksig); + if (err) + return err; + + /* + * We must not fail from this point onwards. We are going to update + * registers, including SP, in order to invoke the signal handler. If + * we failed and attempted to deliver a nested SIGSEGV to a handler + * after that point, the subsequent sigreturn would end up restoring + * the (partial) state for the original signal handler. + */ regs->regs[0] = usig; + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + regs->regs[1] = (unsigned long)&user->sigframe->info; + regs->regs[2] = (unsigned long)&user->sigframe->uc; + } regs->sp = (unsigned long)user->sigframe; regs->regs[29] = (unsigned long)&user->next_frame->fp; - regs->pc = (unsigned long)ka->sa.sa_handler; + regs->regs[30] = (unsigned long)sigtramp; + regs->pc = (unsigned long)ksig->ka.sa.sa_handler; - if (ka->sa.sa_flags & SA_RESTORER) - sigtramp = ka->sa.sa_restorer; - else - sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); + /* + * Signal delivery is a (wacky) indirect function call in + * userspace, so simulate the same setting of BTYPE as a BLR + * <register containing the signal handler entry point>. + * Signal delivery to a location in a PROT_BTI guarded page + * that is not a function entry point will now trigger a + * SIGILL in userspace. + * + * If the signal handler entry point is not in a PROT_BTI + * guarded page, this is harmless. + */ + if (system_supports_bti()) { + regs->pstate &= ~PSR_BTYPE_MASK; + regs->pstate |= PSR_BTYPE_C; + } - regs->regs[30] = (unsigned long)sigtramp; + /* TCO (Tag Check Override) always cleared for signal handlers */ + regs->pstate &= ~PSR_TCO_BIT; + + /* Signal handlers are invoked with ZA and streaming mode disabled */ + if (system_supports_sme()) { + task_smstop_sm(current); + current->thread.svcr &= ~SVCR_ZA_MASK; + write_sysreg_s(0, SYS_TPIDR2_EL0); + } + + return 0; } static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, @@ -759,28 +1493,37 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, { struct rt_sigframe_user_layout user; struct rt_sigframe __user *frame; + struct user_access_state ua_state; int err = 0; - fpsimd_signal_preserve_current_state(); + fpsimd_save_and_flush_current_state(); if (get_sigframe(&user, ksig, regs)) return 1; + save_reset_user_access_state(&ua_state); frame = user.sigframe; __put_user_error(0, &frame->uc.uc_flags, err); __put_user_error(NULL, &frame->uc.uc_link, err); err |= __save_altstack(&frame->uc.uc_stack, regs->sp); - err |= setup_sigframe(&user, regs, set); - if (err == 0) { - setup_return(regs, &ksig->ka, &user, usig); - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, &ksig->info); - regs->regs[1] = (unsigned long)&frame->info; - regs->regs[2] = (unsigned long)&frame->uc; - } - } + err |= setup_sigframe(&user, regs, set, &ua_state); + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + err |= copy_siginfo_to_user(&frame->info, &ksig->info); + + if (err == 0) + err = setup_return(regs, ksig, &user, usig); + + /* + * We must not fail if setup_return() succeeded - see comment at the + * beginning of setup_return(). + */ + + if (err == 0) + set_handler_user_access_state(); + else + restore_user_access_state(&ua_state); return err; } @@ -798,7 +1541,6 @@ static void setup_restart_syscall(struct pt_regs *regs) */ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { - struct task_struct *tsk = current; sigset_t *oldset = sigmask_to_save(); int usig = ksig->sig; int ret; @@ -822,14 +1564,8 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) */ ret |= !valid_user_regs(®s->user_regs, current); - /* - * Fast forward the stepping logic so we step into the signal - * handler. - */ - if (!ret) - user_fastforward_single_step(tsk); - - signal_setup_done(ret, ksig, 0); + /* Step into the signal handler if we are stepping */ + signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLESTEP)); } /* @@ -841,7 +1577,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ -static void do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; @@ -891,7 +1627,7 @@ static void do_signal(struct pt_regs *regs) retval == -ERESTART_RESTARTBLOCK || (retval == -ERESTARTSYS && !(ksig.ka.sa.sa_flags & SA_RESTART)))) { - regs->regs[0] = -EINTR; + syscall_set_return_value(current, regs, -EINTR, 0); regs->pc = continue_addr; } @@ -912,49 +1648,6 @@ static void do_signal(struct pt_regs *regs) restore_saved_sigmask(); } -asmlinkage void do_notify_resume(struct pt_regs *regs, - unsigned long thread_flags) -{ - /* - * The assembly code enters us with IRQs off, but it hasn't - * informed the tracing code of that for efficiency reasons. - * Update the trace code with the current status. - */ - trace_hardirqs_off(); - - do { - /* Check valid user FS if needed */ - addr_limit_user_check(); - - if (thread_flags & _TIF_NEED_RESCHED) { - /* Unmask Debug and SError for the next task */ - local_daif_restore(DAIF_PROCCTX_NOIRQ); - - schedule(); - } else { - local_daif_restore(DAIF_PROCCTX); - - if (thread_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - if (thread_flags & _TIF_SIGPENDING) - do_signal(regs); - - if (thread_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } - - if (thread_flags & _TIF_FOREIGN_FPSTATE) - fpsimd_restore_current_state(); - } - - local_daif_mask(); - thread_flags = READ_ONCE(current_thread_info()->flags); - } while (thread_flags & _TIF_WORK_MASK); -} - unsigned long __ro_after_init signal_minsigstksz; /* @@ -979,3 +1672,43 @@ void __init minsigstksz_setup(void) round_up(sizeof(struct frame_record), 16) + 16; /* max alignment padding */ } + +/* + * Compile-time assertions for siginfo_t offsets. Check NSIG* as well, as + * changes likely come with new fields that should be added below. + */ +static_assert(NSIGILL == 11); +static_assert(NSIGFPE == 15); +static_assert(NSIGSEGV == 10); +static_assert(NSIGBUS == 5); +static_assert(NSIGTRAP == 6); +static_assert(NSIGCHLD == 6); +static_assert(NSIGSYS == 2); +static_assert(sizeof(siginfo_t) == 128); +static_assert(__alignof__(siginfo_t) == 8); +static_assert(offsetof(siginfo_t, si_signo) == 0x00); +static_assert(offsetof(siginfo_t, si_errno) == 0x04); +static_assert(offsetof(siginfo_t, si_code) == 0x08); +static_assert(offsetof(siginfo_t, si_pid) == 0x10); +static_assert(offsetof(siginfo_t, si_uid) == 0x14); +static_assert(offsetof(siginfo_t, si_tid) == 0x10); +static_assert(offsetof(siginfo_t, si_overrun) == 0x14); +static_assert(offsetof(siginfo_t, si_status) == 0x18); +static_assert(offsetof(siginfo_t, si_utime) == 0x20); +static_assert(offsetof(siginfo_t, si_stime) == 0x28); +static_assert(offsetof(siginfo_t, si_value) == 0x18); +static_assert(offsetof(siginfo_t, si_int) == 0x18); +static_assert(offsetof(siginfo_t, si_ptr) == 0x18); +static_assert(offsetof(siginfo_t, si_addr) == 0x10); +static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18); +static_assert(offsetof(siginfo_t, si_lower) == 0x20); +static_assert(offsetof(siginfo_t, si_upper) == 0x28); +static_assert(offsetof(siginfo_t, si_pkey) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); +static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); +static_assert(offsetof(siginfo_t, si_band) == 0x10); +static_assert(offsetof(siginfo_t, si_fd) == 0x18); +static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); +static_assert(offsetof(siginfo_t, si_syscall) == 0x18); +static_assert(offsetof(siginfo_t, si_arch) == 0x1c); diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index cb7800acd19f..bb3b526ff43f 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -1,21 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/signal.c * * Copyright (C) 1995-2009 Russell King * Copyright (C) 2012 ARM Ltd. * Modified by Will Deacon <will.deacon@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/compat.h> @@ -28,43 +17,8 @@ #include <asm/signal32.h> #include <asm/traps.h> #include <linux/uaccess.h> -#include <asm/unistd.h> - -struct compat_sigcontext { - /* We always set these two fields to 0 */ - compat_ulong_t trap_no; - compat_ulong_t error_code; - - compat_ulong_t oldmask; - compat_ulong_t arm_r0; - compat_ulong_t arm_r1; - compat_ulong_t arm_r2; - compat_ulong_t arm_r3; - compat_ulong_t arm_r4; - compat_ulong_t arm_r5; - compat_ulong_t arm_r6; - compat_ulong_t arm_r7; - compat_ulong_t arm_r8; - compat_ulong_t arm_r9; - compat_ulong_t arm_r10; - compat_ulong_t arm_fp; - compat_ulong_t arm_ip; - compat_ulong_t arm_sp; - compat_ulong_t arm_lr; - compat_ulong_t arm_pc; - compat_ulong_t arm_cpsr; - compat_ulong_t fault_address; -}; - -struct compat_ucontext { - compat_ulong_t uc_flags; - compat_uptr_t uc_link; - compat_stack_t uc_stack; - struct compat_sigcontext uc_mcontext; - compat_sigset_t uc_sigmask; - int __unused[32 - (sizeof (compat_sigset_t) / sizeof (int))]; - compat_ulong_t uc_regspace[128] __attribute__((__aligned__(8))); -}; +#include <asm/unistd_compat_32.h> +#include <asm/vdso.h> struct compat_vfp_sigframe { compat_ulong_t magic; @@ -92,18 +46,6 @@ struct compat_aux_sigframe { unsigned long end_magic; } __attribute__((__aligned__(8))); -struct compat_sigframe { - struct compat_ucontext uc; - compat_ulong_t retcode[2]; -}; - -struct compat_rt_sigframe { - struct compat_siginfo info; - struct compat_sigframe sig; -}; - -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set) { compat_sigset_t cset; @@ -161,7 +103,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) * Note that this also saves V16-31, which aren't visible * in AArch32. */ - fpsimd_signal_preserve_current_state(); + fpsimd_save_and_flush_current_state(); /* Place structure header on the stack */ __put_user_error(magic, &frame->magic, err); @@ -227,14 +169,17 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) fpsimd.fpsr = fpscr & VFP_FPSCR_STAT_MASK; fpsimd.fpcr = fpscr & VFP_FPSCR_CTRL_MASK; + if (err) + return -EFAULT; + /* * We don't need to touch the exception register, so * reload the hardware state. */ - if (!err) - fpsimd_update_current_state(&fpsimd); + fpsimd_save_and_flush_current_state(); + current->thread.uw.fpsimd_state = fpsimd; - return err ? -EFAULT : 0; + return 0; } static int compat_restore_sigframe(struct pt_regs *regs, @@ -246,10 +191,8 @@ static int compat_restore_sigframe(struct pt_regs *regs, unsigned long psr; err = get_sigset_t(&set, &sf->uc.uc_sigmask); - if (err == 0) { - sigdelsetmask(&set, ~_BLOCKABLE); + if (err == 0) set_current_blocked(&set); - } __get_user_error(regs->regs[0], &sf->uc.uc_mcontext.arm_r0, err); __get_user_error(regs->regs[1], &sf->uc.uc_mcontext.arm_r1, err); @@ -279,7 +222,7 @@ static int compat_restore_sigframe(struct pt_regs *regs, err |= !valid_user_regs(®s->user_regs, current); aux = (struct compat_aux_sigframe __user *) sf->uc.uc_regspace; - if (err == 0) + if (err == 0 && system_supports_fpsimd()) err |= compat_restore_vfp_context(&aux->vfp); return err; @@ -403,8 +346,7 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, if (ka->sa.sa_flags & SA_SIGINFO) idx += 3; - retcode = AARCH32_VECTORS_BASE + - AARCH32_KERN_SIGRET_CODE_OFFSET + + retcode = (unsigned long)current->mm->context.sigpage + (idx << 2) + thumb; } @@ -451,7 +393,7 @@ static int compat_setup_sigframe(struct compat_sigframe __user *sf, aux = (struct compat_aux_sigframe __user *) sf->uc.uc_regspace; - if (err == 0) + if (err == 0 && system_supports_fpsimd()) err |= compat_preserve_vfp_context(&aux->vfp); __put_user_error(0, &aux->end_magic, err); @@ -512,5 +454,45 @@ int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set, void compat_setup_restart_syscall(struct pt_regs *regs) { - regs->regs[7] = __NR_compat_restart_syscall; + regs->regs[7] = __NR_compat32_restart_syscall; } + +/* + * Compile-time assertions for siginfo_t offsets. Check NSIG* as well, as + * changes likely come with new fields that should be added below. + */ +static_assert(NSIGILL == 11); +static_assert(NSIGFPE == 15); +static_assert(NSIGSEGV == 10); +static_assert(NSIGBUS == 5); +static_assert(NSIGTRAP == 6); +static_assert(NSIGCHLD == 6); +static_assert(NSIGSYS == 2); +static_assert(sizeof(compat_siginfo_t) == 128); +static_assert(__alignof__(compat_siginfo_t) == 4); +static_assert(offsetof(compat_siginfo_t, si_signo) == 0x00); +static_assert(offsetof(compat_siginfo_t, si_errno) == 0x04); +static_assert(offsetof(compat_siginfo_t, si_code) == 0x08); +static_assert(offsetof(compat_siginfo_t, si_pid) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_uid) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_tid) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_overrun) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_status) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_utime) == 0x18); +static_assert(offsetof(compat_siginfo_t, si_stime) == 0x1c); +static_assert(offsetof(compat_siginfo_t, si_value) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_int) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_ptr) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_addr) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_addr_lsb) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_lower) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_upper) == 0x18); +static_assert(offsetof(compat_siginfo_t, si_pkey) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_perf_data) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_perf_type) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_perf_flags) == 0x18); +static_assert(offsetof(compat_siginfo_t, si_band) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_fd) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_call_addr) == 0x0c); +static_assert(offsetof(compat_siginfo_t, si_syscall) == 0x10); +static_assert(offsetof(compat_siginfo_t, si_arch) == 0x14); diff --git a/arch/arm64/kernel/sigreturn32.S b/arch/arm64/kernel/sigreturn32.S new file mode 100644 index 000000000000..6f486b95b413 --- /dev/null +++ b/arch/arm64/kernel/sigreturn32.S @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AArch32 sigreturn code. + * Based on the kuser helpers in arch/arm/kernel/entry-armv.S. + * + * Copyright (C) 2005-2011 Nicolas Pitre <nico@fluxnic.net> + * Copyright (C) 2012-2018 ARM Ltd. + * + * For ARM syscalls, the syscall number has to be loaded into r7. + * We do not support an OABI userspace. + * + * For Thumb syscalls, we also pass the syscall number via r7. We therefore + * need two 16-bit instructions. + */ + +#include <asm/unistd_compat_32.h> + + .section .rodata + .globl __aarch32_sigret_code_start +__aarch32_sigret_code_start: + + /* + * ARM Code + */ + .byte __NR_compat32_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat32_sigreturn + .byte __NR_compat32_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat32_sigreturn + + /* + * Thumb code + */ + .byte __NR_compat32_sigreturn, 0x27 // svc #__NR_compat32_sigreturn + .byte __NR_compat32_sigreturn, 0xdf // mov r7, #__NR_compat32_sigreturn + + /* + * ARM code + */ + .byte __NR_compat32_rt_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat32_rt_sigreturn + .byte __NR_compat32_rt_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat32_rt_sigreturn + + /* + * Thumb code + */ + .byte __NR_compat32_rt_sigreturn, 0x27 // svc #__NR_compat32_rt_sigreturn + .byte __NR_compat32_rt_sigreturn, 0xdf // mov r7, #__NR_compat32_rt_sigreturn + + .globl __aarch32_sigret_code_end +__aarch32_sigret_code_end: diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 3e53ffa07994..f093cdf71be1 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -3,6 +3,7 @@ #include <linux/linkage.h> #include <asm/asm-offsets.h> #include <asm/assembler.h> +#include <asm/smp.h> .text /* @@ -27,7 +28,7 @@ * aff0 = mpidr_masked & 0xff; * aff1 = mpidr_masked & 0xff00; * aff2 = mpidr_masked & 0xff0000; - * aff2 = mpidr_masked & 0xff00000000; + * aff3 = mpidr_masked & 0xff00000000; * dst = (aff0 >> rs0 | aff1 >> rs1 | aff2 >> rs2 | aff3 >> rs3); *} * Input registers: rs0, rs1, rs2, rs3, mpidr, mask @@ -61,7 +62,7 @@ * * x0 = struct sleep_stack_data area */ -ENTRY(__cpu_suspend_enter) +SYM_FUNC_START(__cpu_suspend_enter) stp x29, lr, [x0, #SLEEP_STACK_DATA_CALLEE_REGS] stp x19, x20, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+16] stp x21, x22, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+32] @@ -94,22 +95,28 @@ ENTRY(__cpu_suspend_enter) ldp x29, lr, [sp], #16 mov x0, #1 ret -ENDPROC(__cpu_suspend_enter) +SYM_FUNC_END(__cpu_suspend_enter) - .pushsection ".idmap.text", "awx" -ENTRY(cpu_resume) - bl el2_setup // if in EL2 drop to EL1 cleanly + .pushsection ".idmap.text", "a" +SYM_CODE_START(cpu_resume) + mov x0, xzr + bl init_kernel_el + mov x19, x0 // preserve boot mode bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ adrp x1, swapper_pg_dir + adrp x2, idmap_pg_dir bl __enable_mmu ldr x8, =_cpu_resume br x8 -ENDPROC(cpu_resume) +SYM_CODE_END(cpu_resume) .ltorg .popsection -ENTRY(_cpu_resume) +SYM_FUNC_START(_cpu_resume) + mov x0, x19 + bl finalise_el2 + mrs x1, mpidr_el1 adr_l x8, mpidr_hash // x8 = struct mpidr_hash virt address @@ -132,7 +139,7 @@ ENTRY(_cpu_resume) */ bl cpu_do_resume -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) mov x0, sp bl kasan_unpoison_task_stack_below #endif @@ -145,4 +152,4 @@ ENTRY(_cpu_resume) ldp x29, lr, [x29] mov x0, #0 ret -ENDPROC(_cpu_resume) +SYM_FUNC_END(_cpu_resume) diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S index 184332286a81..2def9d0dd3dd 100644 --- a/arch/arm64/kernel/smccc-call.S +++ b/arch/arm64/kernel/smccc-call.S @@ -1,15 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2015, Linaro Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License Version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include <linux/linkage.h> #include <linux/arm-smccc.h> @@ -18,7 +9,6 @@ #include <asm/assembler.h> .macro SMCCC instr - .cfi_startproc \instr #0 ldr x4, [sp] stp x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS] @@ -30,7 +20,6 @@ b.ne 1f str x6, [x4, ARM_SMCCC_QUIRK_STATE_OFFS] 1: ret - .cfi_endproc .endm /* @@ -39,9 +28,9 @@ * unsigned long a6, unsigned long a7, struct arm_smccc_res *res, * struct arm_smccc_quirk *quirk) */ -ENTRY(__arm_smccc_smc) +SYM_FUNC_START(__arm_smccc_smc) SMCCC smc -ENDPROC(__arm_smccc_smc) +SYM_FUNC_END(__arm_smccc_smc) EXPORT_SYMBOL(__arm_smccc_smc) /* @@ -50,7 +39,64 @@ EXPORT_SYMBOL(__arm_smccc_smc) * unsigned long a6, unsigned long a7, struct arm_smccc_res *res, * struct arm_smccc_quirk *quirk) */ -ENTRY(__arm_smccc_hvc) +SYM_FUNC_START(__arm_smccc_hvc) SMCCC hvc -ENDPROC(__arm_smccc_hvc) +SYM_FUNC_END(__arm_smccc_hvc) EXPORT_SYMBOL(__arm_smccc_hvc) + + .macro SMCCC_1_2 instr + /* Save `res` and free a GPR that won't be clobbered */ + stp x1, x19, [sp, #-16]! + + /* Ensure `args` won't be clobbered while loading regs in next step */ + mov x19, x0 + + /* Load the registers x0 - x17 from the struct arm_smccc_1_2_regs */ + ldp x0, x1, [x19, #ARM_SMCCC_1_2_REGS_X0_OFFS] + ldp x2, x3, [x19, #ARM_SMCCC_1_2_REGS_X2_OFFS] + ldp x4, x5, [x19, #ARM_SMCCC_1_2_REGS_X4_OFFS] + ldp x6, x7, [x19, #ARM_SMCCC_1_2_REGS_X6_OFFS] + ldp x8, x9, [x19, #ARM_SMCCC_1_2_REGS_X8_OFFS] + ldp x10, x11, [x19, #ARM_SMCCC_1_2_REGS_X10_OFFS] + ldp x12, x13, [x19, #ARM_SMCCC_1_2_REGS_X12_OFFS] + ldp x14, x15, [x19, #ARM_SMCCC_1_2_REGS_X14_OFFS] + ldp x16, x17, [x19, #ARM_SMCCC_1_2_REGS_X16_OFFS] + + \instr #0 + + /* Load the `res` from the stack */ + ldr x19, [sp] + + /* Store the registers x0 - x17 into the result structure */ + stp x0, x1, [x19, #ARM_SMCCC_1_2_REGS_X0_OFFS] + stp x2, x3, [x19, #ARM_SMCCC_1_2_REGS_X2_OFFS] + stp x4, x5, [x19, #ARM_SMCCC_1_2_REGS_X4_OFFS] + stp x6, x7, [x19, #ARM_SMCCC_1_2_REGS_X6_OFFS] + stp x8, x9, [x19, #ARM_SMCCC_1_2_REGS_X8_OFFS] + stp x10, x11, [x19, #ARM_SMCCC_1_2_REGS_X10_OFFS] + stp x12, x13, [x19, #ARM_SMCCC_1_2_REGS_X12_OFFS] + stp x14, x15, [x19, #ARM_SMCCC_1_2_REGS_X14_OFFS] + stp x16, x17, [x19, #ARM_SMCCC_1_2_REGS_X16_OFFS] + + /* Restore original x19 */ + ldp xzr, x19, [sp], #16 + ret +.endm + +/* + * void arm_smccc_1_2_hvc(const struct arm_smccc_1_2_regs *args, + * struct arm_smccc_1_2_regs *res); + */ +SYM_FUNC_START(arm_smccc_1_2_hvc) + SMCCC_1_2 hvc +SYM_FUNC_END(arm_smccc_1_2_hvc) +EXPORT_SYMBOL(arm_smccc_1_2_hvc) + +/* + * void arm_smccc_1_2_smc(const struct arm_smccc_1_2_regs *args, + * struct arm_smccc_1_2_regs *res); + */ +SYM_FUNC_START(arm_smccc_1_2_smc) + SMCCC_1_2 smc +SYM_FUNC_END(arm_smccc_1_2_smc) +EXPORT_SYMBOL(arm_smccc_1_2_smc) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 1598d6f7200a..1aa324104afb 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * SMP initialisation and IPI support * Based on arch/arm/kernel/smp.c * * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/acpi.h> @@ -35,12 +24,17 @@ #include <linux/smp.h> #include <linux/seq_file.h> #include <linux/irq.h> +#include <linux/irqchip/arm-gic-v3.h> #include <linux/percpu.h> #include <linux/clockchips.h> #include <linux/completion.h> #include <linux/of.h> #include <linux/irq_work.h> +#include <linux/kernel_stat.h> #include <linux/kexec.h> +#include <linux/kgdb.h> +#include <linux/kvm_host.h> +#include <linux/nmi.h> #include <asm/alternative.h> #include <asm/atomic.h> @@ -49,10 +43,9 @@ #include <asm/cputype.h> #include <asm/cpu_ops.h> #include <asm/daifflags.h> +#include <asm/kvm_mmu.h> #include <asm/mmu_context.h> #include <asm/numa.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> #include <asm/processor.h> #include <asm/smp_plat.h> #include <asm/sections.h> @@ -60,12 +53,8 @@ #include <asm/ptrace.h> #include <asm/virt.h> -#define CREATE_TRACE_POINTS #include <trace/events/ipi.h> -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); -EXPORT_PER_CPU_SYMBOL(cpu_number); - /* * as from 2.5, kernels no longer have an init_tasks structure * so we need some other way of telling a new secondary core @@ -73,19 +62,27 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); */ struct secondary_data secondary_data; /* Number of CPUs which aren't online, but looping in kernel text. */ -int cpus_stuck_in_kernel; - -enum ipi_msg_type { - IPI_RESCHEDULE, - IPI_CALL_FUNC, - IPI_CPU_STOP, - IPI_CPU_CRASH_STOP, - IPI_TIMER, - IPI_IRQ_WORK, - IPI_WAKEUP +static int cpus_stuck_in_kernel; + +static int ipi_irq_base __ro_after_init; +static int nr_ipi __ro_after_init = NR_IPI; + +struct ipi_descs { + struct irq_desc *descs[MAX_IPI]; }; +static DEFINE_PER_CPU_READ_MOSTLY(struct ipi_descs, pcpu_ipi_desc); + +#define get_ipi_desc(__cpu, __ipi) (per_cpu_ptr(&pcpu_ipi_desc, __cpu)->descs[__ipi]) + +static bool percpu_ipi_descs __ro_after_init; + +static bool crash_stop; + +static void ipi_setup(int cpu); + #ifdef CONFIG_HOTPLUG_CPU +static void ipi_teardown(int cpu); static int op_cpu_kill(unsigned int cpu); #else static inline int op_cpu_kill(unsigned int cpu) @@ -101,8 +98,10 @@ static inline int op_cpu_kill(unsigned int cpu) */ static int boot_secondary(unsigned int cpu, struct task_struct *idle) { - if (cpu_ops[cpu]->cpu_boot) - return cpu_ops[cpu]->cpu_boot(cpu); + const struct cpu_operations *ops = get_cpu_ops(cpu); + + if (ops->cpu_boot) + return ops->cpu_boot(cpu); return -EOPNOTSUPP; } @@ -119,65 +118,74 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) * page tables. */ secondary_data.task = idle; - secondary_data.stack = task_stack_page(idle) + THREAD_SIZE; update_cpu_boot_status(CPU_MMU_OFF); - __flush_dcache_area(&secondary_data, sizeof(secondary_data)); - /* - * Now bring the CPU into our world. - */ + /* Now bring the CPU into our world */ ret = boot_secondary(cpu, idle); - if (ret == 0) { - /* - * CPU was successfully started, wait for it to come online or - * time out. - */ - wait_for_completion_timeout(&cpu_running, - msecs_to_jiffies(1000)); - - if (!cpu_online(cpu)) { - pr_crit("CPU%u: failed to come online\n", cpu); - ret = -EIO; - } - } else { - pr_err("CPU%u: failed to boot: %d\n", cpu, ret); + if (ret) { + if (ret != -EPERM) + pr_err("CPU%u: failed to boot: %d\n", cpu, ret); return ret; } + /* + * CPU was successfully started, wait for it to come online or + * time out. + */ + wait_for_completion_timeout(&cpu_running, + msecs_to_jiffies(5000)); + if (cpu_online(cpu)) + return 0; + + pr_crit("CPU%u: failed to come online\n", cpu); secondary_data.task = NULL; - secondary_data.stack = NULL; status = READ_ONCE(secondary_data.status); - if (ret && status) { - - if (status == CPU_MMU_OFF) - status = READ_ONCE(__early_cpu_boot_status); + if (status == CPU_MMU_OFF) + status = READ_ONCE(__early_cpu_boot_status); - switch (status & CPU_BOOT_STATUS_MASK) { - default: - pr_err("CPU%u: failed in unknown state : 0x%lx\n", - cpu, status); - break; - case CPU_KILL_ME: - if (!op_cpu_kill(cpu)) { - pr_crit("CPU%u: died during early boot\n", cpu); - break; - } - /* Fall through */ - pr_crit("CPU%u: may not have shut down cleanly\n", cpu); - case CPU_STUCK_IN_KERNEL: - pr_crit("CPU%u: is stuck in kernel\n", cpu); - if (status & CPU_STUCK_REASON_52_BIT_VA) - pr_crit("CPU%u: does not support 52-bit VAs\n", cpu); - if (status & CPU_STUCK_REASON_NO_GRAN) - pr_crit("CPU%u: does not support %luK granule \n", cpu, PAGE_SIZE / SZ_1K); - cpus_stuck_in_kernel++; + switch (status & CPU_BOOT_STATUS_MASK) { + default: + pr_err("CPU%u: failed in unknown state : 0x%lx\n", + cpu, status); + cpus_stuck_in_kernel++; + break; + case CPU_KILL_ME: + if (!op_cpu_kill(cpu)) { + pr_crit("CPU%u: died during early boot\n", cpu); break; - case CPU_PANIC_KERNEL: - panic("CPU%u detected unsupported configuration\n", cpu); } + pr_crit("CPU%u: may not have shut down cleanly\n", cpu); + fallthrough; + case CPU_STUCK_IN_KERNEL: + pr_crit("CPU%u: is stuck in kernel\n", cpu); + if (status & CPU_STUCK_REASON_52_BIT_VA) + pr_crit("CPU%u: does not support 52-bit VAs\n", cpu); + if (status & CPU_STUCK_REASON_NO_GRAN) { + pr_crit("CPU%u: does not support %luK granule\n", + cpu, PAGE_SIZE / SZ_1K); + } + cpus_stuck_in_kernel++; + break; + case CPU_PANIC_KERNEL: + panic("CPU%u detected unsupported configuration\n", cpu); } - return ret; + return -EIO; +} + +static void init_gic_priority_masking(void) +{ + u32 cpuflags; + + if (WARN_ON(!gic_enable_sre())) + return; + + cpuflags = read_sysreg(daif); + + WARN_ON(!(cpuflags & PSR_I_BIT)); + WARN_ON(!(cpuflags & PSR_F_BIT)); + + gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); } /* @@ -188,10 +196,8 @@ asmlinkage notrace void secondary_start_kernel(void) { u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; struct mm_struct *mm = &init_mm; - unsigned int cpu; - - cpu = task_cpu(current); - set_my_cpu_offset(per_cpu_offset(cpu)); + const struct cpu_operations *ops; + unsigned int cpu = smp_processor_id(); /* * All kernel threads share the same mm context; grab a @@ -206,7 +212,10 @@ asmlinkage notrace void secondary_start_kernel(void) */ cpu_uninstall_idmap(); - preempt_disable(); + if (system_uses_irq_prio_masking()) + init_gic_priority_masking(); + + rcutree_report_cpu_starting(cpu); trace_hardirqs_off(); /* @@ -216,20 +225,23 @@ asmlinkage notrace void secondary_start_kernel(void) */ check_local_cpu_capabilities(); - if (cpu_ops[cpu]->cpu_postboot) - cpu_ops[cpu]->cpu_postboot(); + ops = get_cpu_ops(cpu); + if (ops->cpu_postboot) + ops->cpu_postboot(); /* * Log the CPU info before it is marked online and might get read. */ cpuinfo_store_cpu(); + store_cpu_topology(cpu); /* * Enable GIC and timers. */ notify_cpu_starting(cpu); - store_cpu_topology(cpu); + ipi_setup(cpu); + numa_add_cpu(cpu); /* @@ -244,6 +256,13 @@ asmlinkage notrace void secondary_start_kernel(void) set_cpu_online(cpu, true); complete(&cpu_running); + /* + * Secondary CPUs enter the kernel with all DAIF exceptions masked. + * + * As with setup_arch() we must unmask Debug and SError exceptions, and + * as the root irqchip has already been detected and initialized we can + * unmask IRQ and FIQ at the same time. + */ local_daif_restore(DAIF_PROCCTX); /* @@ -255,19 +274,21 @@ asmlinkage notrace void secondary_start_kernel(void) #ifdef CONFIG_HOTPLUG_CPU static int op_cpu_disable(unsigned int cpu) { + const struct cpu_operations *ops = get_cpu_ops(cpu); + /* * If we don't have a cpu_die method, abort before we reach the point * of no return. CPU0 may not have an cpu_ops, so test for it. */ - if (!cpu_ops[cpu] || !cpu_ops[cpu]->cpu_die) + if (!ops || !ops->cpu_die) return -EOPNOTSUPP; /* * We may need to abort a hot unplug for some other mechanism-specific * reason. */ - if (cpu_ops[cpu]->cpu_disable) - return cpu_ops[cpu]->cpu_disable(cpu); + if (ops->cpu_disable) + return ops->cpu_disable(cpu); return 0; } @@ -292,6 +313,7 @@ int __cpu_disable(void) * and we must not schedule until we're ready to give up the cpu. */ set_cpu_online(cpu, false); + ipi_teardown(cpu); /* * OK - migrate IRQs away from this CPU @@ -303,74 +325,82 @@ int __cpu_disable(void) static int op_cpu_kill(unsigned int cpu) { + const struct cpu_operations *ops = get_cpu_ops(cpu); + /* * If we have no means of synchronising with the dying CPU, then assume * that it is really dead. We can only wait for an arbitrary length of * time and hope that it's dead, so let's skip the wait and just hope. */ - if (!cpu_ops[cpu]->cpu_kill) + if (!ops->cpu_kill) return 0; - return cpu_ops[cpu]->cpu_kill(cpu); + return ops->cpu_kill(cpu); } /* - * called on the thread which is asking for a CPU to be shutdown - - * waits until shutdown has completed, or it is timed out. + * Called on the thread which is asking for a CPU to be shutdown after the + * shutdown completed. */ -void __cpu_die(unsigned int cpu) +void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { int err; - if (!cpu_wait_death(cpu, 5)) { - pr_crit("CPU%u: cpu didn't die\n", cpu); - return; - } - pr_notice("CPU%u: shutdown\n", cpu); + pr_debug("CPU%u: shutdown\n", cpu); /* * Now that the dying CPU is beyond the point of no return w.r.t. - * in-kernel synchronisation, try to get the firwmare to help us to + * in-kernel synchronisation, try to get the firmware to help us to * verify that it has really left the kernel before we consider * clobbering anything it might still be using. */ err = op_cpu_kill(cpu); if (err) - pr_warn("CPU%d may not have shut down cleanly: %d\n", - cpu, err); + pr_warn("CPU%d may not have shut down cleanly: %d\n", cpu, err); } /* * Called from the idle thread for the CPU which has been shutdown. * */ -void cpu_die(void) +void __noreturn cpu_die(void) { unsigned int cpu = smp_processor_id(); + const struct cpu_operations *ops = get_cpu_ops(cpu); idle_task_exit(); local_daif_mask(); - /* Tell __cpu_die() that this CPU is now safe to dispose of */ - (void)cpu_report_death(); + /* Tell cpuhp_bp_sync_dead() that this CPU is now safe to dispose of */ + cpuhp_ap_report_dead(); /* * Actually shutdown the CPU. This must never fail. The specific hotplug * mechanism must perform all required cache maintenance to ensure that * no dirty lines are lost in the process of shutting down the CPU. */ - cpu_ops[cpu]->cpu_die(cpu); + ops->cpu_die(cpu); BUG(); } #endif +static void __cpu_try_die(int cpu) +{ +#ifdef CONFIG_HOTPLUG_CPU + const struct cpu_operations *ops = get_cpu_ops(cpu); + + if (ops && ops->cpu_die) + ops->cpu_die(cpu); +#endif +} + /* * Kill the calling secondary CPU, early in bringup before it is turned * online. */ -void cpu_die_early(void) +void __noreturn cpu_die_early(void) { int cpu = smp_processor_id(); @@ -378,13 +408,13 @@ void cpu_die_early(void) /* Mark this CPU absent */ set_cpu_present(cpu, 0); + rcutree_report_cpu_dead(); + + if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) { + update_cpu_boot_status(CPU_KILL_ME); + __cpu_try_die(cpu); + } -#ifdef CONFIG_HOTPLUG_CPU - update_cpu_boot_status(CPU_KILL_ME); - /* Check if we can park ourselves */ - if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_die) - cpu_ops[cpu]->cpu_die(cpu); -#endif update_cpu_boot_status(CPU_STUCK_IN_KERNEL); cpu_park_loop(); @@ -399,53 +429,40 @@ static void __init hyp_mode_check(void) "CPU: CPUs started in inconsistent modes"); else pr_info("CPU: All CPU(s) started at EL1\n"); + if (IS_ENABLED(CONFIG_KVM) && !is_kernel_in_hyp_mode()) { + kvm_compute_layout(); + kvm_apply_hyp_relocations(); + } } void __init smp_cpus_done(unsigned int max_cpus) { pr_info("SMP: Total of %d processors activated.\n", num_online_cpus()); - setup_cpu_features(); hyp_mode_check(); - apply_alternatives_all(); + setup_system_features(); + setup_user_features(); mark_linear_text_alias_ro(); } void __init smp_prepare_boot_cpu(void) { - set_my_cpu_offset(per_cpu_offset(smp_processor_id())); /* - * Initialise the static keys early as they may be enabled by the - * cpufeature code. + * The runtime per-cpu areas have been allocated by + * setup_per_cpu_areas(), and CPU0's boot time per-cpu area will be + * freed shortly, so we must move over to the runtime per-cpu area. */ - jump_label_init(); - cpuinfo_store_boot_cpu(); -} + set_my_cpu_offset(per_cpu_offset(smp_processor_id())); -static u64 __init of_get_cpu_mpidr(struct device_node *dn) -{ - const __be32 *cell; - u64 hwid; + cpuinfo_store_boot_cpu(); + setup_boot_cpu_features(); - /* - * A cpu node with missing "reg" property is - * considered invalid to build a cpu_logical_map - * entry. - */ - cell = of_get_property(dn, "reg", NULL); - if (!cell) { - pr_err("%pOF: missing reg property\n", dn); - return INVALID_HWID; - } + /* Conditionally switch to GIC PMR for interrupt masking */ + if (system_uses_irq_prio_masking()) + init_gic_priority_masking(); - hwid = of_read_number(cell, of_n_addr_cells(dn)); - /* - * Non affinity bits must be set to 0 in the DT - */ - if (hwid & ~MPIDR_HWID_BITMASK) { - pr_err("%pOF: invalid reg property\n", dn); - return INVALID_HWID; - } - return hwid; + kasan_init_hw_tags(); + /* Init percpu seeds for random tags after cpus are set up. */ + kasan_init_sw_tags(); } /* @@ -470,10 +487,13 @@ static bool __init is_mpidr_duplicate(unsigned int cpu, u64 hwid) */ static int __init smp_cpu_setup(int cpu) { - if (cpu_read_ops(cpu)) + const struct cpu_operations *ops; + + if (init_cpu_ops(cpu)) return -ENODEV; - if (cpu_ops[cpu]->cpu_init(cpu)) + ops = get_cpu_ops(cpu); + if (ops->cpu_init(cpu)) return -ENODEV; set_cpu_possible(cpu, true); @@ -484,6 +504,59 @@ static int __init smp_cpu_setup(int cpu) static bool bootcpu_valid __initdata; static unsigned int cpu_count = 1; +int arch_register_cpu(int cpu) +{ + acpi_handle acpi_handle = acpi_get_processor_handle(cpu); + struct cpu *c = &per_cpu(cpu_devices, cpu); + + if (!acpi_disabled && !acpi_handle && + IS_ENABLED(CONFIG_ACPI_HOTPLUG_CPU)) + return -EPROBE_DEFER; + +#ifdef CONFIG_ACPI_HOTPLUG_CPU + /* For now block anything that looks like physical CPU Hotplug */ + if (invalid_logical_cpuid(cpu) || !cpu_present(cpu)) { + pr_err_once("Changing CPU present bit is not supported\n"); + return -ENODEV; + } +#endif + + /* + * Availability of the acpi handle is sufficient to establish + * that _STA has already been checked. No need to recheck here. + */ + c->hotpluggable = arch_cpu_is_hotpluggable(cpu); + + return register_cpu(c, cpu); +} + +#ifdef CONFIG_ACPI_HOTPLUG_CPU +void arch_unregister_cpu(int cpu) +{ + acpi_handle acpi_handle = acpi_get_processor_handle(cpu); + struct cpu *c = &per_cpu(cpu_devices, cpu); + acpi_status status; + unsigned long long sta; + + if (!acpi_handle) { + pr_err_once("Removing a CPU without associated ACPI handle\n"); + return; + } + + status = acpi_evaluate_integer(acpi_handle, "_STA", NULL, &sta); + if (ACPI_FAILURE(status)) + return; + + /* For now do not allow anything that looks like physical CPU HP */ + if (cpu_present(cpu) && !(sta & ACPI_STA_DEVICE_PRESENT)) { + pr_err_once("Changing CPU present bit is not supported\n"); + return; + } + + unregister_cpu(c); +} +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ + #ifdef CONFIG_ACPI static struct acpi_madt_generic_interrupt cpu_madt_gicc[NR_CPUS]; @@ -491,6 +564,7 @@ struct acpi_madt_generic_interrupt *acpi_cpu_get_madt_gicc(int cpu) { return &cpu_madt_gicc[cpu]; } +EXPORT_SYMBOL_GPL(acpi_cpu_get_madt_gicc); /* * acpi_map_gic_cpu_interface - parse processor MADT entry @@ -503,7 +577,8 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) { u64 hwid = processor->arm_mpidr; - if (!(processor->flags & ACPI_MADT_ENABLED)) { + if (!(processor->flags & + (ACPI_MADT_ENABLED | ACPI_MADT_GICC_ONLINE_CAPABLE))) { pr_debug("skipping disabled CPU entry with 0x%llx MPIDR\n", hwid); return; } @@ -534,7 +609,7 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) return; /* map the logical cpu id to cpu MPIDR */ - cpu_logical_map(cpu_count) = hwid; + set_cpu_logical_map(cpu_count, hwid); cpu_madt_gicc[cpu_count] = *processor; @@ -553,7 +628,7 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) } static int __init -acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header, +acpi_parse_gic_cpu_interface(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_interrupt *processor; @@ -562,7 +637,7 @@ acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header, if (BAD_MADT_GICC_ENTRY(processor, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); acpi_map_gic_cpu_interface(processor); @@ -608,9 +683,9 @@ static void __init of_parse_and_init_cpus(void) struct device_node *dn; for_each_of_cpu_node(dn) { - u64 hwid = of_get_cpu_mpidr(dn); + u64 hwid = of_get_cpu_hwid(dn, 0); - if (hwid == INVALID_HWID) + if (hwid & ~MPIDR_HWID_BITMASK) goto next; if (is_mpidr_duplicate(cpu_count, hwid)) { @@ -648,7 +723,7 @@ static void __init of_parse_and_init_cpus(void) goto next; pr_debug("cpu logical map 0x%llx\n", hwid); - cpu_logical_map(cpu_count) = hwid; + set_cpu_logical_map(cpu_count, hwid); early_map_cpu_to_node(cpu_count, of_node_to_nid(dn)); next: @@ -689,13 +764,14 @@ void __init smp_init_cpus(void) for (i = 1; i < nr_cpu_ids; i++) { if (cpu_logical_map(i) != INVALID_HWID) { if (smp_cpu_setup(i)) - cpu_logical_map(i) = INVALID_HWID; + set_cpu_logical_map(i, INVALID_HWID); } } } void __init smp_prepare_cpus(unsigned int max_cpus) { + const struct cpu_operations *ops; int err; unsigned int cpu; unsigned int this_cpu; @@ -721,15 +797,14 @@ void __init smp_prepare_cpus(unsigned int max_cpus) */ for_each_possible_cpu(cpu) { - per_cpu(cpu_number, cpu) = cpu; - if (cpu == smp_processor_id()) continue; - if (!cpu_ops[cpu]) + ops = get_cpu_ops(cpu); + if (!ops) continue; - err = cpu_ops[cpu]->cpu_prepare(cpu); + err = ops->cpu_prepare(cpu); if (err) continue; @@ -738,53 +813,35 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } } -void (*__smp_cross_call)(const struct cpumask *, unsigned int); - -void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int)) -{ - __smp_cross_call = fn; -} - -static const char *ipi_types[NR_IPI] __tracepoint_string = { -#define S(x,s) [x] = s - S(IPI_RESCHEDULE, "Rescheduling interrupts"), - S(IPI_CALL_FUNC, "Function call interrupts"), - S(IPI_CPU_STOP, "CPU stop interrupts"), - S(IPI_CPU_CRASH_STOP, "CPU stop (for crash dump) interrupts"), - S(IPI_TIMER, "Timer broadcast interrupts"), - S(IPI_IRQ_WORK, "IRQ work interrupts"), - S(IPI_WAKEUP, "CPU wake-up interrupts"), +static const char *ipi_types[MAX_IPI] __tracepoint_string = { + [IPI_RESCHEDULE] = "Rescheduling interrupts", + [IPI_CALL_FUNC] = "Function call interrupts", + [IPI_CPU_STOP] = "CPU stop interrupts", + [IPI_CPU_STOP_NMI] = "CPU stop NMIs", + [IPI_TIMER] = "Timer broadcast interrupts", + [IPI_IRQ_WORK] = "IRQ work interrupts", + [IPI_CPU_BACKTRACE] = "CPU backtrace interrupts", + [IPI_KGDB_ROUNDUP] = "KGDB roundup interrupts", }; -static void smp_cross_call(const struct cpumask *target, unsigned int ipinr) -{ - trace_ipi_raise(target, ipi_types[ipinr]); - __smp_cross_call(target, ipinr); -} +static void smp_cross_call(const struct cpumask *target, unsigned int ipinr); -void show_ipi_list(struct seq_file *p, int prec) +unsigned long irq_err_count; + +int arch_show_interrupts(struct seq_file *p, int prec) { unsigned int cpu, i; - for (i = 0; i < NR_IPI; i++) { + for (i = 0; i < MAX_IPI; i++) { seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i, prec >= 4 ? " " : ""); for_each_online_cpu(cpu) - seq_printf(p, "%10u ", - __get_irq_stat(cpu, ipi_irqs[i])); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(get_ipi_desc(cpu, i), cpu)); seq_printf(p, " %s\n", ipi_types[i]); } -} -u64 smp_irq_stat_cpu(unsigned int cpu) -{ - u64 sum = 0; - int i; - - for (i = 0; i < NR_IPI; i++) - sum += __get_irq_stat(cpu, ipi_irqs[i]); - - return sum; + seq_printf(p, "%*s: %10lu\n", prec, "Err", irq_err_count); + return 0; } void arch_send_call_function_ipi_mask(const struct cpumask *mask) @@ -797,71 +854,113 @@ void arch_send_call_function_single_ipi(int cpu) smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC); } -#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL -void arch_send_wakeup_ipi_mask(const struct cpumask *mask) -{ - smp_cross_call(mask, IPI_WAKEUP); -} -#endif - #ifdef CONFIG_IRQ_WORK void arch_irq_work_raise(void) { - if (__smp_cross_call) - smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK); + smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK); } #endif -/* - * ipi_cpu_stop - handle IPI from smp_send_stop() - */ -static void ipi_cpu_stop(unsigned int cpu) +static void __noreturn local_cpu_stop(unsigned int cpu) { set_cpu_online(cpu, false); local_daif_mask(); sdei_mask_local_cpu(); - - while (1) - cpu_relax(); + cpu_park_loop(); } -#ifdef CONFIG_KEXEC_CORE -static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0); -#endif +/* + * We need to implement panic_smp_self_stop() for parallel panic() calls, so + * that cpu_online_mask gets correctly updated and smp_send_stop() can skip + * CPUs that have already stopped themselves. + */ +void __noreturn panic_smp_self_stop(void) +{ + local_cpu_stop(smp_processor_id()); +} -static void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) +static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) { #ifdef CONFIG_KEXEC_CORE + /* + * Use local_daif_mask() instead of local_irq_disable() to make sure + * that pseudo-NMIs are disabled. The "crash stop" code starts with + * an IRQ and falls back to NMI (which might be pseudo). If the IRQ + * finally goes through right as we're timing out then the NMI could + * interrupt us. It's better to prevent the NMI and let the IRQ + * finish since the pt_regs will be better. + */ + local_daif_mask(); + crash_save_cpu(regs, cpu); - atomic_dec(&waiting_for_crash_ipi); + set_cpu_online(cpu, false); - local_irq_disable(); sdei_mask_local_cpu(); -#ifdef CONFIG_HOTPLUG_CPU - if (cpu_ops[cpu]->cpu_die) - cpu_ops[cpu]->cpu_die(cpu); -#endif + if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) + __cpu_try_die(cpu); /* just in case */ cpu_park_loop(); +#else + BUG(); #endif } +static void arm64_send_ipi(const cpumask_t *mask, unsigned int nr) +{ + unsigned int cpu; + + if (!percpu_ipi_descs) + __ipi_send_mask(get_ipi_desc(0, nr), mask); + else + for_each_cpu(cpu, mask) + __ipi_send_single(get_ipi_desc(cpu, nr), cpu); +} + +static void arm64_backtrace_ipi(cpumask_t *mask) +{ + arm64_send_ipi(mask, IPI_CPU_BACKTRACE); +} + +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu) +{ + /* + * NOTE: though nmi_trigger_cpumask_backtrace() has "nmi_" in the name, + * nothing about it truly needs to be implemented using an NMI, it's + * just that it's _allowed_ to work with NMIs. If ipi_should_be_nmi() + * returned false our backtrace attempt will just use a regular IPI. + */ + nmi_trigger_cpumask_backtrace(mask, exclude_cpu, arm64_backtrace_ipi); +} + +#ifdef CONFIG_KGDB +void kgdb_roundup_cpus(void) +{ + int this_cpu = raw_smp_processor_id(); + int cpu; + + for_each_online_cpu(cpu) { + /* No need to roundup ourselves */ + if (cpu == this_cpu) + continue; + + __ipi_send_single(get_ipi_desc(cpu, IPI_KGDB_ROUNDUP), cpu); + } +} +#endif + /* * Main handler for inter-processor interrupts */ -void handle_IPI(int ipinr, struct pt_regs *regs) +static void do_handle_IPI(int ipinr) { unsigned int cpu = smp_processor_id(); - struct pt_regs *old_regs = set_irq_regs(regs); - if ((unsigned)ipinr < NR_IPI) { - trace_ipi_entry_rcuidle(ipi_types[ipinr]); - __inc_irq_stat(cpu, ipi_irqs[ipinr]); - } + if ((unsigned)ipinr < NR_IPI) + trace_ipi_entry(ipi_types[ipinr]); switch (ipinr) { case IPI_RESCHEDULE: @@ -869,49 +968,42 @@ void handle_IPI(int ipinr, struct pt_regs *regs) break; case IPI_CALL_FUNC: - irq_enter(); generic_smp_call_function_interrupt(); - irq_exit(); break; case IPI_CPU_STOP: - irq_enter(); - ipi_cpu_stop(cpu); - irq_exit(); - break; - - case IPI_CPU_CRASH_STOP: - if (IS_ENABLED(CONFIG_KEXEC_CORE)) { - irq_enter(); - ipi_cpu_crash_stop(cpu, regs); - + case IPI_CPU_STOP_NMI: + if (IS_ENABLED(CONFIG_KEXEC_CORE) && crash_stop) { + ipi_cpu_crash_stop(cpu, get_irq_regs()); unreachable(); + } else { + local_cpu_stop(cpu); } break; #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST case IPI_TIMER: - irq_enter(); tick_receive_broadcast(); - irq_exit(); break; #endif #ifdef CONFIG_IRQ_WORK case IPI_IRQ_WORK: - irq_enter(); irq_work_run(); - irq_exit(); break; #endif -#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL - case IPI_WAKEUP: - WARN_ONCE(!acpi_parking_protocol_valid(cpu), - "CPU%u: Wake-up IPI outside the ACPI parking protocol\n", - cpu); + case IPI_CPU_BACKTRACE: + /* + * NOTE: in some cases this _won't_ be NMI context. See the + * comment in arch_trigger_cpumask_backtrace(). + */ + nmi_cpu_backtrace(get_irq_regs()); + break; + + case IPI_KGDB_ROUNDUP: + kgdb_nmicallback(cpu, get_irq_regs()); break; -#endif default: pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); @@ -919,15 +1011,159 @@ void handle_IPI(int ipinr, struct pt_regs *regs) } if ((unsigned)ipinr < NR_IPI) - trace_ipi_exit_rcuidle(ipi_types[ipinr]); - set_irq_regs(old_regs); + trace_ipi_exit(ipi_types[ipinr]); +} + +static irqreturn_t ipi_handler(int irq, void *data) +{ + unsigned int ipi = (irq - ipi_irq_base) % nr_ipi; + + do_handle_IPI(ipi); + return IRQ_HANDLED; +} + +static void smp_cross_call(const struct cpumask *target, unsigned int ipinr) +{ + trace_ipi_raise(target, ipi_types[ipinr]); + arm64_send_ipi(target, ipinr); +} + +static bool ipi_should_be_nmi(enum ipi_msg_type ipi) +{ + if (!system_uses_irq_prio_masking()) + return false; + + switch (ipi) { + case IPI_CPU_STOP_NMI: + case IPI_CPU_BACKTRACE: + case IPI_KGDB_ROUNDUP: + return true; + default: + return false; + } +} + +static void ipi_setup(int cpu) +{ + int i; + + if (WARN_ON_ONCE(!ipi_irq_base)) + return; + + for (i = 0; i < nr_ipi; i++) { + if (!percpu_ipi_descs) { + if (ipi_should_be_nmi(i)) { + prepare_percpu_nmi(ipi_irq_base + i); + enable_percpu_nmi(ipi_irq_base + i, 0); + } else { + enable_percpu_irq(ipi_irq_base + i, 0); + } + } else { + enable_irq(irq_desc_get_irq(get_ipi_desc(cpu, i))); + } + } +} + +#ifdef CONFIG_HOTPLUG_CPU +static void ipi_teardown(int cpu) +{ + int i; + + if (WARN_ON_ONCE(!ipi_irq_base)) + return; + + for (i = 0; i < nr_ipi; i++) { + if (!percpu_ipi_descs) { + if (ipi_should_be_nmi(i)) { + disable_percpu_nmi(ipi_irq_base + i); + teardown_percpu_nmi(ipi_irq_base + i); + } else { + disable_percpu_irq(ipi_irq_base + i); + } + } else { + disable_irq(irq_desc_get_irq(get_ipi_desc(cpu, i))); + } + } +} +#endif + +static void ipi_setup_sgi(int ipi) +{ + int err, irq, cpu; + + irq = ipi_irq_base + ipi; + + if (ipi_should_be_nmi(ipi)) { + err = request_percpu_nmi(irq, ipi_handler, "IPI", NULL, &irq_stat); + WARN(err, "Could not request IRQ %d as NMI, err=%d\n", irq, err); + } else { + err = request_percpu_irq(irq, ipi_handler, "IPI", &irq_stat); + WARN(err, "Could not request IRQ %d as IRQ, err=%d\n", irq, err); + } + + for_each_possible_cpu(cpu) + get_ipi_desc(cpu, ipi) = irq_to_desc(irq); + + irq_set_status_flags(irq, IRQ_HIDDEN); +} + +static void ipi_setup_lpi(int ipi, int ncpus) +{ + for (int cpu = 0; cpu < ncpus; cpu++) { + int err, irq; + + irq = ipi_irq_base + (cpu * nr_ipi) + ipi; + + err = irq_force_affinity(irq, cpumask_of(cpu)); + WARN(err, "Could not force affinity IRQ %d, err=%d\n", irq, err); + + err = request_irq(irq, ipi_handler, IRQF_NO_AUTOEN, "IPI", + NULL); + WARN(err, "Could not request IRQ %d, err=%d\n", irq, err); + + irq_set_status_flags(irq, (IRQ_HIDDEN | IRQ_NO_BALANCING_MASK)); + + get_ipi_desc(cpu, ipi) = irq_to_desc(irq); + } } -void smp_send_reschedule(int cpu) +void __init set_smp_ipi_range_percpu(int ipi_base, int n, int ncpus) +{ + int i; + + WARN_ON(n < MAX_IPI); + nr_ipi = min(n, MAX_IPI); + + percpu_ipi_descs = !!ncpus; + ipi_irq_base = ipi_base; + + for (i = 0; i < nr_ipi; i++) { + if (!percpu_ipi_descs) + ipi_setup_sgi(i); + else + ipi_setup_lpi(i, ncpus); + } + + /* Setup the boot CPU immediately */ + ipi_setup(smp_processor_id()); +} + +void arch_smp_send_reschedule(int cpu) { smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE); } +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL +void arch_send_wakeup_ipi(unsigned int cpu) +{ + /* + * We use a scheduler IPI to wake the CPU as this avoids the need for a + * dedicated IPI and we can safely handle spurious scheduler IPIs. + */ + smp_send_reschedule(cpu); +} +#endif + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST void tick_broadcast(const struct cpumask *mask) { @@ -935,94 +1171,132 @@ void tick_broadcast(const struct cpumask *mask) } #endif +/* + * The number of CPUs online, not counting this CPU (which may not be + * fully online and so not counted in num_online_cpus()). + */ +static inline unsigned int num_other_online_cpus(void) +{ + unsigned int this_cpu_online = cpu_online(smp_processor_id()); + + return num_online_cpus() - this_cpu_online; +} + void smp_send_stop(void) { + static unsigned long stop_in_progress; + static cpumask_t mask; unsigned long timeout; - if (num_online_cpus() > 1) { - cpumask_t mask; + /* + * If this cpu is the only one alive at this point in time, online or + * not, there are no stop messages to be sent around, so just back out. + */ + if (num_other_online_cpus() == 0) + goto skip_ipi; + + /* Only proceed if this is the first CPU to reach this code */ + if (test_and_set_bit(0, &stop_in_progress)) + return; + + /* + * Send an IPI to all currently online CPUs except the CPU running + * this code. + * + * NOTE: we don't do anything here to prevent other CPUs from coming + * online after we snapshot `cpu_online_mask`. Ideally, the calling code + * should do something to prevent other CPUs from coming up. This code + * can be called in the panic path and thus it doesn't seem wise to + * grab the CPU hotplug mutex ourselves. Worst case: + * - If a CPU comes online as we're running, we'll likely notice it + * during the 1 second wait below and then we'll catch it when we try + * with an NMI (assuming NMIs are enabled) since we re-snapshot the + * mask before sending an NMI. + * - If we leave the function and see that CPUs are still online we'll + * at least print a warning. Especially without NMIs this function + * isn't foolproof anyway so calling code will just have to accept + * the fact that there could be cases where a CPU can't be stopped. + */ + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + + if (system_state <= SYSTEM_RUNNING) + pr_crit("SMP: stopping secondary CPUs\n"); + + /* + * Start with a normal IPI and wait up to one second for other CPUs to + * stop. We do this first because it gives other processors a chance + * to exit critical sections / drop locks and makes the rest of the + * stop process (especially console flush) more robust. + */ + smp_cross_call(&mask, IPI_CPU_STOP); + timeout = USEC_PER_SEC; + while (num_other_online_cpus() && timeout--) + udelay(1); + /* + * If CPUs are still online, try an NMI. There's no excuse for this to + * be slow, so we only give them an extra 10 ms to respond. + */ + if (num_other_online_cpus() && ipi_should_be_nmi(IPI_CPU_STOP_NMI)) { + smp_rmb(); cpumask_copy(&mask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &mask); - if (system_state <= SYSTEM_RUNNING) - pr_crit("SMP: stopping secondary CPUs\n"); - smp_cross_call(&mask, IPI_CPU_STOP); + pr_info("SMP: retry stop with NMI for CPUs %*pbl\n", + cpumask_pr_args(&mask)); + + smp_cross_call(&mask, IPI_CPU_STOP_NMI); + timeout = USEC_PER_MSEC * 10; + while (num_other_online_cpus() && timeout--) + udelay(1); } - /* Wait up to one second for other CPUs to stop */ - timeout = USEC_PER_SEC; - while (num_online_cpus() > 1 && timeout--) - udelay(1); + if (num_other_online_cpus()) { + smp_rmb(); + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); - if (num_online_cpus() > 1) - pr_warning("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(cpu_online_mask)); + pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", + cpumask_pr_args(&mask)); + } +skip_ipi: sdei_mask_local_cpu(); } #ifdef CONFIG_KEXEC_CORE void crash_smp_send_stop(void) { - static int cpus_stopped; - cpumask_t mask; - unsigned long timeout; - /* * This function can be called twice in panic path, but obviously * we execute this only once. + * + * We use this same boolean to tell whether the IPI we send was a + * stop or a "crash stop". */ - if (cpus_stopped) + if (crash_stop) return; + crash_stop = 1; - cpus_stopped = 1; + smp_send_stop(); - if (num_online_cpus() == 1) { - sdei_mask_local_cpu(); - return; - } - - cpumask_copy(&mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &mask); - - atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - - pr_crit("SMP: stopping secondary CPUs\n"); - smp_cross_call(&mask, IPI_CPU_CRASH_STOP); - - /* Wait up to one second for other CPUs to stop */ - timeout = USEC_PER_SEC; - while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--) - udelay(1); - - if (atomic_read(&waiting_for_crash_ipi) > 0) - pr_warning("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(&mask)); - - sdei_mask_local_cpu(); + sdei_handler_abort(); } bool smp_crash_stop_failed(void) { - return (atomic_read(&waiting_for_crash_ipi) > 0); + return num_other_online_cpus() != 0; } #endif -/* - * not supported here - */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - static bool have_cpu_die(void) { #ifdef CONFIG_HOTPLUG_CPU int any_cpu = raw_smp_processor_id(); + const struct cpu_operations *ops = get_cpu_ops(any_cpu); - if (cpu_ops[any_cpu] && cpu_ops[any_cpu]->cpu_die) + if (ops && ops->cpu_die) return true; #endif return false; @@ -1032,5 +1306,6 @@ bool cpus_are_stuck_in_kernel(void) { bool smp_spin_tables = (num_possible_cpus() > 1 && !have_cpu_die()); - return !!cpus_stuck_in_kernel || smp_spin_tables; + return !!cpus_stuck_in_kernel || smp_spin_tables || + is_protected_kvm_enabled(); } diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c index 93034651c87e..49029eace3ad 100644 --- a/arch/arm64/kernel/smp_spin_table.c +++ b/arch/arm64/kernel/smp_spin_table.c @@ -1,19 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Spin Table SMP initialisation * * Copyright (C) 2013 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/delay.h> @@ -47,7 +36,7 @@ static void write_pen_release(u64 val) unsigned long size = sizeof(secondary_holding_pen_release); secondary_holding_pen_release = val; - __flush_dcache_area(start, size); + dcache_clean_inval_poc((unsigned long)start, (unsigned long)start + size); } @@ -77,6 +66,7 @@ static int smp_spin_table_cpu_init(unsigned int cpu) static int smp_spin_table_cpu_prepare(unsigned int cpu) { __le64 __iomem *release_addr; + phys_addr_t pa_holding_pen = __pa_symbol(secondary_holding_pen); if (!cpu_release_addr[cpu]) return -ENODEV; @@ -94,14 +84,15 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu) /* * We write the release address as LE regardless of the native - * endianess of the kernel. Therefore, any boot-loaders that + * endianness of the kernel. Therefore, any boot-loaders that * read this address need to convert this address to the - * boot-loader's endianess before jumping. This is mandated by + * boot-loader's endianness before jumping. This is mandated by * the boot protocol. */ - writeq_relaxed(__pa_symbol(secondary_holding_pen), release_addr); - __flush_dcache_area((__force void *)release_addr, - sizeof(*release_addr)); + writeq_relaxed(pa_holding_pen, release_addr); + dcache_clean_inval_poc((__force unsigned long)release_addr, + (__force unsigned long)release_addr + + sizeof(*release_addr)); /* * Send an event to wake up the secondary CPU. diff --git a/arch/arm64/kernel/ssbd.c b/arch/arm64/kernel/ssbd.c deleted file mode 100644 index 885f13e58708..000000000000 --- a/arch/arm64/kernel/ssbd.c +++ /dev/null @@ -1,128 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2018 ARM Ltd, All Rights Reserved. - */ - -#include <linux/compat.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/sched/task_stack.h> -#include <linux/thread_info.h> - -#include <asm/cpufeature.h> - -static void ssbd_ssbs_enable(struct task_struct *task) -{ - u64 val = is_compat_thread(task_thread_info(task)) ? - PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; - - task_pt_regs(task)->pstate |= val; -} - -static void ssbd_ssbs_disable(struct task_struct *task) -{ - u64 val = is_compat_thread(task_thread_info(task)) ? - PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; - - task_pt_regs(task)->pstate &= ~val; -} - -/* - * prctl interface for SSBD - */ -static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl) -{ - int state = arm64_get_ssbd_state(); - - /* Unsupported */ - if (state == ARM64_SSBD_UNKNOWN) - return -EINVAL; - - /* Treat the unaffected/mitigated state separately */ - if (state == ARM64_SSBD_MITIGATED) { - switch (ctrl) { - case PR_SPEC_ENABLE: - return -EPERM; - case PR_SPEC_DISABLE: - case PR_SPEC_FORCE_DISABLE: - return 0; - } - } - - /* - * Things are a bit backward here: the arm64 internal API - * *enables the mitigation* when the userspace API *disables - * speculation*. So much fun. - */ - switch (ctrl) { - case PR_SPEC_ENABLE: - /* If speculation is force disabled, enable is not allowed */ - if (state == ARM64_SSBD_FORCE_ENABLE || - task_spec_ssb_force_disable(task)) - return -EPERM; - task_clear_spec_ssb_disable(task); - clear_tsk_thread_flag(task, TIF_SSBD); - ssbd_ssbs_enable(task); - break; - case PR_SPEC_DISABLE: - if (state == ARM64_SSBD_FORCE_DISABLE) - return -EPERM; - task_set_spec_ssb_disable(task); - set_tsk_thread_flag(task, TIF_SSBD); - ssbd_ssbs_disable(task); - break; - case PR_SPEC_FORCE_DISABLE: - if (state == ARM64_SSBD_FORCE_DISABLE) - return -EPERM; - task_set_spec_ssb_disable(task); - task_set_spec_ssb_force_disable(task); - set_tsk_thread_flag(task, TIF_SSBD); - ssbd_ssbs_disable(task); - break; - default: - return -ERANGE; - } - - return 0; -} - -int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, - unsigned long ctrl) -{ - switch (which) { - case PR_SPEC_STORE_BYPASS: - return ssbd_prctl_set(task, ctrl); - default: - return -ENODEV; - } -} - -static int ssbd_prctl_get(struct task_struct *task) -{ - switch (arm64_get_ssbd_state()) { - case ARM64_SSBD_UNKNOWN: - return -EINVAL; - case ARM64_SSBD_FORCE_ENABLE: - return PR_SPEC_DISABLE; - case ARM64_SSBD_KERNEL: - if (task_spec_ssb_force_disable(task)) - return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; - if (task_spec_ssb_disable(task)) - return PR_SPEC_PRCTL | PR_SPEC_DISABLE; - return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - case ARM64_SSBD_FORCE_DISABLE: - return PR_SPEC_ENABLE; - default: - return PR_SPEC_NOT_AFFECTED; - } -} - -int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) -{ - switch (which) { - case PR_SPEC_STORE_BYPASS: - return ssbd_prctl_get(task); - default: - return -ENODEV; - } -} diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 1a29f2695ff2..3ebcf8c53fb0 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -1,192 +1,621 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Stack tracing support * * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/kernel.h> +#include <linux/efi.h> #include <linux/export.h> +#include <linux/filter.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/stacktrace.h> +#include <asm/efi.h> #include <asm/irq.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> +enum kunwind_source { + KUNWIND_SOURCE_UNKNOWN, + KUNWIND_SOURCE_FRAME, + KUNWIND_SOURCE_CALLER, + KUNWIND_SOURCE_TASK, + KUNWIND_SOURCE_REGS_PC, +}; + +union unwind_flags { + unsigned long all; + struct { + unsigned long fgraph : 1, + kretprobe : 1; + }; +}; + /* - * AArch64 PCS assigns the frame pointer to x29. + * Kernel unwind state * - * A simple function prologue looks like this: - * sub sp, sp, #0x10 - * stp x29, x30, [sp] - * mov x29, sp + * @common: Common unwind state. + * @task: The task being unwound. + * @graph_idx: Used by ftrace_graph_ret_addr() for optimized stack unwinding. + * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance + * associated with the most recently encountered replacement lr + * value. + */ +struct kunwind_state { + struct unwind_state common; + struct task_struct *task; + int graph_idx; +#ifdef CONFIG_KRETPROBES + struct llist_node *kr_cur; +#endif + enum kunwind_source source; + union unwind_flags flags; + struct pt_regs *regs; +}; + +static __always_inline void +kunwind_init(struct kunwind_state *state, + struct task_struct *task) +{ + unwind_init_common(&state->common); + state->task = task; + state->source = KUNWIND_SOURCE_UNKNOWN; + state->flags.all = 0; + state->regs = NULL; +} + +/* + * Start an unwind from a pt_regs. * - * A simple function epilogue looks like this: - * mov sp, x29 - * ldp x29, x30, [sp] - * add sp, sp, #0x10 + * The unwind will begin at the PC within the regs. + * + * The regs must be on a stack currently owned by the calling task. */ -int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) +static __always_inline void +kunwind_init_from_regs(struct kunwind_state *state, + struct pt_regs *regs) { - unsigned long fp = frame->fp; + kunwind_init(state, current); - if (fp & 0xf) - return -EINVAL; + state->regs = regs; + state->common.fp = regs->regs[29]; + state->common.pc = regs->pc; + state->source = KUNWIND_SOURCE_REGS_PC; +} - if (!tsk) - tsk = current; +/* + * Start an unwind from a caller. + * + * The unwind will begin at the caller of whichever function this is inlined + * into. + * + * The function which invokes this must be noinline. + */ +static __always_inline void +kunwind_init_from_caller(struct kunwind_state *state) +{ + kunwind_init(state, current); - if (!on_accessible_stack(tsk, fp, NULL)) - return -EINVAL; + state->common.fp = (unsigned long)__builtin_frame_address(1); + state->common.pc = (unsigned long)__builtin_return_address(0); + state->source = KUNWIND_SOURCE_CALLER; +} - frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); - frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); +/* + * Start an unwind from a blocked task. + * + * The unwind will begin at the blocked tasks saved PC (i.e. the caller of + * cpu_switch_to()). + * + * The caller should ensure the task is blocked in cpu_switch_to() for the + * duration of the unwind, or the unwind will be bogus. It is never valid to + * call this for the current task. + */ +static __always_inline void +kunwind_init_from_task(struct kunwind_state *state, + struct task_struct *task) +{ + kunwind_init(state, task); + + state->common.fp = thread_saved_fp(task); + state->common.pc = thread_saved_pc(task); + state->source = KUNWIND_SOURCE_TASK; +} +static __always_inline int +kunwind_recover_return_address(struct kunwind_state *state) +{ #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (tsk->ret_stack && - (frame->pc == (unsigned long)return_to_handler)) { - struct ftrace_ret_stack *ret_stack; - /* - * This is a case where function graph tracer has - * modified a return address (LR) in a stack frame - * to hook a function return. - * So replace it to an original value. - */ - ret_stack = ftrace_graph_get_ret_stack(tsk, frame->graph++); - if (WARN_ON_ONCE(!ret_stack)) + if (state->task->ret_stack && + (state->common.pc == (unsigned long)return_to_handler)) { + unsigned long orig_pc; + orig_pc = ftrace_graph_ret_addr(state->task, &state->graph_idx, + state->common.pc, + (void *)state->common.fp); + if (state->common.pc == orig_pc) { + WARN_ON_ONCE(state->task == current); return -EINVAL; - frame->pc = ret_stack->ret; + } + state->common.pc = orig_pc; + state->flags.fgraph = 1; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - /* - * Frames created upon entry from EL0 have NULL FP and PC values, so - * don't bother reporting these. Frames created by __noreturn functions - * might have a valid FP even if PC is bogus, so only terminate where - * both are NULL. - */ - if (!frame->fp && !frame->pc) +#ifdef CONFIG_KRETPROBES + if (is_kretprobe_trampoline(state->common.pc)) { + unsigned long orig_pc; + orig_pc = kretprobe_find_ret_addr(state->task, + (void *)state->common.fp, + &state->kr_cur); + if (!orig_pc) + return -EINVAL; + state->common.pc = orig_pc; + state->flags.kretprobe = 1; + } +#endif /* CONFIG_KRETPROBES */ + + return 0; +} + +static __always_inline +int kunwind_next_regs_pc(struct kunwind_state *state) +{ + struct stack_info *info; + unsigned long fp = state->common.fp; + struct pt_regs *regs; + + regs = container_of((u64 *)fp, struct pt_regs, stackframe.record.fp); + + info = unwind_find_stack(&state->common, (unsigned long)regs, sizeof(*regs)); + if (!info) return -EINVAL; + unwind_consume_stack(&state->common, info, (unsigned long)regs, + sizeof(*regs)); + + state->regs = regs; + state->common.pc = regs->pc; + state->common.fp = regs->regs[29]; + state->regs = NULL; + state->source = KUNWIND_SOURCE_REGS_PC; return 0; } -void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, - int (*fn)(struct stackframe *, void *), void *data) +static __always_inline int +kunwind_next_frame_record_meta(struct kunwind_state *state) { - while (1) { - int ret; + struct task_struct *tsk = state->task; + unsigned long fp = state->common.fp; + struct frame_record_meta *meta; + struct stack_info *info; - if (fn(frame, data)) - break; - ret = unwind_frame(tsk, frame); - if (ret < 0) - break; + info = unwind_find_stack(&state->common, fp, sizeof(*meta)); + if (!info) + return -EINVAL; + + meta = (struct frame_record_meta *)fp; + switch (READ_ONCE(meta->type)) { + case FRAME_META_TYPE_FINAL: + if (meta == &task_pt_regs(tsk)->stackframe) + return -ENOENT; + WARN_ON_ONCE(tsk == current); + return -EINVAL; + case FRAME_META_TYPE_PT_REGS: + return kunwind_next_regs_pc(state); + default: + WARN_ON_ONCE(tsk == current); + return -EINVAL; } } -#ifdef CONFIG_STACKTRACE -struct stack_trace_data { - struct stack_trace *trace; - unsigned int no_sched_functions; - unsigned int skip; -}; +static __always_inline int +kunwind_next_frame_record(struct kunwind_state *state) +{ + unsigned long fp = state->common.fp; + struct frame_record *record; + struct stack_info *info; + unsigned long new_fp, new_pc; + + if (fp & 0x7) + return -EINVAL; + + info = unwind_find_stack(&state->common, fp, sizeof(*record)); + if (!info) + return -EINVAL; + + record = (struct frame_record *)fp; + new_fp = READ_ONCE(record->fp); + new_pc = READ_ONCE(record->lr); + + if (!new_fp && !new_pc) + return kunwind_next_frame_record_meta(state); + + unwind_consume_stack(&state->common, info, fp, sizeof(*record)); -static int save_trace(struct stackframe *frame, void *d) + state->common.fp = new_fp; + state->common.pc = new_pc; + state->source = KUNWIND_SOURCE_FRAME; + + return 0; +} + +/* + * Unwind from one frame record (A) to the next frame record (B). + * + * We terminate early if the location of B indicates a malformed chain of frame + * records (e.g. a cycle), determined based on the location and fp value of A + * and the location (but not the fp value) of B. + */ +static __always_inline int +kunwind_next(struct kunwind_state *state) { - struct stack_trace_data *data = d; - struct stack_trace *trace = data->trace; - unsigned long addr = frame->pc; + int err; + + state->flags.all = 0; - if (data->no_sched_functions && in_sched_functions(addr)) - return 0; - if (data->skip) { - data->skip--; - return 0; + switch (state->source) { + case KUNWIND_SOURCE_FRAME: + case KUNWIND_SOURCE_CALLER: + case KUNWIND_SOURCE_TASK: + case KUNWIND_SOURCE_REGS_PC: + err = kunwind_next_frame_record(state); + break; + default: + err = -EINVAL; } - trace->entries[trace->nr_entries++] = addr; + if (err) + return err; - return trace->nr_entries >= trace->max_entries; + state->common.pc = ptrauth_strip_kernel_insn_pac(state->common.pc); + + return kunwind_recover_return_address(state); } -void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) +typedef bool (*kunwind_consume_fn)(const struct kunwind_state *state, void *cookie); + +static __always_inline int +do_kunwind(struct kunwind_state *state, kunwind_consume_fn consume_state, + void *cookie) { - struct stack_trace_data data; - struct stackframe frame; + int ret; - data.trace = trace; - data.skip = trace->skip; - data.no_sched_functions = 0; + ret = kunwind_recover_return_address(state); + if (ret) + return ret; - frame.fp = regs->regs[29]; - frame.pc = regs->pc; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = 0; + while (1) { + if (!consume_state(state, cookie)) + return -EINVAL; + ret = kunwind_next(state); + if (ret == -ENOENT) + return 0; + if (ret < 0) + return ret; + } +} + +/* + * Per-cpu stacks are only accessible when unwinding the current task in a + * non-preemptible context. + */ +#define STACKINFO_CPU(name) \ + ({ \ + ((task == current) && !preemptible()) \ + ? stackinfo_get_##name() \ + : stackinfo_get_unknown(); \ + }) + +/* + * SDEI stacks are only accessible when unwinding the current task in an NMI + * context. + */ +#define STACKINFO_SDEI(name) \ + ({ \ + ((task == current) && in_nmi()) \ + ? stackinfo_get_sdei_##name() \ + : stackinfo_get_unknown(); \ + }) + +#define STACKINFO_EFI \ + ({ \ + ((task == current) && current_in_efi()) \ + ? stackinfo_get_efi() \ + : stackinfo_get_unknown(); \ + }) + +static __always_inline int +kunwind_stack_walk(kunwind_consume_fn consume_state, + void *cookie, struct task_struct *task, + struct pt_regs *regs) +{ + struct stack_info stacks[] = { + stackinfo_get_task(task), + STACKINFO_CPU(irq), + STACKINFO_CPU(overflow), +#if defined(CONFIG_ARM_SDE_INTERFACE) + STACKINFO_SDEI(normal), + STACKINFO_SDEI(critical), #endif +#ifdef CONFIG_EFI + STACKINFO_EFI, +#endif + }; + struct kunwind_state state = { + .common = { + .stacks = stacks, + .nr_stacks = ARRAY_SIZE(stacks), + }, + }; - walk_stackframe(current, &frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + if (regs) { + if (task != current) + return -EINVAL; + kunwind_init_from_regs(&state, regs); + } else if (task == current) { + kunwind_init_from_caller(&state); + } else { + kunwind_init_from_task(&state, task); + } + + return do_kunwind(&state, consume_state, cookie); } -static noinline void __save_stack_trace(struct task_struct *tsk, - struct stack_trace *trace, unsigned int nosched) +struct kunwind_consume_entry_data { + stack_trace_consume_fn consume_entry; + void *cookie; +}; + +static __always_inline bool +arch_kunwind_consume_entry(const struct kunwind_state *state, void *cookie) { - struct stack_trace_data data; - struct stackframe frame; + struct kunwind_consume_entry_data *data = cookie; + return data->consume_entry(data->cookie, state->common.pc); +} - if (!try_get_task_stack(tsk)) - return; +noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task, + struct pt_regs *regs) +{ + struct kunwind_consume_entry_data data = { + .consume_entry = consume_entry, + .cookie = cookie, + }; - data.trace = trace; - data.skip = trace->skip; - data.no_sched_functions = nosched; + kunwind_stack_walk(arch_kunwind_consume_entry, &data, task, regs); +} - if (tsk != current) { - frame.fp = thread_saved_fp(tsk); - frame.pc = thread_saved_pc(tsk); - } else { - /* We don't want this function nor the caller */ - data.skip += 2; - frame.fp = (unsigned long)__builtin_frame_address(0); - frame.pc = (unsigned long)__save_stack_trace; +static __always_inline bool +arch_reliable_kunwind_consume_entry(const struct kunwind_state *state, void *cookie) +{ + /* + * At an exception boundary we can reliably consume the saved PC. We do + * not know whether the LR was live when the exception was taken, and + * so we cannot perform the next unwind step reliably. + * + * All that matters is whether the *entire* unwind is reliable, so give + * up as soon as we hit an exception boundary. + */ + if (state->source == KUNWIND_SOURCE_REGS_PC) + return false; + + return arch_kunwind_consume_entry(state, cookie); +} + +noinline noinstr int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, + struct task_struct *task) +{ + struct kunwind_consume_entry_data data = { + .consume_entry = consume_entry, + .cookie = cookie, + }; + + return kunwind_stack_walk(arch_reliable_kunwind_consume_entry, &data, + task, NULL); +} + +struct bpf_unwind_consume_entry_data { + bool (*consume_entry)(void *cookie, u64 ip, u64 sp, u64 fp); + void *cookie; +}; + +static bool +arch_bpf_unwind_consume_entry(const struct kunwind_state *state, void *cookie) +{ + struct bpf_unwind_consume_entry_data *data = cookie; + + return data->consume_entry(data->cookie, state->common.pc, 0, + state->common.fp); +} + +noinline noinstr void arch_bpf_stack_walk(bool (*consume_entry)(void *cookie, u64 ip, u64 sp, + u64 fp), void *cookie) +{ + struct bpf_unwind_consume_entry_data data = { + .consume_entry = consume_entry, + .cookie = cookie, + }; + + kunwind_stack_walk(arch_bpf_unwind_consume_entry, &data, current, NULL); +} + +static const char *state_source_string(const struct kunwind_state *state) +{ + switch (state->source) { + case KUNWIND_SOURCE_FRAME: return NULL; + case KUNWIND_SOURCE_CALLER: return "C"; + case KUNWIND_SOURCE_TASK: return "T"; + case KUNWIND_SOURCE_REGS_PC: return "P"; + default: return "U"; } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = 0; -#endif +} + +static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg) +{ + const char *source = state_source_string(state); + union unwind_flags flags = state->flags; + bool has_info = source || flags.all; + char *loglvl = arg; + + printk("%s %pSb%s%s%s%s%s\n", loglvl, + (void *)state->common.pc, + has_info ? " (" : "", + source ? source : "", + flags.fgraph ? "F" : "", + flags.kretprobe ? "K" : "", + has_info ? ")" : ""); + + return true; +} + +void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) +{ + pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); + + if (regs && user_mode(regs)) + return; - walk_stackframe(tsk, &frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + if (!tsk) + tsk = current; + + if (!try_get_task_stack(tsk)) + return; + + printk("%sCall trace:\n", loglvl); + kunwind_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); put_task_stack(tsk); } -EXPORT_SYMBOL_GPL(save_stack_trace_tsk); -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) +{ + dump_backtrace(NULL, tsk, loglvl); + barrier(); +} + +/* + * The struct defined for userspace stack frame in AARCH64 mode. + */ +struct frame_tail { + struct frame_tail __user *fp; + unsigned long lr; +} __attribute__((packed)); + +/* + * Get the return address for a single stackframe and return a pointer to the + * next frame tail. + */ +static struct frame_tail __user * +unwind_user_frame(struct frame_tail __user *tail, void *cookie, + stack_trace_consume_fn consume_entry) { - __save_stack_trace(tsk, trace, 1); + struct frame_tail buftail; + unsigned long err; + unsigned long lr; + + /* Also check accessibility of one struct frame_tail beyond */ + if (!access_ok(tail, sizeof(buftail))) + return NULL; + + pagefault_disable(); + err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); + pagefault_enable(); + + if (err) + return NULL; + + lr = ptrauth_strip_user_insn_pac(buftail.lr); + + if (!consume_entry(cookie, lr)) + return NULL; + + /* + * Frame pointers should strictly progress back up the stack + * (towards higher addresses). + */ + if (tail >= buftail.fp) + return NULL; + + return buftail.fp; } -void save_stack_trace(struct stack_trace *trace) +#ifdef CONFIG_COMPAT +/* + * The registers we're interested in are at the end of the variable + * length saved register structure. The fp points at the end of this + * structure so the address of this struct is: + * (struct compat_frame_tail *)(xxx->fp)-1 + * + * This code has been adapted from the ARM OProfile support. + */ +struct compat_frame_tail { + compat_uptr_t fp; /* a (struct compat_frame_tail *) in compat mode */ + u32 sp; + u32 lr; +} __attribute__((packed)); + +static struct compat_frame_tail __user * +unwind_compat_user_frame(struct compat_frame_tail __user *tail, void *cookie, + stack_trace_consume_fn consume_entry) { - __save_stack_trace(current, trace, 0); + struct compat_frame_tail buftail; + unsigned long err; + + /* Also check accessibility of one struct frame_tail beyond */ + if (!access_ok(tail, sizeof(buftail))) + return NULL; + + pagefault_disable(); + err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); + pagefault_enable(); + + if (err) + return NULL; + + if (!consume_entry(cookie, buftail.lr)) + return NULL; + + /* + * Frame pointers should strictly progress back up the stack + * (towards higher addresses). + */ + if (tail + 1 >= (struct compat_frame_tail __user *) + compat_ptr(buftail.fp)) + return NULL; + + return (struct compat_frame_tail __user *)compat_ptr(buftail.fp) - 1; } +#endif /* CONFIG_COMPAT */ + + +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) +{ + if (!consume_entry(cookie, regs->pc)) + return; -EXPORT_SYMBOL_GPL(save_stack_trace); + if (!compat_user_mode(regs)) { + /* AARCH64 mode */ + struct frame_tail __user *tail; + + tail = (struct frame_tail __user *)regs->regs[29]; + while (tail && !((unsigned long)tail & 0x7)) + tail = unwind_user_frame(tail, cookie, consume_entry); + } else { +#ifdef CONFIG_COMPAT + /* AARCH32 compat mode */ + struct compat_frame_tail __user *tail; + + tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; + while (tail && !((unsigned long)tail & 0x3)) + tail = unwind_compat_user_frame(tail, cookie, consume_entry); #endif + } +} diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 9405d1b7f4b0..eaaff94329cd 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -3,13 +3,17 @@ #include <linux/percpu.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/pgtable.h> +#include <linux/cpuidle.h> #include <asm/alternative.h> #include <asm/cacheflush.h> #include <asm/cpufeature.h> +#include <asm/cpuidle.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/exec.h> -#include <asm/pgtable.h> +#include <asm/fpsimd.h> +#include <asm/mte.h> #include <asm/memory.h> #include <asm/mmu_context.h> #include <asm/smp_plat.h> @@ -41,6 +45,8 @@ void notrace __cpu_suspend_exit(void) { unsigned int cpu = smp_processor_id(); + mte_suspend_exit(); + /* * We are resuming from reset with the idmap active in TTBR0_EL1. * We must uninstall the idmap and restore the expected MMU @@ -50,14 +56,15 @@ void notrace __cpu_suspend_exit(void) /* Restore CnP bit in TTBR1_EL1 */ if (system_supports_cnp()) - cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); + cpu_enable_swapper_cnp(); /* * PSTATE was not saved over suspend/resume, re-enable any detected * features that might not have been set correctly. */ + if (alternative_has_cap_unlikely(ARM64_HAS_DIT)) + set_pstate_dit(1); __uaccess_enable_hw_pan(); - uao_thread_switch(current); /* * Restore HW breakpoint registers to sane values @@ -72,8 +79,12 @@ void notrace __cpu_suspend_exit(void) * have turned the mitigation on. If the user has forcefully * disabled it, make sure their wishes are obeyed. */ - if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) - arm64_set_ssbd_mitigation(false); + spectre_v4_enable_mitigation(NULL); + + sme_suspend_exit(); + + /* Restore additional feature-specific configuration */ + ptrauth_suspend_exit(); } /* @@ -88,21 +99,46 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) int ret = 0; unsigned long flags; struct sleep_stack_data state; + struct arm_cpuidle_irq_context context; + + /* + * Some portions of CPU state (e.g. PSTATE.{PAN,DIT}) are initialized + * before alternatives are patched, but are only restored by + * __cpu_suspend_exit() after alternatives are patched. To avoid + * accidentally losing these bits we must not attempt to suspend until + * after alternatives have been patched. + */ + WARN_ON(!system_capabilities_finalized()); + + /* Report any MTE async fault before going to suspend */ + mte_suspend_enter(); /* * From this point debug exceptions are disabled to prevent * updates to mdscr register (saved and restored along with * general purpose registers) from kernel debuggers. + * + * Strictly speaking the trace_hardirqs_off() here is superfluous, + * hardirqs should be firmly off by now. This really ought to use + * something like raw_local_daif_save(). */ flags = local_daif_save(); /* - * Function graph tracer state gets incosistent when the kernel + * Function graph tracer state gets inconsistent when the kernel * calls functions that never return (aka suspend finishers) hence * disable graph tracing during their execution. */ pause_graph_tracing(); + /* + * Switch to using DAIF.IF instead of PMR in order to reliably + * resume if we're using pseudo-NMIs. + */ + arm_cpuidle_save_irq_context(&context); + + ct_cpuidle_enter(); + if (__cpu_suspend_enter(&state)) { /* Call the suspend finisher */ ret = fn(arg); @@ -116,16 +152,21 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) */ if (!ret) ret = -EOPNOTSUPP; + + ct_cpuidle_exit(); } else { + ct_cpuidle_exit(); __cpu_suspend_exit(); } + arm_cpuidle_restore_irq_context(&context); + unpause_graph_tracing(); /* * Restore pstate flags. OS lock and mdscr have been already * restored, so from this point onwards, debugging is fully - * renabled if it was enabled when core started shutdown. + * reenabled if it was enabled when core started shutdown. */ local_daif_restore(flags); diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c index b44065fb1616..f08408b6e826 100644 --- a/arch/arm64/kernel/sys.c +++ b/arch/arm64/kernel/sys.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * AArch64-specific system calls implementation * * Copyright (C) 2012 ARM Ltd. * Author: Catalin Marinas <catalin.marinas@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/compiler.h> @@ -31,7 +20,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, - unsigned long, fd, off_t, off) + unsigned long, fd, unsigned long, off) { if (offset_in_page(off) != 0) return -EINVAL; @@ -47,22 +36,28 @@ SYSCALL_DEFINE1(arm64_personality, unsigned int, personality) return ksys_personality(personality); } +asmlinkage long sys_ni_syscall(void); + +asmlinkage long __arm64_sys_ni_syscall(const struct pt_regs *__unused) +{ + return sys_ni_syscall(); +} + /* * Wrappers to pass the pt_regs argument. */ -#define sys_personality sys_arm64_personality +#define __arm64_sys_personality __arm64_sys_arm64_personality -asmlinkage long sys_ni_syscall(const struct pt_regs *); -#define __arm64_sys_ni_syscall sys_ni_syscall +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) #undef __SYSCALL #define __SYSCALL(nr, sym) asmlinkage long __arm64_##sym(const struct pt_regs *); -#include <asm/unistd.h> +#include <asm/syscall_table_64.h> #undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = (syscall_fn_t)__arm64_##sym, +#define __SYSCALL(nr, sym) [nr] = __arm64_##sym, const syscall_fn_t sys_call_table[__NR_syscalls] = { - [0 ... __NR_syscalls - 1] = (syscall_fn_t)sys_ni_syscall, -#include <asm/unistd.h> + [0 ... __NR_syscalls - 1] = __arm64_sys_ni_syscall, +#include <asm/syscall_table_64.h> }; diff --git a/arch/arm64/kernel/sys32.c b/arch/arm64/kernel/sys32.c index 0f8bcb7de700..96bcfb907443 100644 --- a/arch/arm64/kernel/sys32.c +++ b/arch/arm64/kernel/sys32.c @@ -1,32 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * arch/arm64/kernel/sys32.c * * Copyright (C) 2015 ARM Ltd. - * - * This program is free software(void); you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http(void);//www.gnu.org/licenses/>. - */ - -/* - * Needed to avoid conflicting __NR_* macros between uapi/asm/unistd.h and - * asm/unistd32.h. */ -#define __COMPAT_SYSCALL_NR #include <linux/compat.h> #include <linux/compiler.h> #include <linux/syscalls.h> #include <asm/syscall.h> +#include <asm/unistd_compat_32.h> asmlinkage long compat_sys_sigreturn(void); asmlinkage long compat_sys_rt_sigreturn(void); @@ -133,17 +117,16 @@ COMPAT_SYSCALL_DEFINE6(aarch32_fallocate, int, fd, int, mode, return ksys_fallocate(fd, mode, arg_u64(offset), arg_u64(len)); } -asmlinkage long sys_ni_syscall(const struct pt_regs *); -#define __arm64_sys_ni_syscall sys_ni_syscall +#define __SYSCALL_WITH_COMPAT(nr, sym, compat) __SYSCALL(nr, compat) #undef __SYSCALL #define __SYSCALL(nr, sym) asmlinkage long __arm64_##sym(const struct pt_regs *); -#include <asm/unistd32.h> +#include <asm/syscall_table_32.h> #undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = (syscall_fn_t)__arm64_##sym, +#define __SYSCALL(nr, sym) [nr] = __arm64_##sym, -const syscall_fn_t compat_sys_call_table[__NR_compat_syscalls] = { - [0 ... __NR_compat_syscalls - 1] = (syscall_fn_t)sys_ni_syscall, -#include <asm/unistd32.h> +const syscall_fn_t compat_sys_call_table[__NR_compat32_syscalls] = { + [0 ... __NR_compat32_syscalls - 1] = __arm64_sys_ni_syscall, +#include <asm/syscall_table_32.h> }; diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index c832a5c24efc..4a609e9b65de 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -1,25 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/sys_arm.c * * Copyright (C) People who wrote linux/arch/i386/kernel/sys_i386.c * Copyright (C) 1995, 1996 Russell King. * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/compat.h> -#include <linux/personality.h> +#include <linux/cpufeature.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/slab.h> @@ -28,6 +17,7 @@ #include <asm/cacheflush.h> #include <asm/system_misc.h> +#include <asm/tlbflush.h> #include <asm/unistd.h> static long @@ -41,7 +31,16 @@ __do_compat_cache_op(unsigned long start, unsigned long end) if (fatal_signal_pending(current)) return 0; - ret = __flush_cache_user_range(start, start + chunk); + if (cpus_have_final_cap(ARM64_WORKAROUND_1542419)) { + /* + * The workaround requires an inner-shareable tlbi. + * We pick the reserved-ASID to minimise the impact. + */ + __tlbi(aside1is, __TLBI_VADDR(0, 0)); + dsb(ish); + } + + ret = caches_clean_inval_user_pou(start, start + chunk); if (ret) return ret; @@ -68,7 +67,7 @@ do_compat_cache_op(unsigned long start, unsigned long end, int flags) */ long compat_arm_syscall(struct pt_regs *regs, int scno) { - void __user *addr; + unsigned long addr; switch (scno) { /* @@ -111,10 +110,9 @@ long compat_arm_syscall(struct pt_regs *regs, int scno) break; } - addr = (void __user *)instruction_pointer(regs) - - (compat_thumb_mode(regs) ? 2 : 4); + addr = instruction_pointer(regs) - (compat_thumb_mode(regs) ? 2 : 4); arm64_notify_die("Oops - bad compat syscall(2)", regs, - SIGILL, ILL_ILLTRP, addr, scno); + SIGILL, ILL_ILLTRP, addr, 0); return 0; } diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index 5610ac01c1ec..c062badd1a56 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -5,27 +5,27 @@ #include <linux/errno.h> #include <linux/nospec.h> #include <linux/ptrace.h> +#include <linux/randomize_kstack.h> #include <linux/syscalls.h> -#include <asm/daifflags.h> +#include <asm/debug-monitors.h> +#include <asm/exception.h> #include <asm/fpsimd.h> #include <asm/syscall.h> #include <asm/thread_info.h> #include <asm/unistd.h> +#include <asm/unistd_compat_32.h> long compat_arm_syscall(struct pt_regs *regs, int scno); long sys_ni_syscall(void); static long do_ni_syscall(struct pt_regs *regs, int scno) { -#ifdef CONFIG_COMPAT - long ret; if (is_compat_task()) { - ret = compat_arm_syscall(regs, scno); + long ret = compat_arm_syscall(regs, scno); if (ret != -ENOSYS) return ret; } -#endif return sys_ni_syscall(); } @@ -41,7 +41,9 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, { long ret; - if (scno < sc_nr) { + add_random_kstack_offset(); + + if (likely(scno < sc_nr)) { syscall_fn_t syscall_fn; syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; ret = __invoke_syscall(regs, syscall_fn); @@ -49,7 +51,18 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, ret = do_ni_syscall(regs, scno); } - regs->regs[0] = ret; + syscall_set_return_value(current, regs, 0, ret); + + /* + * This value will get limited by KSTACK_OFFSET_MAX(), which is 10 + * bits. The actual entropy will be further reduced by the compiler + * when applying stack alignment constraints: the AAPCS mandates a + * 16-byte aligned SP at function boundaries, which will remove the + * 4 low bits from any entropy chosen here. + * + * The resulting 6 bits of entropy is seen in SP[9:4]. + */ + choose_random_kstack_offset(get_random_u16()); } static inline bool has_syscall_work(unsigned long flags) @@ -57,24 +70,60 @@ static inline bool has_syscall_work(unsigned long flags) return unlikely(flags & _TIF_SYSCALL_WORK); } -int syscall_trace_enter(struct pt_regs *regs); -void syscall_trace_exit(struct pt_regs *regs); - static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, const syscall_fn_t syscall_table[]) { - unsigned long flags = current_thread_info()->flags; + unsigned long flags = read_thread_flags(); regs->orig_x0 = regs->regs[0]; regs->syscallno = scno; - local_daif_restore(DAIF_PROCCTX); - user_exit(); + /* + * BTI note: + * The architecture does not guarantee that SPSR.BTYPE is zero + * on taking an SVC, so we could return to userspace with a + * non-zero BTYPE after the syscall. + * + * This shouldn't matter except when userspace is explicitly + * doing something stupid, such as setting PROT_BTI on a page + * that lacks conforming BTI/PACIxSP instructions, falling + * through from one executable page to another with differing + * PROT_BTI, or messing with BTYPE via ptrace: in such cases, + * userspace should not be surprised if a SIGILL occurs on + * syscall return. + * + * So, don't touch regs->pstate & PSR_BTYPE_MASK here. + * (Similarly for HVC and SMC elsewhere.) + */ + + if (unlikely(flags & _TIF_MTE_ASYNC_FAULT)) { + /* + * Process the asynchronous tag check fault before the actual + * syscall. do_notify_resume() will send a signal to userspace + * before the syscall is restarted. + */ + syscall_set_return_value(current, regs, -ERESTARTNOINTR, 0); + return; + } if (has_syscall_work(flags)) { - /* set default errno for user-issued syscall(-1) */ + /* + * The de-facto standard way to skip a system call using ptrace + * is to set the system call to -1 (NO_SYSCALL) and set x0 to a + * suitable error code for consumption by userspace. However, + * this cannot be distinguished from a user-issued syscall(-1) + * and so we must set x0 to -ENOSYS here in case the tracer doesn't + * issue the skip and we fall into trace_exit with x0 preserved. + * + * This is slightly odd because it also means that if a tracer + * sets the system call number to -1 but does not initialise x0, + * then x0 will be preserved for all system calls apart from a + * user-issued syscall(-1). However, requesting a skip and not + * setting the return value is unlikely to do anything sensible + * anyway. + */ if (scno == NO_SYSCALL) - regs->regs[0] = -ENOSYS; + syscall_set_return_value(current, regs, -ENOSYS, 0); scno = syscall_trace_enter(regs); if (scno == NO_SYSCALL) goto trace_exit; @@ -88,51 +137,24 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, * exit regardless, as the old entry assembly did. */ if (!has_syscall_work(flags) && !IS_ENABLED(CONFIG_DEBUG_RSEQ)) { - local_daif_mask(); - flags = current_thread_info()->flags; - if (!has_syscall_work(flags)) { - /* - * We're off to userspace, where interrupts are - * always enabled after we restore the flags from - * the SPSR. - */ - trace_hardirqs_on(); + flags = read_thread_flags(); + if (!has_syscall_work(flags) && !(flags & _TIF_SINGLESTEP)) return; - } - local_daif_restore(DAIF_PROCCTX); } trace_exit: syscall_trace_exit(regs); } -static inline void sve_user_discard(void) -{ - if (!system_supports_sve()) - return; - - clear_thread_flag(TIF_SVE); - - /* - * task_fpsimd_load() won't be called to update CPACR_EL1 in - * ret_to_user unless TIF_FOREIGN_FPSTATE is still set, which only - * happens if a context switch or kernel_neon_begin() or context - * modification (sigreturn, ptrace) intervenes. - * So, ensure that CPACR_EL1 is already correct for the fast-path case. - */ - sve_user_disable(); -} - -asmlinkage void el0_svc_handler(struct pt_regs *regs) +void do_el0_svc(struct pt_regs *regs) { - sve_user_discard(); el0_svc_common(regs, regs->regs[8], __NR_syscalls, sys_call_table); } #ifdef CONFIG_COMPAT -asmlinkage void el0_svc_compat_handler(struct pt_regs *regs) +void do_el0_svc_compat(struct pt_regs *regs) { - el0_svc_common(regs, regs->regs[7], __NR_compat_syscalls, + el0_svc_common(regs, regs->regs[7], __NR_compat32_syscalls, compat_sys_call_table); } #endif diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index a777ae90044d..b5855eb7435d 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -1,21 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/time.c * * Copyright (C) 1991, 1992, 1995 Linus Torvalds * Modifications for ARM (C) 1994-2001 Russell King * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/clockchips.h> @@ -29,38 +18,37 @@ #include <linux/timex.h> #include <linux/errno.h> #include <linux/profile.h> +#include <linux/stacktrace.h> #include <linux/syscore_ops.h> #include <linux/timer.h> #include <linux/irq.h> #include <linux/delay.h> #include <linux/clocksource.h> -#include <linux/clk-provider.h> +#include <linux/of_clk.h> #include <linux/acpi.h> #include <clocksource/arm_arch_timer.h> #include <asm/thread_info.h> -#include <asm/stacktrace.h> +#include <asm/paravirt.h> -unsigned long profile_pc(struct pt_regs *regs) +static bool profile_pc_cb(void *arg, unsigned long pc) { - struct stackframe frame; + unsigned long *prof_pc = arg; - if (!in_lock_functions(regs->pc)) - return regs->pc; + if (in_lock_functions(pc)) + return true; + *prof_pc = pc; + return false; +} - frame.fp = regs->regs[29]; - frame.pc = regs->pc; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = 0; -#endif - do { - int ret = unwind_frame(NULL, &frame); - if (ret < 0) - return 0; - } while (in_lock_functions(frame.pc)); +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long prof_pc = 0; - return frame.pc; + arch_stack_walk(profile_pc_cb, &prof_pc, current, regs); + + return prof_pc; } EXPORT_SYMBOL(profile_pc); @@ -79,4 +67,6 @@ void __init time_init(void) /* Calibrate the delay loop directly */ lpj_fine = arch_timer_rate / HZ; + + pv_time_init(); } diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 0825c4a856e3..5d24dc53799b 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -14,394 +14,394 @@ #include <linux/acpi.h> #include <linux/arch_topology.h> #include <linux/cacheinfo.h> -#include <linux/cpu.h> -#include <linux/cpumask.h> +#include <linux/cpufreq.h> +#include <linux/cpu_smt.h> #include <linux/init.h> #include <linux/percpu.h> -#include <linux/node.h> -#include <linux/nodemask.h> -#include <linux/of.h> -#include <linux/sched.h> -#include <linux/sched/topology.h> -#include <linux/slab.h> -#include <linux/smp.h> -#include <linux/string.h> +#include <linux/sched/isolation.h> +#include <linux/xarray.h> #include <asm/cpu.h> #include <asm/cputype.h> #include <asm/topology.h> -static int __init get_cpu_for_node(struct device_node *node) -{ - struct device_node *cpu_node; - int cpu; +#ifdef CONFIG_ARM64_AMU_EXTN +#define read_corecnt() read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0) +#define read_constcnt() read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0) +#else +#define read_corecnt() (0UL) +#define read_constcnt() (0UL) +#endif - cpu_node = of_parse_phandle(node, "cpu", 0); - if (!cpu_node) - return -1; +#undef pr_fmt +#define pr_fmt(fmt) "AMU: " fmt - cpu = of_cpu_node_to_id(cpu_node); - if (cpu >= 0) - topology_parse_cpu_capacity(cpu_node, cpu); - else - pr_crit("Unable to find CPU node for %pOF\n", cpu_node); +/* + * Ensure that amu_scale_freq_tick() will return SCHED_CAPACITY_SCALE until + * the CPU capacity and its associated frequency have been correctly + * initialized. + */ +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT); +static cpumask_var_t amu_fie_cpus; - of_node_put(cpu_node); - return cpu; -} +struct amu_cntr_sample { + u64 arch_const_cycles_prev; + u64 arch_core_cycles_prev; + unsigned long last_scale_update; +}; -static int __init parse_core(struct device_node *core, int package_id, - int core_id) -{ - char name[10]; - bool leaf = true; - int i = 0; - int cpu; - struct device_node *t; - - do { - snprintf(name, sizeof(name), "thread%d", i); - t = of_get_child_by_name(core, name); - if (t) { - leaf = false; - cpu = get_cpu_for_node(t); - if (cpu >= 0) { - cpu_topology[cpu].package_id = package_id; - cpu_topology[cpu].core_id = core_id; - cpu_topology[cpu].thread_id = i; - } else { - pr_err("%pOF: Can't get CPU for thread\n", - t); - of_node_put(t); - return -EINVAL; - } - of_node_put(t); - } - i++; - } while (t); - - cpu = get_cpu_for_node(core); - if (cpu >= 0) { - if (!leaf) { - pr_err("%pOF: Core has both threads and CPU\n", - core); - return -EINVAL; - } +static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples); - cpu_topology[cpu].package_id = package_id; - cpu_topology[cpu].core_id = core_id; - } else if (leaf) { - pr_err("%pOF: Can't get CPU for leaf core\n", core); - return -EINVAL; - } +void update_freq_counters_refs(void) +{ + struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples); - return 0; + amu_sample->arch_core_cycles_prev = read_corecnt(); + amu_sample->arch_const_cycles_prev = read_constcnt(); } -static int __init parse_cluster(struct device_node *cluster, int depth) +static inline bool freq_counters_valid(int cpu) { - char name[10]; - bool leaf = true; - bool has_cores = false; - struct device_node *c; - static int package_id __initdata; - int core_id = 0; - int i, ret; + struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu); - /* - * First check for child clusters; we currently ignore any - * information about the nesting of clusters and present the - * scheduler with a flat list of them. - */ - i = 0; - do { - snprintf(name, sizeof(name), "cluster%d", i); - c = of_get_child_by_name(cluster, name); - if (c) { - leaf = false; - ret = parse_cluster(c, depth + 1); - of_node_put(c); - if (ret != 0) - return ret; - } - i++; - } while (c); - - /* Now check for cores */ - i = 0; - do { - snprintf(name, sizeof(name), "core%d", i); - c = of_get_child_by_name(cluster, name); - if (c) { - has_cores = true; - - if (depth == 0) { - pr_err("%pOF: cpu-map children should be clusters\n", - c); - of_node_put(c); - return -EINVAL; - } + if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) + return false; - if (leaf) { - ret = parse_core(c, package_id, core_id++); - } else { - pr_err("%pOF: Non-leaf cluster with core %s\n", - cluster, name); - ret = -EINVAL; - } - - of_node_put(c); - if (ret != 0) - return ret; - } - i++; - } while (c); - - if (leaf && !has_cores) - pr_warn("%pOF: empty cluster\n", cluster); + if (!cpu_has_amu_feat(cpu)) { + pr_debug("CPU%d: counters are not supported.\n", cpu); + return false; + } - if (leaf) - package_id++; + if (unlikely(!amu_sample->arch_const_cycles_prev || + !amu_sample->arch_core_cycles_prev)) { + pr_debug("CPU%d: cycle counters are not enabled.\n", cpu); + return false; + } - return 0; + return true; } -static int __init parse_dt_topology(void) +void freq_inv_set_max_ratio(int cpu, u64 max_rate) { - struct device_node *cn, *map; - int ret = 0; - int cpu; + u64 ratio, ref_rate = arch_timer_get_rate(); - cn = of_find_node_by_path("/cpus"); - if (!cn) { - pr_err("No CPU information found in DT\n"); - return 0; + if (unlikely(!max_rate || !ref_rate)) { + WARN_ONCE(1, "CPU%d: invalid maximum or reference frequency.\n", + cpu); + return; } /* - * When topology is provided cpu-map is essentially a root - * cluster with restricted subnodes. + * Pre-compute the fixed ratio between the frequency of the constant + * reference counter and the maximum frequency of the CPU. + * + * ref_rate + * arch_max_freq_scale = ---------- * SCHED_CAPACITY_SCALE² + * max_rate + * + * We use a factor of 2 * SCHED_CAPACITY_SHIFT -> SCHED_CAPACITY_SCALE² + * in order to ensure a good resolution for arch_max_freq_scale for + * very low reference frequencies (down to the KHz range which should + * be unlikely). */ - map = of_get_child_by_name(cn, "cpu-map"); - if (!map) - goto out; + ratio = ref_rate << (2 * SCHED_CAPACITY_SHIFT); + ratio = div64_u64(ratio, max_rate); + if (!ratio) { + WARN_ONCE(1, "Reference frequency too low.\n"); + return; + } + + WRITE_ONCE(per_cpu(arch_max_freq_scale, cpu), (unsigned long)ratio); +} - ret = parse_cluster(map, 0); - if (ret != 0) - goto out_map; +static void amu_scale_freq_tick(void) +{ + struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples); + u64 prev_core_cnt, prev_const_cnt; + u64 core_cnt, const_cnt, scale; + + prev_const_cnt = amu_sample->arch_const_cycles_prev; + prev_core_cnt = amu_sample->arch_core_cycles_prev; - topology_normalize_cpu_scale(); + update_freq_counters_refs(); + + const_cnt = amu_sample->arch_const_cycles_prev; + core_cnt = amu_sample->arch_core_cycles_prev; /* - * Check that all cores are in the topology; the SMP code will - * only mark cores described in the DT as possible. + * This should not happen unless the AMUs have been reset and the + * counter values have not been restored - unlikely */ - for_each_possible_cpu(cpu) - if (cpu_topology[cpu].package_id == -1) - ret = -EINVAL; - -out_map: - of_node_put(map); -out: - of_node_put(cn); - return ret; + if (unlikely(core_cnt <= prev_core_cnt || + const_cnt <= prev_const_cnt)) + return; + + /* + * /\core arch_max_freq_scale + * scale = ------- * -------------------- + * /\const SCHED_CAPACITY_SCALE + * + * See validate_cpu_freq_invariance_counters() for details on + * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT. + */ + scale = core_cnt - prev_core_cnt; + scale *= this_cpu_read(arch_max_freq_scale); + scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT, + const_cnt - prev_const_cnt); + + scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); + this_cpu_write(arch_freq_scale, (unsigned long)scale); + + amu_sample->last_scale_update = jiffies; } -/* - * cpu topology table - */ -struct cpu_topology cpu_topology[NR_CPUS]; -EXPORT_SYMBOL_GPL(cpu_topology); +static struct scale_freq_data amu_sfd = { + .source = SCALE_FREQ_SOURCE_ARCH, + .set_freq_scale = amu_scale_freq_tick, +}; -const struct cpumask *cpu_coregroup_mask(int cpu) +static __always_inline bool amu_fie_cpu_supported(unsigned int cpu) { - const cpumask_t *core_mask = cpumask_of_node(cpu_to_node(cpu)); + return cpumask_available(amu_fie_cpus) && + cpumask_test_cpu(cpu, amu_fie_cpus); +} - /* Find the smaller of NUMA, core or LLC siblings */ - if (cpumask_subset(&cpu_topology[cpu].core_sibling, core_mask)) { - /* not numa in package, lets use the package siblings */ - core_mask = &cpu_topology[cpu].core_sibling; - } - if (cpu_topology[cpu].llc_id != -1) { - if (cpumask_subset(&cpu_topology[cpu].llc_sibling, core_mask)) - core_mask = &cpu_topology[cpu].llc_sibling; - } +void arch_cpu_idle_enter(void) +{ + unsigned int cpu = smp_processor_id(); + + if (!amu_fie_cpu_supported(cpu)) + return; - return core_mask; + /* Kick in AMU update but only if one has not happened already */ + if (housekeeping_cpu(cpu, HK_TYPE_TICK) && + time_is_before_jiffies(per_cpu(cpu_amu_samples.last_scale_update, cpu))) + amu_scale_freq_tick(); } -static void update_siblings_masks(unsigned int cpuid) +#define AMU_SAMPLE_EXP_MS 20 + +int arch_freq_get_on_cpu(int cpu) { - struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; - int cpu; + struct amu_cntr_sample *amu_sample; + unsigned int start_cpu = cpu; + unsigned long last_update; + unsigned int freq = 0; + u64 scale; - /* update core and thread sibling masks */ - for_each_online_cpu(cpu) { - cpu_topo = &cpu_topology[cpu]; + if (!amu_fie_cpu_supported(cpu) || !arch_scale_freq_ref(cpu)) + return -EOPNOTSUPP; - if (cpuid_topo->llc_id == cpu_topo->llc_id) { - cpumask_set_cpu(cpu, &cpuid_topo->llc_sibling); - cpumask_set_cpu(cpuid, &cpu_topo->llc_sibling); - } + while (1) { + + amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu); + + last_update = amu_sample->last_scale_update; + + /* + * For those CPUs that are in full dynticks mode, or those that have + * not seen tick for a while, try an alternative source for the counters + * (and thus freq scale), if available, for given policy: this boils + * down to identifying an active cpu within the same freq domain, if any. + */ + if (!housekeeping_cpu(cpu, HK_TYPE_TICK) || + time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) { + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + int ref_cpu; + + if (!policy) + return -EINVAL; + + if (!cpumask_intersects(policy->related_cpus, + housekeeping_cpumask(HK_TYPE_TICK))) { + cpufreq_cpu_put(policy); + return -EOPNOTSUPP; + } - if (cpuid_topo->package_id != cpu_topo->package_id) - continue; + for_each_cpu_wrap(ref_cpu, policy->cpus, cpu + 1) { + if (ref_cpu == start_cpu) { + /* Prevent verifying same CPU twice */ + ref_cpu = nr_cpu_ids; + break; + } + if (!idle_cpu(ref_cpu)) + break; + } - cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); - cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); + cpufreq_cpu_put(policy); - if (cpuid_topo->core_id != cpu_topo->core_id) - continue; + if (ref_cpu >= nr_cpu_ids) + /* No alternative to pull info from */ + return -EAGAIN; - cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling); - cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling); + cpu = ref_cpu; + } else { + break; + } } + /* + * Reversed computation to the one used to determine + * the arch_freq_scale value + * (see amu_scale_freq_tick for details) + */ + scale = arch_scale_freq_capacity(cpu); + freq = scale * arch_scale_freq_ref(cpu); + freq >>= SCHED_CAPACITY_SHIFT; + return freq; } -void store_cpu_topology(unsigned int cpuid) +static void amu_fie_setup(const struct cpumask *cpus) { - struct cpu_topology *cpuid_topo = &cpu_topology[cpuid]; - u64 mpidr; + int cpu; - if (cpuid_topo->package_id != -1) - goto topology_populated; + /* We are already set since the last insmod of cpufreq driver */ + if (cpumask_available(amu_fie_cpus) && + unlikely(cpumask_subset(cpus, amu_fie_cpus))) + return; - mpidr = read_cpuid_mpidr(); + for_each_cpu(cpu, cpus) + if (!freq_counters_valid(cpu)) + return; - /* Uniprocessor systems can rely on default topology values */ - if (mpidr & MPIDR_UP_BITMASK) + if (!cpumask_available(amu_fie_cpus) && + !zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) { + WARN_ONCE(1, "Failed to allocate FIE cpumask for CPUs[%*pbl]\n", + cpumask_pr_args(cpus)); return; - - /* Create cpu topology mapping based on MPIDR. */ - if (mpidr & MPIDR_MT_BITMASK) { - /* Multiprocessor system : Multi-threads per core */ - cpuid_topo->thread_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); - cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 1); - cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) | - MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8; - } else { - /* Multiprocessor system : Single-thread per core */ - cpuid_topo->thread_id = -1; - cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); - cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) | - MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 | - MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16; } - pr_debug("CPU%u: cluster %d core %d thread %d mpidr %#016llx\n", - cpuid, cpuid_topo->package_id, cpuid_topo->core_id, - cpuid_topo->thread_id, mpidr); + cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); -topology_populated: - update_siblings_masks(cpuid); + topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus); + + pr_debug("CPUs[%*pbl]: counters will be used for FIE.", + cpumask_pr_args(cpus)); } -static void clear_cpu_topology(int cpu) +static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, + void *data) { - struct cpu_topology *cpu_topo = &cpu_topology[cpu]; + struct cpufreq_policy *policy = data; - cpumask_clear(&cpu_topo->llc_sibling); - cpumask_set_cpu(cpu, &cpu_topo->llc_sibling); + if (val == CPUFREQ_CREATE_POLICY) + amu_fie_setup(policy->related_cpus); - cpumask_clear(&cpu_topo->core_sibling); - cpumask_set_cpu(cpu, &cpu_topo->core_sibling); - cpumask_clear(&cpu_topo->thread_sibling); - cpumask_set_cpu(cpu, &cpu_topo->thread_sibling); + /* + * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU + * counters don't have any dependency on cpufreq driver once we have + * initialized AMU support and enabled invariance. The AMU counters will + * keep on working just fine in the absence of the cpufreq driver, and + * for the CPUs for which there are no counters available, the last set + * value of arch_freq_scale will remain valid as that is the frequency + * those CPUs are running at. + */ + + return 0; } -static void __init reset_cpu_topology(void) +static struct notifier_block init_amu_fie_notifier = { + .notifier_call = init_amu_fie_callback, +}; + +static int __init init_amu_fie(void) { - unsigned int cpu; + return cpufreq_register_notifier(&init_amu_fie_notifier, + CPUFREQ_POLICY_NOTIFIER); +} +core_initcall(init_amu_fie); - for_each_possible_cpu(cpu) { - struct cpu_topology *cpu_topo = &cpu_topology[cpu]; +#ifdef CONFIG_ACPI_CPPC_LIB +#include <acpi/cppc_acpi.h> - cpu_topo->thread_id = -1; - cpu_topo->core_id = 0; - cpu_topo->package_id = -1; - cpu_topo->llc_id = -1; +static void cpu_read_corecnt(void *val) +{ + /* + * A value of 0 can be returned if the current CPU does not support AMUs + * or if the counter is disabled for this CPU. A return value of 0 at + * counter read is properly handled as an error case by the users of the + * counter. + */ + *(u64 *)val = read_corecnt(); +} - clear_cpu_topology(cpu); - } +static void cpu_read_constcnt(void *val) +{ + /* + * Return 0 if the current CPU is affected by erratum 2457168. A value + * of 0 is also returned if the current CPU does not support AMUs or if + * the counter is disabled. A return value of 0 at counter read is + * properly handled as an error case by the users of the counter. + */ + *(u64 *)val = this_cpu_has_cap(ARM64_WORKAROUND_2457168) ? + 0UL : read_constcnt(); } -void remove_cpu_topology(unsigned int cpu) +static inline +int counters_read_on_cpu(int cpu, smp_call_func_t func, u64 *val) { - int sibling; + /* + * Abort call on counterless CPU or when interrupts are + * disabled - can lead to deadlock in smp sync call. + */ + if (!cpu_has_amu_feat(cpu)) + return -EOPNOTSUPP; + + if (WARN_ON_ONCE(irqs_disabled())) + return -EPERM; - for_each_cpu(sibling, topology_core_cpumask(cpu)) - cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); - for_each_cpu(sibling, topology_sibling_cpumask(cpu)) - cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); - for_each_cpu(sibling, topology_llc_cpumask(cpu)) - cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling)); + smp_call_function_single(cpu, func, val, 1); - clear_cpu_topology(cpu); + return 0; } -#ifdef CONFIG_ACPI /* - * Propagate the topology information of the processor_topology_node tree to the - * cpu_topology array. + * Refer to drivers/acpi/cppc_acpi.c for the description of the functions + * below. */ -static int __init parse_acpi_topology(void) +bool cpc_ffh_supported(void) { - bool is_threaded; - int cpu, topology_id; - - is_threaded = read_cpuid_mpidr() & MPIDR_MT_BITMASK; + int cpu = get_cpu_with_amu_feat(); - for_each_possible_cpu(cpu) { - int i, cache_id; + /* + * FFH is considered supported if there is at least one present CPU that + * supports AMUs. Using FFH to read core and reference counters for CPUs + * that do not support AMUs, have counters disabled or that are affected + * by errata, will result in a return value of 0. + * + * This is done to allow any enabled and valid counters to be read + * through FFH, knowing that potentially returning 0 as counter value is + * properly handled by the users of these counters. + */ + if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) + return false; - topology_id = find_acpi_cpu_topology(cpu, 0); - if (topology_id < 0) - return topology_id; + return true; +} - if (is_threaded) { - cpu_topology[cpu].thread_id = topology_id; - topology_id = find_acpi_cpu_topology(cpu, 1); - cpu_topology[cpu].core_id = topology_id; - } else { - cpu_topology[cpu].thread_id = -1; - cpu_topology[cpu].core_id = topology_id; - } - topology_id = find_acpi_cpu_topology_package(cpu); - cpu_topology[cpu].package_id = topology_id; - - i = acpi_find_last_cache_level(cpu); - - if (i > 0) { - /* - * this is the only part of cpu_topology that has - * a direct relationship with the cache topology - */ - cache_id = find_acpi_cpu_cache_topology(cpu, i); - if (cache_id > 0) - cpu_topology[cpu].llc_id = cache_id; - } +int cpc_read_ffh(int cpu, struct cpc_reg *reg, u64 *val) +{ + int ret = -EOPNOTSUPP; + + switch ((u64)reg->address) { + case 0x0: + ret = counters_read_on_cpu(cpu, cpu_read_corecnt, val); + break; + case 0x1: + ret = counters_read_on_cpu(cpu, cpu_read_constcnt, val); + break; } - return 0; -} + if (!ret) { + *val &= GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + *val >>= reg->bit_offset; + } -#else -static inline int __init parse_acpi_topology(void) -{ - return -EINVAL; + return ret; } -#endif -void __init init_cpu_topology(void) +int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) { - reset_cpu_topology(); - - /* - * Discard anything that was parsed if we hit an error so we - * don't use partial information. - */ - if (!acpi_disabled && parse_acpi_topology()) - reset_cpu_topology(); - else if (of_have_populated_dt() && parse_dt_topology()) - reset_cpu_topology(); + return -EOPNOTSUPP; } +#endif /* CONFIG_ACPI_CPPC_LIB */ diff --git a/arch/arm64/kernel/trace-events-emulation.h b/arch/arm64/kernel/trace-events-emulation.h index 6c40f58b844a..c51b547b583e 100644 --- a/arch/arm64/kernel/trace-events-emulation.h +++ b/arch/arm64/kernel/trace-events-emulation.h @@ -18,7 +18,7 @@ TRACE_EVENT(instruction_emulation, ), TP_fast_assign( - __assign_str(instr, instr); + __assign_str(instr); __entry->addr = addr; ), diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 4e2fb877f8d5..914282016069 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -1,26 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/traps.c * * Copyright (C) 1995-2009 Russell King * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/bug.h> +#include <linux/context_tracking.h> #include <linux/signal.h> -#include <linux/personality.h> #include <linux/kallsyms.h> +#include <linux/kprobes.h> #include <linux/spinlock.h> #include <linux/uaccess.h> #include <linux/hardirq.h> @@ -28,6 +18,7 @@ #include <linux/module.h> #include <linux/kexec.h> #include <linux/delay.h> +#include <linux/efi.h> #include <linux/init.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> @@ -36,138 +27,159 @@ #include <linux/syscalls.h> #include <linux/mm_types.h> #include <linux/kasan.h> +#include <linux/ubsan.h> +#include <linux/cfi.h> #include <asm/atomic.h> #include <asm/bug.h> #include <asm/cpufeature.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> +#include <asm/efi.h> #include <asm/esr.h> +#include <asm/exception.h> +#include <asm/extable.h> #include <asm/insn.h> +#include <asm/kprobes.h> +#include <asm/text-patching.h> #include <asm/traps.h> #include <asm/smp.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> -#include <asm/exception.h> #include <asm/system_misc.h> #include <asm/sysreg.h> -static const char *handler[]= { - "Synchronous Abort", - "IRQ", - "FIQ", - "Error" -}; +static bool __kprobes __check_eq(unsigned long pstate) +{ + return (pstate & PSR_Z_BIT) != 0; +} -int show_unhandled_signals = 0; +static bool __kprobes __check_ne(unsigned long pstate) +{ + return (pstate & PSR_Z_BIT) == 0; +} -static void dump_backtrace_entry(unsigned long where) +static bool __kprobes __check_cs(unsigned long pstate) { - printk(" %pS\n", (void *)where); + return (pstate & PSR_C_BIT) != 0; } -static void __dump_instr(const char *lvl, struct pt_regs *regs) +static bool __kprobes __check_cc(unsigned long pstate) { - unsigned long addr = instruction_pointer(regs); - char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; - int i; + return (pstate & PSR_C_BIT) == 0; +} - for (i = -4; i < 1; i++) { - unsigned int val, bad; +static bool __kprobes __check_mi(unsigned long pstate) +{ + return (pstate & PSR_N_BIT) != 0; +} - bad = get_user(val, &((u32 *)addr)[i]); +static bool __kprobes __check_pl(unsigned long pstate) +{ + return (pstate & PSR_N_BIT) == 0; +} - if (!bad) - p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); - else { - p += sprintf(p, "bad PC value"); - break; - } - } - printk("%sCode: %s\n", lvl, str); +static bool __kprobes __check_vs(unsigned long pstate) +{ + return (pstate & PSR_V_BIT) != 0; } -static void dump_instr(const char *lvl, struct pt_regs *regs) +static bool __kprobes __check_vc(unsigned long pstate) { - if (!user_mode(regs)) { - mm_segment_t fs = get_fs(); - set_fs(KERNEL_DS); - __dump_instr(lvl, regs); - set_fs(fs); - } else { - __dump_instr(lvl, regs); - } + return (pstate & PSR_V_BIT) == 0; } -void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +static bool __kprobes __check_hi(unsigned long pstate) { - struct stackframe frame; - int skip; + pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ + return (pstate & PSR_C_BIT) != 0; +} - pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); +static bool __kprobes __check_ls(unsigned long pstate) +{ + pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ + return (pstate & PSR_C_BIT) == 0; +} - if (!tsk) - tsk = current; +static bool __kprobes __check_ge(unsigned long pstate) +{ + pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ + return (pstate & PSR_N_BIT) == 0; +} - if (!try_get_task_stack(tsk)) - return; +static bool __kprobes __check_lt(unsigned long pstate) +{ + pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ + return (pstate & PSR_N_BIT) != 0; +} - if (tsk == current) { - frame.fp = (unsigned long)__builtin_frame_address(0); - frame.pc = (unsigned long)dump_backtrace; - } else { - /* - * task blocked in __switch_to - */ - frame.fp = thread_saved_fp(tsk); - frame.pc = thread_saved_pc(tsk); - } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = 0; -#endif +static bool __kprobes __check_gt(unsigned long pstate) +{ + /*PSR_N_BIT ^= PSR_V_BIT */ + unsigned long temp = pstate ^ (pstate << 3); - skip = !!regs; - printk("Call trace:\n"); - do { - /* skip until specified stack frame */ - if (!skip) { - dump_backtrace_entry(frame.pc); - } else if (frame.fp == regs->regs[29]) { - skip = 0; - /* - * Mostly, this is the case where this function is - * called in panic/abort. As exception handler's - * stack frame does not contain the corresponding pc - * at which an exception has taken place, use regs->pc - * instead. - */ - dump_backtrace_entry(regs->pc); - } - } while (!unwind_frame(tsk, &frame)); + temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ + return (temp & PSR_N_BIT) == 0; +} + +static bool __kprobes __check_le(unsigned long pstate) +{ + /*PSR_N_BIT ^= PSR_V_BIT */ + unsigned long temp = pstate ^ (pstate << 3); - put_task_stack(tsk); + temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ + return (temp & PSR_N_BIT) != 0; } -void show_stack(struct task_struct *tsk, unsigned long *sp) +static bool __kprobes __check_al(unsigned long pstate) { - dump_backtrace(NULL, tsk); - barrier(); + return true; +} + +/* + * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that + * it behaves identically to 0b1110 ("al"). + */ +pstate_check_t * const aarch32_opcode_cond_checks[16] = { + __check_eq, __check_ne, __check_cs, __check_cc, + __check_mi, __check_pl, __check_vs, __check_vc, + __check_hi, __check_ls, __check_ge, __check_lt, + __check_gt, __check_le, __check_al, __check_al +}; + +int show_unhandled_signals = 0; + +void dump_kernel_instr(unsigned long kaddr) +{ + char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; + int i; + + if (!is_ttbr1_addr(kaddr)) + return; + + for (i = -4; i < 1; i++) { + unsigned int val, bad; + + bad = aarch64_insn_read(&((u32 *)kaddr)[i], &val); + + if (!bad) + p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); + else + p += sprintf(p, i == 0 ? "(????????) " : "???????? "); + } + + printk(KERN_EMERG "Code: %s\n", str); } -#ifdef CONFIG_PREEMPT -#define S_PREEMPT " PREEMPT" -#else -#define S_PREEMPT "" -#endif #define S_SMP " SMP" -static int __die(const char *str, int err, struct pt_regs *regs) +static int __die(const char *str, long err, struct pt_regs *regs) { - struct task_struct *tsk = current; static int die_counter; int ret; + unsigned long addr = instruction_pointer(regs); - pr_emerg("Internal error: %s: %x [#%d]" S_PREEMPT S_SMP "\n", + pr_emerg("Internal error: %s: %016lx [#%d] " S_SMP "\n", str, err, ++die_counter); /* trap and error numbers are mostly meaningless on ARM */ @@ -176,15 +188,12 @@ static int __die(const char *str, int err, struct pt_regs *regs) return ret; print_modules(); - __show_regs(regs); - pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n", - TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), - end_of_stack(tsk)); + show_regs(regs); - if (!user_mode(regs)) { - dump_backtrace(regs, tsk); - dump_instr(KERN_EMERG, regs); - } + if (user_mode(regs)) + return ret; + + dump_kernel_instr(addr); return ret; } @@ -194,7 +203,7 @@ static DEFINE_RAW_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. */ -void die(const char *str, struct pt_regs *regs, int err) +void die(const char *str, struct pt_regs *regs, long err) { int ret; unsigned long flags; @@ -215,14 +224,14 @@ void die(const char *str, struct pt_regs *regs, int err) oops_exit(); if (in_interrupt()) - panic("Fatal exception in interrupt"); + panic("%s: Fatal exception in interrupt", str); if (panic_on_oops) - panic("Fatal exception"); + panic("%s: Fatal exception", str); raw_spin_unlock_irqrestore(&die_lock, flags); if (ret != NOTIFY_STOP) - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } static void arm64_show_signal(int signo, const char *str) @@ -230,7 +239,7 @@ static void arm64_show_signal(int signo, const char *str) static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct task_struct *tsk = current; - unsigned int esr = tsk->thread.fault_code; + unsigned long esr = tsk->thread.fault_code; struct pt_regs *regs = task_pt_regs(tsk); /* Leave if the signal won't be shown */ @@ -241,7 +250,7 @@ static void arm64_show_signal(int signo, const char *str) pr_info("%s[%d]: unhandled exception: ", tsk->comm, task_pid_nr(tsk)); if (esr) - pr_cont("%s, ESR 0x%08x, ", esr_get_class_string(esr), esr); + pr_cont("%s, ESR 0x%016lx, ", esr_get_class_string(esr), esr); pr_cont("%s", str); print_vma_addr(KERN_CONT " in ", regs->pc); @@ -249,99 +258,139 @@ static void arm64_show_signal(int signo, const char *str) __show_regs(regs); } -void arm64_force_sig_fault(int signo, int code, void __user *addr, +void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str) { arm64_show_signal(signo, str); - force_sig_fault(signo, code, addr, current); + if (signo == SIGKILL) + force_sig(SIGKILL); + else + force_sig_fault(signo, code, (void __user *)far); +} + +void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey) +{ + arm64_show_signal(SIGSEGV, str); + force_sig_pkuerr((void __user *)far, pkey); } -void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, +void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str) { arm64_show_signal(SIGBUS, str); - force_sig_mceerr(code, addr, lsb, current); + force_sig_mceerr(code, (void __user *)far, lsb); } -void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr, +void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str) { arm64_show_signal(SIGTRAP, str); - force_sig_ptrace_errno_trap(errno, addr); + force_sig_ptrace_errno_trap(errno, (void __user *)far); } void arm64_notify_die(const char *str, struct pt_regs *regs, - int signo, int sicode, void __user *addr, - int err) + int signo, int sicode, unsigned long far, + unsigned long err) { if (user_mode(regs)) { WARN_ON(regs != current_pt_regs()); current->thread.fault_address = 0; current->thread.fault_code = err; - arm64_force_sig_fault(signo, sicode, addr, str); + arm64_force_sig_fault(signo, sicode, far, str); } else { die(str, regs, err); } } -void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size) +#ifdef CONFIG_COMPAT +#define PSTATE_IT_1_0_SHIFT 25 +#define PSTATE_IT_1_0_MASK (0x3 << PSTATE_IT_1_0_SHIFT) +#define PSTATE_IT_7_2_SHIFT 10 +#define PSTATE_IT_7_2_MASK (0x3f << PSTATE_IT_7_2_SHIFT) + +static u32 compat_get_it_state(struct pt_regs *regs) { - regs->pc += size; + u32 it, pstate = regs->pstate; - /* - * If we were single stepping, we want to get the step exception after - * we return from the trap. - */ - if (user_mode(regs)) - user_fastforward_single_step(current); + it = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT; + it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2; + + return it; } -static LIST_HEAD(undef_hook); -static DEFINE_RAW_SPINLOCK(undef_lock); +static void compat_set_it_state(struct pt_regs *regs, u32 it) +{ + u32 pstate_it; + + pstate_it = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK; + pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK; + + regs->pstate &= ~PSR_AA32_IT_MASK; + regs->pstate |= pstate_it; +} -void register_undef_hook(struct undef_hook *hook) +static void advance_itstate(struct pt_regs *regs) { - unsigned long flags; + u32 it; - raw_spin_lock_irqsave(&undef_lock, flags); - list_add(&hook->node, &undef_hook); - raw_spin_unlock_irqrestore(&undef_lock, flags); + /* ARM mode */ + if (!(regs->pstate & PSR_AA32_T_BIT) || + !(regs->pstate & PSR_AA32_IT_MASK)) + return; + + it = compat_get_it_state(regs); + + /* + * If this is the last instruction of the block, wipe the IT + * state. Otherwise advance it. + */ + if (!(it & 7)) + it = 0; + else + it = (it & 0xe0) | ((it << 1) & 0x1f); + + compat_set_it_state(regs, it); } +#else +static void advance_itstate(struct pt_regs *regs) +{ +} +#endif -void unregister_undef_hook(struct undef_hook *hook) +void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size) { - unsigned long flags; + regs->pc += size; + + /* + * If we were single stepping, we want to get the step exception after + * we return from the trap. + */ + if (user_mode(regs)) + user_fastforward_single_step(current); - raw_spin_lock_irqsave(&undef_lock, flags); - list_del(&hook->node); - raw_spin_unlock_irqrestore(&undef_lock, flags); + if (compat_user_mode(regs)) + advance_itstate(regs); + else + regs->pstate &= ~PSR_BTYPE_MASK; } -static int call_undef_hook(struct pt_regs *regs) +static int user_insn_read(struct pt_regs *regs, u32 *insnp) { - struct undef_hook *hook; - unsigned long flags; u32 instr; - int (*fn)(struct pt_regs *regs, u32 instr) = NULL; - void __user *pc = (void __user *)instruction_pointer(regs); + unsigned long pc = instruction_pointer(regs); - if (!user_mode(regs)) { - __le32 instr_le; - if (probe_kernel_address((__force __le32 *)pc, instr_le)) - goto exit; - instr = le32_to_cpu(instr_le); - } else if (compat_thumb_mode(regs)) { + if (compat_thumb_mode(regs)) { /* 16-bit Thumb instruction */ __le16 instr_le; if (get_user(instr_le, (__le16 __user *)pc)) - goto exit; + return -EFAULT; instr = le16_to_cpu(instr_le); if (aarch32_insn_is_wide(instr)) { u32 instr2; if (get_user(instr_le, (__le16 __user *)(pc + 2))) - goto exit; + return -EFAULT; instr2 = le16_to_cpu(instr_le); instr = (instr << 16) | instr2; } @@ -349,22 +398,15 @@ static int call_undef_hook(struct pt_regs *regs) /* 32-bit ARM instruction */ __le32 instr_le; if (get_user(instr_le, (__le32 __user *)pc)) - goto exit; + return -EFAULT; instr = le32_to_cpu(instr_le); } - raw_spin_lock_irqsave(&undef_lock, flags); - list_for_each_entry(hook, &undef_hook, node) - if ((instr & hook->instr_mask) == hook->instr_val && - (regs->pstate & hook->pstate_mask) == hook->pstate_val) - fn = hook->fn; - - raw_spin_unlock_irqrestore(&undef_lock, flags); -exit: - return fn ? fn(regs, instr) : 1; + *insnp = instr; + return 0; } -void force_signal_inject(int signal, int code, unsigned long address) +void force_signal_inject(int signal, int code, unsigned long address, unsigned long err) { const char *desc; struct pt_regs *regs = current_pt_regs(); @@ -390,7 +432,7 @@ void force_signal_inject(int signal, int code, unsigned long address) signal = SIGKILL; } - arm64_notify_die(desc, regs, signal, code, (void __user *)address, 0); + arm64_notify_die(desc, regs, signal, code, address, err); } /* @@ -400,31 +442,109 @@ void arm64_notify_segfault(unsigned long addr) { int code; - down_read(¤t->mm->mmap_sem); - if (find_vma(current->mm, addr) == NULL) + mmap_read_lock(current->mm); + if (find_vma(current->mm, untagged_addr(addr)) == NULL) code = SEGV_MAPERR; else code = SEGV_ACCERR; - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); - force_signal_inject(SIGSEGV, code, addr); + force_signal_inject(SIGSEGV, code, addr, 0); } -asmlinkage void __exception do_undefinstr(struct pt_regs *regs) +void do_el0_undef(struct pt_regs *regs, unsigned long esr) { + u32 insn; + /* check for AArch32 breakpoint instructions */ - if (!aarch32_break_handler(regs)) + if (try_handle_aarch32_break(regs)) + return; + + if (user_insn_read(regs, &insn)) + goto out_err; + + if (try_emulate_mrs(regs, insn)) + return; + + if (try_emulate_armv8_deprecated(regs, insn)) return; - if (call_undef_hook(regs) == 0) +out_err: + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); +} + +void do_el1_undef(struct pt_regs *regs, unsigned long esr) +{ + u32 insn; + + if (aarch64_insn_read((void *)regs->pc, &insn)) + goto out_err; + + if (try_emulate_el1_ssbs(regs, insn)) + return; + +out_err: + die("Oops - Undefined instruction", regs, esr); +} + +void do_el0_bti(struct pt_regs *regs) +{ + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); +} + +void do_el1_bti(struct pt_regs *regs, unsigned long esr) +{ + if (efi_runtime_fixup_exception(regs, "BTI violation")) { + regs->pstate &= ~PSR_BTYPE_MASK; return; + } + die("Oops - BTI", regs, esr); +} + +void do_el0_gcs(struct pt_regs *regs, unsigned long esr) +{ + force_signal_inject(SIGSEGV, SEGV_CPERR, regs->pc, 0); +} + +void do_el1_gcs(struct pt_regs *regs, unsigned long esr) +{ + die("Oops - GCS", regs, esr); +} + +void do_el0_fpac(struct pt_regs *regs, unsigned long esr) +{ + force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr); +} + +void do_el1_fpac(struct pt_regs *regs, unsigned long esr) +{ + /* + * Unexpected FPAC exception in the kernel: kill the task before it + * does any more harm. + */ + die("Oops - FPAC", regs, esr); +} + +void do_el0_mops(struct pt_regs *regs, unsigned long esr) +{ + arm64_mops_reset_regs(®s->user_regs, esr); - BUG_ON(!user_mode(regs)); - force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); + /* + * If single stepping then finish the step before executing the + * prologue instruction. + */ + user_fastforward_single_step(current); +} + +void do_el1_mops(struct pt_regs *regs, unsigned long esr) +{ + arm64_mops_reset_regs(®s->user_regs, esr); + + kernel_fastforward_single_step(regs); } #define __user_cache_maint(insn, address, res) \ - if (address >= user_addr_max()) { \ + if (address >= TASK_SIZE_MAX) { \ res = -EFAULT; \ } else { \ uaccess_ttbr0_enable(); \ @@ -432,25 +552,21 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs) "1: " insn ", %1\n" \ " mov %w0, #0\n" \ "2:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "3: mov %w0, %w2\n" \ - " b 2b\n" \ - " .popsection\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) \ : "=r" (res) \ - : "r" (address), "i" (-EFAULT)); \ + : "r" (address)); \ uaccess_ttbr0_disable(); \ } -static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) +static void user_cache_maint_handler(unsigned long esr, struct pt_regs *regs) { - unsigned long address; + unsigned long tagged_address, address; int rt = ESR_ELx_SYS64_ISS_RT(esr); int crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; int ret = 0; - address = untagged_addr(pt_regs_read_reg(regs, rt)); + tagged_address = pt_regs_read_reg(regs, rt); + address = untagged_addr(tagged_address); switch (crm) { case ESR_ELx_SYS64_ISS_CRM_DC_CVAU: /* DC CVAU, gets promoted */ @@ -459,6 +575,9 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) case ESR_ELx_SYS64_ISS_CRM_DC_CVAC: /* DC CVAC, gets promoted */ __user_cache_maint("dc civac", address, ret); break; + case ESR_ELx_SYS64_ISS_CRM_DC_CVADP: /* DC CVADP */ + __user_cache_maint("sys 3, c7, c13, 1", address, ret); + break; case ESR_ELx_SYS64_ISS_CRM_DC_CVAP: /* DC CVAP */ __user_cache_maint("sys 3, c7, c12, 1", address, ret); break; @@ -469,43 +588,60 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) __user_cache_maint("ic ivau", address, ret); break; default: - force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } if (ret) - arm64_notify_segfault(address); + arm64_notify_segfault(tagged_address); else arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } -static void ctr_read_handler(unsigned int esr, struct pt_regs *regs) +static void ctr_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = ESR_ELx_SYS64_ISS_RT(esr); unsigned long val = arm64_ftr_reg_user_value(&arm64_ftr_reg_ctrel0); + if (cpus_have_final_cap(ARM64_WORKAROUND_1542419)) { + /* Hide DIC so that we can trap the unnecessary maintenance...*/ + val &= ~BIT(CTR_EL0_DIC_SHIFT); + + /* ... and fake IminLine to reduce the number of traps. */ + val &= ~CTR_EL0_IminLine_MASK; + val |= (PAGE_SHIFT - 2) & CTR_EL0_IminLine_MASK; + } + pt_regs_write_reg(regs, rt, val); arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } -static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs) +static void cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = ESR_ELx_SYS64_ISS_RT(esr); + if (test_thread_flag(TIF_TSC_SIGSEGV)) { + force_sig(SIGSEGV); + } else { + int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_counter_get_cntvct()); - arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + pt_regs_write_reg(regs, rt, arch_timer_read_counter()); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } } -static void cntfrq_read_handler(unsigned int esr, struct pt_regs *regs) +static void cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = ESR_ELx_SYS64_ISS_RT(esr); + if (test_thread_flag(TIF_TSC_SIGSEGV)) { + force_sig(SIGSEGV); + } else { + int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_timer_get_rate()); - arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + pt_regs_write_reg(regs, rt, arch_timer_get_rate()); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } } -static void mrs_handler(unsigned int esr, struct pt_regs *regs) +static void mrs_handler(unsigned long esr, struct pt_regs *regs) { u32 sysreg, rt; @@ -513,21 +649,21 @@ static void mrs_handler(unsigned int esr, struct pt_regs *regs) sysreg = esr_sys64_to_sysreg(esr); if (do_emulate_mrs(regs, sysreg, rt) != 0) - force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } -static void wfi_handler(unsigned int esr, struct pt_regs *regs) +static void wfi_handler(unsigned long esr, struct pt_regs *regs) { arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } struct sys64_hook { - unsigned int esr_mask; - unsigned int esr_val; - void (*handler)(unsigned int esr, struct pt_regs *regs); + unsigned long esr_mask; + unsigned long esr_val; + void (*handler)(unsigned long esr, struct pt_regs *regs); }; -static struct sys64_hook sys64_hooks[] = { +static const struct sys64_hook sys64_hooks[] = { { .esr_mask = ESR_ELx_SYS64_ISS_EL0_CACHE_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_EL0_CACHE_OP_VAL, @@ -546,6 +682,12 @@ static struct sys64_hook sys64_hooks[] = { .handler = cntvct_read_handler, }, { + /* Trap read access to CNTVCTSS_EL0 */ + .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, + .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTVCTSS, + .handler = cntvct_read_handler, + }, + { /* Trap read access to CNTFRQ_EL0 */ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTFRQ, @@ -566,35 +708,8 @@ static struct sys64_hook sys64_hooks[] = { {}, }; - #ifdef CONFIG_COMPAT -#define PSTATE_IT_1_0_SHIFT 25 -#define PSTATE_IT_1_0_MASK (0x3 << PSTATE_IT_1_0_SHIFT) -#define PSTATE_IT_7_2_SHIFT 10 -#define PSTATE_IT_7_2_MASK (0x3f << PSTATE_IT_7_2_SHIFT) - -static u32 compat_get_it_state(struct pt_regs *regs) -{ - u32 it, pstate = regs->pstate; - - it = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT; - it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2; - - return it; -} - -static void compat_set_it_state(struct pt_regs *regs, u32 it) -{ - u32 pstate_it; - - pstate_it = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK; - pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK; - - regs->pstate &= ~PSR_AA32_IT_MASK; - regs->pstate |= pstate_it; -} - -static bool cp15_cond_valid(unsigned int esr, struct pt_regs *regs) +static bool cp15_cond_valid(unsigned long esr, struct pt_regs *regs) { int cond; @@ -614,45 +729,15 @@ static bool cp15_cond_valid(unsigned int esr, struct pt_regs *regs) return aarch32_opcode_cond_checks[cond](regs->pstate); } -static void advance_itstate(struct pt_regs *regs) -{ - u32 it; - - /* ARM mode */ - if (!(regs->pstate & PSR_AA32_T_BIT) || - !(regs->pstate & PSR_AA32_IT_MASK)) - return; - - it = compat_get_it_state(regs); - - /* - * If this is the last instruction of the block, wipe the IT - * state. Otherwise advance it. - */ - if (!(it & 7)) - it = 0; - else - it = (it & 0xe0) | ((it << 1) & 0x1f); - - compat_set_it_state(regs, it); -} - -static void arm64_compat_skip_faulting_instruction(struct pt_regs *regs, - unsigned int sz) -{ - advance_itstate(regs); - arm64_skip_faulting_instruction(regs, sz); -} - -static void compat_cntfrq_read_handler(unsigned int esr, struct pt_regs *regs) +static void compat_cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { int reg = (esr & ESR_ELx_CP15_32_ISS_RT_MASK) >> ESR_ELx_CP15_32_ISS_RT_SHIFT; pt_regs_write_reg(regs, reg, arch_timer_get_rate()); - arm64_compat_skip_faulting_instruction(regs, 4); + arm64_skip_faulting_instruction(regs, 4); } -static struct sys64_hook cp15_32_hooks[] = { +static const struct sys64_hook cp15_32_hooks[] = { { .esr_mask = ESR_ELx_CP15_32_ISS_SYS_MASK, .esr_val = ESR_ELx_CP15_32_ISS_SYS_CNTFRQ, @@ -661,36 +746,41 @@ static struct sys64_hook cp15_32_hooks[] = { {}, }; -static void compat_cntvct_read_handler(unsigned int esr, struct pt_regs *regs) +static void compat_cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = (esr & ESR_ELx_CP15_64_ISS_RT_MASK) >> ESR_ELx_CP15_64_ISS_RT_SHIFT; int rt2 = (esr & ESR_ELx_CP15_64_ISS_RT2_MASK) >> ESR_ELx_CP15_64_ISS_RT2_SHIFT; - u64 val = arch_counter_get_cntvct(); + u64 val = arch_timer_read_counter(); pt_regs_write_reg(regs, rt, lower_32_bits(val)); pt_regs_write_reg(regs, rt2, upper_32_bits(val)); - arm64_compat_skip_faulting_instruction(regs, 4); + arm64_skip_faulting_instruction(regs, 4); } -static struct sys64_hook cp15_64_hooks[] = { +static const struct sys64_hook cp15_64_hooks[] = { { .esr_mask = ESR_ELx_CP15_64_ISS_SYS_MASK, .esr_val = ESR_ELx_CP15_64_ISS_SYS_CNTVCT, .handler = compat_cntvct_read_handler, }, + { + .esr_mask = ESR_ELx_CP15_64_ISS_SYS_MASK, + .esr_val = ESR_ELx_CP15_64_ISS_SYS_CNTVCTSS, + .handler = compat_cntvct_read_handler, + }, {}, }; -asmlinkage void __exception do_cp15instr(unsigned int esr, struct pt_regs *regs) +void do_el0_cp15(unsigned long esr, struct pt_regs *regs) { - struct sys64_hook *hook, *hook_base; + const struct sys64_hook *hook, *hook_base; if (!cp15_cond_valid(esr, regs)) { /* * There is no T16 variant of a CP access, so we * always advance PC by 4 bytes. */ - arm64_compat_skip_faulting_instruction(regs, 4); + arm64_skip_faulting_instruction(regs, 4); return; } @@ -702,7 +792,7 @@ asmlinkage void __exception do_cp15instr(unsigned int esr, struct pt_regs *regs) hook_base = cp15_64_hooks; break; default: - do_undefinstr(regs); + do_el0_undef(regs, esr); return; } @@ -717,13 +807,13 @@ asmlinkage void __exception do_cp15instr(unsigned int esr, struct pt_regs *regs) * EL0. Fall back to our usual undefined instruction handler * so that we handle these consistently. */ - do_undefinstr(regs); + do_el0_undef(regs, esr); } #endif -asmlinkage void __exception do_sysinstr(unsigned int esr, struct pt_regs *regs) +void do_el0_sys(unsigned long esr, struct pt_regs *regs) { - struct sys64_hook *hook; + const struct sys64_hook *hook; for (hook = sys64_hooks; hook->handler; hook++) if ((hook->esr_mask & esr) == hook->esr_val) { @@ -736,7 +826,7 @@ asmlinkage void __exception do_sysinstr(unsigned int esr, struct pt_regs *regs) * back to our usual undefined instruction handler so that we handle * these consistently. */ - do_undefinstr(regs); + do_el0_undef(regs, esr); } static const char *esr_class_str[] = { @@ -749,7 +839,9 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_CP14_LS] = "CP14 LDC/STC", [ESR_ELx_EC_FP_ASIMD] = "ASIMD", [ESR_ELx_EC_CP10_ID] = "CP10 MRC/VMRS", + [ESR_ELx_EC_PAC] = "PAC", [ESR_ELx_EC_CP14_64] = "CP14 MCRR/MRRC", + [ESR_ELx_EC_BTI] = "BTI", [ESR_ELx_EC_ILL] = "PSTATE.IL", [ESR_ELx_EC_SVC32] = "SVC (AArch32)", [ESR_ELx_EC_HVC32] = "HVC (AArch32)", @@ -759,6 +851,9 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_SMC64] = "SMC (AArch64)", [ESR_ELx_EC_SYS64] = "MSR/MRS (AArch64)", [ESR_ELx_EC_SVE] = "SVE", + [ESR_ELx_EC_ERET] = "ERET/ERETAA/ERETAB", + [ESR_ELx_EC_FPAC] = "FPAC", + [ESR_ELx_EC_SME] = "SME", [ESR_ELx_EC_IMP_DEF] = "EL3 IMP DEF", [ESR_ELx_EC_IABT_LOW] = "IABT (lower EL)", [ESR_ELx_EC_IABT_CUR] = "IABT (current EL)", @@ -766,8 +861,10 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_DABT_LOW] = "DABT (lower EL)", [ESR_ELx_EC_DABT_CUR] = "DABT (current EL)", [ESR_ELx_EC_SP_ALIGN] = "SP Alignment", + [ESR_ELx_EC_MOPS] = "MOPS", [ESR_ELx_EC_FP_EXC32] = "FP (AArch32)", [ESR_ELx_EC_FP_EXC64] = "FP (AArch64)", + [ESR_ELx_EC_GCS] = "Guarded Control Stack", [ESR_ELx_EC_SERROR] = "SError", [ESR_ELx_EC_BREAKPT_LOW] = "Breakpoint (lower EL)", [ESR_ELx_EC_BREAKPT_CUR] = "Breakpoint (current EL)", @@ -780,34 +877,18 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_BRK64] = "BRK (AArch64)", }; -const char *esr_get_class_string(u32 esr) +const char *esr_get_class_string(unsigned long esr) { return esr_class_str[ESR_ELx_EC(esr)]; } /* - * bad_mode handles the impossible case in the exception vector. This is always - * fatal. - */ -asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr) -{ - console_verbose(); - - pr_crit("Bad mode in %s handler detected on CPU%d, code 0x%08x -- %s\n", - handler[reason], smp_processor_id(), esr, - esr_get_class_string(esr)); - - local_daif_mask(); - panic("bad mode"); -} - -/* * bad_el0_sync handles unexpected, but potentially recoverable synchronous - * exceptions taken from EL0. Unlike bad_mode, this returns. + * exceptions taken from EL0. */ -asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) +void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr) { - void __user *pc = (void __user *)instruction_pointer(regs); + unsigned long pc = instruction_pointer(regs); current->thread.fault_address = 0; current->thread.fault_code = esr; @@ -816,48 +897,44 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) "Bad EL0 synchronous exception"); } -#ifdef CONFIG_VMAP_STACK - DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) __aligned(16); -asmlinkage void handle_bad_stack(struct pt_regs *regs) +void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigned long far) { unsigned long tsk_stk = (unsigned long)current->stack; unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack); - unsigned int esr = read_sysreg(esr_el1); - unsigned long far = read_sysreg(far_el1); console_verbose(); pr_emerg("Insufficient stack space to handle exception!"); - pr_emerg("ESR: 0x%08x -- %s\n", esr, esr_get_class_string(esr)); + pr_emerg("ESR: 0x%016lx -- %s\n", esr, esr_get_class_string(esr)); pr_emerg("FAR: 0x%016lx\n", far); pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", tsk_stk, tsk_stk + THREAD_SIZE); pr_emerg("IRQ stack: [0x%016lx..0x%016lx]\n", - irq_stk, irq_stk + THREAD_SIZE); + irq_stk, irq_stk + IRQ_STACK_SIZE); pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); __show_regs(regs); /* - * We use nmi_panic to limit the potential for recusive overflows, and + * We use nmi_panic to limit the potential for recursive overflows, and * to get a better stack trace. */ nmi_panic(NULL, "kernel stack overflow"); cpu_park_loop(); } -#endif -void __noreturn arm64_serror_panic(struct pt_regs *regs, u32 esr) +void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr) { + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_STILL_OK); console_verbose(); - pr_crit("SError Interrupt on CPU%d, code 0x%08x -- %s\n", + pr_crit("SError Interrupt on CPU%d, code 0x%016lx -- %s\n", smp_processor_id(), esr, esr_get_class_string(esr)); if (regs) __show_regs(regs); @@ -865,12 +942,11 @@ void __noreturn arm64_serror_panic(struct pt_regs *regs, u32 esr) nmi_panic(regs, "Asynchronous SError Interrupt"); cpu_park_loop(); - unreachable(); } -bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) +bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr) { - u32 aet = arm64_ras_serror_get_severity(esr); + unsigned long aet = arm64_ras_serror_get_severity(esr); switch (aet) { case ESR_ELx_AET_CE: /* corrected error */ @@ -886,6 +962,10 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) /* * The CPU can't make progress. The exception may have * been imprecise. + * + * Neoverse-N1 #1349291 means a non-KVM SError reported as + * Unrecoverable should be treated as Uncontainable. We + * call arm64_serror_panic() in both cases. */ return true; @@ -896,43 +976,19 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) } } -asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr) +void do_serror(struct pt_regs *regs, unsigned long esr) { - nmi_enter(); - /* non-RAS errors are not containable */ if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) arm64_serror_panic(regs, esr); - - nmi_exit(); -} - -void __pte_error(const char *file, int line, unsigned long val) -{ - pr_err("%s:%d: bad pte %016lx.\n", file, line, val); -} - -void __pmd_error(const char *file, int line, unsigned long val) -{ - pr_err("%s:%d: bad pmd %016lx.\n", file, line, val); -} - -void __pud_error(const char *file, int line, unsigned long val) -{ - pr_err("%s:%d: bad pud %016lx.\n", file, line, val); -} - -void __pgd_error(const char *file, int line, unsigned long val) -{ - pr_err("%s:%d: bad pgd %016lx.\n", file, line, val); } /* GENERIC_BUG traps */ - +#ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long addr) { /* - * bug_handler() only called for BRK #BUG_BRK_IMM. + * bug_brk_handler() only called for BRK #BUG_BRK_IMM. * So the answer is trivial -- any spurious instances with no * bug table entry will be rejected by report_bug() and passed * back to the debug-monitors code and handled as a fatal @@ -940,15 +996,13 @@ int is_valid_bugaddr(unsigned long addr) */ return 1; } +#endif -static int bug_handler(struct pt_regs *regs, unsigned int esr) +int bug_brk_handler(struct pt_regs *regs, unsigned long esr) { - if (user_mode(regs)) - return DBG_HOOK_ERROR; - switch (report_bug(regs->pc, regs)) { case BUG_TRAP_TYPE_BUG: - die("Oops - BUG", regs, 0); + die("Oops - BUG", regs, esr); break; case BUG_TRAP_TYPE_WARN: @@ -964,11 +1018,41 @@ static int bug_handler(struct pt_regs *regs, unsigned int esr) return DBG_HOOK_HANDLED; } -static struct break_hook bug_break_hook = { - .esr_val = 0xf2000000 | BUG_BRK_IMM, - .esr_mask = 0xffffffff, - .fn = bug_handler, -}; +#ifdef CONFIG_CFI +int cfi_brk_handler(struct pt_regs *regs, unsigned long esr) +{ + unsigned long target; + u32 type; + + target = pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TARGET, esr)); + type = (u32)pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TYPE, esr)); + + switch (report_cfi_failure(regs, regs->pc, &target, type)) { + case BUG_TRAP_TYPE_BUG: + die("Oops - CFI", regs, esr); + break; + + case BUG_TRAP_TYPE_WARN: + break; + + default: + return DBG_HOOK_ERROR; + } + + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + return DBG_HOOK_HANDLED; +} +#endif /* CONFIG_CFI */ + +int reserved_fault_brk_handler(struct pt_regs *regs, unsigned long esr) +{ + pr_err("%s generated an invalid instruction at %pS!\n", + "Kernel text patching", + (void *)instruction_pointer(regs)); + + /* We cannot handle this */ + return DBG_HOOK_ERROR; +} #ifdef CONFIG_KASAN_SW_TAGS @@ -977,17 +1061,14 @@ static struct break_hook bug_break_hook = { #define KASAN_ESR_SIZE_MASK 0x0f #define KASAN_ESR_SIZE(esr) (1 << ((esr) & KASAN_ESR_SIZE_MASK)) -static int kasan_handler(struct pt_regs *regs, unsigned int esr) +int kasan_brk_handler(struct pt_regs *regs, unsigned long esr) { bool recover = esr & KASAN_ESR_RECOVER; bool write = esr & KASAN_ESR_WRITE; size_t size = KASAN_ESR_SIZE(esr); - u64 addr = regs->regs[0]; + void *addr = (void *)regs->regs[0]; u64 pc = regs->pc; - if (user_mode(regs)) - return DBG_HOOK_ERROR; - kasan_report(addr, size, write, pc); /* @@ -1005,42 +1086,18 @@ static int kasan_handler(struct pt_regs *regs, unsigned int esr) * This is something that might be fixed at some point in the future. */ if (!recover) - die("Oops - KASAN", regs, 0); + die("Oops - KASAN", regs, esr); /* If thread survives, skip over the brk instruction and continue: */ arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } - -#define KASAN_ESR_VAL (0xf2000000 | KASAN_BRK_IMM) -#define KASAN_ESR_MASK 0xffffff00 - -static struct break_hook kasan_break_hook = { - .esr_val = KASAN_ESR_VAL, - .esr_mask = KASAN_ESR_MASK, - .fn = kasan_handler, -}; #endif -/* - * Initial handler for AArch64 BRK exceptions - * This handler only used until debug_traps_init(). - */ -int __init early_brk64(unsigned long addr, unsigned int esr, - struct pt_regs *regs) +#ifdef CONFIG_UBSAN_TRAP +int ubsan_brk_handler(struct pt_regs *regs, unsigned long esr) { -#ifdef CONFIG_KASAN_SW_TAGS - if ((esr & KASAN_ESR_MASK) == KASAN_ESR_VAL) - return kasan_handler(regs, esr) != DBG_HOOK_HANDLED; -#endif - return bug_handler(regs, esr) != DBG_HOOK_HANDLED; + die(report_ubsan_failure(esr & UBSAN_BRK_MASK), regs, esr); + return DBG_HOOK_HANDLED; } - -/* This registration must happen early, before debug_traps_init(). */ -void __init trap_init(void) -{ - register_break_hook(&bug_break_hook); -#ifdef CONFIG_KASAN_SW_TAGS - register_break_hook(&kasan_break_hook); #endif -} diff --git a/arch/arm64/kernel/vdso-wrap.S b/arch/arm64/kernel/vdso-wrap.S new file mode 100644 index 000000000000..c4b1990bf2be --- /dev/null +++ b/arch/arm64/kernel/vdso-wrap.S @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012 ARM Limited + * + * Author: Will Deacon <will.deacon@arm.com> + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/const.h> +#include <asm/assembler.h> +#include <asm/page.h> + + .globl vdso_start, vdso_end + .section .rodata + .balign PAGE_SIZE +vdso_start: + .incbin "arch/arm64/kernel/vdso/vdso.so" + .balign PAGE_SIZE +vdso_end: + + .previous + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 2d419006ad43..78ddf6bdecad 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * VDSO implementation for AArch64 and vector page setup for AArch32. + * VDSO implementations. * * Copyright (C) 2012 ARM Limited * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * * Author: Will Deacon <will.deacon@arm.com> */ @@ -29,228 +18,330 @@ #include <linux/sched.h> #include <linux/signal.h> #include <linux/slab.h> -#include <linux/timekeeper_internal.h> +#include <linux/vdso_datastore.h> #include <linux/vmalloc.h> +#include <vdso/datapage.h> +#include <vdso/helpers.h> +#include <vdso/vsyscall.h> #include <asm/cacheflush.h> #include <asm/signal32.h> #include <asm/vdso.h> -#include <asm/vdso_datapage.h> - -extern char vdso_start[], vdso_end[]; -static unsigned long vdso_pages __ro_after_init; - -/* - * The vDSO data page. - */ -static union { - struct vdso_data data; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = &vdso_data_store.data; - -#ifdef CONFIG_COMPAT -/* - * Create and map the vectors page for AArch32 tasks. - */ -static struct page *vectors_page[1] __ro_after_init; - -static int __init alloc_vectors_page(void) -{ - extern char __kuser_helper_start[], __kuser_helper_end[]; - extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[]; - - int kuser_sz = __kuser_helper_end - __kuser_helper_start; - int sigret_sz = __aarch32_sigret_code_end - __aarch32_sigret_code_start; - unsigned long vpage; - vpage = get_zeroed_page(GFP_ATOMIC); - - if (!vpage) - return -ENOMEM; - - /* kuser helpers */ - memcpy((void *)vpage + 0x1000 - kuser_sz, __kuser_helper_start, - kuser_sz); - - /* sigreturn code */ - memcpy((void *)vpage + AARCH32_KERN_SIGRET_CODE_OFFSET, - __aarch32_sigret_code_start, sigret_sz); - - flush_icache_range(vpage, vpage + PAGE_SIZE); - vectors_page[0] = virt_to_page(vpage); - - return 0; -} -arch_initcall(alloc_vectors_page); - -int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp) -{ - struct mm_struct *mm = current->mm; - unsigned long addr = AARCH32_VECTORS_BASE; - static const struct vm_special_mapping spec = { - .name = "[vectors]", - .pages = vectors_page, - - }; - void *ret; - - if (down_write_killable(&mm->mmap_sem)) - return -EINTR; - current->mm->context.vdso = (void *)addr; - - /* Map vectors page at the high address. */ - ret = _install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC, - &spec); +enum vdso_abi { + VDSO_ABI_AA64, + VDSO_ABI_AA32, +}; - up_write(&mm->mmap_sem); +struct vdso_abi_info { + const char *name; + const char *vdso_code_start; + const char *vdso_code_end; + unsigned long vdso_pages; + /* Code Mapping */ + struct vm_special_mapping *cm; +}; - return PTR_ERR_OR_ZERO(ret); -} -#endif /* CONFIG_COMPAT */ +static struct vdso_abi_info vdso_info[] __ro_after_init = { + [VDSO_ABI_AA64] = { + .name = "vdso", + .vdso_code_start = vdso_start, + .vdso_code_end = vdso_end, + }, +#ifdef CONFIG_COMPAT_VDSO + [VDSO_ABI_AA32] = { + .name = "vdso32", + .vdso_code_start = vdso32_start, + .vdso_code_end = vdso32_end, + }, +#endif /* CONFIG_COMPAT_VDSO */ +}; static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { - unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - unsigned long vdso_size = vdso_end - vdso_start; - - if (vdso_size != new_size) - return -EINVAL; - current->mm->context.vdso = (void *)new_vma->vm_start; return 0; } -static struct vm_special_mapping vdso_spec[2] __ro_after_init = { - { - .name = "[vvar]", - }, - { - .name = "[vdso]", - .mremap = vdso_mremap, - }, -}; - -static int __init vdso_init(void) +static int __init __vdso_init(enum vdso_abi abi) { int i; struct page **vdso_pagelist; unsigned long pfn; - if (memcmp(vdso_start, "\177ELF", 4)) { + if (memcmp(vdso_info[abi].vdso_code_start, "\177ELF", 4)) { pr_err("vDSO is not a valid ELF object!\n"); return -EINVAL; } - vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT; - pr_info("vdso: %ld pages (%ld code @ %p, %ld data @ %p)\n", - vdso_pages + 1, vdso_pages, vdso_start, 1L, vdso_data); + vdso_info[abi].vdso_pages = ( + vdso_info[abi].vdso_code_end - + vdso_info[abi].vdso_code_start) >> + PAGE_SHIFT; - /* Allocate the vDSO pagelist, plus a page for the data. */ - vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *), + vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages, + sizeof(struct page *), GFP_KERNEL); if (vdso_pagelist == NULL) return -ENOMEM; - /* Grab the vDSO data page. */ - vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data)); - - /* Grab the vDSO code pages. */ - pfn = sym_to_pfn(vdso_start); + pfn = sym_to_pfn(vdso_info[abi].vdso_code_start); - for (i = 0; i < vdso_pages; i++) - vdso_pagelist[i + 1] = pfn_to_page(pfn + i); + for (i = 0; i < vdso_info[abi].vdso_pages; i++) + vdso_pagelist[i] = pfn_to_page(pfn + i); - vdso_spec[0].pages = &vdso_pagelist[0]; - vdso_spec[1].pages = &vdso_pagelist[1]; + vdso_info[abi].cm->pages = vdso_pagelist; return 0; } -arch_initcall(vdso_init); -int arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp) +static int __setup_additional_pages(enum vdso_abi abi, + struct mm_struct *mm, + struct linux_binprm *bprm, + int uses_interp) { - struct mm_struct *mm = current->mm; unsigned long vdso_base, vdso_text_len, vdso_mapping_len; + unsigned long gp_flags = 0; void *ret; - vdso_text_len = vdso_pages << PAGE_SHIFT; + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); + + vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ - vdso_mapping_len = vdso_text_len + PAGE_SIZE; + vdso_mapping_len = vdso_text_len + VDSO_NR_PAGES * PAGE_SIZE; - if (down_write_killable(&mm->mmap_sem)) - return -EINTR; vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); if (IS_ERR_VALUE(vdso_base)) { ret = ERR_PTR(vdso_base); goto up_fail; } - ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE, - VM_READ|VM_MAYREAD, - &vdso_spec[0]); + + ret = vdso_install_vvar_mapping(mm, vdso_base); if (IS_ERR(ret)) goto up_fail; - vdso_base += PAGE_SIZE; + if (system_supports_bti_kernel()) + gp_flags = VM_ARM64_BTI; + + vdso_base += VDSO_NR_PAGES * PAGE_SIZE; mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &vdso_spec[1]); + VM_READ|VM_EXEC|gp_flags| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_SEALED_SYSMAP, + vdso_info[abi].cm); if (IS_ERR(ret)) goto up_fail; - - up_write(&mm->mmap_sem); return 0; up_fail: mm->context.vdso = NULL; - up_write(&mm->mmap_sem); return PTR_ERR(ret); } +#ifdef CONFIG_COMPAT /* - * Update the vDSO data page to keep in sync with kernel timekeeping. + * Create and map the vectors page for AArch32 tasks. */ -void update_vsyscall(struct timekeeper *tk) +enum aarch32_map { + AA32_MAP_VECTORS, /* kuser helpers */ + AA32_MAP_SIGPAGE, + AA32_MAP_VDSO, +}; + +static struct page *aarch32_vectors_page __ro_after_init; +static struct page *aarch32_sig_page __ro_after_init; + +static int aarch32_sigpage_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + current->mm->context.sigpage = (void *)new_vma->vm_start; + + return 0; +} + +static struct vm_special_mapping aarch32_vdso_maps[] = { + [AA32_MAP_VECTORS] = { + .name = "[vectors]", /* ABI */ + .pages = &aarch32_vectors_page, + }, + [AA32_MAP_SIGPAGE] = { + .name = "[sigpage]", /* ABI */ + .pages = &aarch32_sig_page, + .mremap = aarch32_sigpage_mremap, + }, + [AA32_MAP_VDSO] = { + .name = "[vdso]", + .mremap = vdso_mremap, + }, +}; + +static int aarch32_alloc_kuser_vdso_page(void) +{ + extern char __kuser_helper_start[], __kuser_helper_end[]; + int kuser_sz = __kuser_helper_end - __kuser_helper_start; + unsigned long vdso_page; + + if (!IS_ENABLED(CONFIG_KUSER_HELPERS)) + return 0; + + vdso_page = get_zeroed_page(GFP_KERNEL); + if (!vdso_page) + return -ENOMEM; + + memcpy((void *)(vdso_page + 0x1000 - kuser_sz), __kuser_helper_start, + kuser_sz); + aarch32_vectors_page = virt_to_page((void *)vdso_page); + return 0; +} + +#define COMPAT_SIGPAGE_POISON_WORD 0xe7fddef1 +static int aarch32_alloc_sigpage(void) +{ + extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[]; + int sigret_sz = __aarch32_sigret_code_end - __aarch32_sigret_code_start; + __le32 poison = cpu_to_le32(COMPAT_SIGPAGE_POISON_WORD); + void *sigpage; + + sigpage = (void *)__get_free_page(GFP_KERNEL); + if (!sigpage) + return -ENOMEM; + + memset32(sigpage, (__force u32)poison, PAGE_SIZE / sizeof(poison)); + memcpy(sigpage, __aarch32_sigret_code_start, sigret_sz); + aarch32_sig_page = virt_to_page(sigpage); + return 0; +} + +static int __init __aarch32_alloc_vdso_pages(void) +{ + + if (!IS_ENABLED(CONFIG_COMPAT_VDSO)) + return 0; + + vdso_info[VDSO_ABI_AA32].cm = &aarch32_vdso_maps[AA32_MAP_VDSO]; + + return __vdso_init(VDSO_ABI_AA32); +} + +static int __init aarch32_alloc_vdso_pages(void) +{ + int ret; + + ret = __aarch32_alloc_vdso_pages(); + if (ret) + return ret; + + ret = aarch32_alloc_sigpage(); + if (ret) + return ret; + + return aarch32_alloc_kuser_vdso_page(); +} +arch_initcall(aarch32_alloc_vdso_pages); + +static int aarch32_kuser_helpers_setup(struct mm_struct *mm) +{ + void *ret; + + if (!IS_ENABLED(CONFIG_KUSER_HELPERS)) + return 0; + + /* + * Avoid VM_MAYWRITE for compatibility with arch/arm/, where it's + * not safe to CoW the page containing the CPU exception vectors. + */ + ret = _install_special_mapping(mm, AARCH32_VECTORS_BASE, PAGE_SIZE, + VM_READ | VM_EXEC | + VM_MAYREAD | VM_MAYEXEC | + VM_SEALED_SYSMAP, + &aarch32_vdso_maps[AA32_MAP_VECTORS]); + + return PTR_ERR_OR_ZERO(ret); +} + +static int aarch32_sigreturn_setup(struct mm_struct *mm) { - u32 use_syscall = !tk->tkr_mono.clock->archdata.vdso_direct; - - ++vdso_data->tb_seq_count; - smp_wmb(); - - vdso_data->use_syscall = use_syscall; - vdso_data->xtime_coarse_sec = tk->xtime_sec; - vdso_data->xtime_coarse_nsec = tk->tkr_mono.xtime_nsec >> - tk->tkr_mono.shift; - vdso_data->wtm_clock_sec = tk->wall_to_monotonic.tv_sec; - vdso_data->wtm_clock_nsec = tk->wall_to_monotonic.tv_nsec; - - if (!use_syscall) { - /* tkr_mono.cycle_last == tkr_raw.cycle_last */ - vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last; - vdso_data->raw_time_sec = tk->raw_sec; - vdso_data->raw_time_nsec = tk->tkr_raw.xtime_nsec; - vdso_data->xtime_clock_sec = tk->xtime_sec; - vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; - vdso_data->cs_mono_mult = tk->tkr_mono.mult; - vdso_data->cs_raw_mult = tk->tkr_raw.mult; - /* tkr_mono.shift == tkr_raw.shift */ - vdso_data->cs_shift = tk->tkr_mono.shift; + unsigned long addr; + void *ret; + + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = ERR_PTR(addr); + goto out; } - smp_wmb(); - ++vdso_data->tb_seq_count; + /* + * VM_MAYWRITE is required to allow gdb to Copy-on-Write and + * set breakpoints. + */ + ret = _install_special_mapping(mm, addr, PAGE_SIZE, + VM_READ | VM_EXEC | VM_MAYREAD | + VM_MAYWRITE | VM_MAYEXEC | + VM_SEALED_SYSMAP, + &aarch32_vdso_maps[AA32_MAP_SIGPAGE]); + if (IS_ERR(ret)) + goto out; + + mm->context.sigpage = (void *)addr; + +out: + return PTR_ERR_OR_ZERO(ret); } -void update_vsyscall_tz(void) +int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; - vdso_data->tz_dsttime = sys_tz.tz_dsttime; + struct mm_struct *mm = current->mm; + int ret; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + ret = aarch32_kuser_helpers_setup(mm); + if (ret) + goto out; + + if (IS_ENABLED(CONFIG_COMPAT_VDSO)) { + ret = __setup_additional_pages(VDSO_ABI_AA32, mm, bprm, + uses_interp); + if (ret) + goto out; + } + + ret = aarch32_sigreturn_setup(mm); +out: + mmap_write_unlock(mm); + return ret; +} +#endif /* CONFIG_COMPAT */ + +static struct vm_special_mapping aarch64_vdso_map __ro_after_init = { + .name = "[vdso]", + .mremap = vdso_mremap, +}; + +static int __init vdso_init(void) +{ + vdso_info[VDSO_ABI_AA64].cm = &aarch64_vdso_map; + + return __vdso_init(VDSO_ABI_AA64); +} +arch_initcall(vdso_init); + +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + struct mm_struct *mm = current->mm; + int ret; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, uses_interp); + mmap_write_unlock(mm); + + return ret; } diff --git a/arch/arm64/kernel/vdso/.gitignore b/arch/arm64/kernel/vdso/.gitignore index f8b69d84238e..652e31d82582 100644 --- a/arch/arm64/kernel/vdso/.gitignore +++ b/arch/arm64/kernel/vdso/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only vdso.lds diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index b215c712d897..7dec05dd33b7 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -6,33 +6,63 @@ # Heavily based on the vDSO Makefiles for other archs. # -obj-vdso := gettimeofday.o note.o sigreturn.o +# Include the generic Makefile to check the built vdso. +include $(srctree)/lib/vdso/Makefile.include + +obj-vdso := vgettimeofday.o note.o sigreturn.o vgetrandom.o vgetrandom-chacha.o # Build rules targets := $(obj-vdso) vdso.so vdso.so.dbg obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) -ccflags-y := -shared -fno-common -fno-builtin -ccflags-y += -nostdlib -Wl,-soname=linux-vdso.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) +btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti -# Disable gcov profiling for VDSO code -GCOV_PROFILE := n +# -Bsymbolic has been added for consistency with arm, the compat vDSO and +# potential future proofing if we end up with internal calls to the exported +# routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so +# preparation in build-time C")). +ldflags-y := -shared -soname=linux-vdso.so.1 \ + -Bsymbolic --build-id=sha1 -n $(btildflags-y) -# Workaround for bare-metal (ELF) toolchains that neglect to pass -shared -# down to collect2, resulting in silent corruption of the vDSO image. -ccflags-y += -Wl,-shared +ifdef CONFIG_LD_ORPHAN_WARN + ldflags-y += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) +endif -obj-y += vdso.o -extra-y += vdso.lds -CPPFLAGS_vdso.lds += -P -C -U$(ARCH) +ldflags-y += -T + +ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 +ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO + +# -Wmissing-prototypes and -Wmissing-declarations are removed from +# the CFLAGS to make possible to build the kernel with CONFIG_WERROR enabled. +CC_FLAGS_REMOVE_VDSO := $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \ + $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) \ + $(GCC_PLUGINS_CFLAGS) \ + $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \ + -Wmissing-prototypes -Wmissing-declarations + +CC_FLAGS_ADD_VDSO := -O2 -mcmodel=tiny -fasynchronous-unwind-tables + +CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_REMOVE_VDSO) +CFLAGS_REMOVE_vgetrandom.o = $(CC_FLAGS_REMOVE_VDSO) + +CFLAGS_vgettimeofday.o = $(CC_FLAGS_ADD_VDSO) +CFLAGS_vgetrandom.o = $(CC_FLAGS_ADD_VDSO) -# Force dependency (incbin is bad) -$(obj)/vdso.o : $(obj)/vdso.so +ifneq ($(c-gettimeofday-y),) + CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y) +endif + +ifneq ($(c-getrandom-y),) + CFLAGS_vgetrandom.o += -include $(c-getrandom-y) +endif + +targets += vdso.lds +CPPFLAGS_vdso.lds += -P -C -U$(ARCH) # Link rule for the .so file, .lds has to be first -$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) - $(call if_changed,vdsold) +$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE + $(call if_changed,vdsold_and_vdso_check) # Strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S @@ -40,31 +70,13 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) # Generate VDSO offsets using helper script -gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +gen-vdsosym := $(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ -define cmd_vdsosym - $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ -endef + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE $(call if_changed,vdsosym) -# Assembly rules for the .S files -$(obj-vdso): %.o: %.S FORCE - $(call if_changed_dep,vdsoas) - # Actual build commands -quiet_cmd_vdsold = VDSOL $@ - cmd_vdsold = $(CC) $(c_flags) -Wl,-n -Wl,-T $^ -o $@ -quiet_cmd_vdsoas = VDSOA $@ - cmd_vdsoas = $(CC) $(a_flags) -c -o $@ $< - -# Install commands for the unstripped file -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ - -vdso.so: $(obj)/vdso.so.dbg - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: vdso.so +quiet_cmd_vdsold_and_vdso_check = LD $@ + cmd_vdsold_and_vdso_check = $(cmd_ld); $(cmd_vdso_check) diff --git a/arch/arm64/kernel/vdso/gen_vdso_offsets.sh b/arch/arm64/kernel/vdso/gen_vdso_offsets.sh index 0664acaf61ff..0387209d65b1 100755 --- a/arch/arm64/kernel/vdso/gen_vdso_offsets.sh +++ b/arch/arm64/kernel/vdso/gen_vdso_offsets.sh @@ -8,9 +8,9 @@ # Doing this inside the Makefile will break the $(filter-out) function, # causing Kbuild to rebuild the vdso-offsets header file every time. # -# Author: Will Deacon <will.deacon@arm.com +# Author: Will Deacon <will.deacon@arm.com> # LC_ALL=C sed -n -e 's/^00*/0/' -e \ -'s/^\([0-9a-fA-F]*\) . VDSO_\([a-zA-Z0-9_]*\)$/\#define vdso_offset_\2\t0x\1/p' +'s/^\([0-9a-fA-F]*\) . VDSO_\([a-zA-Z0-9_]*\)$/\#define vdso_offset_\2 0x\1/p' diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S deleted file mode 100644 index c39872a7b03c..000000000000 --- a/arch/arm64/kernel/vdso/gettimeofday.S +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Userspace implementations of gettimeofday() and friends. - * - * Copyright (C) 2012 ARM Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * Author: Will Deacon <will.deacon@arm.com> - */ - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> -#include <asm/unistd.h> - -#define NSEC_PER_SEC_LO16 0xca00 -#define NSEC_PER_SEC_HI16 0x3b9a - -vdso_data .req x6 -seqcnt .req w7 -w_tmp .req w8 -x_tmp .req x8 - -/* - * Conventions for macro arguments: - * - An argument is write-only if its name starts with "res". - * - All other arguments are read-only, unless otherwise specified. - */ - - .macro seqcnt_acquire -9999: ldr seqcnt, [vdso_data, #VDSO_TB_SEQ_COUNT] - tbnz seqcnt, #0, 9999b - dmb ishld - .endm - - .macro seqcnt_check fail - dmb ishld - ldr w_tmp, [vdso_data, #VDSO_TB_SEQ_COUNT] - cmp w_tmp, seqcnt - b.ne \fail - .endm - - .macro syscall_check fail - ldr w_tmp, [vdso_data, #VDSO_USE_SYSCALL] - cbnz w_tmp, \fail - .endm - - .macro get_nsec_per_sec res - mov \res, #NSEC_PER_SEC_LO16 - movk \res, #NSEC_PER_SEC_HI16, lsl #16 - .endm - - /* - * Returns the clock delta, in nanoseconds left-shifted by the clock - * shift. - */ - .macro get_clock_shifted_nsec res, cycle_last, mult - /* Read the virtual counter. */ - isb - mrs x_tmp, cntvct_el0 - /* Calculate cycle delta and convert to ns. */ - sub \res, x_tmp, \cycle_last - /* We can only guarantee 56 bits of precision. */ - movn x_tmp, #0xff00, lsl #48 - and \res, x_tmp, \res - mul \res, \res, \mult - .endm - - /* - * Returns in res_{sec,nsec} the REALTIME timespec, based on the - * "wall time" (xtime) and the clock_mono delta. - */ - .macro get_ts_realtime res_sec, res_nsec, \ - clock_nsec, xtime_sec, xtime_nsec, nsec_to_sec - add \res_nsec, \clock_nsec, \xtime_nsec - udiv x_tmp, \res_nsec, \nsec_to_sec - add \res_sec, \xtime_sec, x_tmp - msub \res_nsec, x_tmp, \nsec_to_sec, \res_nsec - .endm - - /* - * Returns in res_{sec,nsec} the timespec based on the clock_raw delta, - * used for CLOCK_MONOTONIC_RAW. - */ - .macro get_ts_clock_raw res_sec, res_nsec, clock_nsec, nsec_to_sec - udiv \res_sec, \clock_nsec, \nsec_to_sec - msub \res_nsec, \res_sec, \nsec_to_sec, \clock_nsec - .endm - - /* sec and nsec are modified in place. */ - .macro add_ts sec, nsec, ts_sec, ts_nsec, nsec_to_sec - /* Add timespec. */ - add \sec, \sec, \ts_sec - add \nsec, \nsec, \ts_nsec - - /* Normalise the new timespec. */ - cmp \nsec, \nsec_to_sec - b.lt 9999f - sub \nsec, \nsec, \nsec_to_sec - add \sec, \sec, #1 -9999: - cmp \nsec, #0 - b.ge 9998f - add \nsec, \nsec, \nsec_to_sec - sub \sec, \sec, #1 -9998: - .endm - - .macro clock_gettime_return, shift=0 - .if \shift == 1 - lsr x11, x11, x12 - .endif - stp x10, x11, [x1, #TSPEC_TV_SEC] - mov x0, xzr - ret - .endm - - .macro jump_slot jumptable, index, label - .if (. - \jumptable) != 4 * (\index) - .error "Jump slot index mismatch" - .endif - b \label - .endm - - .text - -/* int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); */ -ENTRY(__kernel_gettimeofday) - .cfi_startproc - adr vdso_data, _vdso_data - /* If tv is NULL, skip to the timezone code. */ - cbz x0, 2f - - /* Compute the time of day. */ -1: seqcnt_acquire - syscall_check fail=4f - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - /* w11 = cs_mono_mult, w12 = cs_shift */ - ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] - ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - seqcnt_check fail=1b - - get_nsec_per_sec res=x9 - lsl x9, x9, x12 - - get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 - get_ts_realtime res_sec=x10, res_nsec=x11, \ - clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 - - /* Convert ns to us. */ - mov x13, #1000 - lsl x13, x13, x12 - udiv x11, x11, x13 - stp x10, x11, [x0, #TVAL_TV_SEC] -2: - /* If tz is NULL, return 0. */ - cbz x1, 3f - ldp w4, w5, [vdso_data, #VDSO_TZ_MINWEST] - stp w4, w5, [x1, #TZ_MINWEST] -3: - mov x0, xzr - ret -4: - /* Syscall fallback. */ - mov x8, #__NR_gettimeofday - svc #0 - ret - .cfi_endproc -ENDPROC(__kernel_gettimeofday) - -#define JUMPSLOT_MAX CLOCK_MONOTONIC_COARSE - -/* int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); */ -ENTRY(__kernel_clock_gettime) - .cfi_startproc - cmp w0, #JUMPSLOT_MAX - b.hi syscall - adr vdso_data, _vdso_data - adr x_tmp, jumptable - add x_tmp, x_tmp, w0, uxtw #2 - br x_tmp - - ALIGN -jumptable: - jump_slot jumptable, CLOCK_REALTIME, realtime - jump_slot jumptable, CLOCK_MONOTONIC, monotonic - b syscall - b syscall - jump_slot jumptable, CLOCK_MONOTONIC_RAW, monotonic_raw - jump_slot jumptable, CLOCK_REALTIME_COARSE, realtime_coarse - jump_slot jumptable, CLOCK_MONOTONIC_COARSE, monotonic_coarse - - .if (. - jumptable) != 4 * (JUMPSLOT_MAX + 1) - .error "Wrong jumptable size" - .endif - - ALIGN -realtime: - seqcnt_acquire - syscall_check fail=syscall - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - /* w11 = cs_mono_mult, w12 = cs_shift */ - ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] - ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - seqcnt_check fail=realtime - - /* All computations are done with left-shifted nsecs. */ - get_nsec_per_sec res=x9 - lsl x9, x9, x12 - - get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 - get_ts_realtime res_sec=x10, res_nsec=x11, \ - clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 - clock_gettime_return, shift=1 - - ALIGN -monotonic: - seqcnt_acquire - syscall_check fail=syscall - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - /* w11 = cs_mono_mult, w12 = cs_shift */ - ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] - ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - ldp x3, x4, [vdso_data, #VDSO_WTM_CLK_SEC] - seqcnt_check fail=monotonic - - /* All computations are done with left-shifted nsecs. */ - lsl x4, x4, x12 - get_nsec_per_sec res=x9 - lsl x9, x9, x12 - - get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 - get_ts_realtime res_sec=x10, res_nsec=x11, \ - clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 - - add_ts sec=x10, nsec=x11, ts_sec=x3, ts_nsec=x4, nsec_to_sec=x9 - clock_gettime_return, shift=1 - - ALIGN -monotonic_raw: - seqcnt_acquire - syscall_check fail=syscall - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - /* w11 = cs_raw_mult, w12 = cs_shift */ - ldp w12, w11, [vdso_data, #VDSO_CS_SHIFT] - ldp x13, x14, [vdso_data, #VDSO_RAW_TIME_SEC] - seqcnt_check fail=monotonic_raw - - /* All computations are done with left-shifted nsecs. */ - get_nsec_per_sec res=x9 - lsl x9, x9, x12 - - get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 - get_ts_clock_raw res_sec=x10, res_nsec=x11, \ - clock_nsec=x15, nsec_to_sec=x9 - - add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9 - clock_gettime_return, shift=1 - - ALIGN -realtime_coarse: - seqcnt_acquire - ldp x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC] - seqcnt_check fail=realtime_coarse - clock_gettime_return - - ALIGN -monotonic_coarse: - seqcnt_acquire - ldp x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC] - ldp x13, x14, [vdso_data, #VDSO_WTM_CLK_SEC] - seqcnt_check fail=monotonic_coarse - - /* Computations are done in (non-shifted) nsecs. */ - get_nsec_per_sec res=x9 - add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9 - clock_gettime_return - - ALIGN -syscall: /* Syscall fallback. */ - mov x8, #__NR_clock_gettime - svc #0 - ret - .cfi_endproc -ENDPROC(__kernel_clock_gettime) - -/* int __kernel_clock_getres(clockid_t clock_id, struct timespec *res); */ -ENTRY(__kernel_clock_getres) - .cfi_startproc - cmp w0, #CLOCK_REALTIME - ccmp w0, #CLOCK_MONOTONIC, #0x4, ne - ccmp w0, #CLOCK_MONOTONIC_RAW, #0x4, ne - b.ne 1f - - ldr x2, 5f - b 2f -1: - cmp w0, #CLOCK_REALTIME_COARSE - ccmp w0, #CLOCK_MONOTONIC_COARSE, #0x4, ne - b.ne 4f - ldr x2, 6f -2: - cbz x1, 3f - stp xzr, x2, [x1] - -3: /* res == NULL. */ - mov w0, wzr - ret - -4: /* Syscall fallback. */ - mov x8, #__NR_clock_getres - svc #0 - ret -5: - .quad CLOCK_REALTIME_RES -6: - .quad CLOCK_COARSE_RES - .cfi_endproc -ENDPROC(__kernel_clock_getres) diff --git a/arch/arm64/kernel/vdso/note.S b/arch/arm64/kernel/vdso/note.S index e20483b104d9..3d4e82290c80 100644 --- a/arch/arm64/kernel/vdso/note.S +++ b/arch/arm64/kernel/vdso/note.S @@ -1,18 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 ARM Limited * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * * Author: Will Deacon <will.deacon@arm.com> * * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. @@ -23,9 +12,12 @@ #include <linux/version.h> #include <linux/elfnote.h> #include <linux/build-salt.h> +#include <asm/assembler.h> ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE ELFNOTE_END BUILD_SALT + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso/sigreturn.S b/arch/arm64/kernel/vdso/sigreturn.S index 20d98effa7dd..0e18729abc3b 100644 --- a/arch/arm64/kernel/vdso/sigreturn.S +++ b/arch/arm64/kernel/vdso/sigreturn.S @@ -1,37 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Sigreturn trampoline for returning from a signal when the SA_RESTORER - * flag is not set. + * flag is not set. It serves primarily as a hall of shame for crappy + * unwinders and features an exciting but mysterious NOP instruction. * - * Copyright (C) 2012 ARM Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * It's also fragile as hell, so please think twice before changing anything + * in here. * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. + * Copyright (C) 2012 ARM Limited * * Author: Will Deacon <will.deacon@arm.com> */ #include <linux/linkage.h> +#include <asm/assembler.h> #include <asm/unistd.h> .text - nop -ENTRY(__kernel_rt_sigreturn) - .cfi_startproc - .cfi_signal_frame - .cfi_def_cfa x29, 0 - .cfi_offset x29, 0 * 8 - .cfi_offset x30, 1 * 8 +/* + * NOTE!!! You may notice that all of the .cfi directives in this file have + * been commented out. This is because they have been shown to trigger segfaults + * in libgcc when unwinding out of a SIGCANCEL handler to invoke pthread + * cleanup handlers during the thread cancellation dance. By omitting the + * directives, we trigger an arm64-specific fallback path in the unwinder which + * recognises the signal frame and restores many of the registers directly from + * the sigcontext. Re-enabling the cfi directives here therefore needs to be + * much more comprehensive to reduce the risk of further regressions. + */ + +/* Ensure that the mysterious NOP can be associated with a function. */ +// .cfi_startproc + +/* + * .cfi_signal_frame causes the corresponding Frame Description Entry (FDE) in + * the .eh_frame section to be annotated as a signal frame. This allows DWARF + * unwinders (e.g. libstdc++) to implement _Unwind_GetIPInfo() and identify + * the next frame using the unmodified return address instead of subtracting 1, + * which may yield the wrong FDE. + */ +// .cfi_signal_frame + +/* + * Tell the unwinder where to locate the frame record linking back to the + * interrupted context. We don't provide unwind info for registers other than + * the frame pointer and the link register here; in practice, this is likely to + * be insufficient for unwinding in C/C++ based runtimes, especially without a + * means to restore the stack pointer. Thankfully, unwinders and debuggers + * already have baked-in strategies for attempting to unwind out of signals. + */ +// .cfi_def_cfa x29, 0 +// .cfi_offset x29, 0 * 8 +// .cfi_offset x30, 1 * 8 + +/* + * This mysterious NOP is required for some unwinders (e.g. libc++) that + * unconditionally subtract one from the result of _Unwind_GetIP() in order to + * identify the calling function. + * Hack borrowed from arch/powerpc/kernel/vdso64/sigtramp.S. + */ + nop // Mysterious NOP + +/* + * GDB, libgcc and libunwind rely on being able to identify the sigreturn + * instruction sequence to unwind from signal handlers. We cannot, therefore, + * use SYM_FUNC_START() here, as it will emit a BTI C instruction and break the + * unwinder. Thankfully, this function is only ever called from a RET and so + * omitting the landing pad is perfectly fine. + */ +SYM_CODE_START(__kernel_rt_sigreturn) +// PLEASE DO NOT MODIFY mov x8, #__NR_rt_sigreturn +// PLEASE DO NOT MODIFY svc #0 - .cfi_endproc -ENDPROC(__kernel_rt_sigreturn) +// PLEASE DO NOT MODIFY +// .cfi_endproc +SYM_CODE_END(__kernel_rt_sigreturn) + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso/vdso.S deleted file mode 100644 index 82379a70ef03..000000000000 --- a/arch/arm64/kernel/vdso/vdso.S +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (C) 2012 ARM Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * Author: Will Deacon <will.deacon@arm.com> - */ - -#include <linux/init.h> -#include <linux/linkage.h> -#include <linux/const.h> -#include <asm/page.h> - - .globl vdso_start, vdso_end - .section .rodata - .balign PAGE_SIZE -vdso_start: - .incbin "arch/arm64/kernel/vdso/vdso.so" - .balign PAGE_SIZE -vdso_end: - - .previous diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index beca249bc2f3..52314be29191 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -1,20 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * GNU linker script for the VDSO library. * * Copyright (C) 2012 ARM Limited * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * * Author: Will Deacon <will.deacon@arm.com> * Heavily based on the vDSO linker scripts for other archs. */ @@ -22,14 +11,18 @@ #include <linux/const.h> #include <asm/page.h> #include <asm/vdso.h> +#include <asm/vdso/vsyscall.h> +#include <asm-generic/vmlinux.lds.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") OUTPUT_ARCH(aarch64) SECTIONS { - PROVIDE(_vdso_data = . - PAGE_SIZE); - . = VDSO_LBASE + SIZEOF_HEADERS; + VDSO_VVAR_SYMS + + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -39,6 +32,14 @@ SECTIONS .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } + /* + * Discard .note.gnu.property sections which are unused and have + * different alignment requirement from vDSO note sections. + */ + /DISCARD/ : { + *(.note.GNU-stack .note.gnu.property) + *(.ARM.attributes) + } .note : { *(.note.*) } :text :note . = ALIGN(16); @@ -48,20 +49,35 @@ SECTIONS PROVIDE (_etext = .); PROVIDE (etext = .); - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text + . = ALIGN(4); + .altinstructions : { + *(.altinstructions) + } .dynamic : { *(.dynamic) } :text :dynamic - .rodata : { *(.rodata*) } :text + .rela.dyn : ALIGN(8) { *(.rela .rela*) } + + .rodata : { + *(.rodata*) + *(.got) + *(.got.plt) + *(.plt) + *(.plt.*) + *(.iplt) + *(.igot .igot.plt) + } :text _end = .; PROVIDE(end = .); + DWARF_DEBUG + ELF_DETAILS + /DISCARD/ : { - *(.note.GNU-stack) *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) + *(.eh_frame .eh_frame_hdr) } } @@ -74,7 +90,6 @@ PHDRS text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr PT_GNU_EH_FRAME; } /* @@ -88,6 +103,7 @@ VERSION __kernel_gettimeofday; __kernel_clock_gettime; __kernel_clock_getres; + __kernel_getrandom; local: *; }; } diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..67890b445309 --- /dev/null +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/linkage.h> +#include <asm/cache.h> +#include <asm/assembler.h> + + .text + +#define state0 v0 +#define state1 v1 +#define state2 v2 +#define state3 v3 +#define copy0 v4 +#define copy0_q q4 +#define copy1 v5 +#define copy2 v6 +#define copy3 v7 +#define copy3_d d7 +#define one_d d16 +#define one_q q16 +#define one_v v16 +#define tmp v17 +#define rot8 v18 + +/* + * ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive + * number of blocks of output with nonce 0, taking an input key and 8-bytes + * counter. Importantly does not spill to the stack. + * + * This implementation avoids d8-d15 because they are callee-save in user + * space. + * + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, + * const uint8_t *key, + * uint32_t *counter, + * size_t nblocks) + * + * x0: output bytes + * x1: 32-byte key input + * x2: 8-byte counter input/output + * x3: number of 64-byte block to write to output + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + + /* copy0 = "expand 32-byte k" */ + mov_q x8, 0x3320646e61707865 + mov_q x9, 0x6b20657479622d32 + mov copy0.d[0], x8 + mov copy0.d[1], x9 + + /* copy1,copy2 = key */ + ld1 { copy1.4s, copy2.4s }, [x1] + /* copy3 = counter || zero nonce */ + ld1 { copy3.2s }, [x2] + + movi one_v.2s, #1 + uzp1 one_v.4s, one_v.4s, one_v.4s + +.Lblock: + /* copy state to auxiliary vectors for the final add after the permute. */ + mov state0.16b, copy0.16b + mov state1.16b, copy1.16b + mov state2.16b, copy2.16b + mov state3.16b, copy3.16b + + mov w4, 20 +.Lpermute: + /* + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers state0-state3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + */ + +.Ldoubleround: + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + add state0.4s, state0.4s, state1.4s + eor state3.16b, state3.16b, state0.16b + rev32 state3.8h, state3.8h + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + add state2.4s, state2.4s, state3.4s + eor tmp.16b, state1.16b, state2.16b + shl state1.4s, tmp.4s, #12 + sri state1.4s, tmp.4s, #20 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + add state0.4s, state0.4s, state1.4s + eor tmp.16b, state3.16b, state0.16b + shl state3.4s, tmp.4s, #8 + sri state3.4s, tmp.4s, #24 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + add state2.4s, state2.4s, state3.4s + eor tmp.16b, state1.16b, state2.16b + shl state1.4s, tmp.4s, #7 + sri state1.4s, tmp.4s, #25 + + /* state1[0,1,2,3] = state1[1,2,3,0] */ + ext state1.16b, state1.16b, state1.16b, #4 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + ext state2.16b, state2.16b, state2.16b, #8 + /* state3[0,1,2,3] = state3[1,2,3,0] */ + ext state3.16b, state3.16b, state3.16b, #12 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + add state0.4s, state0.4s, state1.4s + eor state3.16b, state3.16b, state0.16b + rev32 state3.8h, state3.8h + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + add state2.4s, state2.4s, state3.4s + eor tmp.16b, state1.16b, state2.16b + shl state1.4s, tmp.4s, #12 + sri state1.4s, tmp.4s, #20 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + add state0.4s, state0.4s, state1.4s + eor tmp.16b, state3.16b, state0.16b + shl state3.4s, tmp.4s, #8 + sri state3.4s, tmp.4s, #24 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + add state2.4s, state2.4s, state3.4s + eor tmp.16b, state1.16b, state2.16b + shl state1.4s, tmp.4s, #7 + sri state1.4s, tmp.4s, #25 + + /* state1[0,1,2,3] = state1[3,0,1,2] */ + ext state1.16b, state1.16b, state1.16b, #12 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + ext state2.16b, state2.16b, state2.16b, #8 + /* state3[0,1,2,3] = state3[1,2,3,0] */ + ext state3.16b, state3.16b, state3.16b, #4 + + subs w4, w4, #2 + b.ne .Ldoubleround + + /* output0 = state0 + state0 */ + add state0.4s, state0.4s, copy0.4s + /* output1 = state1 + state1 */ + add state1.4s, state1.4s, copy1.4s + /* output2 = state2 + state2 */ + add state2.4s, state2.4s, copy2.4s + /* output2 = state3 + state3 */ + add state3.4s, state3.4s, copy3.4s + st1 { state0.16b - state3.16b }, [x0] + + /* + * ++copy3.counter, the 'add' clears the upper half of the SIMD register + * which is the expected behaviour here. + */ + add copy3_d, copy3_d, one_d + + /* output += 64, --nblocks */ + add x0, x0, 64 + subs x3, x3, #1 + b.ne .Lblock + + /* counter = copy3.counter */ + st1 { copy3.2s }, [x2] + + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ + movi state0.16b, #0 + movi state1.16b, #0 + movi state2.16b, #0 + movi state3.16b, #0 + movi copy1.16b, #0 + movi copy2.16b, #0 + ret +SYM_FUNC_END(__arch_chacha20_blocks_nostack) + +emit_aarch64_feature_1_and diff --git a/arch/arm64/kernel/vdso/vgetrandom.c b/arch/arm64/kernel/vdso/vgetrandom.c new file mode 100644 index 000000000000..832fe195292b --- /dev/null +++ b/arch/arm64/kernel/vdso/vgetrandom.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <uapi/asm-generic/errno.h> + +typeof(__cvdso_getrandom) __kernel_getrandom; + +ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + if (alternative_has_cap_likely(ARM64_HAS_FPSIMD)) + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); + + if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) + return -ENOSYS; + return getrandom_syscall(buffer, len, flags); +} diff --git a/arch/arm64/kernel/vdso/vgettimeofday.c b/arch/arm64/kernel/vdso/vgettimeofday.c new file mode 100644 index 000000000000..9941c5b04f15 --- /dev/null +++ b/arch/arm64/kernel/vdso/vgettimeofday.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM64 userspace implementations of gettimeofday() and similar. + * + * Copyright (C) 2018 ARM Limited + * + */ + +int __kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); +int __kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); +int __kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res); + +int __kernel_clock_gettime(clockid_t clock, + struct __kernel_timespec *ts) +{ + return __cvdso_clock_gettime(clock, ts); +} + +int __kernel_gettimeofday(struct __kernel_old_timeval *tv, + struct timezone *tz) +{ + return __cvdso_gettimeofday(tv, tz); +} + +int __kernel_clock_getres(clockid_t clock_id, + struct __kernel_timespec *res) +{ + return __cvdso_clock_getres(clock_id, res); +} diff --git a/arch/arm64/kernel/vdso32-wrap.S b/arch/arm64/kernel/vdso32-wrap.S new file mode 100644 index 000000000000..e72ac7bc4c04 --- /dev/null +++ b/arch/arm64/kernel/vdso32-wrap.S @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2012 ARM Limited + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/const.h> +#include <asm/page.h> + + .globl vdso32_start, vdso32_end + .section .rodata + .balign PAGE_SIZE +vdso32_start: + .incbin "arch/arm64/kernel/vdso32/vdso.so" + .balign PAGE_SIZE +vdso32_end: + + .previous diff --git a/arch/arm64/kernel/vdso32/.gitignore b/arch/arm64/kernel/vdso32/.gitignore new file mode 100644 index 000000000000..3542fa24e26b --- /dev/null +++ b/arch/arm64/kernel/vdso32/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +vdso.lds +vdso.so.raw diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile new file mode 100644 index 000000000000..9d0efed91414 --- /dev/null +++ b/arch/arm64/kernel/vdso32/Makefile @@ -0,0 +1,154 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for vdso32 +# + +include $(srctree)/lib/vdso/Makefile.include + +# Same as cc-*option, but using CC_COMPAT instead of CC +ifeq ($(CONFIG_CC_IS_CLANG), y) +CC_COMPAT ?= $(CC) +CC_COMPAT += --target=arm-linux-gnueabi +else +CC_COMPAT ?= $(CROSS_COMPILE_COMPAT)gcc +endif + +ifeq ($(CONFIG_LD_IS_LLD), y) +LD_COMPAT ?= $(LD) +else +LD_COMPAT ?= $(CROSS_COMPILE_COMPAT)ld +endif + +cc32-option = $(call try-run,\ + $(CC_COMPAT) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) + +# We cannot use the global flags to compile the vDSO files, the main reason +# being that the 32-bit compiler may be older than the main (64-bit) compiler +# and therefore may not understand flags set using $(cc-option ...). Besides, +# arch-specific options should be taken from the arm Makefile instead of the +# arm64 one. +# As a result we set our own flags here. + +# KBUILD_CPPFLAGS and NOSTDINC_FLAGS from top-level Makefile +VDSO_CPPFLAGS := -DBUILD_VDSO -D__KERNEL__ -nostdinc +VDSO_CPPFLAGS += -isystem $(shell $(CC_COMPAT) -print-file-name=include 2>/dev/null) +VDSO_CPPFLAGS += $(LINUXINCLUDE) + +# Common C and assembly flags +# From top-level Makefile +VDSO_CAFLAGS := $(VDSO_CPPFLAGS) +VDSO_CAFLAGS += $(call cc32-option,-fno-PIE) +ifdef CONFIG_DEBUG_INFO +VDSO_CAFLAGS += -g +endif + +# From arm Makefile +VDSO_CAFLAGS += $(call cc32-option,-fno-dwarf2-cfi-asm) +VDSO_CAFLAGS += -mabi=aapcs-linux -mfloat-abi=soft +ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) +VDSO_CAFLAGS += -mbig-endian +else +VDSO_CAFLAGS += -mlittle-endian +endif + +# From arm vDSO Makefile +VDSO_CAFLAGS += -fPIC -fno-builtin -fno-stack-protector +VDSO_CAFLAGS += -DDISABLE_BRANCH_PROFILING +VDSO_CAFLAGS += -march=armv8-a + +VDSO_CFLAGS := $(VDSO_CAFLAGS) +# KBUILD_CFLAGS from top-level Makefile +VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ + -fno-strict-aliasing -fno-common \ + $(filter -Werror,$(KBUILD_CPPFLAGS)) \ + -Werror-implicit-function-declaration \ + -Wno-format-security \ + -std=gnu11 -fms-extensions +VDSO_CFLAGS += -O2 +# Some useful compiler-dependent flags from top-level Makefile +VDSO_CFLAGS += $(call cc32-option,-Wno-pointer-sign) +VDSO_CFLAGS += -fno-strict-overflow +VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes) +VDSO_CFLAGS += -Werror=date-time +VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types) +VDSO_CFLAGS += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag) + +# Compile as THUMB2 or ARM. Unwinding via frame-pointers in THUMB2 is +# unreliable. +ifeq ($(CONFIG_THUMB2_COMPAT_VDSO), y) +VDSO_CFLAGS += -mthumb -fomit-frame-pointer +else +VDSO_CFLAGS += -marm +endif + +VDSO_AFLAGS := $(VDSO_CAFLAGS) +VDSO_AFLAGS += -D__ASSEMBLY__ + +# From arm vDSO Makefile +VDSO_LDFLAGS += -Bsymbolic --no-undefined -soname=linux-vdso.so.1 +VDSO_LDFLAGS += -z max-page-size=4096 -z common-page-size=4096 +VDSO_LDFLAGS += -shared --build-id=sha1 +VDSO_LDFLAGS += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) + + +# Borrow vdsomunge.c from the arm vDSO +# We have to use a relative path because scripts/Makefile.host prefixes +# $(hostprogs) with $(obj) +munge := ../../../arm/vdso/vdsomunge +hostprogs := $(munge) + +c-obj-vdso := note.o +c-obj-vdso-gettimeofday := vgettimeofday.o + +ifneq ($(c-gettimeofday-y),) +VDSO_CFLAGS_gettimeofday_o += -include $(c-gettimeofday-y) +endif + +VDSO_CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os + +# Build rules +targets := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) vdso.so vdso32.so.dbg vdso.so.raw +c-obj-vdso := $(addprefix $(obj)/, $(c-obj-vdso)) +c-obj-vdso-gettimeofday := $(addprefix $(obj)/, $(c-obj-vdso-gettimeofday)) +asm-obj-vdso := $(addprefix $(obj)/, $(asm-obj-vdso)) +obj-vdso := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) + +targets += vdso.lds +CPPFLAGS_vdso.lds += -P -C -U$(ARCH) + +# Strip rule for vdso.so +$(obj)/vdso.so: OBJCOPYFLAGS := -S +$(obj)/vdso.so: $(obj)/vdso32.so.dbg FORCE + $(call if_changed,objcopy) + +$(obj)/vdso32.so.dbg: $(obj)/vdso.so.raw $(obj)/$(munge) FORCE + $(call if_changed,vdsomunge) + +# Link rule for the .so file, .lds has to be first +$(obj)/vdso.so.raw: $(obj)/vdso.lds $(obj-vdso) FORCE + $(call if_changed,vdsold_and_vdso_check) + +# Compilation rules for the vDSO sources +$(c-obj-vdso): %.o: %.c FORCE + $(call if_changed_dep,vdsocc) +$(c-obj-vdso-gettimeofday): %.o: %.c FORCE + $(call if_changed_dep,vdsocc_gettimeofday) +$(asm-obj-vdso): %.o: %.S FORCE + $(call if_changed_dep,vdsoas) + +# Actual build commands +quiet_cmd_vdsold_and_vdso_check = LD32 $@ + cmd_vdsold_and_vdso_check = $(cmd_vdsold); $(cmd_vdso_check) + +quiet_cmd_vdsold = LD32 $@ + cmd_vdsold = $(LD_COMPAT) $(VDSO_LDFLAGS) \ + -T $(filter %.lds,$^) $(filter %.o,$^) -o $@ +quiet_cmd_vdsocc = CC32 $@ + cmd_vdsocc = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_CFLAGS) -c -o $@ $< +quiet_cmd_vdsocc_gettimeofday = CC32 $@ + cmd_vdsocc_gettimeofday = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_CFLAGS) $(VDSO_CFLAGS_gettimeofday_o) -c -o $@ $< +quiet_cmd_vdsoas = AS32 $@ + cmd_vdsoas = $(CC_COMPAT) -Wp,-MD,$(depfile) $(VDSO_AFLAGS) -c -o $@ $< + +quiet_cmd_vdsomunge = MUNGE $@ + cmd_vdsomunge = $(obj)/$(munge) $< $@ diff --git a/arch/arm64/kernel/vdso32/note.c b/arch/arm64/kernel/vdso32/note.c new file mode 100644 index 000000000000..eff5bf9efb8b --- /dev/null +++ b/arch/arm64/kernel/vdso32/note.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2012-2018 ARM Limited + * + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> +#include <linux/build-salt.h> + +ELFNOTE32("Linux", 0, LINUX_VERSION_CODE); +BUILD_SALT; diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S new file mode 100644 index 000000000000..e02b27487ce8 --- /dev/null +++ b/arch/arm64/kernel/vdso32/vdso.lds.S @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Adapted from arm64 version. + * + * GNU linker script for the VDSO library. + * Heavily based on the vDSO linker scripts for other archs. + * + * Copyright (C) 2012-2018 ARM Limited + */ + +#include <linux/const.h> +#include <asm/page.h> +#include <asm/vdso.h> +#include <asm-generic/vmlinux.lds.h> +#include <vdso/datapage.h> + +OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") +OUTPUT_ARCH(arm) + +SECTIONS +{ + VDSO_VVAR_SYMS + + . = SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { + *(.rodata*) + *(.got) + *(.got.plt) + *(.plt) + *(.rel.iplt) + *(.iplt) + *(.igot.plt) + } :text + + .text : { + *(.text*) + *(.glue_7) + *(.glue_7t) + *(.vfp11_veneer) + *(.v4_bx) + } :text =0xe7f001f2 + + .rel.dyn : { *(.rel*) } + + .ARM.exidx : { *(.ARM.exidx*) } + DWARF_DEBUG + ELF_DETAILS + .ARM.attributes 0 : { *(.ARM.attributes) } + + /DISCARD/ : { + *(.note.GNU-stack) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + } +} + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ +} + +VERSION +{ + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_clock_getres; + __vdso_clock_gettime64; + local: *; + }; +} diff --git a/arch/arm64/kernel/vdso32/vgettimeofday.c b/arch/arm64/kernel/vdso32/vgettimeofday.c new file mode 100644 index 000000000000..29b4d8f61e39 --- /dev/null +++ b/arch/arm64/kernel/vdso32/vgettimeofday.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM64 compat userspace implementations of gettimeofday() and similar. + * + * Copyright (C) 2018 ARM Limited + * + */ +#define BUILD_VDSO32_64 +#include <vdso/gettime.h> + +int __vdso_clock_gettime(clockid_t clock, + struct old_timespec32 *ts) +{ + return __cvdso_clock_gettime32(clock, ts); +} + +int __vdso_clock_gettime64(clockid_t clock, + struct __kernel_timespec *ts) +{ + return __cvdso_clock_gettime(clock, ts); +} + +int __vdso_gettimeofday(struct __kernel_old_timeval *tv, + struct timezone *tz) +{ + return __cvdso_gettimeofday(tv, tz); +} + +int __vdso_clock_getres(clockid_t clock_id, + struct old_timespec32 *res) +{ + return __cvdso_clock_getres_time32(clock_id, res); +} + +/* Avoid unresolved references emitted by GCC */ + +void __aeabi_unwind_cpp_pr0(void) +{ +} + +void __aeabi_unwind_cpp_pr1(void) +{ +} + +void __aeabi_unwind_cpp_pr2(void) +{ +} diff --git a/arch/arm64/kernel/vmcore_info.c b/arch/arm64/kernel/vmcore_info.c new file mode 100644 index 000000000000..9619ece66b79 --- /dev/null +++ b/arch/arm64/kernel/vmcore_info.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) Linaro. + * Copyright (C) Huawei Futurewei Technologies. + */ + +#include <linux/vmcore_info.h> +#include <asm/cpufeature.h> +#include <asm/memory.h> +#include <asm/pgtable-hwdef.h> +#include <asm/pointer_auth.h> + +static inline u64 get_tcr_el1_t1sz(void); + +static inline u64 get_tcr_el1_t1sz(void) +{ + return (read_sysreg(tcr_el1) & TCR_EL1_T1SZ_MASK) >> TCR_EL1_T1SZ_SHIFT; +} + +void arch_crash_save_vmcoreinfo(void) +{ + VMCOREINFO_NUMBER(VA_BITS); + /* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */ + vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR); + vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END); + vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END); + vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START); + vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END); + vmcoreinfo_append_str("NUMBER(kimage_voffset)=0x%llx\n", + kimage_voffset); + vmcoreinfo_append_str("NUMBER(PHYS_OFFSET)=0x%llx\n", + PHYS_OFFSET); + vmcoreinfo_append_str("NUMBER(TCR_EL1_T1SZ)=0x%llx\n", + get_tcr_el1_t1sz()); + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + vmcoreinfo_append_str("NUMBER(KERNELPACMASK)=0x%llx\n", + system_supports_address_auth() ? + ptrauth_kernel_pac_mask() : 0); +} diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 7fa008374907..ad6133b89e7a 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -5,40 +5,94 @@ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz> */ +#include <asm/hyp_image.h> +#ifdef CONFIG_KVM +#define HYPERVISOR_EXTABLE \ + . = ALIGN(SZ_8); \ + __start___kvm_ex_table = .; \ + *(__kvm_ex_table) \ + __stop___kvm_ex_table = .; + +#define HYPERVISOR_RODATA_SECTIONS \ + HYP_SECTION_NAME(.rodata) : { \ + . = ALIGN(PAGE_SIZE); \ + __hyp_rodata_start = .; \ + *(HYP_SECTION_NAME(.data..ro_after_init)) \ + *(HYP_SECTION_NAME(.rodata)) \ + . = ALIGN(PAGE_SIZE); \ + __hyp_rodata_end = .; \ + } + +#define HYPERVISOR_DATA_SECTION \ + HYP_SECTION_NAME(.data) : { \ + . = ALIGN(PAGE_SIZE); \ + __hyp_data_start = .; \ + *(HYP_SECTION_NAME(.data)) \ + . = ALIGN(PAGE_SIZE); \ + __hyp_data_end = .; \ + } + +#define HYPERVISOR_PERCPU_SECTION \ + . = ALIGN(PAGE_SIZE); \ + HYP_SECTION_NAME(.data..percpu) : { \ + *(HYP_SECTION_NAME(.data..percpu)) \ + } + +#define HYPERVISOR_RELOC_SECTION \ + .hyp.reloc : ALIGN(4) { \ + __hyp_reloc_begin = .; \ + *(.hyp.reloc) \ + __hyp_reloc_end = .; \ + } + +#define BSS_FIRST_SECTIONS \ + __hyp_bss_start = .; \ + *(HYP_SECTION_NAME(.bss)) \ + . = ALIGN(PAGE_SIZE); \ + __hyp_bss_end = .; + +/* + * We require that __hyp_bss_start and __bss_start are aligned, and enforce it + * with an assertion. But the BSS_SECTION macro places an empty .sbss section + * between them, which can in some cases cause the linker to misalign them. To + * work around the issue, force a page alignment for __bss_start. + */ +#define SBSS_ALIGN PAGE_SIZE +#else /* CONFIG_KVM */ +#define HYPERVISOR_EXTABLE +#define HYPERVISOR_RODATA_SECTIONS +#define HYPERVISOR_DATA_SECTION +#define HYPERVISOR_PERCPU_SECTION +#define HYPERVISOR_RELOC_SECTION +#define SBSS_ALIGN 0 +#endif + +#define RO_EXCEPTION_TABLE_ALIGN 4 +#define RUNTIME_DISCARD_EXIT + #include <asm-generic/vmlinux.lds.h> #include <asm/cache.h> #include <asm/kernel-pgtable.h> -#include <asm/thread_info.h> +#include <asm/kexec.h> #include <asm/memory.h> #include <asm/page.h> -#include <asm/pgtable.h> #include "image.h" -/* .exit.text needed in case of alternative patching */ -#define ARM_EXIT_KEEP(x) x -#define ARM_EXIT_DISCARD(x) - OUTPUT_ARCH(aarch64) ENTRY(_text) jiffies = jiffies_64; #define HYPERVISOR_TEXT \ - /* \ - * Align to 4 KB so that \ - * a) the HYP vector table is at its minimum \ - * alignment of 2048 bytes \ - * b) the HYP init code will not cross a page \ - * boundary if its size does not exceed \ - * 4 KB (see related ASSERT() below) \ - */ \ - . = ALIGN(SZ_4K); \ + . = ALIGN(PAGE_SIZE); \ __hyp_idmap_text_start = .; \ *(.hyp.idmap.text) \ __hyp_idmap_text_end = .; \ __hyp_text_start = .; \ *(.hyp.text) \ + HYPERVISOR_EXTABLE \ + . = ALIGN(PAGE_SIZE); \ __hyp_text_end = .; #define IDMAP_TEXT \ @@ -49,7 +103,7 @@ jiffies = jiffies_64; #ifdef CONFIG_HIBERNATION #define HIBERNATE_TEXT \ - . = ALIGN(SZ_4K); \ + ALIGN_FUNCTION(); \ __hibernate_exit_text_start = .; \ *(.hibernate_exit.text) \ __hibernate_exit_text_end = .; @@ -57,21 +111,43 @@ jiffies = jiffies_64; #define HIBERNATE_TEXT #endif +#ifdef CONFIG_KEXEC_CORE +#define KEXEC_TEXT \ + ALIGN_FUNCTION(); \ + __relocate_new_kernel_start = .; \ + *(.kexec_relocate.text) \ + __relocate_new_kernel_end = .; +#else +#define KEXEC_TEXT +#endif + #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 #define TRAMP_TEXT \ . = ALIGN(PAGE_SIZE); \ __entry_tramp_text_start = .; \ *(.entry.tramp.text) \ . = ALIGN(PAGE_SIZE); \ - __entry_tramp_text_end = .; + __entry_tramp_text_end = .; \ + *(.entry.tramp.rodata) #else #define TRAMP_TEXT #endif +#ifdef CONFIG_UNWIND_TABLES +#define UNWIND_DATA_SECTIONS \ + .eh_frame : { \ + __pi___eh_frame_start = .; \ + *(.eh_frame) \ + __pi___eh_frame_end = .; \ + } +#else +#define UNWIND_DATA_SECTIONS +#endif + /* * The size of the PE/COFF section that covers the kernel image, which - * runs from stext to _edata, must be a round multiple of the PE/COFF - * FileAlignment, which we set to its minimum value of 0x200. 'stext' + * runs from _stext to _edata, must be a round multiple of the PE/COFF + * FileAlignment, which we set to its minimum value of 0x200. '_stext' * itself is 4 KB aligned, so padding out _edata to a 0x200 aligned * boundary should be sufficient. */ @@ -92,78 +168,83 @@ SECTIONS * matching the same input section name. There is no documented * order of matching. */ + DISCARDS /DISCARD/ : { - ARM_EXIT_DISCARD(EXIT_TEXT) - ARM_EXIT_DISCARD(EXIT_DATA) - EXIT_CALL - *(.discard) - *(.discard.*) *(.interp .dynamic) *(.dynsym .dynstr .hash .gnu.hash) - *(.eh_frame) + *(.ARM.attributes) } - . = KIMAGE_VADDR + TEXT_OFFSET; + . = KIMAGE_VADDR; .head.text : { _text = .; HEAD_TEXT } - .text : { /* Real text segment */ + .text : ALIGN(SEGMENT_ALIGN) { /* Real text segment */ _stext = .; /* Text and read-only data */ - __exception_text_start = .; - *(.exception.text) - __exception_text_end = .; IRQENTRY_TEXT SOFTIRQENTRY_TEXT ENTRY_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT HYPERVISOR_TEXT - IDMAP_TEXT - HIBERNATE_TEXT - TRAMP_TEXT - *(.fixup) *(.gnu.warning) - . = ALIGN(16); - *(.got) /* Global offset table */ } . = ALIGN(SEGMENT_ALIGN); _etext = .; /* End of text section */ - RO_DATA(PAGE_SIZE) /* everything from this point to */ - EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */ - NOTES + /* everything from this point to __init_begin will be marked RO NX */ + RO_DATA(PAGE_SIZE) + + HYPERVISOR_RODATA_SECTIONS + + .got : { *(.got) } + /* + * Make sure that the .got.plt is either completely empty or it + * contains only the lazy dispatch entries. + */ + .got.plt : { *(.got.plt) } + ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18, + "Unexpected GOT/PLT entries detected!") + + /* code sections that are never executed via the kernel mapping */ + .rodata.text : { + TRAMP_TEXT + HIBERNATE_TEXT + KEXEC_TEXT + IDMAP_TEXT + . = ALIGN(PAGE_SIZE); + } - . = ALIGN(PAGE_SIZE); idmap_pg_dir = .; - . += IDMAP_DIR_SIZE; + . += PAGE_SIZE; #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 tramp_pg_dir = .; . += PAGE_SIZE; #endif -#ifdef CONFIG_ARM64_SW_TTBR0_PAN - reserved_ttbr0 = .; - . += RESERVED_TTBR0_SIZE; -#endif + reserved_pg_dir = .; + . += PAGE_SIZE; + swapper_pg_dir = .; . += PAGE_SIZE; - swapper_pg_end = .; . = ALIGN(SEGMENT_ALIGN); __init_begin = .; __inittext_begin = .; INIT_TEXT_SECTION(8) + + __exittext_begin = .; .exit.text : { - ARM_EXIT_KEEP(EXIT_TEXT) + EXIT_TEXT } + __exittext_end = .; . = ALIGN(4); .altinstructions : { @@ -171,42 +252,60 @@ SECTIONS *(.altinstructions) __alt_instructions_end = .; } - .altinstr_replacement : { - *(.altinstr_replacement) - } - . = ALIGN(PAGE_SIZE); + UNWIND_DATA_SECTIONS + + . = ALIGN(SEGMENT_ALIGN); __inittext_end = .; __initdata_begin = .; + __pi_init_idmap_pg_dir = .; + . += INIT_IDMAP_DIR_SIZE; + __pi_init_idmap_pg_end = .; + .init.data : { INIT_DATA INIT_SETUP(16) INIT_CALLS CON_INITCALL INIT_RAM_FS - *(.init.rodata.* .init.bss) /* from the EFI stub */ + *(.init.altinstructions .init.bss) /* from the EFI stub */ } .exit.data : { - ARM_EXIT_KEEP(EXIT_DATA) + EXIT_DATA } + RUNTIME_CONST_VARIABLES + PERCPU_SECTION(L1_CACHE_BYTES) + HYPERVISOR_PERCPU_SECTION + + HYPERVISOR_RELOC_SECTION .rela.dyn : ALIGN(8) { + __pi_rela_start = .; *(.rela .rela*) + __pi_rela_end = .; } - __rela_offset = ABSOLUTE(ADDR(.rela.dyn) - KIMAGE_VADDR); - __rela_size = SIZEOF(.rela.dyn); + .relr.dyn : ALIGN(8) { + __pi_relr_start = .; + *(.relr.dyn) + __pi_relr_end = .; + } . = ALIGN(SEGMENT_ALIGN); __initdata_end = .; __init_end = .; + .data.rel.ro : { *(.data.rel.ro) } + ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!") + _data = .; _sdata = .; - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) + + HYPERVISOR_DATA_SECTION /* * Data written with the MMU off but read with the MMU on requires @@ -230,38 +329,82 @@ SECTIONS __pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin); _edata = .; - BSS_SECTION(0, 0, 0) + /* start of zero-init region */ + BSS_SECTION(SBSS_ALIGN, 0, 0) + __pi___bss_start = __bss_start; . = ALIGN(PAGE_SIZE); - init_pg_dir = .; + __pi_init_pg_dir = .; . += INIT_DIR_SIZE; - init_pg_end = .; + __pi_init_pg_end = .; + /* end of zero-init region */ + + . += SZ_4K; /* stack for the early C runtime */ + early_init_stack = .; + . = ALIGN(SEGMENT_ALIGN); __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; + __pi__end = .; STABS_DEBUG + DWARF_DEBUG + ELF_DETAILS HEAD_SYMBOLS + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .plt : { + *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") } +#include "image-vars.h" + /* - * The HYP init code and ID map text can't be longer than a page each, - * and should not cross a page boundary. + * The HYP init code and ID map text can't be longer than a page each. The + * former is page-aligned, but the latter may not be with 16K or 64K pages, so + * it should also not cross a page boundary. */ -ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, - "HYP init code too big or misaligned") +ASSERT(__hyp_idmap_text_end - __hyp_idmap_text_start <= PAGE_SIZE, + "HYP init code too big") ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, "ID map text too big or misaligned") #ifdef CONFIG_HIBERNATION -ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) - <= SZ_4K, "Hibernate exit text too big or misaligned") +ASSERT(__hibernate_exit_text_end - __hibernate_exit_text_start <= SZ_4K, + "Hibernate exit text is bigger than 4 KiB") +ASSERT(__hibernate_exit_text_start == swsusp_arch_suspend_exit, + "Hibernate exit text does not start with swsusp_arch_suspend_exit") #endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, +ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) <= 3*PAGE_SIZE, "Entry trampoline text too big") #endif +#ifdef CONFIG_KVM +ASSERT(__hyp_bss_start == __bss_start, "HYP and Host BSS are misaligned") +#endif /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ -ASSERT(_text == (KIMAGE_VADDR + TEXT_OFFSET), "HEAD is misaligned") +ASSERT(_text == KIMAGE_VADDR, "HEAD is misaligned") + +ASSERT(swapper_pg_dir - reserved_pg_dir == RESERVED_SWAPPER_OFFSET, + "RESERVED_SWAPPER_OFFSET is wrong!") + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +ASSERT(swapper_pg_dir - tramp_pg_dir == TRAMP_SWAPPER_OFFSET, + "TRAMP_SWAPPER_OFFSET is wrong!") +#endif + +#ifdef CONFIG_KEXEC_CORE +/* kexec relocation code should fit into one KEXEC_CONTROL_PAGE_SIZE */ +ASSERT(__relocate_new_kernel_end - __relocate_new_kernel_start <= SZ_4K, + "kexec relocation code is bigger than 4 KiB") +ASSERT(KEXEC_CONTROL_PAGE_SIZE >= SZ_4K, "KEXEC_CONTROL_PAGE_SIZE is broken") +ASSERT(__relocate_new_kernel_start == arm64_relocate_new_kernel, + "kexec control page does not start with arm64_relocate_new_kernel") +#endif diff --git a/arch/arm64/kernel/watchdog_hld.c b/arch/arm64/kernel/watchdog_hld.c new file mode 100644 index 000000000000..3093037dcb7b --- /dev/null +++ b/arch/arm64/kernel/watchdog_hld.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/nmi.h> +#include <linux/cpufreq.h> +#include <linux/perf/arm_pmu.h> + +/* + * Safe maximum CPU frequency in case a particular platform doesn't implement + * cpufreq driver. Although, architecture doesn't put any restrictions on + * maximum frequency but 5 GHz seems to be safe maximum given the available + * Arm CPUs in the market which are clocked much less than 5 GHz. On the other + * hand, we can't make it much higher as it would lead to a large hard-lockup + * detection timeout on parts which are running slower (eg. 1GHz on + * Developerbox) and doesn't possess a cpufreq driver. + */ +#define SAFE_MAX_CPU_FREQ 5000000000UL // 5 GHz +u64 hw_nmi_get_sample_period(int watchdog_thresh) +{ + unsigned int cpu = smp_processor_id(); + unsigned long max_cpu_freq; + + max_cpu_freq = cpufreq_get_hw_max_freq(cpu) * 1000UL; + if (!max_cpu_freq) + max_cpu_freq = SAFE_MAX_CPU_FREQ; + + return (u64)max_cpu_freq * watchdog_thresh; +} + +bool __init arch_perf_nmi_is_available(void) +{ + /* + * hardlockup_detector_perf_init() will success even if Pseudo-NMI turns off, + * however, the pmu interrupts will act like a normal interrupt instead of + * NMI and the hardlockup detector would be broken. + */ + return arm_pmu_irq_is_nmi(); +} + +static int watchdog_perf_update_period(void *data) +{ + int cpu = smp_processor_id(); + u64 max_cpu_freq, new_period; + + max_cpu_freq = cpufreq_get_hw_max_freq(cpu) * 1000UL; + if (!max_cpu_freq) + return 0; + + new_period = watchdog_thresh * max_cpu_freq; + hardlockup_detector_perf_adjust_period(new_period); + + return 0; +} + +static int watchdog_freq_notifier_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = data; + int cpu; + + if (val != CPUFREQ_CREATE_POLICY) + return NOTIFY_DONE; + + /* + * Let each online CPU related to the policy update the period by their + * own. This will serialize with the framework on start/stop the lockup + * detector (softlockup_{start,stop}_all) and avoid potential race + * condition. Otherwise we may have below theoretical race condition: + * (core 0/1 share the same policy) + * [core 0] [core 1] + * hardlockup_detector_event_create() + * hw_nmi_get_sample_period() + * (cpufreq registered, notifier callback invoked) + * watchdog_freq_notifier_callback() + * watchdog_perf_update_period() + * (since core 1's event's not yet created, + * the period is not set) + * perf_event_create_kernel_counter() + * (event's period is SAFE_MAX_CPU_FREQ) + */ + for_each_cpu(cpu, policy->cpus) + smp_call_on_cpu(cpu, watchdog_perf_update_period, NULL, false); + + return NOTIFY_DONE; +} + +static struct notifier_block watchdog_freq_notifier = { + .notifier_call = watchdog_freq_notifier_callback, +}; + +static int __init init_watchdog_freq_notifier(void) +{ + return cpufreq_register_notifier(&watchdog_freq_notifier, + CPUFREQ_POLICY_NOTIFIER); +} +core_initcall(init_watchdog_freq_notifier); |
