summaryrefslogtreecommitdiff
path: root/arch/arm64
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2020-10-20 08:14:25 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2020-10-20 08:14:25 -0400
commit1b21c8db0e3b71523ada0cf568372ebfcf0d3466 (patch)
tree70e537fadc469c5fddabbdf116ea6090baaf4cf8 /arch/arm64
parent628ade2d0816b2675ab61ba6aadfc9a94e3e1589 (diff)
parent4e5dc64c43192b4fd4c96ac150a8f013065f5f5b (diff)
Merge tag 'kvmarm-5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm64 updates for Linux 5.10 - New page table code for both hypervisor and guest stage-2 - Introduction of a new EL2-private host context - Allow EL2 to have its own private per-CPU variables - Support of PMU event filtering - Complete rework of the Spectre mitigation
Diffstat (limited to 'arch/arm64')
-rw-r--r--arch/arm64/Kconfig26
-rw-r--r--arch/arm64/include/asm/assembler.h29
-rw-r--r--arch/arm64/include/asm/cpucaps.h4
-rw-r--r--arch/arm64/include/asm/cpufeature.h24
-rw-r--r--arch/arm64/include/asm/hyp_image.h36
-rw-r--r--arch/arm64/include/asm/kvm_asm.h192
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h14
-rw-r--r--arch/arm64/include/asm/kvm_host.h75
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h9
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h341
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h309
-rw-r--r--arch/arm64/include/asm/kvm_ptrauth.h6
-rw-r--r--arch/arm64/include/asm/mmu.h11
-rw-r--r--arch/arm64/include/asm/percpu.h28
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h24
-rw-r--r--arch/arm64/include/asm/pgtable-prot.h19
-rw-r--r--arch/arm64/include/asm/processor.h44
-rw-r--r--arch/arm64/include/asm/spectre.h32
-rw-r--r--arch/arm64/include/asm/stage2_pgtable.h215
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h25
-rw-r--r--arch/arm64/kernel/Makefile3
-rw-r--r--arch/arm64/kernel/cpu_errata.c487
-rw-r--r--arch/arm64/kernel/cpufeature.c51
-rw-r--r--arch/arm64/kernel/entry.S10
-rw-r--r--arch/arm64/kernel/hibernate.c6
-rw-r--r--arch/arm64/kernel/image-vars.h5
-rw-r--r--arch/arm64/kernel/process.c23
-rw-r--r--arch/arm64/kernel/proton-pack.c792
-rw-r--r--arch/arm64/kernel/ssbd.c129
-rw-r--r--arch/arm64/kernel/suspend.c3
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S13
-rw-r--r--arch/arm64/kvm/Kconfig3
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/arm.c110
-rw-r--r--arch/arm64/kvm/hyp.S34
-rw-r--r--arch/arm64/kvm/hyp/Makefile3
-rw-r--r--arch/arm64/kvm/hyp/entry.S95
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S107
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/debug-sr.h4
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h48
-rw-r--r--arch/arm64/kvm/hyp/nvhe/.gitignore2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile62
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S187
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S67
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c117
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp.lds.S19
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c52
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c2
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c892
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c35
-rw-r--r--arch/arm64/kvm/hyp/vhe/sysreg-sr.c4
-rw-r--r--arch/arm64/kvm/hypercalls.c33
-rw-r--r--arch/arm64/kvm/inject_fault.c1
-rw-r--r--arch/arm64/kvm/mmu.c1611
-rw-r--r--arch/arm64/kvm/pmu-emul.c195
-rw-r--r--arch/arm64/kvm/pmu.c13
-rw-r--r--arch/arm64/kvm/psci.c74
-rw-r--r--arch/arm64/kvm/reset.c44
-rw-r--r--arch/arm64/kvm/sys_regs.c8
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c24
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c4
61 files changed, 3417 insertions, 3420 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6d232837cbee..51259274a819 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1165,32 +1165,6 @@ config UNMAP_KERNEL_AT_EL0
If unsure, say Y.
-config HARDEN_BRANCH_PREDICTOR
- bool "Harden the branch predictor against aliasing attacks" if EXPERT
- default y
- help
- Speculation attacks against some high-performance processors rely on
- being able to manipulate the branch predictor for a victim context by
- executing aliasing branches in the attacker context. Such attacks
- can be partially mitigated against by clearing internal branch
- predictor state and limiting the prediction logic in some situations.
-
- This config option will take CPU-specific actions to harden the
- branch predictor against aliasing attacks and may rely on specific
- instruction sequences or control bits being set by the system
- firmware.
-
- If unsure, say Y.
-
-config ARM64_SSBD
- bool "Speculative Store Bypass Disable" if EXPERT
- default y
- help
- This enables mitigation of the bypassing of previous stores
- by speculative loads.
-
- If unsure, say Y.
-
config RODATA_FULL_DEFAULT_ENABLED
bool "Apply r/o permissions of VM areas also to their linear aliases"
default y
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 54d181177656..ddbe6bf00e33 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -219,6 +219,23 @@ lr .req x30 // link register
.endm
/*
+ * @dst: destination register
+ */
+#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)
+ .macro this_cpu_offset, dst
+ mrs \dst, tpidr_el2
+ .endm
+#else
+ .macro this_cpu_offset, dst
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
+ mrs \dst, tpidr_el1
+alternative_else
+ mrs \dst, tpidr_el2
+alternative_endif
+ .endm
+#endif
+
+ /*
* @dst: Result of per_cpu(sym, smp_processor_id()) (can be SP)
* @sym: The name of the per-cpu variable
* @tmp: scratch register
@@ -226,11 +243,7 @@ lr .req x30 // link register
.macro adr_this_cpu, dst, sym, tmp
adrp \tmp, \sym
add \dst, \tmp, #:lo12:\sym
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
- mrs \tmp, tpidr_el1
-alternative_else
- mrs \tmp, tpidr_el2
-alternative_endif
+ this_cpu_offset \tmp
add \dst, \dst, \tmp
.endm
@@ -241,11 +254,7 @@ alternative_endif
*/
.macro ldr_this_cpu dst, sym, tmp
adr_l \dst, \sym
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
- mrs \tmp, tpidr_el1
-alternative_else
- mrs \tmp, tpidr_el2
-alternative_endif
+ this_cpu_offset \tmp
ldr \dst, [\dst, \tmp]
.endm
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 07b643a70710..c4ac9a13ad5f 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -31,13 +31,13 @@
#define ARM64_HAS_DCPOP 21
#define ARM64_SVE 22
#define ARM64_UNMAP_KERNEL_AT_EL0 23
-#define ARM64_HARDEN_BRANCH_PREDICTOR 24
+#define ARM64_SPECTRE_V2 24
#define ARM64_HAS_RAS_EXTN 25
#define ARM64_WORKAROUND_843419 26
#define ARM64_HAS_CACHE_IDC 27
#define ARM64_HAS_CACHE_DIC 28
#define ARM64_HW_DBM 29
-#define ARM64_SSBD 30
+#define ARM64_SPECTRE_V4 30
#define ARM64_MISMATCHED_CACHE_TYPE 31
#define ARM64_HAS_STAGE2_FWB 32
#define ARM64_HAS_CRC32 33
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 89b4f0142c28..fba6700b457b 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -698,30 +698,6 @@ static inline bool system_supports_tlb_range(void)
cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
}
-#define ARM64_BP_HARDEN_UNKNOWN -1
-#define ARM64_BP_HARDEN_WA_NEEDED 0
-#define ARM64_BP_HARDEN_NOT_REQUIRED 1
-
-int get_spectre_v2_workaround_state(void);
-
-#define ARM64_SSBD_UNKNOWN -1
-#define ARM64_SSBD_FORCE_DISABLE 0
-#define ARM64_SSBD_KERNEL 1
-#define ARM64_SSBD_FORCE_ENABLE 2
-#define ARM64_SSBD_MITIGATED 3
-
-static inline int arm64_get_ssbd_state(void)
-{
-#ifdef CONFIG_ARM64_SSBD
- extern int ssbd_state;
- return ssbd_state;
-#else
- return ARM64_SSBD_UNKNOWN;
-#endif
-}
-
-void arm64_set_ssbd_mitigation(bool state);
-
extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
diff --git a/arch/arm64/include/asm/hyp_image.h b/arch/arm64/include/asm/hyp_image.h
new file mode 100644
index 000000000000..daa1a1da539e
--- /dev/null
+++ b/arch/arm64/include/asm/hyp_image.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 Google LLC.
+ * Written by David Brazdil <dbrazdil@google.com>
+ */
+
+#ifndef __ARM64_HYP_IMAGE_H__
+#define __ARM64_HYP_IMAGE_H__
+
+/*
+ * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_,
+ * to separate it from the kernel proper.
+ */
+#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym
+
+#ifdef LINKER_SCRIPT
+
+/*
+ * KVM nVHE ELF section names are prefixed with .hyp, to separate them
+ * from the kernel proper.
+ */
+#define HYP_SECTION_NAME(NAME) .hyp##NAME
+
+/* Defines an ELF hyp section from input section @NAME and its subsections. */
+#define HYP_SECTION(NAME) \
+ HYP_SECTION_NAME(NAME) : { *(NAME NAME##.*) }
+
+/*
+ * Defines a linker script alias of a kernel-proper symbol referenced by
+ * KVM nVHE hyp code.
+ */
+#define KVM_NVHE_ALIAS(sym) kvm_nvhe_sym(sym) = sym;
+
+#endif /* LINKER_SCRIPT */
+
+#endif /* __ARM64_HYP_IMAGE_H__ */
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 6f98fbd0ac81..54387ccd1ab2 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -7,11 +7,9 @@
#ifndef __ARM_KVM_ASM_H__
#define __ARM_KVM_ASM_H__
+#include <asm/hyp_image.h>
#include <asm/virt.h>
-#define VCPU_WORKAROUND_2_FLAG_SHIFT 0
-#define VCPU_WORKAROUND_2_FLAG (_AC(1, UL) << VCPU_WORKAROUND_2_FLAG_SHIFT)
-
#define ARM_EXIT_WITH_SERROR_BIT 31
#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT))
#define ARM_EXCEPTION_IS_TRAP(x) (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_TRAP)
@@ -38,17 +36,34 @@
#define __SMCCC_WORKAROUND_1_SMC_SZ 36
+#define KVM_HOST_SMCCC_ID(id) \
+ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+ ARM_SMCCC_SMC_64, \
+ ARM_SMCCC_OWNER_VENDOR_HYP, \
+ (id))
+
+#define KVM_HOST_SMCCC_FUNC(name) KVM_HOST_SMCCC_ID(__KVM_HOST_SMCCC_FUNC_##name)
+
+#define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init 0
+#define __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run 1
+#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2
+#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3
+#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4
+#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5
+#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6
+#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11
+#define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14
+
#ifndef __ASSEMBLY__
#include <linux/mm.h>
-/*
- * Translate name of a symbol defined in nVHE hyp to the name seen
- * by kernel proper. All nVHE symbols are prefixed by the build system
- * to avoid clashes with the VHE variants.
- */
-#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym
-
#define DECLARE_KVM_VHE_SYM(sym) extern char sym[]
#define DECLARE_KVM_NVHE_SYM(sym) extern char kvm_nvhe_sym(sym)[]
@@ -60,10 +75,53 @@
DECLARE_KVM_VHE_SYM(sym); \
DECLARE_KVM_NVHE_SYM(sym)
+#define DECLARE_KVM_VHE_PER_CPU(type, sym) \
+ DECLARE_PER_CPU(type, sym)
+#define DECLARE_KVM_NVHE_PER_CPU(type, sym) \
+ DECLARE_PER_CPU(type, kvm_nvhe_sym(sym))
+
+#define DECLARE_KVM_HYP_PER_CPU(type, sym) \
+ DECLARE_KVM_VHE_PER_CPU(type, sym); \
+ DECLARE_KVM_NVHE_PER_CPU(type, sym)
+
+/*
+ * Compute pointer to a symbol defined in nVHE percpu region.
+ * Returns NULL if percpu memory has not been allocated yet.
+ */
+#define this_cpu_ptr_nvhe_sym(sym) per_cpu_ptr_nvhe_sym(sym, smp_processor_id())
+#define per_cpu_ptr_nvhe_sym(sym, cpu) \
+ ({ \
+ unsigned long base, off; \
+ base = kvm_arm_hyp_percpu_base[cpu]; \
+ off = (unsigned long)&CHOOSE_NVHE_SYM(sym) - \
+ (unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start); \
+ base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL; \
+ })
+
+#if defined(__KVM_NVHE_HYPERVISOR__)
+
+#define CHOOSE_NVHE_SYM(sym) sym
+#define CHOOSE_HYP_SYM(sym) CHOOSE_NVHE_SYM(sym)
+
+/* The nVHE hypervisor shouldn't even try to access VHE symbols */
+extern void *__nvhe_undefined_symbol;
+#define CHOOSE_VHE_SYM(sym) __nvhe_undefined_symbol
+#define this_cpu_ptr_hyp_sym(sym) (&__nvhe_undefined_symbol)
+#define per_cpu_ptr_hyp_sym(sym, cpu) (&__nvhe_undefined_symbol)
+
+#elif defined(__KVM_VHE_HYPERVISOR__)
+
#define CHOOSE_VHE_SYM(sym) sym
-#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym)
+#define CHOOSE_HYP_SYM(sym) CHOOSE_VHE_SYM(sym)
+
+/* The VHE hypervisor shouldn't even try to access nVHE symbols */
+extern void *__vhe_undefined_symbol;
+#define CHOOSE_NVHE_SYM(sym) __vhe_undefined_symbol
+#define this_cpu_ptr_hyp_sym(sym) (&__vhe_undefined_symbol)
+#define per_cpu_ptr_hyp_sym(sym, cpu) (&__vhe_undefined_symbol)
+
+#else
-#ifndef __KVM_NVHE_HYPERVISOR__
/*
* BIG FAT WARNINGS:
*
@@ -75,12 +133,21 @@
* - Don't let the nVHE hypervisor have access to this, as it will
* pick the *wrong* symbol (yes, it runs at EL2...).
*/
-#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() ? CHOOSE_VHE_SYM(sym) \
+#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() \
+ ? CHOOSE_VHE_SYM(sym) \
: CHOOSE_NVHE_SYM(sym))
-#else
-/* The nVHE hypervisor shouldn't even try to access anything */
-extern void *__nvhe_undefined_symbol;
-#define CHOOSE_HYP_SYM(sym) __nvhe_undefined_symbol
+
+#define this_cpu_ptr_hyp_sym(sym) (is_kernel_in_hyp_mode() \
+ ? this_cpu_ptr(&sym) \
+ : this_cpu_ptr_nvhe_sym(sym))
+
+#define per_cpu_ptr_hyp_sym(sym, cpu) (is_kernel_in_hyp_mode() \
+ ? per_cpu_ptr(&sym, cpu) \
+ : per_cpu_ptr_nvhe_sym(sym, cpu))
+
+#define CHOOSE_VHE_SYM(sym) sym
+#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym)
+
#endif
/* Translate a kernel address @ptr into its equivalent linear mapping */
@@ -98,15 +165,19 @@ struct kvm_vcpu;
struct kvm_s2_mmu;
DECLARE_KVM_NVHE_SYM(__kvm_hyp_init);
+DECLARE_KVM_NVHE_SYM(__kvm_hyp_host_vector);
DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
#define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init)
+#define __kvm_hyp_host_vector CHOOSE_NVHE_SYM(__kvm_hyp_host_vector)
#define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector)
-#ifdef CONFIG_KVM_INDIRECT_VECTORS
+extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
+DECLARE_KVM_NVHE_SYM(__per_cpu_start);
+DECLARE_KVM_NVHE_SYM(__per_cpu_end);
+
extern atomic_t arm64_el2_vector_last_slot;
DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs);
#define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs)
-#endif
extern void __kvm_flush_vm_context(void);
extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
@@ -149,26 +220,6 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
addr; \
})
-/*
- * Home-grown __this_cpu_{ptr,read} variants that always work at HYP,
- * provided that sym is really a *symbol* and not a pointer obtained from
- * a data structure. As for SHIFT_PERCPU_PTR(), the creative casting keeps
- * sparse quiet.
- */
-#define __hyp_this_cpu_ptr(sym) \
- ({ \
- void *__ptr; \
- __verify_pcpu_ptr(&sym); \
- __ptr = hyp_symbol_addr(sym); \
- __ptr += read_sysreg(tpidr_el2); \
- (typeof(sym) __kernel __force *)__ptr; \
- })
-
-#define __hyp_this_cpu_read(sym) \
- ({ \
- *__hyp_this_cpu_ptr(sym); \
- })
-
#define __KVM_EXTABLE(from, to) \
" .pushsection __kvm_ex_table, \"a\"\n" \
" .align 3\n" \
@@ -199,20 +250,8 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
#else /* __ASSEMBLY__ */
-.macro hyp_adr_this_cpu reg, sym, tmp
- adr_l \reg, \sym
- mrs \tmp, tpidr_el2
- add \reg, \reg, \tmp
-.endm
-
-.macro hyp_ldr_this_cpu reg, sym, tmp
- adr_l \reg, \sym
- mrs \tmp, tpidr_el2
- ldr \reg, [\reg, \tmp]
-.endm
-
.macro get_host_ctxt reg, tmp
- hyp_adr_this_cpu \reg, kvm_host_data, \tmp
+ adr_this_cpu \reg, kvm_host_data, \tmp
add \reg, \reg, #HOST_DATA_CONTEXT
.endm
@@ -221,6 +260,16 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
.endm
+.macro get_loaded_vcpu vcpu, ctxt
+ adr_this_cpu \ctxt, kvm_hyp_ctxt, \vcpu
+ ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
+.endm
+
+.macro set_loaded_vcpu vcpu, ctxt, tmp
+ adr_this_cpu \ctxt, kvm_hyp_ctxt, \tmp
+ str \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
+.endm
+
/*
* KVM extable for unexpected exceptions.
* In the same format _asm_extable, but output to a different section so that
@@ -236,6 +285,45 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
.popsection
.endm
+#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x)
+#define CPU_LR_OFFSET CPU_XREG_OFFSET(30)
+#define CPU_SP_EL0_OFFSET (CPU_LR_OFFSET + 8)
+
+/*
+ * We treat x18 as callee-saved as the host may use it as a platform
+ * register (e.g. for shadow call stack).
+ */
+.macro save_callee_saved_regs ctxt
+ str x18, [\ctxt, #CPU_XREG_OFFSET(18)]
+ stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
+ stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
+ stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
+ stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
+ stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
+ stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
+.endm
+
+.macro restore_callee_saved_regs ctxt
+ // We require \ctxt is not x18-x28
+ ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)]
+ ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
+ ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
+ ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
+ ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
+ ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
+ ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
+.endm
+
+.macro save_sp_el0 ctxt, tmp
+ mrs \tmp, sp_el0
+ str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
+.endm
+
+.macro restore_sp_el0 ctxt, tmp
+ ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
+ msr sp_el0, \tmp
+.endm
+
#endif
#endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 1cc5f5f72d0b..5ef2669ccd6c 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -391,20 +391,6 @@ static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
}
-static inline bool kvm_arm_get_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG;
-}
-
-static inline void kvm_arm_set_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu,
- bool flag)
-{
- if (flag)
- vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;
- else
- vcpu->arch.workaround_flags &= ~VCPU_WORKAROUND_2_FLAG;
-}
-
static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
{
if (vcpu_mode_is_32bit(vcpu)) {
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 905c2b87e05a..0aecbab6a7fb 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -11,6 +11,7 @@
#ifndef __ARM64_KVM_HOST_H__
#define __ARM64_KVM_HOST_H__
+#include <linux/arm-smccc.h>
#include <linux/bitmap.h>
#include <linux/types.h>
#include <linux/jump_label.h>
@@ -79,8 +80,8 @@ struct kvm_s2_mmu {
* for vEL1/EL0 with vHCR_EL2.VM == 0. In that case, we use the
* canonical stage-2 page tables.
*/
- pgd_t *pgd;
phys_addr_t pgd_phys;
+ struct kvm_pgtable *pgt;
/* The last vcpu id that ran on each physical CPU */
int __percpu *last_vcpu_ran;
@@ -110,6 +111,13 @@ struct kvm_arch {
* supported.
*/
bool return_nisv_io_abort_to_user;
+
+ /*
+ * VM-wide PMU filter, implemented as a bitmap and big enough for
+ * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
+ */
+ unsigned long *pmu_filter;
+ unsigned int pmuver;
};
struct kvm_vcpu_fault_info {
@@ -262,8 +270,6 @@ struct kvm_host_data {
struct kvm_pmu_events pmu_events;
};
-typedef struct kvm_host_data kvm_host_data_t;
-
struct vcpu_reset_state {
unsigned long pc;
unsigned long r0;
@@ -480,18 +486,15 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_arm_halt_guest(struct kvm *kvm);
void kvm_arm_resume_guest(struct kvm *kvm);
-u64 __kvm_call_hyp(void *hypfn, ...);
-
-#define kvm_call_hyp_nvhe(f, ...) \
- do { \
- DECLARE_KVM_NVHE_SYM(f); \
- __kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \
- } while(0)
-
-#define kvm_call_hyp_nvhe_ret(f, ...) \
+#define kvm_call_hyp_nvhe(f, ...) \
({ \
- DECLARE_KVM_NVHE_SYM(f); \
- __kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \
+ struct arm_smccc_res res; \
+ \
+ arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f), \
+ ##__VA_ARGS__, &res); \
+ WARN_ON(res.a0 != SMCCC_RET_SUCCESS); \
+ \
+ res.a1; \
})
/*
@@ -517,7 +520,7 @@ u64 __kvm_call_hyp(void *hypfn, ...);
ret = f(__VA_ARGS__); \
isb(); \
} else { \
- ret = kvm_call_hyp_nvhe_ret(f, ##__VA_ARGS__); \
+ ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \
} \
\
ret; \
@@ -565,7 +568,7 @@ void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
-DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data);
+DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data);
static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
{
@@ -631,46 +634,6 @@ static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {}
static inline void kvm_clr_pmu_events(u32 clr) {}
#endif
-#define KVM_BP_HARDEN_UNKNOWN -1
-#define KVM_BP_HARDEN_WA_NEEDED 0
-#define KVM_BP_HARDEN_NOT_REQUIRED 1
-
-static inline int kvm_arm_harden_branch_predictor(void)
-{
- switch (get_spectre_v2_workaround_state()) {
- case ARM64_BP_HARDEN_WA_NEEDED:
- return KVM_BP_HARDEN_WA_NEEDED;
- case ARM64_BP_HARDEN_NOT_REQUIRED:
- return KVM_BP_HARDEN_NOT_REQUIRED;
- case ARM64_BP_HARDEN_UNKNOWN:
- default:
- return KVM_BP_HARDEN_UNKNOWN;
- }
-}
-
-#define KVM_SSBD_UNKNOWN -1
-#define KVM_SSBD_FORCE_DISABLE 0
-#define KVM_SSBD_KERNEL 1
-#define KVM_SSBD_FORCE_ENABLE 2
-#define KVM_SSBD_MITIGATED 3
-
-static inline int kvm_arm_have_ssbd(void)
-{
- switch (arm64_get_ssbd_state()) {
- case ARM64_SSBD_FORCE_DISABLE:
- return KVM_SSBD_FORCE_DISABLE;
- case ARM64_SSBD_KERNEL:
- return KVM_SSBD_KERNEL;
- case ARM64_SSBD_FORCE_ENABLE:
- return KVM_SSBD_FORCE_ENABLE;
- case ARM64_SSBD_MITIGATED:
- return KVM_SSBD_MITIGATED;
- case ARM64_SSBD_UNKNOWN:
- default:
- return KVM_SSBD_UNKNOWN;
- }
-}
-
void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu);
void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 46689e7db46c..6b664de5ec1f 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -12,6 +12,9 @@
#include <asm/alternative.h>
#include <asm/sysreg.h>
+DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
+
#define read_sysreg_elx(r,nvh,vh) \
({ \
u64 reg; \
@@ -87,11 +90,11 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
void deactivate_traps_vhe_put(void);
#endif
-u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
+u64 __guest_enter(struct kvm_vcpu *vcpu);
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt);
+void __noreturn hyp_panic(void);
#ifdef __KVM_NVHE_HYPERVISOR__
-void __noreturn __hyp_do_panic(unsigned long, ...);
+void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
#endif
#endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 189839c3706a..331394306cce 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -9,6 +9,7 @@
#include <asm/page.h>
#include <asm/memory.h>
+#include <asm/mmu.h>
#include <asm/cpufeature.h>
/*
@@ -43,16 +44,6 @@
* HYP_VA_MIN = 1 << (VA_BITS - 1)
* HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
*
- * This of course assumes that the trampoline page exists within the
- * VA_BITS range. If it doesn't, then it means we're in the odd case
- * where the kernel idmap (as well as HYP) uses more levels than the
- * kernel runtime page tables (as seen when the kernel is configured
- * for 4k pages, 39bits VA, and yet memory lives just above that
- * limit, forcing the idmap to use 4 levels of page tables while the
- * kernel itself only uses 3). In this particular case, it doesn't
- * matter which side of VA_BITS we use, as we're guaranteed not to
- * conflict with anything.
- *
* When using VHE, there are no separate hyp mappings and all KVM
* functionality is already mapped as part of the main kernel
* mappings, and none of this applies in that case.
@@ -117,15 +108,10 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
#define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm))
#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL))
-static inline bool kvm_page_empty(void *ptr)
-{
- struct page *ptr_page = virt_to_page(ptr);
- return page_count(ptr_page) == 1;
-}
-
+#include <asm/kvm_pgtable.h>
#include <asm/stage2_pgtable.h>
-int create_hyp_mappings(void *from, void *to, pgprot_t prot);
+int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
void __iomem **kaddr,
void __iomem **haddr);
@@ -141,149 +127,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
-
phys_addr_t kvm_mmu_get_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
int kvm_mmu_init(void);
-void kvm_clear_hyp_idmap(void);
-
-#define kvm_mk_pmd(ptep) \
- __pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE)
-#define kvm_mk_pud(pmdp) \
- __pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE)
-#define kvm_mk_p4d(pmdp) \
- __p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE)
-
-#define kvm_set_pud(pudp, pud) set_pud(pudp, pud)
-
-#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot)
-#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot)
-#define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot)
-
-#define kvm_pud_pfn(pud) pud_pfn(pud)
-
-#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd)
-#define kvm_pud_mkhuge(pud) pud_mkhuge(pud)
-
-static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
-{
- pte_val(pte) |= PTE_S2_RDWR;
- return pte;
-}
-
-static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
-{
- pmd_val(pmd) |= PMD_S2_RDWR;
- return pmd;
-}
-
-static inline pud_t kvm_s2pud_mkwrite(pud_t pud)
-{
- pud_val(pud) |= PUD_S2_RDWR;
- return pud;
-}
-
-static inline pte_t kvm_s2pte_mkexec(pte_t pte)
-{
- pte_val(pte) &= ~PTE_S2_XN;
- return pte;
-}
-
-static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd)
-{
- pmd_val(pmd) &= ~PMD_S2_XN;
- return pmd;
-}
-
-static inline pud_t kvm_s2pud_mkexec(pud_t pud)
-{
- pud_val(pud) &= ~PUD_S2_XN;
- return pud;
-}
-
-static inline void kvm_set_s2pte_readonly(pte_t *ptep)
-{
- pteval_t old_pteval, pteval;
-
- pteval = READ_ONCE(pte_val(*ptep));
- do {
- old_pteval = pteval;
- pteval &= ~PTE_S2_RDWR;
- pteval |= PTE_S2_RDONLY;
- pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
- } while (pteval != old_pteval);
-}
-
-static inline bool kvm_s2pte_readonly(pte_t *ptep)
-{
- return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY;
-}
-
-static inline bool kvm_s2pte_exec(pte_t *ptep)
-{
- return !(READ_ONCE(pte_val(*ptep)) & PTE_S2_XN);
-}
-
-static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp)
-{
- kvm_set_s2pte_readonly((pte_t *)pmdp);
-}
-
-static inline bool kvm_s2pmd_readonly(pmd_t *pmdp)
-{
- return kvm_s2pte_readonly((pte_t *)pmdp);
-}
-
-static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
-{
- return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
-}
-
-static inline void kvm_set_s2pud_readonly(pud_t *pudp)
-{
- kvm_set_s2pte_readonly((pte_t *)pudp);
-}
-
-static inline bool kvm_s2pud_readonly(pud_t *pudp)
-{
- return kvm_s2pte_readonly((pte_t *)pudp);
-}
-
-static inline bool kvm_s2pud_exec(pud_t *pudp)
-{
- return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN);
-}
-
-static inline pud_t kvm_s2pud_mkyoung(pud_t pud)
-{
- return pud_mkyoung(pud);
-}
-
-static inline bool kvm_s2pud_young(pud_t pud)
-{
- return pud_young(pud);
-}
-
-#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
-
-#ifdef __PAGETABLE_PMD_FOLDED
-#define hyp_pmd_table_empty(pmdp) (0)
-#else
-#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
-#endif
-
-#ifdef __PAGETABLE_PUD_FOLDED
-#define hyp_pud_table_empty(pudp) (0)
-#else
-#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp)
-#endif
-
-#ifdef __PAGETABLE_P4D_FOLDED
-#define hyp_p4d_table_empty(p4dp) (0)
-#else
-#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp)
-#endif
struct kvm;
@@ -325,77 +171,9 @@ static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
}
}
-static inline void __kvm_flush_dcache_pte(pte_t pte)
-{
- if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
- struct page *page = pte_page(pte);
- kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE);
- }
-}
-
-static inline void __kvm_flush_dcache_pmd(pmd_t pmd)
-{
- if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
- struct page *page = pmd_page(pmd);
- kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE);
- }
-}
-
-static inline void __kvm_flush_dcache_pud(pud_t pud)
-{
- if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
- struct page *page = pud_page(pud);
- kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE);
- }
-}
-
void kvm_set_way_flush(struct kvm_vcpu *vcpu);
void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
-static inline bool __kvm_cpu_uses_extended_idmap(void)
-{
- return __cpu_uses_extended_idmap_level();
-}
-
-static inline unsigned long __kvm_idmap_ptrs_per_pgd(void)
-{
- return idmap_ptrs_per_pgd;
-}
-
-/*
- * Can't use pgd_populate here, because the extended idmap adds an extra level
- * above CONFIG_PGTABLE_LEVELS (which is 2 or 3 if we're using the extended
- * idmap), and pgd_populate is only available if CONFIG_PGTABLE_LEVELS = 4.
- */
-static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
- pgd_t *hyp_pgd,
- pgd_t *merged_hyp_pgd,
- unsigned long hyp_idmap_start)
-{
- int idmap_idx;
- u64 pgd_addr;
-
- /*
- * Use the first entry to access the HYP mappings. It is
- * guaranteed to be free, otherwise we wouldn't use an
- * extended idmap.
- */
- VM_BUG_ON(pgd_val(merged_hyp_pgd[0]));
- pgd_addr = __phys_to_pgd_val(__pa(hyp_pgd));
- merged_hyp_pgd[0] = __pgd(pgd_addr | PMD_TYPE_TABLE);
-
- /*
- * Create another extended level entry that points to the boot HYP map,
- * which contains an ID mapping of the HYP init code. We essentially
- * merge the boot and runtime HYP maps by doing so, but they don't
- * overlap anyway, so this is fine.
- */
- idmap_idx = hyp_idmap_start >> VA_BITS;
- VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx]));
- pgd_addr = __phys_to_pgd_val(__pa(boot_hyp_pgd));
- merged_hyp_pgd[idmap_idx] = __pgd(pgd_addr | PMD_TYPE_TABLE);
-}
-
static inline unsigned int kvm_get_vmid_bits(void)
{
int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
@@ -430,19 +208,17 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
return ret;
}
-#ifdef CONFIG_KVM_INDIRECT_VECTORS
/*
* EL2 vectors can be mapped and rerouted in a number of ways,
* depending on the kernel configuration and CPU present:
*
- * - If the CPU has the ARM64_HARDEN_BRANCH_PREDICTOR cap, the
- * hardening sequence is placed in one of the vector slots, which is
- * executed before jumping to the real vectors.
+ * - If the CPU is affected by Spectre-v2, the hardening sequence is
+ * placed in one of the vector slots, which is executed before jumping
+ * to the real vectors.
*
- * - If the CPU has both the ARM64_HARDEN_EL2_VECTORS cap and the
- * ARM64_HARDEN_BRANCH_PREDICTOR cap, the slot containing the
- * hardening sequence is mapped next to the idmap page, and executed
- * before jumping to the real vectors.
+ * - If the CPU also has the ARM64_HARDEN_EL2_VECTORS cap, the slot
+ * containing the hardening sequence is mapped next to the idmap page,
+ * and executed before jumping to the real vectors.
*
* - If the CPU only has the ARM64_HARDEN_EL2_VECTORS cap, then an
* empty slot is selected, mapped next to the idmap page, and
@@ -452,19 +228,16 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
* VHE, as we don't have hypervisor-specific mappings. If the system
* is VHE and yet selects this capability, it will be ignored.
*/
-#include <asm/mmu.h>
-
extern void *__kvm_bp_vect_base;
extern int __kvm_harden_el2_vector_slot;
-/* This is called on both VHE and !VHE systems */
static inline void *kvm_get_hyp_vector(void)
{
struct bp_hardening_data *data = arm64_get_bp_hardening_data();
void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
int slot = -1;
- if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) && data->fn) {
+ if (cpus_have_const_cap(ARM64_SPECTRE_V2) && data->fn) {
vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
slot = data->hyp_vectors_slot;
}
@@ -481,102 +254,8 @@ static inline void *kvm_get_hyp_vector(void)
return vect;
}
-/* This is only called on a !VHE system */
-static inline int kvm_map_vectors(void)
-{
- /*
- * HBP = ARM64_HARDEN_BRANCH_PREDICTOR
- * HEL2 = ARM64_HARDEN_EL2_VECTORS
- *
- * !HBP + !HEL2 -> use direct vectors
- * HBP + !HEL2 -> use hardened vectors in place
- * !HBP + HEL2 -> allocate one vector slot and use exec mapping
- * HBP + HEL2 -> use hardened vertors and use exec mapping
- */
- if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) {
- __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs);
- __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base);
- }
-
- if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
- phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs);
- unsigned long size = __BP_HARDEN_HYP_VECS_SZ;
-
- /*
- * Always allocate a spare vector slot, as we don't
- * know yet which CPUs have a BP hardening slot that
- * we can reuse.
- */
- __kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot);
- BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS);
- return create_hyp_exec_mappings(vect_pa, size,
- &__kvm_bp_vect_base);
- }
-
- return 0;
-}
-#else
-static inline void *kvm_get_hyp_vector(void)
-{
- return kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
-}
-
-static inline int kvm_map_vectors(void)
-{
- return 0;
-}
-#endif
-
-#ifdef CONFIG_ARM64_SSBD
-DECLARE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
-
-static inline int hyp_map_aux_data(void)
-{
- int cpu, err;
-
- for_each_possible_cpu(cpu) {
- u64 *ptr;
-
- ptr = per_cpu_ptr(&arm64_ssbd_callback_required, cpu);
- err = create_hyp_mappings(ptr, ptr + 1, PAGE_HYP);
- if (err)
- return err;
- }
- return 0;
-}
-#else
-static inline int hyp_map_aux_data(void)
-{
- return 0;
-}
-#endif
-
#define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr)
-/*
- * Get the magic number 'x' for VTTBR:BADDR of this KVM instance.
- * With v8.2 LVA extensions, 'x' should be a minimum of 6 with
- * 52bit IPS.
- */
-static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels)
-{
- int x = ARM64_VTTBR_X(ipa_shift, levels);
-
- return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x;
-}
-
-static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels)
-{
- unsigned int x = arm64_vttbr_x(ipa_shift, levels);
-
- return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x);
-}
-
-static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
-{
- return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
-}
-
static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
{
struct kvm_vmid *vmid = &mmu->vmid;
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
new file mode 100644
index 000000000000..52ab38db04c7
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#ifndef __ARM64_KVM_PGTABLE_H__
+#define __ARM64_KVM_PGTABLE_H__
+
+#include <linux/bits.h>
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+
+typedef u64 kvm_pte_t;
+
+/**
+ * struct kvm_pgtable - KVM page-table.
+ * @ia_bits: Maximum input address size, in bits.
+ * @start_level: Level at which the page-table walk starts.
+ * @pgd: Pointer to the first top-level entry of the page-table.
+ * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
+ */
+struct kvm_pgtable {
+ u32 ia_bits;
+ u32 start_level;
+ kvm_pte_t *pgd;
+
+ /* Stage-2 only */
+ struct kvm_s2_mmu *mmu;
+};
+
+/**
+ * enum kvm_pgtable_prot - Page-table permissions and attributes.
+ * @KVM_PGTABLE_PROT_X: Execute permission.
+ * @KVM_PGTABLE_PROT_W: Write permission.
+ * @KVM_PGTABLE_PROT_R: Read permission.
+ * @KVM_PGTABLE_PROT_DEVICE: Device attributes.
+ */
+enum kvm_pgtable_prot {
+ KVM_PGTABLE_PROT_X = BIT(0),
+ KVM_PGTABLE_PROT_W = BIT(1),
+ KVM_PGTABLE_PROT_R = BIT(2),
+
+ KVM_PGTABLE_PROT_DEVICE = BIT(3),
+};
+
+#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
+#define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
+#define PAGE_HYP_RO (KVM_PGTABLE_PROT_R)
+#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
+
+/**
+ * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
+ * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid
+ * entries.
+ * @KVM_PGTABLE_WALK_TABLE_PRE: Visit table entries before their
+ * children.
+ * @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their
+ * children.
+ */
+enum kvm_pgtable_walk_flags {
+ KVM_PGTABLE_WALK_LEAF = BIT(0),
+ KVM_PGTABLE_WALK_TABLE_PRE = BIT(1),
+ KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
+};
+
+typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg);
+
+/**
+ * struct kvm_pgtable_walker - Hook into a page-table walk.
+ * @cb: Callback function to invoke during the walk.
+ * @arg: Argument passed to the callback function.
+ * @flags: Bitwise-OR of flags to identify the entry types on which to
+ * invoke the callback function.
+ */
+struct kvm_pgtable_walker {
+ const kvm_pgtable_visitor_fn_t cb;
+ void * const arg;
+ const enum kvm_pgtable_walk_flags flags;
+};
+
+/**
+ * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
+ * @pgt: Uninitialised page-table structure to initialise.
+ * @va_bits: Maximum virtual address bits.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits);
+
+/**
+ * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init().
+ *
+ * The page-table is assumed to be unreachable by any hardware walkers prior
+ * to freeing and therefore no TLB invalidation is performed.
+ */
+void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt);
+
+/**
+ * kvm_pgtable_hyp_map() - Install a mapping in a hypervisor stage-1 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init().
+ * @addr: Virtual address at which to place the mapping.
+ * @size: Size of the mapping.
+ * @phys: Physical address of the memory to map.
+ * @prot: Permissions and attributes for the mapping.
+ *
+ * The offset of @addr within a page is ignored, @size is rounded-up to
+ * the next page boundary and @phys is rounded-down to the previous page
+ * boundary.
+ *
+ * If device attributes are not explicitly requested in @prot, then the
+ * mapping will be normal, cacheable. Attempts to install a new mapping
+ * for a virtual address that is already mapped will be rejected with an
+ * error and a WARN().
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
+ enum kvm_pgtable_prot prot);
+
+/**
+ * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
+ * @pgt: Uninitialised page-table structure to initialise.
+ * @kvm: KVM structure representing the guest virtual machine.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm);
+
+/**
+ * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ *
+ * The page-table is assumed to be unreachable by any hardware walkers prior
+ * to freeing and therefore no TLB invalidation is performed.
+ */
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
+
+/**
+ * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address at which to place the mapping.
+ * @size: Size of the mapping.
+ * @phys: Physical address of the memory to map.
+ * @prot: Permissions and attributes for the mapping.
+ * @mc: Cache of pre-allocated GFP_PGTABLE_USER memory from which to
+ * allocate page-table pages.
+ *
+ * The offset of @addr within a page is ignored, @size is rounded-up to
+ * the next page boundary and @phys is rounded-down to the previous page
+ * boundary.
+ *
+ * If device attributes are not explicitly requested in @prot, then the
+ * mapping will be normal, cacheable.
+ *
+ * Note that this function will both coalesce existing table entries and split
+ * existing block mappings, relying on page-faults to fault back areas outside
+ * of the new mapping lazily.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ u64 phys, enum kvm_pgtable_prot prot,
+ struct kvm_mmu_memory_cache *mc);
+
+/**
+ * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address from which to remove the mapping.
+ * @size: Size of the mapping.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * TLB invalidation is performed for each page-table entry cleared during the
+ * unmapping operation and the reference count for the page-table page
+ * containing the cleared entry is decremented, with unreferenced pages being
+ * freed. Unmapping a cacheable page will ensure that it is clean to the PoC if
+ * FWB is not supported by the CPU.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
+ * without TLB invalidation.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address from which to write-protect,
+ * @size: Size of the range.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * Note that it is the caller's responsibility to invalidate the TLB after
+ * calling this function to ensure that the updated permissions are visible
+ * to the CPUs.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * set the access flag in that entry.
+ *
+ * Return: The old page-table entry prior to setting the flag, 0 on failure.
+ */
+kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * clear the access flag in that entry.
+ *
+ * Note that it is the caller's responsibility to invalidate the TLB after
+ * calling this function to ensure that the updated permissions are visible
+ * to the CPUs.
+ *
+ * Return: The old page-table entry prior to clearing the flag, 0 on failure.
+ */
+kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
+ * page-table entry.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address to identify the page-table entry.
+ * @prot: Additional permissions to grant for the mapping.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * relax the permissions in that entry according to the read, write and
+ * execute permissions specified by @prot. No permissions are removed, and
+ * TLB invalidation is performed after updating the entry.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
+ enum kvm_pgtable_prot prot);
+
+/**
+ * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
+ * access flag set.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * Return: True if the page-table entry has the access flag set, false otherwise.
+ */
+bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
+ * of Coherency for guest stage-2 address
+ * range.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address from which to flush.
+ * @size: Size of the range.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_walk() - Walk a page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_*_init().
+ * @addr: Input address for the start of the walk.
+ * @size: Size of the range to walk.
+ * @walker: Walker callback description.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * The walker will walk the page-table entries corresponding to the input
+ * address range specified, visiting entries according to the walker flags.
+ * Invalid entries are treated as leaf entries. Leaf entries are reloaded
+ * after invoking the walker callback, allowing the walker to descend into
+ * a newly installed table.
+ *
+ * Returning a negative error code from the walker callback function will
+ * terminate the walk immediately with the same error code.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_pgtable_walker *walker);
+
+#endif /* __ARM64_KVM_PGTABLE_H__ */
diff --git a/arch/arm64/include/asm/kvm_ptrauth.h b/arch/arm64/include/asm/kvm_ptrauth.h
index 0ddf98c3ba9f..0cd0965255d2 100644
--- a/arch/arm64/include/asm/kvm_ptrauth.h
+++ b/arch/arm64/include/asm/kvm_ptrauth.h
@@ -60,7 +60,7 @@
.endm
/*
- * Both ptrauth_switch_to_guest and ptrauth_switch_to_host macros will
+ * Both ptrauth_switch_to_guest and ptrauth_switch_to_hyp macros will
* check for the presence ARM64_HAS_ADDRESS_AUTH, which is defined as
* (ARM64_HAS_ADDRESS_AUTH_ARCH || ARM64_HAS_ADDRESS_AUTH_IMP_DEF) and
* then proceed ahead with the save/restore of Pointer Authentication
@@ -78,7 +78,7 @@ alternative_else_nop_endif
.L__skip_switch\@:
.endm
-.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
+.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3
alternative_if_not ARM64_HAS_ADDRESS_AUTH
b .L__skip_switch\@
alternative_else_nop_endif
@@ -96,7 +96,7 @@ alternative_else_nop_endif
#else /* !CONFIG_ARM64_PTR_AUTH */
.macro ptrauth_switch_to_guest g_ctxt, reg1, reg2, reg3
.endm
-.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
+.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3
.endm
#endif /* CONFIG_ARM64_PTR_AUTH */
#endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index a7a5ecaa2e83..cbff2d42c1d8 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -45,7 +45,6 @@ struct bp_hardening_data {
bp_hardening_cb_t fn;
};
-#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
@@ -57,21 +56,13 @@ static inline void arm64_apply_bp_hardening(void)
{
struct bp_hardening_data *d;
- if (!cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR))
+ if (!cpus_have_const_cap(ARM64_SPECTRE_V2))
return;
d = arm64_get_bp_hardening_data();
if (d->fn)
d->fn();
}
-#else
-static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
-{
- return NULL;
-}
-
-static inline void arm64_apply_bp_hardening(void) { }
-#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */
extern void arm64_memblock_init(void);
extern void paging_init(void);
diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 0b6409b89e5e..1599e17379d8 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -19,7 +19,16 @@ static inline void set_my_cpu_offset(unsigned long off)
:: "r" (off) : "memory");
}
-static inline unsigned long __my_cpu_offset(void)
+static inline unsigned long __hyp_my_cpu_offset(void)
+{
+ /*
+ * Non-VHE hyp code runs with preemption disabled. No need to hazard
+ * the register access against barrier() as in __kern_my_cpu_offset.
+ */
+ return read_sysreg(tpidr_el2);
+}
+
+static inline unsigned long __kern_my_cpu_offset(void)
{
unsigned long off;
@@ -35,7 +44,12 @@ static inline unsigned long __my_cpu_offset(void)
return off;
}
-#define __my_cpu_offset __my_cpu_offset()
+
+#ifdef __KVM_NVHE_HYPERVISOR__
+#define __my_cpu_offset __hyp_my_cpu_offset()
+#else
+#define __my_cpu_offset __kern_my_cpu_offset()
+#endif
#define PERCPU_RW_OPS(sz) \
static inline unsigned long __percpu_read_##sz(void *ptr) \
@@ -227,4 +241,14 @@ PERCPU_RET_OP(add, add, ldadd)
#include <asm-generic/percpu.h>
+/* Redefine macros for nVHE hyp under DEBUG_PREEMPT to avoid its dependencies. */
+#if defined(__KVM_NVHE_HYPERVISOR__) && defined(CONFIG_DEBUG_PREEMPT)
+#undef this_cpu_ptr
+#define this_cpu_ptr raw_cpu_ptr
+#undef __this_cpu_read
+#define __this_cpu_read raw_cpu_read
+#undef __this_cpu_write
+#define __this_cpu_write raw_cpu_write
+#endif
+
#endif /* __ASM_PERCPU_H */
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index d400a4d9aee2..2b5f822fb033 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -156,7 +156,6 @@
#define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */
#define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */
#define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */
-#define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */
#define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
#ifdef CONFIG_ARM64_PA_BITS_52
@@ -173,34 +172,11 @@
#define PTE_ATTRINDX_MASK (_AT(pteval_t, 7) << 2)
/*
- * 2nd stage PTE definitions
- */
-#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */
-#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
-#define PTE_S2_XN (_AT(pteval_t, 2) << 53) /* XN[1:0] */
-#define PTE_S2_SW_RESVD (_AT(pteval_t, 15) << 55) /* Reserved for SW */
-
-#define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */
-#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
-#define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */
-#define PMD_S2_SW_RESVD (_AT(pmdval_t, 15) << 55) /* Reserved for SW */
-
-#define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */
-#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */
-#define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */
-
-/*
* Memory Attribute override for Stage-2 (MemAttr[3:0])
*/
#define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2)
/*
- * EL2/HYP PTE/PMD definitions
- */
-#define PMD_HYP PMD_SECT_USER
-#define PTE_HYP PTE_USER
-
-/*
* Highest possible physical address supported.
*/
#define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS)
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 4d867c6446c4..8f094c43072a 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -56,7 +56,6 @@ extern bool arm64_use_ng_mappings;
#define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
#define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
-#define _HYP_PAGE_DEFAULT _PAGE_DEFAULT
#define PAGE_KERNEL __pgprot(PROT_NORMAL)
#define PAGE_KERNEL_RO __pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY)
@@ -64,11 +63,6 @@ extern bool arm64_use_ng_mappings;
#define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN)
#define PAGE_KERNEL_EXEC_CONT __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)
-#define PAGE_HYP __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
-#define PAGE_HYP_EXEC __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
-#define PAGE_HYP_RO __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
-#define PAGE_HYP_DEVICE __pgprot(_PROT_DEFAULT | PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_HYP | PTE_HYP_XN)
-
#define PAGE_S2_MEMATTR(attr) \
({ \
u64 __val; \
@@ -79,19 +73,6 @@ extern bool arm64_use_ng_mappings;
__val; \
})
-#define PAGE_S2_XN \
- ({ \
- u64 __val; \
- if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) \
- __val = 0; \
- else \
- __val = PTE_S2_XN; \
- __val; \
- })
-
-#define PAGE_S2 __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN)
-#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
-
#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
/* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */
#define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 240fe5e5b720..7d90ea2e2063 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -38,6 +38,7 @@
#include <asm/pgtable-hwdef.h>
#include <asm/pointer_auth.h>
#include <asm/ptrace.h>
+#include <asm/spectre.h>
#include <asm/types.h>
/*
@@ -197,40 +198,15 @@ static inline void start_thread_common(struct pt_regs *regs, unsigned long pc)
regs->pmr_save = GIC_PRIO_IRQON;
}
-static inline void set_ssbs_bit(struct pt_regs *regs)
-{
- regs->pstate |= PSR_SSBS_BIT;
-}
-
-static inline void set_compat_ssbs_bit(struct pt_regs *regs)
-{
- regs->pstate |= PSR_AA32_SSBS_BIT;
-}
-
static inline void start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
{
start_thread_common(regs, pc);
regs->pstate = PSR_MODE_EL0t;
-
- if (arm64_get_ssbd_state() != ARM64_SSBD_FORCE_ENABLE)
- set_ssbs_bit(regs);
-
+ spectre_v4_enable_task_mitigation(current);
regs->sp = sp;
}
-static inline bool is_ttbr0_addr(unsigned long addr)
-{
- /* entry assembly clears tags for TTBR0 addrs */
- return addr < TASK_SIZE;
-}
-
-static inline bool is_ttbr1_addr(unsigned long addr)
-{
- /* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
- return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
-}
-
#ifdef CONFIG_COMPAT
static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
@@ -244,13 +220,23 @@ static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
regs->pstate |= PSR_AA32_E_BIT;
#endif
- if (arm64_get_ssbd_state() != ARM64_SSBD_FORCE_ENABLE)
- set_compat_ssbs_bit(regs);
-
+ spectre_v4_enable_task_mitigation(current);
regs->compat_sp = sp;
}
#endif
+static inline bool is_ttbr0_addr(unsigned long addr)
+{
+ /* entry assembly clears tags for TTBR0 addrs */
+ return addr < TASK_SIZE;
+}
+
+static inline bool is_ttbr1_addr(unsigned long addr)
+{
+ /* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
+ return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
+}
+
/* Forward declaration, a strange C thing */
struct task_struct;
diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h
new file mode 100644
index 000000000000..fcdfbce302bd
--- /dev/null
+++ b/arch/arm64/include/asm/spectre.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Interface for managing mitigations for Spectre vulnerabilities.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#ifndef __ASM_SPECTRE_H
+#define __ASM_SPECTRE_H
+
+#include <asm/cpufeature.h>
+
+/* Watch out, ordering is important here. */
+enum mitigation_state {
+ SPECTRE_UNAFFECTED,
+ SPECTRE_MITIGATED,
+ SPECTRE_VULNERABLE,
+};
+
+struct task_struct;
+
+enum mitigation_state arm64_get_spectre_v2_state(void);
+bool has_spectre_v2(const struct arm64_cpu_capabilities *cap, int scope);
+void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
+
+enum mitigation_state arm64_get_spectre_v4_state(void);
+bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope);
+void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
+void spectre_v4_enable_task_mitigation(struct task_struct *tsk);
+
+#endif /* __ASM_SPECTRE_H */
diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h
index 996bf98f0cab..fe341a6578c3 100644
--- a/arch/arm64/include/asm/stage2_pgtable.h
+++ b/arch/arm64/include/asm/stage2_pgtable.h
@@ -8,7 +8,6 @@
#ifndef __ARM64_S2_PGTABLE_H_
#define __ARM64_S2_PGTABLE_H_
-#include <linux/hugetlb.h>
#include <linux/pgtable.h>
/*
@@ -37,217 +36,12 @@
#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1)
/*
- * The number of PTRS across all concatenated stage2 tables given by the
- * number of bits resolved at the initial level.
- * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA),
- * in which case, stage2_pgd_ptrs will have one entry.
- */
-#define pgd_ptrs_shift(ipa, pgdir_shift) \
- ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0)
-#define __s2_pgd_ptrs(ipa, lvls) \
- (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls))))
-#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t))
-
-#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
-#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
-
-/*
* kvm_mmmu_cache_min_pages() is the number of pages required to install
* a stage-2 translation. We pre-allocate the entry level page table at
* the VM creation.
*/
#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1)
-/* Stage2 PUD definitions when the level is present */
-static inline bool kvm_stage2_has_pud(struct kvm *kvm)
-{
- return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3);
-}
-
-#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
-#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT)
-#define S2_PUD_MASK (~(S2_PUD_SIZE - 1))
-
-#define stage2_pgd_none(kvm, pgd) pgd_none(pgd)
-#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd)
-#define stage2_pgd_present(kvm, pgd) pgd_present(pgd)
-#define stage2_pgd_populate(kvm, pgd, p4d) pgd_populate(NULL, pgd, p4d)
-
-static inline p4d_t *stage2_p4d_offset(struct kvm *kvm,
- pgd_t *pgd, unsigned long address)
-{
- return p4d_offset(pgd, address);
-}
-
-static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d)
-{
-}
-
-static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp)
-{
- return false;
-}
-
-static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm,
- phys_addr_t addr, phys_addr_t end)
-{
- return end;
-}
-
-static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d)
-{
- if (kvm_stage2_has_pud(kvm))
- return p4d_none(p4d);
- else
- return 0;
-}
-
-static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp)
-{
- if (kvm_stage2_has_pud(kvm))
- p4d_clear(p4dp);
-}
-
-static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d)
-{
- if (kvm_stage2_has_pud(kvm))
- return p4d_present(p4d);
- else
- return 1;
-}
-
-static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud)
-{
- if (kvm_stage2_has_pud(kvm))
- p4d_populate(NULL, p4d, pud);
-}
-
-static inline pud_t *stage2_pud_offset(struct kvm *kvm,
- p4d_t *p4d, unsigned long address)
-{
- if (kvm_stage2_has_pud(kvm))
- return pud_offset(p4d, address);
- else
- return (pud_t *)p4d;
-}
-
-static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
-{
- if (kvm_stage2_has_pud(kvm))
- free_page((unsigned long)pud);
-}
-
-static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp)
-{
- if (kvm_stage2_has_pud(kvm))
- return kvm_page_empty(pudp);
- else
- return false;
-}
-
-static inline phys_addr_t
-stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
- if (kvm_stage2_has_pud(kvm)) {
- phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
-
- return (boundary - 1 < end - 1) ? boundary : end;
- } else {
- return end;
- }
-}
-
-/* Stage2 PMD definitions when the level is present */
-static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
-{
- return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2);
-}
-
-#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
-#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT)
-#define S2_PMD_MASK (~(S2_PMD_SIZE - 1))
-
-static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud)
-{
- if (kvm_stage2_has_pmd(kvm))
- return pud_none(pud);
- else
- return 0;
-}
-
-static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud)
-{
- if (kvm_stage2_has_pmd(kvm))
- pud_clear(pud);
-}
-
-static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud)
-{
- if (kvm_stage2_has_pmd(kvm))
- return pud_present(pud);
- else
- return 1;
-}
-
-static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd)
-{
- if (kvm_stage2_has_pmd(kvm))
- pud_populate(NULL, pud, pmd);
-}
-
-static inline pmd_t *stage2_pmd_offset(struct kvm *kvm,
- pud_t *pud, unsigned long address)
-{
- if (kvm_stage2_has_pmd(kvm))
- return pmd_offset(pud, address);
- else
- return (pmd_t *)pud;
-}
-
-static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd)
-{
- if (kvm_stage2_has_pmd(kvm))
- free_page((unsigned long)pmd);
-}
-
-static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud)
-{
- if (kvm_stage2_has_pmd(kvm))
- return pud_huge(pud);
- else
- return 0;
-}
-
-static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp)
-{
- if (kvm_stage2_has_pmd(kvm))
- return kvm_page_empty(pmdp);
- else
- return 0;
-}
-
-static inline phys_addr_t
-stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
- if (kvm_stage2_has_pmd(kvm)) {
- phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
-
- return (boundary - 1 < end - 1) ? boundary : end;
- } else {
- return end;
- }
-}
-
-static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep)
-{
- return kvm_page_empty(ptep);
-}
-
-static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr)
-{
- return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1));
-}
-
static inline phys_addr_t
stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
{
@@ -256,13 +50,4 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
return (boundary - 1 < end - 1) ? boundary : end;
}
-/*
- * Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and
- * the architectural page-table level.
- */
-#define S2_NO_LEVEL_HINT 0
-#define S2_PUD_LEVEL 1
-#define S2_PMD_LEVEL 2
-#define S2_PTE_LEVEL 3
-
#endif /* __ARM64_S2_PGTABLE_H_ */
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index ba85bb23f060..1c17c3a24411 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -159,6 +159,21 @@ struct kvm_sync_regs {
struct kvm_arch_memory_slot {
};
+/*
+ * PMU filter structure. Describe a range of events with a particular
+ * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER.
+ */
+struct kvm_pmu_event_filter {
+ __u16 base_event;
+ __u16 nevents;
+
+#define KVM_PMU_EVENT_ALLOW 0
+#define KVM_PMU_EVENT_DENY 1
+
+ __u8 action;
+ __u8 pad[3];
+};
+
/* for KVM_GET/SET_VCPU_EVENTS */
struct kvm_vcpu_events {
struct {
@@ -242,6 +257,15 @@ struct kvm_vcpu_events {
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL 0
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL 1
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED 2
+
+/*
+ * Only two states can be presented by the host kernel:
+ * - NOT_REQUIRED: the guest doesn't need to do anything
+ * - NOT_AVAIL: the guest isn't mitigated (it can still use SSBS if available)
+ *
+ * All the other values are deprecated. The host still accepts all
+ * values (they are ABI), but will narrow them to the above two.
+ */
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 KVM_REG_ARM_FW_REG(2)
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL 0
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN 1
@@ -329,6 +353,7 @@ struct kvm_vcpu_events {
#define KVM_ARM_VCPU_PMU_V3_CTRL 0
#define KVM_ARM_VCPU_PMU_V3_IRQ 0
#define KVM_ARM_VCPU_PMU_V3_INIT 1
+#define KVM_ARM_VCPU_PMU_V3_FILTER 2
#define KVM_ARM_VCPU_TIMER_CTRL 1
#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0
#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index a561cbb91d4d..bd12b9a2ab4a 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -19,7 +19,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \
return_address.o cpuinfo.o cpu_errata.o \
cpufeature.o alternative.o cacheinfo.o \
smp.o smp_spin_table.o topology.o smccc-call.o \
- syscall.o
+ syscall.o proton-pack.o
targets += efi-entry.o
@@ -59,7 +59,6 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o
-obj-$(CONFIG_ARM64_SSBD) += ssbd.o
obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index c332d49780dc..6c8303559beb 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -106,365 +106,6 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap)
sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0);
}
-atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1);
-
-#include <asm/mmu_context.h>
-#include <asm/cacheflush.h>
-
-DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
-
-#ifdef CONFIG_KVM_INDIRECT_VECTORS
-static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
- const char *hyp_vecs_end)
-{
- void *dst = lm_alias(__bp_harden_hyp_vecs + slot * SZ_2K);
- int i;
-
- for (i = 0; i < SZ_2K; i += 0x80)
- memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
-
- __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
-}
-
-static void install_bp_hardening_cb(bp_hardening_cb_t fn,
- const char *hyp_vecs_start,
- const char *hyp_vecs_end)
-{
- static DEFINE_RAW_SPINLOCK(bp_lock);
- int cpu, slot = -1;
-
- /*
- * detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if
- * we're a guest. Skip the hyp-vectors work.
- */
- if (!hyp_vecs_start) {
- __this_cpu_write(bp_hardening_data.fn, fn);
- return;
- }
-
- raw_spin_lock(&bp_lock);
- for_each_possible_cpu(cpu) {
- if (per_cpu(bp_hardening_data.fn, cpu) == fn) {
- slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu);
- break;
- }
- }
-
- if (slot == -1) {
- slot = atomic_inc_return(&arm64_el2_vector_last_slot);
- BUG_ON(slot >= BP_HARDEN_EL2_SLOTS);
- __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
- }
-
- __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot);
- __this_cpu_write(bp_hardening_data.fn, fn);
- raw_spin_unlock(&bp_lock);
-}
-#else
-static void install_bp_hardening_cb(bp_hardening_cb_t fn,
- const char *hyp_vecs_start,
- const char *hyp_vecs_end)
-{
- __this_cpu_write(bp_hardening_data.fn, fn);
-}
-#endif /* CONFIG_KVM_INDIRECT_VECTORS */
-
-#include <linux/arm-smccc.h>
-
-static void __maybe_unused call_smc_arch_workaround_1(void)
-{
- arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
-}
-
-static void call_hvc_arch_workaround_1(void)
-{
- arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
-}
-
-static void qcom_link_stack_sanitization(void)
-{
- u64 tmp;
-
- asm volatile("mov %0, x30 \n"
- ".rept 16 \n"
- "bl . + 4 \n"
- ".endr \n"
- "mov x30, %0 \n"
- : "=&r" (tmp));
-}
-
-static bool __nospectre_v2;
-static int __init parse_nospectre_v2(char *str)
-{
- __nospectre_v2 = true;
- return 0;
-}
-early_param("nospectre_v2", parse_nospectre_v2);
-
-/*
- * -1: No workaround
- * 0: No workaround required
- * 1: Workaround installed
- */
-static int detect_harden_bp_fw(void)
-{
- bp_hardening_cb_t cb;
- void *smccc_start, *smccc_end;
- struct arm_smccc_res res;
- u32 midr = read_cpuid_id();
-
- arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_1, &res);
-
- switch ((int)res.a0) {
- case 1:
- /* Firmware says we're just fine */
- return 0;
- case 0:
- break;
- default:
- return -1;
- }
-
- switch (arm_smccc_1_1_get_conduit()) {
- case SMCCC_CONDUIT_HVC:
- cb = call_hvc_arch_workaround_1;
- /* This is a guest, no need to patch KVM vectors */
- smccc_start = NULL;
- smccc_end = NULL;
- break;
-
-#if IS_ENABLED(CONFIG_KVM)
- case SMCCC_CONDUIT_SMC:
- cb = call_smc_arch_workaround_1;
- smccc_start = __smccc_workaround_1_smc;
- smccc_end = __smccc_workaround_1_smc +
- __SMCCC_WORKAROUND_1_SMC_SZ;
- break;
-#endif
-
- default:
- return -1;
- }
-
- if (((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR) ||
- ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1))
- cb = qcom_link_stack_sanitization;
-
- if (IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR))
- install_bp_hardening_cb(cb, smccc_start, smccc_end);
-
- return 1;
-}
-
-DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
-
-int ssbd_state __read_mostly = ARM64_SSBD_KERNEL;
-static bool __ssb_safe = true;
-
-static const struct ssbd_options {
- const char *str;
- int state;
-} ssbd_options[] = {
- { "force-on", ARM64_SSBD_FORCE_ENABLE, },
- { "force-off", ARM64_SSBD_FORCE_DISABLE, },
- { "kernel", ARM64_SSBD_KERNEL, },
-};
-
-static int __init ssbd_cfg(char *buf)
-{
- int i;
-
- if (!buf || !buf[0])
- return -EINVAL;
-
- for (i = 0; i < ARRAY_SIZE(ssbd_options); i++) {
- int len = strlen(ssbd_options[i].str);
-
- if (strncmp(buf, ssbd_options[i].str, len))
- continue;
-
- ssbd_state = ssbd_options[i].state;
- return 0;
- }
-
- return -EINVAL;
-}
-early_param("ssbd", ssbd_cfg);
-
-void __init arm64_update_smccc_conduit(struct alt_instr *alt,
- __le32 *origptr, __le32 *updptr,
- int nr_inst)
-{
- u32 insn;
-
- BUG_ON(nr_inst != 1);
-
- switch (arm_smccc_1_1_get_conduit()) {
- case SMCCC_CONDUIT_HVC:
- insn = aarch64_insn_get_hvc_value();
- break;
- case SMCCC_CONDUIT_SMC:
- insn = aarch64_insn_get_smc_value();
- break;
- default:
- return;
- }
-
- *updptr = cpu_to_le32(insn);
-}
-
-void __init arm64_enable_wa2_handling(struct alt_instr *alt,
- __le32 *origptr, __le32 *updptr,
- int nr_inst)
-{
- BUG_ON(nr_inst != 1);
- /*
- * Only allow mitigation on EL1 entry/exit and guest
- * ARCH_WORKAROUND_2 handling if the SSBD state allows it to
- * be flipped.
- */
- if (arm64_get_ssbd_state() == ARM64_SSBD_KERNEL)
- *updptr = cpu_to_le32(aarch64_insn_gen_nop());
-}
-
-void arm64_set_ssbd_mitigation(bool state)
-{
- int conduit;
-
- if (!IS_ENABLED(CONFIG_ARM64_SSBD)) {
- pr_info_once("SSBD disabled by kernel configuration\n");
- return;
- }
-
- if (this_cpu_has_cap(ARM64_SSBS)) {
- if (state)
- asm volatile(SET_PSTATE_SSBS(0));
- else
- asm volatile(SET_PSTATE_SSBS(1));
- return;
- }
-
- conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, state,
- NULL);
-
- WARN_ON_ONCE(conduit == SMCCC_CONDUIT_NONE);
-}
-
-static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
- int scope)
-{
- struct arm_smccc_res res;
- bool required = true;
- s32 val;
- bool this_cpu_safe = false;
- int conduit;
-
- WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
-
- if (cpu_mitigations_off())
- ssbd_state = ARM64_SSBD_FORCE_DISABLE;
-
- /* delay setting __ssb_safe until we get a firmware response */
- if (is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list))
- this_cpu_safe = true;
-
- if (this_cpu_has_cap(ARM64_SSBS)) {
- if (!this_cpu_safe)
- __ssb_safe = false;
- required = false;
- goto out_printmsg;
- }
-
- conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_2, &res);
-
- if (conduit == SMCCC_CONDUIT_NONE) {
- ssbd_state = ARM64_SSBD_UNKNOWN;
- if (!this_cpu_safe)
- __ssb_safe = false;
- return false;
- }
-
- val = (s32)res.a0;
-
- switch (val) {
- case SMCCC_RET_NOT_SUPPORTED:
- ssbd_state = ARM64_SSBD_UNKNOWN;
- if (!this_cpu_safe)
- __ssb_safe = false;
- return false;
-
- /* machines with mixed mitigation requirements must not return this */
- case SMCCC_RET_NOT_REQUIRED:
- pr_info_once("%s mitigation not required\n", entry->desc);
- ssbd_state = ARM64_SSBD_MITIGATED;
- return false;
-
- case SMCCC_RET_SUCCESS:
- __ssb_safe = false;
- required = true;
- break;
-
- case 1: /* Mitigation not required on this CPU */
- required = false;
- break;
-
- default:
- WARN_ON(1);
- if (!this_cpu_safe)
- __ssb_safe = false;
- return false;
- }
-
- switch (ssbd_state) {
- case ARM64_SSBD_FORCE_DISABLE:
- arm64_set_ssbd_mitigation(false);
- required = false;
- break;
-
- case ARM64_SSBD_KERNEL:
- if (required) {
- __this_cpu_write(arm64_ssbd_callback_required, 1);
- arm64_set_ssbd_mitigation(true);
- }
- break;
-
- case ARM64_SSBD_FORCE_ENABLE:
- arm64_set_ssbd_mitigation(true);
- required = true;
- break;
-
- default:
- WARN_ON(1);
- break;
- }
-
-out_printmsg:
- switch (ssbd_state) {
- case ARM64_SSBD_FORCE_DISABLE:
- pr_info_once("%s disabled from command-line\n", entry->desc);
- break;
-
- case ARM64_SSBD_FORCE_ENABLE:
- pr_info_once("%s forced from command-line\n", entry->desc);
- break;
- }
-
- return required;
-}
-
-/* known invulnerable cores */
-static const struct midr_range arm64_ssb_cpus[] = {
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
- MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
- MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
- MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
- {},
-};
-
#ifdef CONFIG_ARM64_ERRATUM_1463225
DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
@@ -519,83 +160,6 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused)
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, \
CAP_MIDR_RANGE_LIST(midr_list)
-/* Track overall mitigation state. We are only mitigated if all cores are ok */
-static bool __hardenbp_enab = true;
-static bool __spectrev2_safe = true;
-
-int get_spectre_v2_workaround_state(void)
-{
- if (__spectrev2_safe)
- return ARM64_BP_HARDEN_NOT_REQUIRED;
-
- if (!__hardenbp_enab)
- return ARM64_BP_HARDEN_UNKNOWN;
-
- return ARM64_BP_HARDEN_WA_NEEDED;
-}
-
-/*
- * List of CPUs that do not need any Spectre-v2 mitigation at all.
- */
-static const struct midr_range spectre_v2_safe_list[] = {
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
- MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
- MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
- MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
- MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
- { /* sentinel */ }
-};
-
-/*
- * Track overall bp hardening for all heterogeneous cores in the machine.
- * We are only considered "safe" if all booted cores are known safe.
- */
-static bool __maybe_unused
-check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope)
-{
- int need_wa;
-
- WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
-
- /* If the CPU has CSV2 set, we're safe */
- if (cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64PFR0_EL1),
- ID_AA64PFR0_CSV2_SHIFT))
- return false;
-
- /* Alternatively, we have a list of unaffected CPUs */
- if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list))
- return false;
-
- /* Fallback to firmware detection */
- need_wa = detect_harden_bp_fw();
- if (!need_wa)
- return false;
-
- __spectrev2_safe = false;
-
- if (!IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR)) {
- pr_warn_once("spectrev2 mitigation disabled by kernel configuration\n");
- __hardenbp_enab = false;
- return false;
- }
-
- /* forced off */
- if (__nospectre_v2 || cpu_mitigations_off()) {
- pr_info_once("spectrev2 mitigation disabled by command line option\n");
- __hardenbp_enab = false;
- return false;
- }
-
- if (need_wa < 0) {
- pr_warn_once("ARM_SMCCC_ARCH_WORKAROUND_1 missing from firmware\n");
- __hardenbp_enab = false;
- }
-
- return (need_wa > 0);
-}
-
static const __maybe_unused struct midr_range tx2_family_cpus[] = {
MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
@@ -887,9 +451,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
},
#endif
{
- .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+ .desc = "Spectre-v2",
+ .capability = ARM64_SPECTRE_V2,
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
- .matches = check_branch_predictor,
+ .matches = has_spectre_v2,
+ .cpu_enable = spectre_v2_enable_mitigation,
},
#ifdef CONFIG_RANDOMIZE_BASE
{
@@ -899,11 +465,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
},
#endif
{
- .desc = "Speculative Store Bypass Disable",
- .capability = ARM64_SSBD,
+ .desc = "Spectre-v4",
+ .capability = ARM64_SPECTRE_V4,
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
- .matches = has_ssbd_mitigation,
- .midr_range_list = arm64_ssb_cpus,
+ .matches = has_spectre_v4,
+ .cpu_enable = spectre_v4_enable_mitigation,
},
#ifdef CONFIG_ARM64_ERRATUM_1418040
{
@@ -956,40 +522,3 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
{
}
};
-
-ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "Mitigation: __user pointer sanitization\n");
-}
-
-ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- switch (get_spectre_v2_workaround_state()) {
- case ARM64_BP_HARDEN_NOT_REQUIRED:
- return sprintf(buf, "Not affected\n");
- case ARM64_BP_HARDEN_WA_NEEDED:
- return sprintf(buf, "Mitigation: Branch predictor hardening\n");
- case ARM64_BP_HARDEN_UNKNOWN:
- default:
- return sprintf(buf, "Vulnerable\n");
- }
-}
-
-ssize_t cpu_show_spec_store_bypass(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- if (__ssb_safe)
- return sprintf(buf, "Not affected\n");
-
- switch (ssbd_state) {
- case ARM64_SSBD_KERNEL:
- case ARM64_SSBD_FORCE_ENABLE:
- if (IS_ENABLED(CONFIG_ARM64_SSBD))
- return sprintf(buf,
- "Mitigation: Speculative Store Bypass disabled via prctl\n");
- }
-
- return sprintf(buf, "Vulnerable\n");
-}
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 6424584be01e..a4debb63ebfb 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -227,7 +227,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI),
FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_BT_SHIFT, 4, 0),
ARM64_FTR_END,
@@ -487,7 +487,7 @@ static const struct arm64_ftr_bits ftr_id_pfr1[] = {
};
static const struct arm64_ftr_bits ftr_id_pfr2[] = {
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_CSV3_SHIFT, 4, 0),
ARM64_FTR_END,
};
@@ -1583,48 +1583,6 @@ static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused)
WARN_ON(val & (7 << 27 | 7 << 21));
}
-#ifdef CONFIG_ARM64_SSBD
-static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr)
-{
- if (user_mode(regs))
- return 1;
-
- if (instr & BIT(PSTATE_Imm_shift))
- regs->pstate |= PSR_SSBS_BIT;
- else
- regs->pstate &= ~PSR_SSBS_BIT;
-
- arm64_skip_faulting_instruction(regs, 4);
- return 0;
-}
-
-static struct undef_hook ssbs_emulation_hook = {
- .instr_mask = ~(1U << PSTATE_Imm_shift),
- .instr_val = 0xd500401f | PSTATE_SSBS,
- .fn = ssbs_emulation_handler,
-};
-
-static void cpu_enable_ssbs(const struct arm64_cpu_capabilities *__unused)
-{
- static bool undef_hook_registered = false;
- static DEFINE_RAW_SPINLOCK(hook_lock);
-
- raw_spin_lock(&hook_lock);
- if (!undef_hook_registered) {
- register_undef_hook(&ssbs_emulation_hook);
- undef_hook_registered = true;
- }
- raw_spin_unlock(&hook_lock);
-
- if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
- sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS);
- arm64_set_ssbd_mitigation(false);
- } else {
- arm64_set_ssbd_mitigation(true);
- }
-}
-#endif /* CONFIG_ARM64_SSBD */
-
#ifdef CONFIG_ARM64_PAN
static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
{
@@ -1976,19 +1934,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.field_pos = ID_AA64ISAR0_CRC32_SHIFT,
.min_field_value = 1,
},
-#ifdef CONFIG_ARM64_SSBD
{
.desc = "Speculative Store Bypassing Safe (SSBS)",
.capability = ARM64_SSBS,
- .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
.matches = has_cpuid_feature,
.sys_reg = SYS_ID_AA64PFR1_EL1,
.field_pos = ID_AA64PFR1_SSBS_SHIFT,
.sign = FTR_UNSIGNED,
.min_field_value = ID_AA64PFR1_SSBS_PSTATE_ONLY,
- .cpu_enable = cpu_enable_ssbs,
},
-#endif
#ifdef CONFIG_ARM64_CNP
{
.desc = "Common not Private translations",
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 55af8b504b65..aeb337029d56 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -132,9 +132,8 @@ alternative_else_nop_endif
* them if required.
*/
.macro apply_ssbd, state, tmp1, tmp2
-#ifdef CONFIG_ARM64_SSBD
-alternative_cb arm64_enable_wa2_handling
- b .L__asm_ssbd_skip\@
+alternative_cb spectre_v4_patch_fw_mitigation_enable
+ b .L__asm_ssbd_skip\@ // Patched to NOP
alternative_cb_end
ldr_this_cpu \tmp2, arm64_ssbd_callback_required, \tmp1
cbz \tmp2, .L__asm_ssbd_skip\@
@@ -142,11 +141,10 @@ alternative_cb_end
tbnz \tmp2, #TIF_SSBD, .L__asm_ssbd_skip\@
mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2
mov w1, #\state
-alternative_cb arm64_update_smccc_conduit
+alternative_cb spectre_v4_patch_fw_mitigation_conduit
nop // Patched to SMC/HVC #0
alternative_cb_end
.L__asm_ssbd_skip\@:
-#endif
.endm
.macro kernel_entry, el, regsize = 64
@@ -697,11 +695,9 @@ el0_irq_naked:
bl trace_hardirqs_off
#endif
-#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
tbz x22, #55, 1f
bl do_el0_irq_bp_hardening
1:
-#endif
irq_handler
#ifdef CONFIG_TRACE_IRQFLAGS
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 68e14152d6e9..c7b00120dc3e 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -332,11 +332,7 @@ int swsusp_arch_suspend(void)
* mitigation off behind our back, let's set the state
* to what we expect it to be.
*/
- switch (arm64_get_ssbd_state()) {
- case ARM64_SSBD_FORCE_ENABLE:
- case ARM64_SSBD_KERNEL:
- arm64_set_ssbd_mitigation(true);
- }
+ spectre_v4_enable_mitigation(NULL);
}
local_daif_restore(flags);
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 8982b68289b7..fbd4b6b1fde5 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -61,16 +61,11 @@ __efistub__ctype = _ctype;
* memory mappings.
*/
-#define KVM_NVHE_ALIAS(sym) __kvm_nvhe_##sym = sym;
-
/* Alternative callbacks for init-time patching of nVHE hyp code. */
-KVM_NVHE_ALIAS(arm64_enable_wa2_handling);
KVM_NVHE_ALIAS(kvm_patch_vector_branch);
KVM_NVHE_ALIAS(kvm_update_va_mask);
/* Global kernel state accessed by nVHE hyp code. */
-KVM_NVHE_ALIAS(arm64_ssbd_callback_required);
-KVM_NVHE_ALIAS(kvm_host_data);
KVM_NVHE_ALIAS(kvm_vgic_global_state);
/* Kernel constant needed to compute idmap addresses. */
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index f1804496b935..085d8ca39e47 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -21,6 +21,7 @@
#include <linux/lockdep.h>
#include <linux/mman.h>
#include <linux/mm.h>
+#include <linux/nospec.h>
#include <linux/stddef.h>
#include <linux/sysctl.h>
#include <linux/unistd.h>
@@ -421,8 +422,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
cpus_have_const_cap(ARM64_HAS_UAO))
childregs->pstate |= PSR_UAO_BIT;
- if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE)
- set_ssbs_bit(childregs);
+ spectre_v4_enable_task_mitigation(p);
if (system_uses_irq_prio_masking())
childregs->pmr_save = GIC_PRIO_IRQON;
@@ -472,8 +472,6 @@ void uao_thread_switch(struct task_struct *next)
*/
static void ssbs_thread_switch(struct task_struct *next)
{
- struct pt_regs *regs = task_pt_regs(next);
-
/*
* Nothing to do for kernel threads, but 'regs' may be junk
* (e.g. idle task) so check the flags and bail early.
@@ -485,18 +483,10 @@ static void ssbs_thread_switch(struct task_struct *next)
* If all CPUs implement the SSBS extension, then we just need to
* context-switch the PSTATE field.
*/
- if (cpu_have_feature(cpu_feature(SSBS)))
- return;
-
- /* If the mitigation is enabled, then we leave SSBS clear. */
- if ((arm64_get_ssbd_state() == ARM64_SSBD_FORCE_ENABLE) ||
- test_tsk_thread_flag(next, TIF_SSBD))
+ if (cpus_have_const_cap(ARM64_SSBS))
return;
- if (compat_user_mode(regs))
- set_compat_ssbs_bit(regs);
- else if (user_mode(regs))
- set_ssbs_bit(regs);
+ spectre_v4_enable_task_mitigation(next);
}
/*
@@ -620,6 +610,11 @@ void arch_setup_new_exec(void)
current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0;
ptrauth_thread_init_user(current);
+
+ if (task_spec_ssb_noexec(current)) {
+ arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS,
+ PR_SPEC_ENABLE);
+ }
}
#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
new file mode 100644
index 000000000000..68b710f1b43f
--- /dev/null
+++ b/arch/arm64/kernel/proton-pack.c
@@ -0,0 +1,792 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Handle detection, reporting and mitigation of Spectre v1, v2 and v4, as
+ * detailed at:
+ *
+ * https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability
+ *
+ * This code was originally written hastily under an awful lot of stress and so
+ * aspects of it are somewhat hacky. Unfortunately, changing anything in here
+ * instantly makes me feel ill. Thanks, Jann. Thann.
+ *
+ * Copyright (C) 2018 ARM Ltd, All Rights Reserved.
+ * Copyright (C) 2020 Google LLC
+ *
+ * "If there's something strange in your neighbourhood, who you gonna call?"
+ *
+ * Authors: Will Deacon <will@kernel.org> and Marc Zyngier <maz@kernel.org>
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/cpu.h>
+#include <linux/device.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
+#include <linux/sched/task_stack.h>
+
+#include <asm/spectre.h>
+#include <asm/traps.h>
+
+/*
+ * We try to ensure that the mitigation state can never change as the result of
+ * onlining a late CPU.
+ */
+static void update_mitigation_state(enum mitigation_state *oldp,
+ enum mitigation_state new)
+{
+ enum mitigation_state state;
+
+ do {
+ state = READ_ONCE(*oldp);
+ if (new <= state)
+ break;
+
+ /* Userspace almost certainly can't deal with this. */
+ if (WARN_ON(system_capabilities_finalized()))
+ break;
+ } while (cmpxchg_relaxed(oldp, state, new) != state);
+}
+
+/*
+ * Spectre v1.
+ *
+ * The kernel can't protect userspace for this one: it's each person for
+ * themselves. Advertise what we're doing and be done with it.
+ */
+ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+}
+
+/*
+ * Spectre v2.
+ *
+ * This one sucks. A CPU is either:
+ *
+ * - Mitigated in hardware and advertised by ID_AA64PFR0_EL1.CSV2.
+ * - Mitigated in hardware and listed in our "safe list".
+ * - Mitigated in software by firmware.
+ * - Mitigated in software by a CPU-specific dance in the kernel.
+ * - Vulnerable.
+ *
+ * It's not unlikely for different CPUs in a big.LITTLE system to fall into
+ * different camps.
+ */
+static enum mitigation_state spectre_v2_state;
+
+static bool __read_mostly __nospectre_v2;
+static int __init parse_spectre_v2_param(char *str)
+{
+ __nospectre_v2 = true;
+ return 0;
+}
+early_param("nospectre_v2", parse_spectre_v2_param);
+
+static bool spectre_v2_mitigations_off(void)
+{
+ bool ret = __nospectre_v2 || cpu_mitigations_off();
+
+ if (ret)
+ pr_info_once("spectre-v2 mitigation disabled by command line option\n");
+
+ return ret;
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ switch (spectre_v2_state) {
+ case SPECTRE_UNAFFECTED:
+ return sprintf(buf, "Not affected\n");
+ case SPECTRE_MITIGATED:
+ return sprintf(buf, "Mitigation: Branch predictor hardening\n");
+ case SPECTRE_VULNERABLE:
+ fallthrough;
+ default:
+ return sprintf(buf, "Vulnerable\n");
+ }
+}
+
+static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void)
+{
+ u64 pfr0;
+ static const struct midr_range spectre_v2_safe_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
+ MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
+ MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
+ MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
+ MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
+ { /* sentinel */ }
+ };
+
+ /* If the CPU has CSV2 set, we're safe */
+ pfr0 = read_cpuid(ID_AA64PFR0_EL1);
+ if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT))
+ return SPECTRE_UNAFFECTED;
+
+ /* Alternatively, we have a list of unaffected CPUs */
+ if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list))
+ return SPECTRE_UNAFFECTED;
+
+ return SPECTRE_VULNERABLE;
+}
+
+#define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED (1)
+
+static enum mitigation_state spectre_v2_get_cpu_fw_mitigation_state(void)
+{
+ int ret;
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+ ARM_SMCCC_ARCH_WORKAROUND_1, &res);
+
+ ret = res.a0;
+ switch (ret) {
+ case SMCCC_RET_SUCCESS:
+ return SPECTRE_MITIGATED;
+ case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
+ return SPECTRE_UNAFFECTED;
+ default:
+ fallthrough;
+ case SMCCC_RET_NOT_SUPPORTED:
+ return SPECTRE_VULNERABLE;
+ }
+}
+
+bool has_spectre_v2(const struct arm64_cpu_capabilities *entry, int scope)
+{
+ WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
+
+ if (spectre_v2_get_cpu_hw_mitigation_state() == SPECTRE_UNAFFECTED)
+ return false;
+
+ if (spectre_v2_get_cpu_fw_mitigation_state() == SPECTRE_UNAFFECTED)
+ return false;
+
+ return true;
+}
+
+DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
+
+enum mitigation_state arm64_get_spectre_v2_state(void)
+{
+ return spectre_v2_state;
+}
+
+#ifdef CONFIG_KVM
+#include <asm/cacheflush.h>
+#include <asm/kvm_asm.h>
+
+atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1);
+
+static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
+ const char *hyp_vecs_end)
+{
+ void *dst = lm_alias(__bp_harden_hyp_vecs + slot * SZ_2K);
+ int i;
+
+ for (i = 0; i < SZ_2K; i += 0x80)
+ memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
+
+ __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
+}
+
+static void install_bp_hardening_cb(bp_hardening_cb_t fn)
+{
+ static DEFINE_RAW_SPINLOCK(bp_lock);
+ int cpu, slot = -1;
+ const char *hyp_vecs_start = __smccc_workaround_1_smc;
+ const char *hyp_vecs_end = __smccc_workaround_1_smc +
+ __SMCCC_WORKAROUND_1_SMC_SZ;
+
+ /*
+ * detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if
+ * we're a guest. Skip the hyp-vectors work.
+ */
+ if (!is_hyp_mode_available()) {
+ __this_cpu_write(bp_hardening_data.fn, fn);
+ return;
+ }
+
+ raw_spin_lock(&bp_lock);
+ for_each_possible_cpu(cpu) {
+ if (per_cpu(bp_hardening_data.fn, cpu) == fn) {
+ slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu);
+ break;
+ }
+ }
+
+ if (slot == -1) {
+ slot = atomic_inc_return(&arm64_el2_vector_last_slot);
+ BUG_ON(slot >= BP_HARDEN_EL2_SLOTS);
+ __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
+ }
+
+ __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot);
+ __this_cpu_write(bp_hardening_data.fn, fn);
+ raw_spin_unlock(&bp_lock);
+}
+#else
+static void install_bp_hardening_cb(bp_hardening_cb_t fn)
+{
+ __this_cpu_write(bp_hardening_data.fn, fn);
+}
+#endif /* CONFIG_KVM */
+
+static void call_smc_arch_workaround_1(void)
+{
+ arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
+}
+
+static void call_hvc_arch_workaround_1(void)
+{
+ arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
+}
+
+static void qcom_link_stack_sanitisation(void)
+{
+ u64 tmp;
+
+ asm volatile("mov %0, x30 \n"
+ ".rept 16 \n"
+ "bl . + 4 \n"
+ ".endr \n"
+ "mov x30, %0 \n"
+ : "=&r" (tmp));
+}
+
+static enum mitigation_state spectre_v2_enable_fw_mitigation(void)
+{
+ bp_hardening_cb_t cb;
+ enum mitigation_state state;
+
+ state = spectre_v2_get_cpu_fw_mitigation_state();
+ if (state != SPECTRE_MITIGATED)
+ return state;
+
+ if (spectre_v2_mitigations_off())
+ return SPECTRE_VULNERABLE;
+
+ switch (arm_smccc_1_1_get_conduit()) {
+ case SMCCC_CONDUIT_HVC:
+ cb = call_hvc_arch_workaround_1;
+ break;
+
+ case SMCCC_CONDUIT_SMC:
+ cb = call_smc_arch_workaround_1;
+ break;
+
+ default:
+ return SPECTRE_VULNERABLE;
+ }
+
+ install_bp_hardening_cb(cb);
+ return SPECTRE_MITIGATED;
+}
+
+static enum mitigation_state spectre_v2_enable_sw_mitigation(void)
+{
+ u32 midr;
+
+ if (spectre_v2_mitigations_off())
+ return SPECTRE_VULNERABLE;
+
+ midr = read_cpuid_id();
+ if (((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR) &&
+ ((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR_V1))
+ return SPECTRE_VULNERABLE;
+
+ install_bp_hardening_cb(qcom_link_stack_sanitisation);
+ return SPECTRE_MITIGATED;
+}
+
+void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
+{
+ enum mitigation_state state;
+
+ WARN_ON(preemptible());
+
+ state = spectre_v2_get_cpu_hw_mitigation_state();
+ if (state == SPECTRE_VULNERABLE)
+ state = spectre_v2_enable_fw_mitigation();
+ if (state == SPECTRE_VULNERABLE)
+ state = spectre_v2_enable_sw_mitigation();
+
+ update_mitigation_state(&spectre_v2_state, state);
+}
+
+/*
+ * Spectre v4.
+ *
+ * If you thought Spectre v2 was nasty, wait until you see this mess. A CPU is
+ * either:
+ *
+ * - Mitigated in hardware and listed in our "safe list".
+ * - Mitigated in hardware via PSTATE.SSBS.
+ * - Mitigated in software by firmware (sometimes referred to as SSBD).
+ *
+ * Wait, that doesn't sound so bad, does it? Keep reading...
+ *
+ * A major source of headaches is that the software mitigation is enabled both
+ * on a per-task basis, but can also be forced on for the kernel, necessitating
+ * both context-switch *and* entry/exit hooks. To make it even worse, some CPUs
+ * allow EL0 to toggle SSBS directly, which can end up with the prctl() state
+ * being stale when re-entering the kernel. The usual big.LITTLE caveats apply,
+ * so you can have systems that have both firmware and SSBS mitigations. This
+ * means we actually have to reject late onlining of CPUs with mitigations if
+ * all of the currently onlined CPUs are safelisted, as the mitigation tends to
+ * be opt-in for userspace. Yes, really, the cure is worse than the disease.
+ *
+ * The only good part is that if the firmware mitigation is present, then it is
+ * present for all CPUs, meaning we don't have to worry about late onlining of a
+ * vulnerable CPU if one of the boot CPUs is using the firmware mitigation.
+ *
+ * Give me a VAX-11/780 any day of the week...
+ */
+static enum mitigation_state spectre_v4_state;
+
+/* This is the per-cpu state tracking whether we need to talk to firmware */
+DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
+
+enum spectre_v4_policy {
+ SPECTRE_V4_POLICY_MITIGATION_DYNAMIC,
+ SPECTRE_V4_POLICY_MITIGATION_ENABLED,
+ SPECTRE_V4_POLICY_MITIGATION_DISABLED,
+};
+
+static enum spectre_v4_policy __read_mostly __spectre_v4_policy;
+
+static const struct spectre_v4_param {
+ const char *str;
+ enum spectre_v4_policy policy;
+} spectre_v4_params[] = {
+ { "force-on", SPECTRE_V4_POLICY_MITIGATION_ENABLED, },
+ { "force-off", SPECTRE_V4_POLICY_MITIGATION_DISABLED, },
+ { "kernel", SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, },
+};
+static int __init parse_spectre_v4_param(char *str)
+{
+ int i;
+
+ if (!str || !str[0])
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(spectre_v4_params); i++) {
+ const struct spectre_v4_param *param = &spectre_v4_params[i];
+
+ if (strncmp(str, param->str, strlen(param->str)))
+ continue;
+
+ __spectre_v4_policy = param->policy;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+early_param("ssbd", parse_spectre_v4_param);
+
+/*
+ * Because this was all written in a rush by people working in different silos,
+ * we've ended up with multiple command line options to control the same thing.
+ * Wrap these up in some helpers, which prefer disabling the mitigation if faced
+ * with contradictory parameters. The mitigation is always either "off",
+ * "dynamic" or "on".
+ */
+static bool spectre_v4_mitigations_off(void)
+{
+ bool ret = cpu_mitigations_off() ||
+ __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED;
+
+ if (ret)
+ pr_info_once("spectre-v4 mitigation disabled by command-line option\n");
+
+ return ret;
+}
+
+/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */
+static bool spectre_v4_mitigations_dynamic(void)
+{
+ return !spectre_v4_mitigations_off() &&
+ __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DYNAMIC;
+}
+
+static bool spectre_v4_mitigations_on(void)
+{
+ return !spectre_v4_mitigations_off() &&
+ __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_ENABLED;
+}
+
+ssize_t cpu_show_spec_store_bypass(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ switch (spectre_v4_state) {
+ case SPECTRE_UNAFFECTED:
+ return sprintf(buf, "Not affected\n");
+ case SPECTRE_MITIGATED:
+ return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n");
+ case SPECTRE_VULNERABLE:
+ fallthrough;
+ default:
+ return sprintf(buf, "Vulnerable\n");
+ }
+}
+
+enum mitigation_state arm64_get_spectre_v4_state(void)
+{
+ return spectre_v4_state;
+}
+
+static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void)
+{
+ static const struct midr_range spectre_v4_safe_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
+ MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
+ MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
+ MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
+ { /* sentinel */ },
+ };
+
+ if (is_midr_in_range_list(read_cpuid_id(), spectre_v4_safe_list))
+ return SPECTRE_UNAFFECTED;
+
+ /* CPU features are detected first */
+ if (this_cpu_has_cap(ARM64_SSBS))
+ return SPECTRE_MITIGATED;
+
+ return SPECTRE_VULNERABLE;
+}
+
+static enum mitigation_state spectre_v4_get_cpu_fw_mitigation_state(void)
+{
+ int ret;
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+ ARM_SMCCC_ARCH_WORKAROUND_2, &res);
+
+ ret = res.a0;
+ switch (ret) {
+ case SMCCC_RET_SUCCESS:
+ return SPECTRE_MITIGATED;
+ case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
+ fallthrough;
+ case SMCCC_RET_NOT_REQUIRED:
+ return SPECTRE_UNAFFECTED;
+ default:
+ fallthrough;
+ case SMCCC_RET_NOT_SUPPORTED:
+ return SPECTRE_VULNERABLE;
+ }
+}
+
+bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope)
+{
+ enum mitigation_state state;
+
+ WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
+
+ state = spectre_v4_get_cpu_hw_mitigation_state();
+ if (state == SPECTRE_VULNERABLE)
+ state = spectre_v4_get_cpu_fw_mitigation_state();
+
+ return state != SPECTRE_UNAFFECTED;
+}
+
+static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr)
+{
+ if (user_mode(regs))
+ return 1;
+
+ if (instr & BIT(PSTATE_Imm_shift))
+ regs->pstate |= PSR_SSBS_BIT;
+ else
+ regs->pstate &= ~PSR_SSBS_BIT;
+
+ arm64_skip_faulting_instruction(regs, 4);
+ return 0;
+}
+
+static struct undef_hook ssbs_emulation_hook = {
+ .instr_mask = ~(1U << PSTATE_Imm_shift),
+ .instr_val = 0xd500401f | PSTATE_SSBS,
+ .fn = ssbs_emulation_handler,
+};
+
+static enum mitigation_state spectre_v4_enable_hw_mitigation(void)
+{
+ static bool undef_hook_registered = false;
+ static DEFINE_RAW_SPINLOCK(hook_lock);
+ enum mitigation_state state;
+
+ /*
+ * If the system is mitigated but this CPU doesn't have SSBS, then
+ * we must be on the safelist and there's nothing more to do.
+ */
+ state = spectre_v4_get_cpu_hw_mitigation_state();
+ if (state != SPECTRE_MITIGATED || !this_cpu_has_cap(ARM64_SSBS))
+ return state;
+
+ raw_spin_lock(&hook_lock);
+ if (!undef_hook_registered) {
+ register_undef_hook(&ssbs_emulation_hook);
+ undef_hook_registered = true;
+ }
+ raw_spin_unlock(&hook_lock);
+
+ if (spectre_v4_mitigations_off()) {
+ sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS);
+ asm volatile(SET_PSTATE_SSBS(1));
+ return SPECTRE_VULNERABLE;
+ }
+
+ /* SCTLR_EL1.DSSBS was initialised to 0 during boot */
+ asm volatile(SET_PSTATE_SSBS(0));
+ return SPECTRE_MITIGATED;
+}
+
+/*
+ * Patch a branch over the Spectre-v4 mitigation code with a NOP so that
+ * we fallthrough and check whether firmware needs to be called on this CPU.
+ */
+void __init spectre_v4_patch_fw_mitigation_enable(struct alt_instr *alt,
+ __le32 *origptr,
+ __le32 *updptr, int nr_inst)
+{
+ BUG_ON(nr_inst != 1); /* Branch -> NOP */
+
+ if (spectre_v4_mitigations_off())
+ return;
+
+ if (cpus_have_final_cap(ARM64_SSBS))
+ return;
+
+ if (spectre_v4_mitigations_dynamic())
+ *updptr = cpu_to_le32(aarch64_insn_gen_nop());
+}
+
+/*
+ * Patch a NOP in the Spectre-v4 mitigation code with an SMC/HVC instruction
+ * to call into firmware to adjust the mitigation state.
+ */
+void __init spectre_v4_patch_fw_mitigation_conduit(struct alt_instr *alt,
+ __le32 *origptr,
+ __le32 *updptr, int nr_inst)
+{
+ u32 insn;
+
+ BUG_ON(nr_inst != 1); /* NOP -> HVC/SMC */
+
+ switch (arm_smccc_1_1_get_conduit()) {
+ case SMCCC_CONDUIT_HVC:
+ insn = aarch64_insn_get_hvc_value();
+ break;
+ case SMCCC_CONDUIT_SMC:
+ insn = aarch64_insn_get_smc_value();
+ break;
+ default:
+ return;
+ }
+
+ *updptr = cpu_to_le32(insn);
+}
+
+static enum mitigation_state spectre_v4_enable_fw_mitigation(void)
+{
+ enum mitigation_state state;
+
+ state = spectre_v4_get_cpu_fw_mitigation_state();
+ if (state != SPECTRE_MITIGATED)
+ return state;
+
+ if (spectre_v4_mitigations_off()) {
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, false, NULL);
+ return SPECTRE_VULNERABLE;
+ }
+
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, true, NULL);
+
+ if (spectre_v4_mitigations_dynamic())
+ __this_cpu_write(arm64_ssbd_callback_required, 1);
+
+ return SPECTRE_MITIGATED;
+}
+
+void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
+{
+ enum mitigation_state state;
+
+ WARN_ON(preemptible());
+
+ state = spectre_v4_enable_hw_mitigation();
+ if (state == SPECTRE_VULNERABLE)
+ state = spectre_v4_enable_fw_mitigation();
+
+ update_mitigation_state(&spectre_v4_state, state);
+}
+
+static void __update_pstate_ssbs(struct pt_regs *regs, bool state)
+{
+ u64 bit = compat_user_mode(regs) ? PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
+
+ if (state)
+ regs->pstate |= bit;
+ else
+ regs->pstate &= ~bit;
+}
+
+void spectre_v4_enable_task_mitigation(struct task_struct *tsk)
+{
+ struct pt_regs *regs = task_pt_regs(tsk);
+ bool ssbs = false, kthread = tsk->flags & PF_KTHREAD;
+
+ if (spectre_v4_mitigations_off())
+ ssbs = true;
+ else if (spectre_v4_mitigations_dynamic() && !kthread)
+ ssbs = !test_tsk_thread_flag(tsk, TIF_SSBD);
+
+ __update_pstate_ssbs(regs, ssbs);
+}
+
+/*
+ * The Spectre-v4 mitigation can be controlled via a prctl() from userspace.
+ * This is interesting because the "speculation disabled" behaviour can be
+ * configured so that it is preserved across exec(), which means that the
+ * prctl() may be necessary even when PSTATE.SSBS can be toggled directly
+ * from userspace.
+ */
+static void ssbd_prctl_enable_mitigation(struct task_struct *task)
+{
+ task_clear_spec_ssb_noexec(task);
+ task_set_spec_ssb_disable(task);
+ set_tsk_thread_flag(task, TIF_SSBD);
+}
+
+static void ssbd_prctl_disable_mitigation(struct task_struct *task)
+{
+ task_clear_spec_ssb_noexec(task);
+ task_clear_spec_ssb_disable(task);
+ clear_tsk_thread_flag(task, TIF_SSBD);
+}
+
+static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ /* Enable speculation: disable mitigation */
+ /*
+ * Force disabled speculation prevents it from being
+ * re-enabled.
+ */
+ if (task_spec_ssb_force_disable(task))
+ return -EPERM;
+
+ /*
+ * If the mitigation is forced on, then speculation is forced
+ * off and we again prevent it from being re-enabled.
+ */
+ if (spectre_v4_mitigations_on())
+ return -EPERM;
+
+ ssbd_prctl_disable_mitigation(task);
+ break;
+ case PR_SPEC_FORCE_DISABLE:
+ /* Force disable speculation: force enable mitigation */
+ /*
+ * If the mitigation is forced off, then speculation is forced
+ * on and we prevent it from being disabled.
+ */
+ if (spectre_v4_mitigations_off())
+ return -EPERM;
+
+ task_set_spec_ssb_force_disable(task);
+ fallthrough;
+ case PR_SPEC_DISABLE:
+ /* Disable speculation: enable mitigation */
+ /* Same as PR_SPEC_FORCE_DISABLE */
+ if (spectre_v4_mitigations_off())
+ return -EPERM;
+
+ ssbd_prctl_enable_mitigation(task);
+ break;
+ case PR_SPEC_DISABLE_NOEXEC:
+ /* Disable speculation until execve(): enable mitigation */
+ /*
+ * If the mitigation state is forced one way or the other, then
+ * we must fail now before we try to toggle it on execve().
+ */
+ if (task_spec_ssb_force_disable(task) ||
+ spectre_v4_mitigations_off() ||
+ spectre_v4_mitigations_on()) {
+ return -EPERM;
+ }
+
+ ssbd_prctl_enable_mitigation(task);
+ task_set_spec_ssb_noexec(task);
+ break;
+ default:
+ return -ERANGE;
+ }
+
+ spectre_v4_enable_task_mitigation(task);
+ return 0;
+}
+
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+ unsigned long ctrl)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssbd_prctl_set(task, ctrl);
+ default:
+ return -ENODEV;
+ }
+}
+
+static int ssbd_prctl_get(struct task_struct *task)
+{
+ switch (spectre_v4_state) {
+ case SPECTRE_UNAFFECTED:
+ return PR_SPEC_NOT_AFFECTED;
+ case SPECTRE_MITIGATED:
+ if (spectre_v4_mitigations_on())
+ return PR_SPEC_NOT_AFFECTED;
+
+ if (spectre_v4_mitigations_dynamic())
+ break;
+
+ /* Mitigations are disabled, so we're vulnerable. */
+ fallthrough;
+ case SPECTRE_VULNERABLE:
+ fallthrough;
+ default:
+ return PR_SPEC_ENABLE;
+ }
+
+ /* Check the mitigation state for this task */
+ if (task_spec_ssb_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+
+ if (task_spec_ssb_noexec(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC;
+
+ if (task_spec_ssb_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+}
+
+int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssbd_prctl_get(task);
+ default:
+ return -ENODEV;
+ }
+}
diff --git a/arch/arm64/kernel/ssbd.c b/arch/arm64/kernel/ssbd.c
deleted file mode 100644
index b26955f56750..000000000000
--- a/arch/arm64/kernel/ssbd.c
+++ /dev/null
@@ -1,129 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2018 ARM Ltd, All Rights Reserved.
- */
-
-#include <linux/compat.h>
-#include <linux/errno.h>
-#include <linux/prctl.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/thread_info.h>
-
-#include <asm/cpufeature.h>
-
-static void ssbd_ssbs_enable(struct task_struct *task)
-{
- u64 val = is_compat_thread(task_thread_info(task)) ?
- PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
-
- task_pt_regs(task)->pstate |= val;
-}
-
-static void ssbd_ssbs_disable(struct task_struct *task)
-{
- u64 val = is_compat_thread(task_thread_info(task)) ?
- PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
-
- task_pt_regs(task)->pstate &= ~val;
-}
-
-/*
- * prctl interface for SSBD
- */
-static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
-{
- int state = arm64_get_ssbd_state();
-
- /* Unsupported */
- if (state == ARM64_SSBD_UNKNOWN)
- return -ENODEV;
-
- /* Treat the unaffected/mitigated state separately */
- if (state == ARM64_SSBD_MITIGATED) {
- switch (ctrl) {
- case PR_SPEC_ENABLE:
- return -EPERM;
- case PR_SPEC_DISABLE:
- case PR_SPEC_FORCE_DISABLE:
- return 0;
- }
- }
-
- /*
- * Things are a bit backward here: the arm64 internal API
- * *enables the mitigation* when the userspace API *disables
- * speculation*. So much fun.
- */
- switch (ctrl) {
- case PR_SPEC_ENABLE:
- /* If speculation is force disabled, enable is not allowed */
- if (state == ARM64_SSBD_FORCE_ENABLE ||
- task_spec_ssb_force_disable(task))
- return -EPERM;
- task_clear_spec_ssb_disable(task);
- clear_tsk_thread_flag(task, TIF_SSBD);
- ssbd_ssbs_enable(task);
- break;
- case PR_SPEC_DISABLE:
- if (state == ARM64_SSBD_FORCE_DISABLE)
- return -EPERM;
- task_set_spec_ssb_disable(task);
- set_tsk_thread_flag(task, TIF_SSBD);
- ssbd_ssbs_disable(task);
- break;
- case PR_SPEC_FORCE_DISABLE:
- if (state == ARM64_SSBD_FORCE_DISABLE)
- return -EPERM;
- task_set_spec_ssb_disable(task);
- task_set_spec_ssb_force_disable(task);
- set_tsk_thread_flag(task, TIF_SSBD);
- ssbd_ssbs_disable(task);
- break;
- default:
- return -ERANGE;
- }
-
- return 0;
-}
-
-int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
- unsigned long ctrl)
-{
- switch (which) {
- case PR_SPEC_STORE_BYPASS:
- return ssbd_prctl_set(task, ctrl);
- default:
- return -ENODEV;
- }
-}
-
-static int ssbd_prctl_get(struct task_struct *task)
-{
- switch (arm64_get_ssbd_state()) {
- case ARM64_SSBD_UNKNOWN:
- return -ENODEV;
- case ARM64_SSBD_FORCE_ENABLE:
- return PR_SPEC_DISABLE;
- case ARM64_SSBD_KERNEL:
- if (task_spec_ssb_force_disable(task))
- return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
- if (task_spec_ssb_disable(task))
- return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
- return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
- case ARM64_SSBD_FORCE_DISABLE:
- return PR_SPEC_ENABLE;
- default:
- return PR_SPEC_NOT_AFFECTED;
- }
-}
-
-int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
-{
- switch (which) {
- case PR_SPEC_STORE_BYPASS:
- return ssbd_prctl_get(task);
- default:
- return -ENODEV;
- }
-}
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index c1dee9066ff9..584c14ce3c86 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -72,8 +72,7 @@ void notrace __cpu_suspend_exit(void)
* have turned the mitigation on. If the user has forcefully
* disabled it, make sure their wishes are obeyed.
*/
- if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE)
- arm64_set_ssbd_mitigation(false);
+ spectre_v4_enable_mitigation(NULL);
}
/*
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 7cba7623fcec..d52e6b5dbfd3 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -9,6 +9,7 @@
#include <asm-generic/vmlinux.lds.h>
#include <asm/cache.h>
+#include <asm/hyp_image.h>
#include <asm/kernel-pgtable.h>
#include <asm/memory.h>
#include <asm/page.h>
@@ -21,12 +22,23 @@ ENTRY(_text)
jiffies = jiffies_64;
+#ifdef CONFIG_KVM
#define HYPERVISOR_EXTABLE \
. = ALIGN(SZ_8); \
__start___kvm_ex_table = .; \
*(__kvm_ex_table) \
__stop___kvm_ex_table = .;
+#define HYPERVISOR_PERCPU_SECTION \
+ . = ALIGN(PAGE_SIZE); \
+ HYP_SECTION_NAME(.data..percpu) : { \
+ *(HYP_SECTION_NAME(.data..percpu)) \
+ }
+#else /* CONFIG_KVM */
+#define HYPERVISOR_EXTABLE
+#define HYPERVISOR_PERCPU_SECTION
+#endif
+
#define HYPERVISOR_TEXT \
/* \
* Align to 4 KB so that \
@@ -190,6 +202,7 @@ SECTIONS
}
PERCPU_SECTION(L1_CACHE_BYTES)
+ HYPERVISOR_PERCPU_SECTION
.rela.dyn : ALIGN(8) {
*(.rela .rela*)
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 318c8f2df245..043756db8f6e 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -57,9 +57,6 @@ config KVM_ARM_PMU
Adds support for a virtual Performance Monitoring Unit (PMU) in
virtual machines.
-config KVM_INDIRECT_VECTORS
- def_bool HARDEN_BRANCH_PREDICTOR || RANDOMIZE_BASE
-
endif # KVM
endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 99977c1972cc..1504c81fbf5d 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_KVM) += hyp/
kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
$(KVM)/vfio.o $(KVM)/irqchip.o \
arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
- inject_fault.o regmap.o va_layout.o hyp.o handle_exit.o \
+ inject_fault.o regmap.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o \
vgic-sys-reg-v3.o fpsimd.o pmu.o \
aarch32.o arch_timer.o \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b588c3b5c2f0..f56122eedffc 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -46,8 +46,10 @@
__asm__(".arch_extension virt");
#endif
-DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
+DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
+
static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
+unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
/* The VMID used in the VTTBR */
static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
@@ -145,6 +147,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
{
int i;
+ bitmap_free(kvm->arch.pmu_filter);
+
kvm_vgic_destroy(kvm);
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
@@ -286,7 +290,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
static_branch_dec(&userspace_irqchip_in_use);
- kvm_mmu_free_memory_caches(vcpu);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
kvm_timer_vcpu_terminate(vcpu);
kvm_pmu_vcpu_destroy(vcpu);
@@ -1259,12 +1263,60 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
}
+static unsigned long nvhe_percpu_size(void)
+{
+ return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
+ (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
+}
+
+static unsigned long nvhe_percpu_order(void)
+{
+ unsigned long size = nvhe_percpu_size();
+
+ return size ? get_order(size) : 0;
+}
+
+static int kvm_map_vectors(void)
+{
+ /*
+ * SV2 = ARM64_SPECTRE_V2
+ * HEL2 = ARM64_HARDEN_EL2_VECTORS
+ *
+ * !SV2 + !HEL2 -> use direct vectors
+ * SV2 + !HEL2 -> use hardened vectors in place
+ * !SV2 + HEL2 -> allocate one vector slot and use exec mapping
+ * SV2 + HEL2 -> use hardened vectors and use exec mapping
+ */
+ if (cpus_have_const_cap(ARM64_SPECTRE_V2)) {
+ __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs);
+ __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base);
+ }
+
+ if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
+ phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs);
+ unsigned long size = __BP_HARDEN_HYP_VECS_SZ;
+
+ /*
+ * Always allocate a spare vector slot, as we don't
+ * know yet which CPUs have a BP hardening slot that
+ * we can reuse.
+ */
+ __kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot);
+ BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS);
+ return create_hyp_exec_mappings(vect_pa, size,
+ &__kvm_bp_vect_base);
+ }
+
+ return 0;
+}
+
static void cpu_init_hyp_mode(void)
{
phys_addr_t pgd_ptr;
unsigned long hyp_stack_ptr;
unsigned long vector_ptr;
unsigned long tpidr_el2;
+ struct arm_smccc_res res;
/* Switch from the HYP stub to our own HYP init vector */
__hyp_set_vectors(kvm_get_idmap_vector());
@@ -1274,12 +1326,13 @@ static void cpu_init_hyp_mode(void)
* kernel's mapping to the linear mapping, and store it in tpidr_el2
* so that we can use adr_l to access per-cpu variables in EL2.
*/
- tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) -
- (unsigned long)kvm_ksym_ref(&kvm_host_data));
+ tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) -
+ (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
pgd_ptr = kvm_mmu_get_httbr();
hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
- vector_ptr = (unsigned long)kvm_get_hyp_vector();
+ hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr);
+ vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector));
/*
* Call initialization code, and switch to the full blown HYP code.
@@ -1288,14 +1341,16 @@ static void cpu_init_hyp_mode(void)
* cpus_have_const_cap() wrapper.
*/
BUG_ON(!system_capabilities_finalized());
- __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
+ arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init),
+ pgd_ptr, tpidr_el2, hyp_stack_ptr, vector_ptr, &res);
+ WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
/*
* Disabling SSBD on a non-VHE system requires us to enable SSBS
* at EL2.
*/
if (this_cpu_has_cap(ARM64_SSBS) &&
- arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
+ arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) {
kvm_call_hyp_nvhe(__kvm_enable_ssbs);
}
}
@@ -1308,10 +1363,12 @@ static void cpu_hyp_reset(void)
static void cpu_hyp_reinit(void)
{
- kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt);
+ kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
cpu_hyp_reset();
+ *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)kvm_get_hyp_vector();
+
if (is_kernel_in_hyp_mode())
kvm_timer_init_vhe();
else
@@ -1462,8 +1519,10 @@ static void teardown_hyp_mode(void)
int cpu;
free_hyp_pgds();
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+ free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
+ }
}
/**
@@ -1497,6 +1556,24 @@ static int init_hyp_mode(void)
}
/*
+ * Allocate and initialize pages for Hypervisor-mode percpu regions.
+ */
+ for_each_possible_cpu(cpu) {
+ struct page *page;
+ void *page_addr;
+
+ page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
+ if (!page) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ page_addr = page_address(page);
+ memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
+ kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr;
+ }
+
+ /*
* Map the Hyp-code called directly from the host
*/
err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
@@ -1540,22 +1617,21 @@ static int init_hyp_mode(void)
}
}
+ /*
+ * Map Hyp percpu pages
+ */
for_each_possible_cpu(cpu) {
- kvm_host_data_t *cpu_data;
+ char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
+ char *percpu_end = percpu_begin + nvhe_percpu_size();
- cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
- err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
+ err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
if (err) {
- kvm_err("Cannot map host CPU state: %d\n", err);
+ kvm_err("Cannot map hyp percpu region\n");
goto out_err;
}
}
- err = hyp_map_aux_data();
- if (err)
- kvm_err("Cannot map host auxiliary data: %d\n", err);
-
return 0;
out_err:
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
deleted file mode 100644
index 3c79a1124af2..000000000000
--- a/arch/arm64/kvm/hyp.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/linkage.h>
-
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-#include <asm/cpufeature.h>
-
-/*
- * u64 __kvm_call_hyp(void *hypfn, ...);
- *
- * This is not really a variadic function in the classic C-way and care must
- * be taken when calling this to ensure parameters are passed in registers
- * only, since the stack will change between the caller and the callee.
- *
- * Call the function with the first argument containing a pointer to the
- * function you wish to call in Hyp mode, and subsequent arguments will be
- * passed as x0, x1, and x2 (a maximum of 3 arguments in addition to the
- * function pointer can be passed). The function being called must be mapped
- * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c). Return values are
- * passed in x0.
- *
- * A function pointer with a value less than 0xfff has a special meaning,
- * and is used to implement hyp stubs in the same way as in
- * arch/arm64/kernel/hyp_stub.S.
- */
-SYM_FUNC_START(__kvm_call_hyp)
- hvc #0
- ret
-SYM_FUNC_END(__kvm_call_hyp)
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index f54f0e89a71c..4a81eddabcd8 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -10,5 +10,4 @@ subdir-ccflags-y := -I$(incdir) \
-DDISABLE_BRANCH_PROFILING \
$(DISABLE_STACKLEAK_PLUGIN)
-obj-$(CONFIG_KVM) += vhe/ nvhe/
-obj-$(CONFIG_KVM_INDIRECT_VECTORS) += smccc_wa.o
+obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o smccc_wa.o
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 76e7eaf4675e..b0afad7a99c6 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -7,7 +7,6 @@
#include <linux/linkage.h>
#include <asm/alternative.h>
-#include <asm/asm-offsets.h>
#include <asm/assembler.h>
#include <asm/fpsimdmacros.h>
#include <asm/kvm.h>
@@ -16,66 +15,28 @@
#include <asm/kvm_mmu.h>
#include <asm/kvm_ptrauth.h>
-#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x)
-#define CPU_SP_EL0_OFFSET (CPU_XREG_OFFSET(30) + 8)
-
.text
/*
- * We treat x18 as callee-saved as the host may use it as a platform
- * register (e.g. for shadow call stack).
- */
-.macro save_callee_saved_regs ctxt
- str x18, [\ctxt, #CPU_XREG_OFFSET(18)]
- stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
- stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
- stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
- stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
- stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
- stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
-.endm
-
-.macro restore_callee_saved_regs ctxt
- // We require \ctxt is not x18-x28
- ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)]
- ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
- ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
- ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
- ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
- ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
- ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
-.endm
-
-.macro save_sp_el0 ctxt, tmp
- mrs \tmp, sp_el0
- str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
-.endm
-
-.macro restore_sp_el0 ctxt, tmp
- ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
- msr sp_el0, \tmp
-.endm
-
-/*
- * u64 __guest_enter(struct kvm_vcpu *vcpu,
- * struct kvm_cpu_context *host_ctxt);
+ * u64 __guest_enter(struct kvm_vcpu *vcpu);
*/
SYM_FUNC_START(__guest_enter)
// x0: vcpu
- // x1: host context
- // x2-x17: clobbered by macros
+ // x1-x17: clobbered by macros
// x29: guest context
- // Store the host regs
+ adr_this_cpu x1, kvm_hyp_ctxt, x2
+
+ // Store the hyp regs
save_callee_saved_regs x1
- // Save the host's sp_el0
+ // Save hyp's sp_el0
save_sp_el0 x1, x2
- // Now the host state is stored if we have a pending RAS SError it must
- // affect the host. If any asynchronous exception is pending we defer
- // the guest entry. The DSB isn't necessary before v8.2 as any SError
- // would be fatal.
+ // Now the hyp state is stored if we have a pending RAS SError it must
+ // affect the host or hyp. If any asynchronous exception is pending we
+ // defer the guest entry. The DSB isn't necessary before v8.2 as any
+ // SError would be fatal.
alternative_if ARM64_HAS_RAS_EXTN
dsb nshst
isb
@@ -86,6 +47,8 @@ alternative_else_nop_endif
ret
1:
+ set_loaded_vcpu x0, x1, x2
+
add x29, x0, #VCPU_CONTEXT
// Macro ptrauth_switch_to_guest format:
@@ -116,6 +79,26 @@ alternative_else_nop_endif
eret
sb
+SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
+ // x2-x29,lr: vcpu regs
+ // vcpu x0-x1 on the stack
+
+ // If the hyp context is loaded, go straight to hyp_panic
+ get_loaded_vcpu x0, x1
+ cbz x0, hyp_panic
+
+ // The hyp context is saved so make sure it is restored to allow
+ // hyp_panic to run at hyp and, subsequently, panic to run in the host.
+ // This makes use of __guest_exit to avoid duplication but sets the
+ // return address to tail call into hyp_panic. As a side effect, the
+ // current state is saved to the guest context but it will only be
+ // accurate if the guest had been completely restored.
+ adr_this_cpu x0, kvm_hyp_ctxt, x1
+ adr x1, hyp_panic
+ str x1, [x0, #CPU_XREG_OFFSET(30)]
+
+ get_vcpu_ptr x1, x0
+
SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// x0: return code
// x1: vcpu
@@ -148,21 +131,23 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// Store the guest's sp_el0
save_sp_el0 x1, x2
- get_host_ctxt x2, x3
+ adr_this_cpu x2, kvm_hyp_ctxt, x3
- // Macro ptrauth_switch_to_guest format:
- // ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3)
+ // Macro ptrauth_switch_to_hyp format:
+ // ptrauth_switch_to_hyp(guest cxt, host cxt, tmp1, tmp2, tmp3)
// The below macro to save/restore keys is not implemented in C code
// as it may cause Pointer Authentication key signing mismatch errors
// when this feature is enabled for kernel code.
- ptrauth_switch_to_host x1, x2, x3, x4, x5
+ ptrauth_switch_to_hyp x1, x2, x3, x4, x5
- // Restore the hosts's sp_el0
+ // Restore hyp's sp_el0
restore_sp_el0 x2, x3
- // Now restore the host regs
+ // Now restore the hyp regs
restore_callee_saved_regs x2
+ set_loaded_vcpu xzr, x1, x2
+
alternative_if ARM64_HAS_RAS_EXTN
// If we have the RAS extensions we can consume a pending error
// without an unmask-SError and isb. The ESB-instruction consumed any
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 46b4dab933d0..0a5b36eb54b3 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -12,7 +12,6 @@
#include <asm/cpufeature.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
#include <asm/mmu.h>
.macro save_caller_saved_regs_vect
@@ -41,20 +40,6 @@
.text
-.macro do_el2_call
- /*
- * Shuffle the parameters before calling the function
- * pointed to in x0. Assumes parameters in x[1,2,3].
- */
- str lr, [sp, #-16]!
- mov lr, x0
- mov x0, x1
- mov x1, x2
- mov x2, x3
- blr lr
- ldr lr, [sp], #16
-.endm
-
el1_sync: // Guest trapped into EL2
mrs x0, esr_el2
@@ -63,44 +48,6 @@ el1_sync: // Guest trapped into EL2
ccmp x0, #ESR_ELx_EC_HVC32, #4, ne
b.ne el1_trap
-#ifdef __KVM_NVHE_HYPERVISOR__
- mrs x1, vttbr_el2 // If vttbr is valid, the guest
- cbnz x1, el1_hvc_guest // called HVC
-
- /* Here, we're pretty sure the host called HVC. */
- ldp x0, x1, [sp], #16
-
- /* Check for a stub HVC call */
- cmp x0, #HVC_STUB_HCALL_NR
- b.hs 1f
-
- /*
- * Compute the idmap address of __kvm_handle_stub_hvc and
- * jump there. Since we use kimage_voffset, do not use the
- * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
- * (by loading it from the constant pool).
- *
- * Preserve x0-x4, which may contain stub parameters.
- */
- ldr x5, =__kvm_handle_stub_hvc
- ldr_l x6, kimage_voffset
-
- /* x5 = __pa(x5) */
- sub x5, x5, x6
- br x5
-
-1:
- /*
- * Perform the EL2 call
- */
- kern_hyp_va x0
- do_el2_call
-
- eret
- sb
-#endif /* __KVM_NVHE_HYPERVISOR__ */
-
-el1_hvc_guest:
/*
* Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1.
* The workaround has already been applied on the host,
@@ -116,35 +63,6 @@ el1_hvc_guest:
ARM_SMCCC_ARCH_WORKAROUND_2)
cbnz w1, el1_trap
-#ifdef CONFIG_ARM64_SSBD
-alternative_cb arm64_enable_wa2_handling
- b wa2_end
-alternative_cb_end
- get_vcpu_ptr x2, x0
- ldr x0, [x2, #VCPU_WORKAROUND_FLAGS]
-
- // Sanitize the argument and update the guest flags
- ldr x1, [sp, #8] // Guest's x1
- clz w1, w1 // Murphy's device:
- lsr w1, w1, #5 // w1 = !!w1 without using
- eor w1, w1, #1 // the flags...
- bfi x0, x1, #VCPU_WORKAROUND_2_FLAG_SHIFT, #1
- str x0, [x2, #VCPU_WORKAROUND_FLAGS]
-
- /* Check that we actually need to perform the call */
- hyp_ldr_this_cpu x0, arm64_ssbd_callback_required, x2
- cbz x0, wa2_end
-
- mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2
- smc #0
-
- /* Don't leak data from the SMC call */
- mov x3, xzr
-wa2_end:
- mov x2, xzr
- mov x1, xzr
-#endif
-
wa_epilogue:
mov x0, xzr
add sp, sp, #16
@@ -198,24 +116,7 @@ el2_error:
eret
sb
-#ifdef __KVM_NVHE_HYPERVISOR__
-SYM_FUNC_START(__hyp_do_panic)
- mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
- PSR_MODE_EL1h)
- msr spsr_el2, lr
- ldr lr, =panic
- msr elr_el2, lr
- eret
- sb
-SYM_FUNC_END(__hyp_do_panic)
-#endif
-
-SYM_CODE_START(__hyp_panic)
- get_host_ctxt x0, x1
- b hyp_panic
-SYM_CODE_END(__hyp_panic)
-
-.macro invalid_vector label, target = __hyp_panic
+.macro invalid_vector label, target = __guest_exit_panic
.align 2
SYM_CODE_START(\label)
b \target
@@ -227,7 +128,6 @@ SYM_CODE_END(\label)
invalid_vector el2t_irq_invalid
invalid_vector el2t_fiq_invalid
invalid_vector el2t_error_invalid
- invalid_vector el2h_sync_invalid
invalid_vector el2h_irq_invalid
invalid_vector el2h_fiq_invalid
invalid_vector el1_fiq_invalid
@@ -257,10 +157,9 @@ check_preamble_length 661b, 662b
.macro invalid_vect target
.align 7
661:
- b \target
nop
+ stp x0, x1, [sp, #-16]!
662:
- ldp x0, x1, [sp], #16
b \target
check_preamble_length 661b, 662b
@@ -288,7 +187,6 @@ SYM_CODE_START(__kvm_hyp_vector)
valid_vect el1_error // Error 32-bit EL1
SYM_CODE_END(__kvm_hyp_vector)
-#ifdef CONFIG_KVM_INDIRECT_VECTORS
.macro hyp_ventry
.align 7
1: esb
@@ -338,4 +236,3 @@ SYM_CODE_START(__bp_harden_hyp_vecs)
1: .org __bp_harden_hyp_vecs + __BP_HARDEN_HYP_VECS_SZ
.org 1b
SYM_CODE_END(__bp_harden_hyp_vecs)
-#endif
diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
index 5e28ea6aa097..4ebe9f558f3a 100644
--- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
@@ -135,7 +135,7 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
return;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
guest_ctxt = &vcpu->arch.ctxt;
host_dbg = &vcpu->arch.host_debug_state.regs;
guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
@@ -154,7 +154,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
return;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
guest_ctxt = &vcpu->arch.ctxt;
host_dbg = &vcpu->arch.host_debug_state.regs;
guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 0261308bf944..313a8fa3c721 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -126,11 +126,6 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu)
}
}
-static inline void __activate_vm(struct kvm_s2_mmu *mmu)
-{
- __load_guest_stage2(mmu);
-}
-
static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
{
u64 par, tmp;
@@ -377,6 +372,8 @@ static inline bool esr_is_ptrauth_trap(u32 esr)
ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \
} while(0)
+DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+
static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *ctxt;
@@ -386,7 +383,7 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
!esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu)))
return false;
- ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ ctxt = this_cpu_ptr(&kvm_hyp_ctxt);
__ptrauth_save_key(ctxt, APIA);
__ptrauth_save_key(ctxt, APIB);
__ptrauth_save_key(ctxt, APDA);
@@ -479,49 +476,15 @@ exit:
return false;
}
-static inline bool __needs_ssbd_off(struct kvm_vcpu *vcpu)
-{
- if (!cpus_have_final_cap(ARM64_SSBD))
- return false;
-
- return !(vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG);
-}
-
-static inline void __set_guest_arch_workaround_state(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_ARM64_SSBD
- /*
- * The host runs with the workaround always present. If the
- * guest wants it disabled, so be it...
- */
- if (__needs_ssbd_off(vcpu) &&
- __hyp_this_cpu_read(arm64_ssbd_callback_required))
- arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 0, NULL);
-#endif
-}
-
-static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_ARM64_SSBD
- /*
- * If the guest has disabled the workaround, bring it back on.
- */
- if (__needs_ssbd_off(vcpu) &&
- __hyp_this_cpu_read(arm64_ssbd_callback_required))
- arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 1, NULL);
-#endif
-}
-
static inline void __kvm_unexpected_el2_exception(void)
{
+ extern char __guest_exit_panic[];
unsigned long addr, fixup;
- struct kvm_cpu_context *host_ctxt;
struct exception_table_entry *entry, *end;
unsigned long elr_el2 = read_sysreg(elr_el2);
entry = hyp_symbol_addr(__start___kvm_ex_table);
end = hyp_symbol_addr(__stop___kvm_ex_table);
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
while (entry < end) {
addr = (unsigned long)&entry->insn + entry->insn;
@@ -536,7 +499,8 @@ static inline void __kvm_unexpected_el2_exception(void)
return;
}
- hyp_panic(host_ctxt);
+ /* Trigger a panic after restoring the hyp context. */
+ write_sysreg(__guest_exit_panic, elr_el2);
}
#endif /* __ARM64_KVM_HYP_SWITCH_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/.gitignore b/arch/arm64/kvm/hyp/nvhe/.gitignore
new file mode 100644
index 000000000000..695d73d0249e
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+hyp.lds
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index aef76487edc2..ddde15fe85f2 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -6,44 +6,50 @@
asflags-y := -D__KVM_NVHE_HYPERVISOR__
ccflags-y := -D__KVM_NVHE_HYPERVISOR__
-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o
+obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o
obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o
-obj-y := $(patsubst %.o,%.hyp.o,$(obj-y))
-extra-y := $(patsubst %.hyp.o,%.hyp.tmp.o,$(obj-y))
+##
+## Build rules for compiling nVHE hyp code
+## Output of this folder is `kvm_nvhe.o`, a partially linked object
+## file containing all nVHE hyp code and data.
+##
-$(obj)/%.hyp.tmp.o: $(src)/%.c FORCE
+hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y))
+obj-y := kvm_nvhe.o
+extra-y := $(hyp-obj) kvm_nvhe.tmp.o hyp.lds
+
+# 1) Compile all source files to `.nvhe.o` object files. The file extension
+# avoids file name clashes for files shared with VHE.
+$(obj)/%.nvhe.o: $(src)/%.c FORCE
$(call if_changed_rule,cc_o_c)
-$(obj)/%.hyp.tmp.o: $(src)/%.S FORCE
+$(obj)/%.nvhe.o: $(src)/%.S FORCE
$(call if_changed_rule,as_o_S)
-$(obj)/%.hyp.o: $(obj)/%.hyp.tmp.o FORCE
- $(call if_changed,hypcopy)
-# Disable reordering functions by GCC (enabled at -O2).
-# This pass puts functions into '.text.*' sections to aid the linker
-# in optimizing ELF layout. See HYPCOPY comment below for more info.
-ccflags-y += $(call cc-option,-fno-reorder-functions)
+# 2) Compile linker script.
+$(obj)/hyp.lds: $(src)/hyp.lds.S FORCE
+ $(call if_changed_dep,cpp_lds_S)
+
+# 3) Partially link all '.nvhe.o' files and apply the linker script.
+# Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'.
+# Note: The following rule assumes that the 'ld' rule puts LDFLAGS before
+# the list of dependencies to form '-T $(obj)/hyp.lds'. This is to
+# keep the dependency on the target while avoiding an error from
+# GNU ld if the linker script is passed to it twice.
+LDFLAGS_kvm_nvhe.tmp.o := -r -T
+$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
+ $(call if_changed,ld)
+
+# 4) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
+# Prefixes names of ELF symbols with '__kvm_nvhe_'.
+$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.tmp.o FORCE
+ $(call if_changed,hypcopy)
# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
-# and relevant ELF section names to avoid clashes with VHE code/data.
-#
-# Hyp code is assumed to be in the '.text' section of the input object
-# files (with the exception of specialized sections such as
-# '.hyp.idmap.text'). This assumption may be broken by a compiler that
-# divides code into sections like '.text.unlikely' so as to optimize
-# ELF layout. HYPCOPY checks that no such sections exist in the input
-# using `objdump`, otherwise they would be linked together with other
-# kernel code and not memory-mapped correctly at runtime.
+# to avoid clashes with VHE code/data.
quiet_cmd_hypcopy = HYPCOPY $@
- cmd_hypcopy = \
- if $(OBJDUMP) -h $< | grep -F '.text.'; then \
- echo "$@: function reordering not supported in nVHE hyp code" >&2; \
- /bin/false; \
- fi; \
- $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ \
- --rename-section=.text=.hyp.text \
- $< $@
+ cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
# Remove ftrace and Shadow Call Stack CFLAGS.
# This is equivalent to the 'notrace' and '__noscs' annotations.
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
new file mode 100644
index 000000000000..ff9a0f547b9f
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 - Google Inc
+ * Author: Andrew Scull <ascull@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/assembler.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+
+ .text
+
+SYM_FUNC_START(__host_exit)
+ stp x0, x1, [sp, #-16]!
+
+ get_host_ctxt x0, x1
+
+ ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
+
+ /* Store the host regs x2 and x3 */
+ stp x2, x3, [x0, #CPU_XREG_OFFSET(2)]
+
+ /* Retrieve the host regs x0-x1 from the stack */
+ ldp x2, x3, [sp], #16 // x0, x1
+
+ /* Store the host regs x0-x1 and x4-x17 */
+ stp x2, x3, [x0, #CPU_XREG_OFFSET(0)]
+ stp x4, x5, [x0, #CPU_XREG_OFFSET(4)]
+ stp x6, x7, [x0, #CPU_XREG_OFFSET(6)]
+ stp x8, x9, [x0, #CPU_XREG_OFFSET(8)]
+ stp x10, x11, [x0, #CPU_XREG_OFFSET(10)]
+ stp x12, x13, [x0, #CPU_XREG_OFFSET(12)]
+ stp x14, x15, [x0, #CPU_XREG_OFFSET(14)]
+ stp x16, x17, [x0, #CPU_XREG_OFFSET(16)]
+
+ /* Store the host regs x18-x29, lr */
+ save_callee_saved_regs x0
+
+ /* Save the host context pointer in x29 across the function call */
+ mov x29, x0
+ bl handle_trap
+
+ /* Restore host regs x0-x17 */
+ ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
+ ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
+ ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)]
+ ldp x6, x7, [x29, #CPU_XREG_OFFSET(6)]
+
+ /* x0-7 are use for panic arguments */
+__host_enter_for_panic:
+ ldp x8, x9, [x29, #CPU_XREG_OFFSET(8)]
+ ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)]
+ ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)]
+ ldp x14, x15, [x29, #CPU_XREG_OFFSET(14)]
+ ldp x16, x17, [x29, #CPU_XREG_OFFSET(16)]
+
+ /* Restore host regs x18-x29, lr */
+ restore_callee_saved_regs x29
+
+ /* Do not touch any register after this! */
+__host_enter_without_restoring:
+ eret
+ sb
+SYM_FUNC_END(__host_exit)
+
+/*
+ * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
+ */
+SYM_FUNC_START(__hyp_do_panic)
+ /* Load the format arguments into x1-7 */
+ mov x6, x3
+ get_vcpu_ptr x7, x3
+
+ mrs x3, esr_el2
+ mrs x4, far_el2
+ mrs x5, hpfar_el2
+
+ /* Prepare and exit to the host's panic funciton. */
+ mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
+ PSR_MODE_EL1h)
+ msr spsr_el2, lr
+ ldr lr, =panic
+ msr elr_el2, lr
+
+ /*
+ * Set the panic format string and enter the host, conditionally
+ * restoring the host context.
+ */
+ cmp x0, xzr
+ ldr x0, =__hyp_panic_string
+ b.eq __host_enter_without_restoring
+ b __host_enter_for_panic
+SYM_FUNC_END(__hyp_do_panic)
+
+.macro host_el1_sync_vect
+ .align 7
+.L__vect_start\@:
+ stp x0, x1, [sp, #-16]!
+ mrs x0, esr_el2
+ lsr x0, x0, #ESR_ELx_EC_SHIFT
+ cmp x0, #ESR_ELx_EC_HVC64
+ ldp x0, x1, [sp], #16
+ b.ne __host_exit
+
+ /* Check for a stub HVC call */
+ cmp x0, #HVC_STUB_HCALL_NR
+ b.hs __host_exit
+
+ /*
+ * Compute the idmap address of __kvm_handle_stub_hvc and
+ * jump there. Since we use kimage_voffset, do not use the
+ * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
+ * (by loading it from the constant pool).
+ *
+ * Preserve x0-x4, which may contain stub parameters.
+ */
+ ldr x5, =__kvm_handle_stub_hvc
+ ldr_l x6, kimage_voffset
+
+ /* x5 = __pa(x5) */
+ sub x5, x5, x6
+ br x5
+.L__vect_end\@:
+.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80)
+ .error "host_el1_sync_vect larger than vector entry"
+.endif
+.endm
+
+.macro invalid_host_el2_vect
+ .align 7
+ /* If a guest is loaded, panic out of it. */
+ stp x0, x1, [sp, #-16]!
+ get_loaded_vcpu x0, x1
+ cbnz x0, __guest_exit_panic
+ add sp, sp, #16
+
+ /*
+ * The panic may not be clean if the exception is taken before the host
+ * context has been saved by __host_exit or after the hyp context has
+ * been partially clobbered by __host_enter.
+ */
+ b hyp_panic
+.endm
+
+.macro invalid_host_el1_vect
+ .align 7
+ mov x0, xzr /* restore_host = false */
+ mrs x1, spsr_el2
+ mrs x2, elr_el2
+ mrs x3, par_el1
+ b __hyp_do_panic
+.endm
+
+/*
+ * The host vector does not use an ESB instruction in order to avoid consuming
+ * SErrors that should only be consumed by the host. Guest entry is deferred by
+ * __guest_enter if there are any pending asynchronous exceptions so hyp will
+ * always return to the host without having consumerd host SErrors.
+ *
+ * CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the
+ * host knows about the EL2 vectors already, and there is no point in hiding
+ * them.
+ */
+ .align 11
+SYM_CODE_START(__kvm_hyp_host_vector)
+ invalid_host_el2_vect // Synchronous EL2t
+ invalid_host_el2_vect // IRQ EL2t
+ invalid_host_el2_vect // FIQ EL2t
+ invalid_host_el2_vect // Error EL2t
+
+ invalid_host_el2_vect // Synchronous EL2h
+ invalid_host_el2_vect // IRQ EL2h
+ invalid_host_el2_vect // FIQ EL2h
+ invalid_host_el2_vect // Error EL2h
+
+ host_el1_sync_vect // Synchronous 64-bit EL1
+ invalid_host_el1_vect // IRQ 64-bit EL1
+ invalid_host_el1_vect // FIQ 64-bit EL1
+ invalid_host_el1_vect // Error 64-bit EL1
+
+ invalid_host_el1_vect // Synchronous 32-bit EL1
+ invalid_host_el1_vect // IRQ 32-bit EL1
+ invalid_host_el1_vect // FIQ 32-bit EL1
+ invalid_host_el1_vect // Error 32-bit EL1
+SYM_CODE_END(__kvm_hyp_host_vector)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index d9434e90c06d..47224dc62c51 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -4,11 +4,13 @@
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
+#include <linux/arm-smccc.h>
#include <linux/linkage.h>
#include <asm/alternative.h>
#include <asm/assembler.h>
#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
#include <asm/pgtable-hwdef.h>
#include <asm/sysreg.h>
@@ -44,27 +46,37 @@ __invalid:
b .
/*
- * x0: HYP pgd
- * x1: HYP stack
- * x2: HYP vectors
- * x3: per-CPU offset
+ * x0: SMCCC function ID
+ * x1: HYP pgd
+ * x2: per-CPU offset
+ * x3: HYP stack
+ * x4: HYP vectors
*/
__do_hyp_init:
/* Check for a stub HVC call */
cmp x0, #HVC_STUB_HCALL_NR
b.lo __kvm_handle_stub_hvc
- phys_to_ttbr x4, x0
+ /* Set tpidr_el2 for use by HYP to free a register */
+ msr tpidr_el2, x2
+
+ mov x2, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init)
+ cmp x0, x2
+ b.eq 1f
+ mov x0, #SMCCC_RET_NOT_SUPPORTED
+ eret
+
+1: phys_to_ttbr x0, x1
alternative_if ARM64_HAS_CNP
- orr x4, x4, #TTBR_CNP_BIT
+ orr x0, x0, #TTBR_CNP_BIT
alternative_else_nop_endif
- msr ttbr0_el2, x4
+ msr ttbr0_el2, x0
- mrs x4, tcr_el1
- mov_q x5, TCR_EL2_MASK
- and x4, x4, x5
- mov x5, #TCR_EL2_RES1
- orr x4, x4, x5
+ mrs x0, tcr_el1
+ mov_q x1, TCR_EL2_MASK
+ and x0, x0, x1
+ mov x1, #TCR_EL2_RES1
+ orr x0, x0, x1
/*
* The ID map may be configured to use an extended virtual address
@@ -80,18 +92,18 @@ alternative_else_nop_endif
*
* So use the same T0SZ value we use for the ID map.
*/
- ldr_l x5, idmap_t0sz
- bfi x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
+ ldr_l x1, idmap_t0sz
+ bfi x0, x1, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
/*
* Set the PS bits in TCR_EL2.
*/
- tcr_compute_pa_size x4, #TCR_EL2_PS_SHIFT, x5, x6
+ tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2
- msr tcr_el2, x4
+ msr tcr_el2, x0
- mrs x4, mair_el1
- msr mair_el2, x4
+ mrs x0, mair_el1
+ msr mair_el2, x0
isb
/* Invalidate the stale TLBs from Bootloader */
@@ -103,25 +115,22 @@ alternative_else_nop_endif
* as well as the EE bit on BE. Drop the A flag since the compiler
* is allowed to generate unaligned accesses.
*/
- mov_q x4, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
-CPU_BE( orr x4, x4, #SCTLR_ELx_EE)
+ mov_q x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
+CPU_BE( orr x0, x0, #SCTLR_ELx_EE)
alternative_if ARM64_HAS_ADDRESS_AUTH
- mov_q x5, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
+ mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
- orr x4, x4, x5
+ orr x0, x0, x1
alternative_else_nop_endif
- msr sctlr_el2, x4
+ msr sctlr_el2, x0
isb
/* Set the stack and new vectors */
- kern_hyp_va x1
- mov sp, x1
- msr vbar_el2, x2
-
- /* Set tpidr_el2 for use by HYP */
- msr tpidr_el2, x3
+ mov sp, x3
+ msr vbar_el2, x4
/* Hello, World! */
+ mov x0, #SMCCC_RET_SUCCESS
eret
SYM_CODE_END(__kvm_hyp_init)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
new file mode 100644
index 000000000000..e2eafe2c93af
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google Inc
+ * Author: Andrew Scull <ascull@google.com>
+ */
+
+#include <hyp/switch.h>
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+#include <kvm/arm_hypercalls.h>
+
+static void handle_host_hcall(unsigned long func_id,
+ struct kvm_cpu_context *host_ctxt)
+{
+ unsigned long ret = 0;
+
+ switch (func_id) {
+ case KVM_HOST_SMCCC_FUNC(__kvm_vcpu_run): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1;
+
+ ret = __kvm_vcpu_run(kern_hyp_va(vcpu));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context):
+ __kvm_flush_vm_context();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid_ipa): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+ phys_addr_t ipa = host_ctxt->regs.regs[2];
+ int level = host_ctxt->regs.regs[3];
+
+ __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+
+ __kvm_tlb_flush_vmid(kern_hyp_va(mmu));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+
+ __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): {
+ u64 cntvoff = host_ctxt->regs.regs[1];
+
+ __kvm_timer_set_cntvoff(cntvoff);
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_enable_ssbs):
+ __kvm_enable_ssbs();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_get_ich_vtr_el2):
+ ret = __vgic_v3_get_ich_vtr_el2();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_read_vmcr):
+ ret = __vgic_v3_read_vmcr();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_write_vmcr): {
+ u32 vmcr = host_ctxt->regs.regs[1];
+
+ __vgic_v3_write_vmcr(vmcr);
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_init_lrs):
+ __vgic_v3_init_lrs();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__kvm_get_mdcr_el2):
+ ret = __kvm_get_mdcr_el2();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_save_aprs): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
+
+ __vgic_v3_save_aprs(kern_hyp_va(cpu_if));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
+
+ __vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
+ break;
+ }
+ default:
+ /* Invalid host HVC. */
+ host_ctxt->regs.regs[0] = SMCCC_RET_NOT_SUPPORTED;
+ return;
+ }
+
+ host_ctxt->regs.regs[0] = SMCCC_RET_SUCCESS;
+ host_ctxt->regs.regs[1] = ret;
+}
+
+void handle_trap(struct kvm_cpu_context *host_ctxt)
+{
+ u64 esr = read_sysreg_el2(SYS_ESR);
+ unsigned long func_id;
+
+ if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64)
+ hyp_panic();
+
+ func_id = host_ctxt->regs.regs[0];
+ handle_host_hcall(func_id, host_ctxt);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
new file mode 100644
index 000000000000..bb2d986ff696
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 Google LLC.
+ * Written by David Brazdil <dbrazdil@google.com>
+ *
+ * Linker script used for partial linking of nVHE EL2 object files.
+ */
+
+#include <asm/hyp_image.h>
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/cache.h>
+#include <asm/memory.h>
+
+SECTIONS {
+ HYP_SECTION(.text)
+ HYP_SECTION_NAME(.data..percpu) : {
+ PERCPU_INPUT(L1_CACHE_BYTES)
+ }
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 0970442d2dbc..a457a0306e03 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -27,6 +27,11 @@
#include <asm/processor.h>
#include <asm/thread_info.h>
+/* Non-VHE specific context */
+DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
+DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
+
static void __activate_traps(struct kvm_vcpu *vcpu)
{
u64 val;
@@ -42,6 +47,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
}
write_sysreg(val, cptr_el2);
+ write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
@@ -60,6 +66,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
static void __deactivate_traps(struct kvm_vcpu *vcpu)
{
+ extern char __kvm_hyp_host_vector[];
u64 mdcr_el2;
___deactivate_traps(vcpu);
@@ -91,9 +98,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
write_sysreg(mdcr_el2, mdcr_el2);
write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
+ write_sysreg(__kvm_hyp_host_vector, vbar_el2);
}
-static void __deactivate_vm(struct kvm_vcpu *vcpu)
+static void __load_host_stage2(void)
{
write_sysreg(0, vttbr_el2);
}
@@ -173,9 +181,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
pmr_sync();
}
- vcpu = kern_hyp_va(vcpu);
-
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
host_ctxt->__hyp_running_vcpu = vcpu;
guest_ctxt = &vcpu->arch.ctxt;
@@ -194,7 +200,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__sysreg32_restore_state(vcpu);
__sysreg_restore_state_nvhe(guest_ctxt);
- __activate_vm(kern_hyp_va(vcpu->arch.hw_mmu));
+ __load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu));
__activate_traps(vcpu);
__hyp_vgic_restore_state(vcpu);
@@ -202,24 +208,20 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__debug_switch_to_guest(vcpu);
- __set_guest_arch_workaround_state(vcpu);
-
do {
/* Jump in the fire! */
- exit_code = __guest_enter(vcpu, host_ctxt);
+ exit_code = __guest_enter(vcpu);
/* And we're baaack! */
} while (fixup_guest_exit(vcpu, &exit_code));
- __set_host_arch_workaround_state(vcpu);
-
__sysreg_save_state_nvhe(guest_ctxt);
__sysreg32_save_state(vcpu);
__timer_disable_traps(vcpu);
__hyp_vgic_save_state(vcpu);
__deactivate_traps(vcpu);
- __deactivate_vm(vcpu);
+ __load_host_stage2();
__sysreg_restore_state_nvhe(host_ctxt);
@@ -239,35 +241,31 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
if (system_uses_irq_prio_masking())
gic_write_pmr(GIC_PRIO_IRQOFF);
+ host_ctxt->__hyp_running_vcpu = NULL;
+
return exit_code;
}
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
+void __noreturn hyp_panic(void)
{
u64 spsr = read_sysreg_el2(SYS_SPSR);
u64 elr = read_sysreg_el2(SYS_ELR);
u64 par = read_sysreg(par_el1);
- struct kvm_vcpu *vcpu = host_ctxt->__hyp_running_vcpu;
- unsigned long str_va;
+ bool restore_host = true;
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_vcpu *vcpu;
+
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ vcpu = host_ctxt->__hyp_running_vcpu;
- if (read_sysreg(vttbr_el2)) {
+ if (vcpu) {
__timer_disable_traps(vcpu);
__deactivate_traps(vcpu);
- __deactivate_vm(vcpu);
+ __load_host_stage2();
__sysreg_restore_state_nvhe(host_ctxt);
}
- /*
- * Force the panic string to be loaded from the literal pool,
- * making sure it is a kernel address and not a PC-relative
- * reference.
- */
- asm volatile("ldr %0, =%1" : "=r" (str_va) : "S" (__hyp_panic_string));
-
- __hyp_do_panic(str_va,
- spsr, elr,
- read_sysreg(esr_el2), read_sysreg_el2(SYS_FAR),
- read_sysreg(hpfar_el2), par, vcpu);
+ __hyp_do_panic(restore_host, spsr, elr, par);
unreachable();
}
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 69eae608d670..544bca3072b7 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -54,7 +54,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
dsb(ishst);
/* Switch to requested VMID */
- mmu = kern_hyp_va(mmu);
__tlb_switch_to_guest(mmu, &cxt);
/*
@@ -108,7 +107,6 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
dsb(ishst);
/* Switch to requested VMID */
- mmu = kern_hyp_va(mmu);
__tlb_switch_to_guest(mmu, &cxt);
__tlbi(vmalls12e1is);
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
new file mode 100644
index 000000000000..0cdf6e461cbd
--- /dev/null
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -0,0 +1,892 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
+ * No bombay mix was harmed in the writing of this file.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#include <linux/bitfield.h>
+#include <asm/kvm_pgtable.h>
+
+#define KVM_PGTABLE_MAX_LEVELS 4U
+
+#define KVM_PTE_VALID BIT(0)
+
+#define KVM_PTE_TYPE BIT(1)
+#define KVM_PTE_TYPE_BLOCK 0
+#define KVM_PTE_TYPE_PAGE 1
+#define KVM_PTE_TYPE_TABLE 1
+
+#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT)
+#define KVM_PTE_ADDR_51_48 GENMASK(15, 12)
+
+#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
+
+#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
+#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51)
+
+#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
+
+#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
+
+struct kvm_pgtable_walk_data {
+ struct kvm_pgtable *pgt;
+ struct kvm_pgtable_walker *walker;
+
+ u64 addr;
+ u64 end;
+};
+
+static u64 kvm_granule_shift(u32 level)
+{
+ /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
+ return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
+}
+
+static u64 kvm_granule_size(u32 level)
+{
+ return BIT(kvm_granule_shift(level));
+}
+
+static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+{
+ u64 granule = kvm_granule_size(level);
+
+ /*
+ * Reject invalid block mappings and don't bother with 4TB mappings for
+ * 52-bit PAs.
+ */
+ if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
+ return false;
+
+ if (granule > (end - addr))
+ return false;
+
+ return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
+}
+
+static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
+{
+ u64 shift = kvm_granule_shift(level);
+ u64 mask = BIT(PAGE_SHIFT - 3) - 1;
+
+ return (data->addr >> shift) & mask;
+}
+
+static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
+{
+ u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
+ u64 mask = BIT(pgt->ia_bits) - 1;
+
+ return (addr & mask) >> shift;
+}
+
+static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
+{
+ return __kvm_pgd_page_idx(data->pgt, data->addr);
+}
+
+static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
+{
+ struct kvm_pgtable pgt = {
+ .ia_bits = ia_bits,
+ .start_level = start_level,
+ };
+
+ return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
+}
+
+static bool kvm_pte_valid(kvm_pte_t pte)
+{
+ return pte & KVM_PTE_VALID;
+}
+
+static bool kvm_pte_table(kvm_pte_t pte, u32 level)
+{
+ if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+ return false;
+
+ if (!kvm_pte_valid(pte))
+ return false;
+
+ return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
+}
+
+static u64 kvm_pte_to_phys(kvm_pte_t pte)
+{
+ u64 pa = pte & KVM_PTE_ADDR_MASK;
+
+ if (PAGE_SHIFT == 16)
+ pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
+
+ return pa;
+}
+
+static kvm_pte_t kvm_phys_to_pte(u64 pa)
+{
+ kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
+
+ if (PAGE_SHIFT == 16)
+ pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
+
+ return pte;
+}
+
+static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
+{
+ return __va(kvm_pte_to_phys(pte));
+}
+
+static void kvm_set_invalid_pte(kvm_pte_t *ptep)
+{
+ kvm_pte_t pte = *ptep;
+ WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
+}
+
+static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
+{
+ kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
+
+ pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
+ pte |= KVM_PTE_VALID;
+
+ WARN_ON(kvm_pte_valid(old));
+ smp_store_release(ptep, pte);
+}
+
+static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
+ u32 level)
+{
+ kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
+ u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
+ KVM_PTE_TYPE_BLOCK;
+
+ pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
+ pte |= FIELD_PREP(KVM_PTE_TYPE, type);
+ pte |= KVM_PTE_VALID;
+
+ /* Tolerate KVM recreating the exact same mapping. */
+ if (kvm_pte_valid(old))
+ return old == pte;
+
+ smp_store_release(ptep, pte);
+ return true;
+}
+
+static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
+ u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag)
+{
+ struct kvm_pgtable_walker *walker = data->walker;
+ return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *pgtable, u32 level);
+
+static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *ptep, u32 level)
+{
+ int ret = 0;
+ u64 addr = data->addr;
+ kvm_pte_t *childp, pte = *ptep;
+ bool table = kvm_pte_table(pte, level);
+ enum kvm_pgtable_walk_flags flags = data->walker->flags;
+
+ if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_TABLE_PRE);
+ }
+
+ if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_LEAF);
+ pte = *ptep;
+ table = kvm_pte_table(pte, level);
+ }
+
+ if (ret)
+ goto out;
+
+ if (!table) {
+ data->addr += kvm_granule_size(level);
+ goto out;
+ }
+
+ childp = kvm_pte_follow(pte);
+ ret = __kvm_pgtable_walk(data, childp, level + 1);
+ if (ret)
+ goto out;
+
+ if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_TABLE_POST);
+ }
+
+out:
+ return ret;
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *pgtable, u32 level)
+{
+ u32 idx;
+ int ret = 0;
+
+ if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
+ return -EINVAL;
+
+ for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
+ kvm_pte_t *ptep = &pgtable[idx];
+
+ if (data->addr >= data->end)
+ break;
+
+ ret = __kvm_pgtable_visit(data, ptep, level);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
+{
+ u32 idx;
+ int ret = 0;
+ struct kvm_pgtable *pgt = data->pgt;
+ u64 limit = BIT(pgt->ia_bits);
+
+ if (data->addr > limit || data->end > limit)
+ return -ERANGE;
+
+ if (!pgt->pgd)
+ return -EINVAL;
+
+ for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
+ kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
+
+ ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_pgtable_walker *walker)
+{
+ struct kvm_pgtable_walk_data walk_data = {
+ .pgt = pgt,
+ .addr = ALIGN_DOWN(addr, PAGE_SIZE),
+ .end = PAGE_ALIGN(walk_data.addr + size),
+ .walker = walker,
+ };
+
+ return _kvm_pgtable_walk(&walk_data);
+}
+
+struct hyp_map_data {
+ u64 phys;
+ kvm_pte_t attr;
+};
+
+static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
+ struct hyp_map_data *data)
+{
+ bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+ u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
+ kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
+ u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
+ u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
+ KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
+
+ if (!(prot & KVM_PGTABLE_PROT_R))
+ return -EINVAL;
+
+ if (prot & KVM_PGTABLE_PROT_X) {
+ if (prot & KVM_PGTABLE_PROT_W)
+ return -EINVAL;
+
+ if (device)
+ return -EINVAL;
+ } else {
+ attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
+ }
+
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
+ attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
+ data->attr = attr;
+ return 0;
+}
+
+static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep, struct hyp_map_data *data)
+{
+ u64 granule = kvm_granule_size(level), phys = data->phys;
+
+ if (!kvm_block_mapping_supported(addr, end, phys, level))
+ return false;
+
+ WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level));
+ data->phys += granule;
+ return true;
+}
+
+static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+ kvm_pte_t *childp;
+
+ if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
+ return 0;
+
+ if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+ return -EINVAL;
+
+ childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!childp)
+ return -ENOMEM;
+
+ kvm_set_table_pte(ptep, childp);
+ return 0;
+}
+
+int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
+ enum kvm_pgtable_prot prot)
+{
+ int ret;
+ struct hyp_map_data map_data = {
+ .phys = ALIGN_DOWN(phys, PAGE_SIZE),
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = hyp_map_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = &map_data,
+ };
+
+ ret = hyp_map_set_prot_attr(prot, &map_data);
+ if (ret)
+ return ret;
+
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ dsb(ishst);
+ isb();
+ return ret;
+}
+
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
+{
+ u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
+
+ pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pgt->pgd)
+ return -ENOMEM;
+
+ pgt->ia_bits = va_bits;
+ pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels;
+ pgt->mmu = NULL;
+ return 0;
+}
+
+static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+ free_page((unsigned long)kvm_pte_follow(*ptep));
+ return 0;
+}
+
+void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = hyp_free_walker,
+ .flags = KVM_PGTABLE_WALK_TABLE_POST,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+ free_page((unsigned long)pgt->pgd);
+ pgt->pgd = NULL;
+}
+
+struct stage2_map_data {
+ u64 phys;
+ kvm_pte_t attr;
+
+ kvm_pte_t *anchor;
+
+ struct kvm_s2_mmu *mmu;
+ struct kvm_mmu_memory_cache *memcache;
+};
+
+static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
+ struct stage2_map_data *data)
+{
+ bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+ kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
+ PAGE_S2_MEMATTR(NORMAL);
+ u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+
+ if (!(prot & KVM_PGTABLE_PROT_X))
+ attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ else if (device)
+ return -EINVAL;
+
+ if (prot & KVM_PGTABLE_PROT_R)
+ attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+
+ if (prot & KVM_PGTABLE_PROT_W)
+ attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
+ attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ data->attr = attr;
+ return 0;
+}
+
+static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ u64 granule = kvm_granule_size(level), phys = data->phys;
+
+ if (!kvm_block_mapping_supported(addr, end, phys, level))
+ return false;
+
+ if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
+ goto out;
+
+ /* There's an existing valid leaf entry, so perform break-before-make */
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+ kvm_set_valid_leaf_pte(ptep, phys, data->attr, level);
+out:
+ data->phys += granule;
+ return true;
+}
+
+static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ if (data->anchor)
+ return 0;
+
+ if (!kvm_block_mapping_supported(addr, end, data->phys, level))
+ return 0;
+
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0);
+ data->anchor = ptep;
+ return 0;
+}
+
+static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ kvm_pte_t *childp, pte = *ptep;
+ struct page *page = virt_to_page(ptep);
+
+ if (data->anchor) {
+ if (kvm_pte_valid(pte))
+ put_page(page);
+
+ return 0;
+ }
+
+ if (stage2_map_walker_try_leaf(addr, end, level, ptep, data))
+ goto out_get_page;
+
+ if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+ return -EINVAL;
+
+ if (!data->memcache)
+ return -ENOMEM;
+
+ childp = kvm_mmu_memory_cache_alloc(data->memcache);
+ if (!childp)
+ return -ENOMEM;
+
+ /*
+ * If we've run into an existing block mapping then replace it with
+ * a table. Accesses beyond 'end' that fall within the new table
+ * will be mapped lazily.
+ */
+ if (kvm_pte_valid(pte)) {
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+ put_page(page);
+ }
+
+ kvm_set_table_pte(ptep, childp);
+
+out_get_page:
+ get_page(page);
+ return 0;
+}
+
+static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ int ret = 0;
+
+ if (!data->anchor)
+ return 0;
+
+ free_page((unsigned long)kvm_pte_follow(*ptep));
+ put_page(virt_to_page(ptep));
+
+ if (data->anchor == ptep) {
+ data->anchor = NULL;
+ ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
+ }
+
+ return ret;
+}
+
+/*
+ * This is a little fiddly, as we use all three of the walk flags. The idea
+ * is that the TABLE_PRE callback runs for table entries on the way down,
+ * looking for table entries which we could conceivably replace with a
+ * block entry for this mapping. If it finds one, then it sets the 'anchor'
+ * field in 'struct stage2_map_data' to point at the table entry, before
+ * clearing the entry to zero and descending into the now detached table.
+ *
+ * The behaviour of the LEAF callback then depends on whether or not the
+ * anchor has been set. If not, then we're not using a block mapping higher
+ * up the table and we perform the mapping at the existing leaves instead.
+ * If, on the other hand, the anchor _is_ set, then we drop references to
+ * all valid leaves so that the pages beneath the anchor can be freed.
+ *
+ * Finally, the TABLE_POST callback does nothing if the anchor has not
+ * been set, but otherwise frees the page-table pages while walking back up
+ * the page-table, installing the block entry when it revisits the anchor
+ * pointer and clearing the anchor to NULL.
+ */
+static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+ struct stage2_map_data *data = arg;
+
+ switch (flag) {
+ case KVM_PGTABLE_WALK_TABLE_PRE:
+ return stage2_map_walk_table_pre(addr, end, level, ptep, data);
+ case KVM_PGTABLE_WALK_LEAF:
+ return stage2_map_walk_leaf(addr, end, level, ptep, data);
+ case KVM_PGTABLE_WALK_TABLE_POST:
+ return stage2_map_walk_table_post(addr, end, level, ptep, data);
+ }
+
+ return -EINVAL;
+}
+
+int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ u64 phys, enum kvm_pgtable_prot prot,
+ struct kvm_mmu_memory_cache *mc)
+{
+ int ret;
+ struct stage2_map_data map_data = {
+ .phys = ALIGN_DOWN(phys, PAGE_SIZE),
+ .mmu = pgt->mmu,
+ .memcache = mc,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_map_walker,
+ .flags = KVM_PGTABLE_WALK_TABLE_PRE |
+ KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_TABLE_POST,
+ .arg = &map_data,
+ };
+
+ ret = stage2_map_set_prot_attr(prot, &map_data);
+ if (ret)
+ return ret;
+
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ dsb(ishst);
+ return ret;
+}
+
+static void stage2_flush_dcache(void *addr, u64 size)
+{
+ if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+ return;
+
+ __flush_dcache_area(addr, size);
+}
+
+static bool stage2_pte_cacheable(kvm_pte_t pte)
+{
+ u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte);
+ return memattr == PAGE_S2_MEMATTR(NORMAL);
+}
+
+static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ struct kvm_s2_mmu *mmu = arg;
+ kvm_pte_t pte = *ptep, *childp = NULL;
+ bool need_flush = false;
+
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ if (kvm_pte_table(pte, level)) {
+ childp = kvm_pte_follow(pte);
+
+ if (page_count(virt_to_page(childp)) != 1)
+ return 0;
+ } else if (stage2_pte_cacheable(pte)) {
+ need_flush = true;
+ }
+
+ /*
+ * This is similar to the map() path in that we unmap the entire
+ * block entry and rely on the remaining portions being faulted
+ * back lazily.
+ */
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
+ put_page(virt_to_page(ptep));
+
+ if (need_flush) {
+ stage2_flush_dcache(kvm_pte_follow(pte),
+ kvm_granule_size(level));
+ }
+
+ if (childp)
+ free_page((unsigned long)childp);
+
+ return 0;
+}
+
+int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_unmap_walker,
+ .arg = pgt->mmu,
+ .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
+ };
+
+ return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
+struct stage2_attr_data {
+ kvm_pte_t attr_set;
+ kvm_pte_t attr_clr;
+ kvm_pte_t pte;
+ u32 level;
+};
+
+static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ kvm_pte_t pte = *ptep;
+ struct stage2_attr_data *data = arg;
+
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ data->level = level;
+ data->pte = pte;
+ pte &= ~data->attr_clr;
+ pte |= data->attr_set;
+
+ /*
+ * We may race with the CPU trying to set the access flag here,
+ * but worst-case the access flag update gets lost and will be
+ * set on the next access instead.
+ */
+ if (data->pte != pte)
+ WRITE_ONCE(*ptep, pte);
+
+ return 0;
+}
+
+static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, kvm_pte_t attr_set,
+ kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
+ u32 *level)
+{
+ int ret;
+ kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
+ struct stage2_attr_data data = {
+ .attr_set = attr_set & attr_mask,
+ .attr_clr = attr_clr & attr_mask,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_attr_walker,
+ .arg = &data,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ if (ret)
+ return ret;
+
+ if (orig_pte)
+ *orig_pte = data.pte;
+
+ if (level)
+ *level = data.level;
+ return 0;
+}
+
+int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ return stage2_update_leaf_attrs(pgt, addr, size, 0,
+ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
+ NULL, NULL);
+}
+
+kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
+{
+ kvm_pte_t pte = 0;
+ stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
+ &pte, NULL);
+ dsb(ishst);
+ return pte;
+}
+
+kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
+{
+ kvm_pte_t pte = 0;
+ stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
+ &pte, NULL);
+ /*
+ * "But where's the TLBI?!", you scream.
+ * "Over in the core code", I sigh.
+ *
+ * See the '->clear_flush_young()' callback on the KVM mmu notifier.
+ */
+ return pte;
+}
+
+bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
+{
+ kvm_pte_t pte = 0;
+ stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
+ return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
+}
+
+int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
+ enum kvm_pgtable_prot prot)
+{
+ int ret;
+ u32 level;
+ kvm_pte_t set = 0, clr = 0;
+
+ if (prot & KVM_PGTABLE_PROT_R)
+ set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+
+ if (prot & KVM_PGTABLE_PROT_W)
+ set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+
+ if (prot & KVM_PGTABLE_PROT_X)
+ clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+
+ ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level);
+ if (!ret)
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
+ return ret;
+}
+
+static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ kvm_pte_t pte = *ptep;
+
+ if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte))
+ return 0;
+
+ stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level));
+ return 0;
+}
+
+int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_flush_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+ return 0;
+
+ return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
+int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
+{
+ size_t pgd_sz;
+ u64 vtcr = kvm->arch.vtcr;
+ u32 ia_bits = VTCR_EL2_IPA(vtcr);
+ u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
+ u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
+
+ pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
+ pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO);
+ if (!pgt->pgd)
+ return -ENOMEM;
+
+ pgt->ia_bits = ia_bits;
+ pgt->start_level = start_level;
+ pgt->mmu = &kvm->arch.mmu;
+
+ /* Ensure zeroed PGD pages are visible to the hardware walker */
+ dsb(ishst);
+ return 0;
+}
+
+static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ kvm_pte_t pte = *ptep;
+
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ put_page(virt_to_page(ptep));
+
+ if (kvm_pte_table(pte, level))
+ free_page((unsigned long)kvm_pte_follow(pte));
+
+ return 0;
+}
+
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+{
+ size_t pgd_sz;
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_free_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_TABLE_POST,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+ pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
+ free_pages_exact(pgt->pgd, pgd_sz);
+ pgt->pgd = NULL;
+}
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index c1da4f86ccac..fe69de16dadc 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -28,6 +28,11 @@
const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
+/* VHE specific context */
+DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
+DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
+
static void __activate_traps(struct kvm_vcpu *vcpu)
{
u64 val;
@@ -59,7 +64,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
write_sysreg(val, cpacr_el1);
- write_sysreg(kvm_get_hyp_vector(), vbar_el1);
+ write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
}
NOKPROBE_SYMBOL(__activate_traps);
@@ -108,7 +113,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt;
u64 exit_code;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
host_ctxt->__hyp_running_vcpu = vcpu;
guest_ctxt = &vcpu->arch.ctxt;
@@ -120,28 +125,24 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
* HCR_EL2.TGE.
*
* We have already configured the guest's stage 1 translation in
- * kvm_vcpu_load_sysregs_vhe above. We must now call __activate_vm
- * before __activate_traps, because __activate_vm configures
- * stage 2 translation, and __activate_traps clear HCR_EL2.TGE
- * (among other things).
+ * kvm_vcpu_load_sysregs_vhe above. We must now call
+ * __load_guest_stage2 before __activate_traps, because
+ * __load_guest_stage2 configures stage 2 translation, and
+ * __activate_traps clear HCR_EL2.TGE (among other things).
*/
- __activate_vm(vcpu->arch.hw_mmu);
+ __load_guest_stage2(vcpu->arch.hw_mmu);
__activate_traps(vcpu);
sysreg_restore_guest_state_vhe(guest_ctxt);
__debug_switch_to_guest(vcpu);
- __set_guest_arch_workaround_state(vcpu);
-
do {
/* Jump in the fire! */
- exit_code = __guest_enter(vcpu, host_ctxt);
+ exit_code = __guest_enter(vcpu);
/* And we're baaack! */
} while (fixup_guest_exit(vcpu, &exit_code));
- __set_host_arch_workaround_state(vcpu);
-
sysreg_save_guest_state_vhe(guest_ctxt);
__deactivate_traps(vcpu);
@@ -192,10 +193,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
return ret;
}
-static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
- struct kvm_cpu_context *host_ctxt)
+static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
{
+ struct kvm_cpu_context *host_ctxt;
struct kvm_vcpu *vcpu;
+
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
vcpu = host_ctxt->__hyp_running_vcpu;
__deactivate_traps(vcpu);
@@ -208,13 +211,13 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
}
NOKPROBE_SYMBOL(__hyp_call_panic);
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
+void __noreturn hyp_panic(void)
{
u64 spsr = read_sysreg_el2(SYS_SPSR);
u64 elr = read_sysreg_el2(SYS_ELR);
u64 par = read_sysreg(par_el1);
- __hyp_call_panic(spsr, elr, par, host_ctxt);
+ __hyp_call_panic(spsr, elr, par);
unreachable();
}
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 996471e4c138..2a0b8c88d74f 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -66,7 +66,7 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
struct kvm_cpu_context *host_ctxt;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
__sysreg_save_user_state(host_ctxt);
/*
@@ -100,7 +100,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
struct kvm_cpu_context *host_ctxt;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
deactivate_traps_vhe_put();
__sysreg_save_el1_state(guest_ctxt);
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 550dfa3e53cd..9824025ccc5c 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -24,27 +24,36 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
feature = smccc_get_arg1(vcpu);
switch (feature) {
case ARM_SMCCC_ARCH_WORKAROUND_1:
- switch (kvm_arm_harden_branch_predictor()) {
- case KVM_BP_HARDEN_UNKNOWN:
+ switch (arm64_get_spectre_v2_state()) {
+ case SPECTRE_VULNERABLE:
break;
- case KVM_BP_HARDEN_WA_NEEDED:
+ case SPECTRE_MITIGATED:
val = SMCCC_RET_SUCCESS;
break;
- case KVM_BP_HARDEN_NOT_REQUIRED:
+ case SPECTRE_UNAFFECTED:
val = SMCCC_RET_NOT_REQUIRED;
break;
}
break;
case ARM_SMCCC_ARCH_WORKAROUND_2:
- switch (kvm_arm_have_ssbd()) {
- case KVM_SSBD_FORCE_DISABLE:
- case KVM_SSBD_UNKNOWN:
+ switch (arm64_get_spectre_v4_state()) {
+ case SPECTRE_VULNERABLE:
break;
- case KVM_SSBD_KERNEL:
- val = SMCCC_RET_SUCCESS;
- break;
- case KVM_SSBD_FORCE_ENABLE:
- case KVM_SSBD_MITIGATED:
+ case SPECTRE_MITIGATED:
+ /*
+ * SSBS everywhere: Indicate no firmware
+ * support, as the SSBS support will be
+ * indicated to the guest and the default is
+ * safe.
+ *
+ * Otherwise, expose a permanent mitigation
+ * to the guest, and hide SSBS so that the
+ * guest stays protected.
+ */
+ if (cpus_have_final_cap(ARM64_SSBS))
+ break;
+ fallthrough;
+ case SPECTRE_UNAFFECTED:
val = SMCCC_RET_NOT_REQUIRED;
break;
}
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index ebfdfc27b2bd..34a96ab244fa 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -202,6 +202,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
/**
* kvm_inject_undefined - inject an undefined instruction into the guest
+ * @vcpu: The vCPU in which to inject the exception
*
* It is assumed that this code is called from the VCPU thread and that the
* VCPU therefore is not currently executing guest code.
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 3d26b47a1343..19aacc7d64de 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -14,6 +14,7 @@
#include <asm/cacheflush.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
#include <asm/kvm_ras.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
@@ -21,9 +22,7 @@
#include "trace.h"
-static pgd_t *boot_hyp_pgd;
-static pgd_t *hyp_pgd;
-static pgd_t *merged_hyp_pgd;
+static struct kvm_pgtable *hyp_pgtable;
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
static unsigned long hyp_idmap_start;
@@ -32,16 +31,42 @@ static phys_addr_t hyp_idmap_vector;
static unsigned long io_map_base;
-#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
-#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
-#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
-
-static bool is_iomap(unsigned long flags)
+/*
+ * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
+ * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
+ * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
+ * long will also starve other vCPUs. We have to also make sure that the page
+ * tables are not freed while we released the lock.
+ */
+static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
+ phys_addr_t end,
+ int (*fn)(struct kvm_pgtable *, u64, u64),
+ bool resched)
{
- return flags & KVM_S2PTE_FLAG_IS_IOMAP;
+ int ret;
+ u64 next;
+
+ do {
+ struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ if (!pgt)
+ return -EINVAL;
+
+ next = stage2_pgd_addr_end(kvm, addr, end);
+ ret = fn(pgt, addr, next - addr);
+ if (ret)
+ break;
+
+ if (resched && next != end)
+ cond_resched_lock(&kvm->mmu_lock);
+ } while (addr = next, addr != end);
+
+ return ret;
}
+#define stage2_apply_range_resched(kvm, addr, end, fn) \
+ stage2_apply_range(kvm, addr, end, fn, true)
+
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
@@ -58,154 +83,11 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
}
-static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
- int level)
-{
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
-}
-
-/*
- * D-Cache management functions. They take the page table entries by
- * value, as they are flushing the cache using the kernel mapping (or
- * kmap on 32bit).
- */
-static void kvm_flush_dcache_pte(pte_t pte)
-{
- __kvm_flush_dcache_pte(pte);
-}
-
-static void kvm_flush_dcache_pmd(pmd_t pmd)
-{
- __kvm_flush_dcache_pmd(pmd);
-}
-
-static void kvm_flush_dcache_pud(pud_t pud)
-{
- __kvm_flush_dcache_pud(pud);
-}
-
static bool kvm_is_device_pfn(unsigned long pfn)
{
return !pfn_valid(pfn);
}
-/**
- * stage2_dissolve_pmd() - clear and flush huge PMD entry
- * @mmu: pointer to mmu structure to operate on
- * @addr: IPA
- * @pmd: pmd pointer for IPA
- *
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
-{
- if (!pmd_thp_or_huge(*pmd))
- return;
-
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
- put_page(virt_to_page(pmd));
-}
-
-/**
- * stage2_dissolve_pud() - clear and flush huge PUD entry
- * @mmu: pointer to mmu structure to operate on
- * @addr: IPA
- * @pud: pud pointer for IPA
- *
- * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
-{
- struct kvm *kvm = mmu->kvm;
-
- if (!stage2_pud_huge(kvm, *pudp))
- return;
-
- stage2_pud_clear(kvm, pudp);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
- put_page(virt_to_page(pudp));
-}
-
-static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
- stage2_pgd_clear(kvm, pgd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- stage2_p4d_free(kvm, p4d_table);
- put_page(virt_to_page(pgd));
-}
-
-static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
- stage2_p4d_clear(kvm, p4d);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- stage2_pud_free(kvm, pud_table);
- put_page(virt_to_page(p4d));
-}
-
-static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
-
- VM_BUG_ON(stage2_pud_huge(kvm, *pud));
- stage2_pud_clear(kvm, pud);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- stage2_pmd_free(kvm, pmd_table);
- put_page(virt_to_page(pud));
-}
-
-static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
-{
- pte_t *pte_table = pte_offset_kernel(pmd, 0);
- VM_BUG_ON(pmd_thp_or_huge(*pmd));
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- free_page((unsigned long)pte_table);
- put_page(virt_to_page(pmd));
-}
-
-static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
-{
- WRITE_ONCE(*ptep, new_pte);
- dsb(ishst);
-}
-
-static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
-{
- WRITE_ONCE(*pmdp, new_pmd);
- dsb(ishst);
-}
-
-static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
-{
- kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
-}
-
-static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
-{
- WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
- dsb(ishst);
-}
-
-static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
-{
- WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
- dsb(ishst);
-}
-
-static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
-{
-#ifndef __PAGETABLE_P4D_FOLDED
- WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
- dsb(ishst);
-#endif
-}
-
/*
* Unmapping vs dcache management:
*
@@ -223,120 +105,19 @@ static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
* end up writing old data to disk.
*
* This is why right after unmapping a page/section and invalidating
- * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
- * the IO subsystem will never hit in the cache.
+ * the corresponding TLBs, we flush to make sure the IO subsystem will
+ * never hit in the cache.
*
* This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
* we then fully enforce cacheability of RAM, no matter what the guest
* does.
*/
-static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
- phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t start_addr = addr;
- pte_t *pte, *start_pte;
-
- start_pte = pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte)) {
- pte_t old_pte = *pte;
-
- kvm_set_pte(pte, __pte(0));
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
-
- /* No need to invalidate the cache for device mappings */
- if (!kvm_is_device_pfn(pte_pfn(old_pte)))
- kvm_flush_dcache_pte(old_pte);
-
- put_page(virt_to_page(pte));
- }
- } while (pte++, addr += PAGE_SIZE, addr != end);
-
- if (stage2_pte_table_empty(mmu->kvm, start_pte))
- clear_stage2_pmd_entry(mmu, pmd, start_addr);
-}
-
-static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- phys_addr_t next, start_addr = addr;
- pmd_t *pmd, *start_pmd;
-
- start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
- do {
- next = stage2_pmd_addr_end(kvm, addr, end);
- if (!pmd_none(*pmd)) {
- if (pmd_thp_or_huge(*pmd)) {
- pmd_t old_pmd = *pmd;
-
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
-
- kvm_flush_dcache_pmd(old_pmd);
-
- put_page(virt_to_page(pmd));
- } else {
- unmap_stage2_ptes(mmu, pmd, addr, next);
- }
- }
- } while (pmd++, addr = next, addr != end);
-
- if (stage2_pmd_table_empty(kvm, start_pmd))
- clear_stage2_pud_entry(mmu, pud, start_addr);
-}
-
-static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- phys_addr_t next, start_addr = addr;
- pud_t *pud, *start_pud;
-
- start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
- do {
- next = stage2_pud_addr_end(kvm, addr, end);
- if (!stage2_pud_none(kvm, *pud)) {
- if (stage2_pud_huge(kvm, *pud)) {
- pud_t old_pud = *pud;
-
- stage2_pud_clear(kvm, pud);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
- kvm_flush_dcache_pud(old_pud);
- put_page(virt_to_page(pud));
- } else {
- unmap_stage2_pmds(mmu, pud, addr, next);
- }
- }
- } while (pud++, addr = next, addr != end);
-
- if (stage2_pud_table_empty(kvm, start_pud))
- clear_stage2_p4d_entry(mmu, p4d, start_addr);
-}
-
-static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- phys_addr_t next, start_addr = addr;
- p4d_t *p4d, *start_p4d;
-
- start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
- do {
- next = stage2_p4d_addr_end(kvm, addr, end);
- if (!stage2_p4d_none(kvm, *p4d))
- unmap_stage2_puds(mmu, p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-
- if (stage2_p4d_table_empty(kvm, start_p4d))
- clear_stage2_pgd_entry(mmu, pgd, start_addr);
-}
-
/**
* unmap_stage2_range -- Clear stage2 page table entries to unmap a range
- * @kvm: The VM pointer
+ * @mmu: The KVM stage-2 MMU pointer
* @start: The intermediate physical base address of the range to unmap
* @size: The size of the area to unmap
+ * @may_block: Whether or not we are permitted to block
*
* Clear a range of stage-2 mappings, lowering the various ref-counts. Must
* be called while holding mmu_lock (unless for freeing the stage2 pgd before
@@ -347,32 +128,12 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
bool may_block)
{
struct kvm *kvm = mmu->kvm;
- pgd_t *pgd;
- phys_addr_t addr = start, end = start + size;
- phys_addr_t next;
+ phys_addr_t end = start + size;
assert_spin_locked(&kvm->mmu_lock);
WARN_ON(size & ~PAGE_MASK);
-
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- do {
- /*
- * Make sure the page table is still active, as another thread
- * could have possibly freed the page table, while we released
- * the lock.
- */
- if (!READ_ONCE(mmu->pgd))
- break;
- next = stage2_pgd_addr_end(kvm, addr, end);
- if (!stage2_pgd_none(kvm, *pgd))
- unmap_stage2_p4ds(mmu, pgd, addr, next);
- /*
- * If the range is too large, release the kvm->mmu_lock
- * to prevent starvation and lockup detector warnings.
- */
- if (may_block && next != end)
- cond_resched_lock(&kvm->mmu_lock);
- } while (pgd++, addr = next, addr != end);
+ WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
+ may_block));
}
static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
@@ -380,89 +141,13 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
__unmap_stage2_range(mmu, start, size, true);
}
-static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
- phys_addr_t addr, phys_addr_t end)
-{
- pte_t *pte;
-
- pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
- kvm_flush_dcache_pte(*pte);
- } while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pmd_t *pmd;
- phys_addr_t next;
-
- pmd = stage2_pmd_offset(kvm, pud, addr);
- do {
- next = stage2_pmd_addr_end(kvm, addr, end);
- if (!pmd_none(*pmd)) {
- if (pmd_thp_or_huge(*pmd))
- kvm_flush_dcache_pmd(*pmd);
- else
- stage2_flush_ptes(mmu, pmd, addr, next);
- }
- } while (pmd++, addr = next, addr != end);
-}
-
-static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- phys_addr_t next;
-
- pud = stage2_pud_offset(kvm, p4d, addr);
- do {
- next = stage2_pud_addr_end(kvm, addr, end);
- if (!stage2_pud_none(kvm, *pud)) {
- if (stage2_pud_huge(kvm, *pud))
- kvm_flush_dcache_pud(*pud);
- else
- stage2_flush_pmds(mmu, pud, addr, next);
- }
- } while (pud++, addr = next, addr != end);
-}
-
-static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d;
- phys_addr_t next;
-
- p4d = stage2_p4d_offset(kvm, pgd, addr);
- do {
- next = stage2_p4d_addr_end(kvm, addr, end);
- if (!stage2_p4d_none(kvm, *p4d))
- stage2_flush_puds(mmu, p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-}
-
static void stage2_flush_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
- phys_addr_t next;
- pgd_t *pgd;
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- do {
- next = stage2_pgd_addr_end(kvm, addr, end);
- if (!stage2_pgd_none(kvm, *pgd))
- stage2_flush_p4ds(mmu, pgd, addr, next);
-
- if (next != end)
- cond_resched_lock(&kvm->mmu_lock);
- } while (pgd++, addr = next, addr != end);
+ stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
}
/**
@@ -489,338 +174,28 @@ static void stage2_flush_vm(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
}
-static void clear_hyp_pgd_entry(pgd_t *pgd)
-{
- p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL);
- pgd_clear(pgd);
- p4d_free(NULL, p4d_table);
- put_page(virt_to_page(pgd));
-}
-
-static void clear_hyp_p4d_entry(p4d_t *p4d)
-{
- pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL);
- VM_BUG_ON(p4d_huge(*p4d));
- p4d_clear(p4d);
- pud_free(NULL, pud_table);
- put_page(virt_to_page(p4d));
-}
-
-static void clear_hyp_pud_entry(pud_t *pud)
-{
- pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
- VM_BUG_ON(pud_huge(*pud));
- pud_clear(pud);
- pmd_free(NULL, pmd_table);
- put_page(virt_to_page(pud));
-}
-
-static void clear_hyp_pmd_entry(pmd_t *pmd)
-{
- pte_t *pte_table = pte_offset_kernel(pmd, 0);
- VM_BUG_ON(pmd_thp_or_huge(*pmd));
- pmd_clear(pmd);
- pte_free_kernel(NULL, pte_table);
- put_page(virt_to_page(pmd));
-}
-
-static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
- pte_t *pte, *start_pte;
-
- start_pte = pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte)) {
- kvm_set_pte(pte, __pte(0));
- put_page(virt_to_page(pte));
- }
- } while (pte++, addr += PAGE_SIZE, addr != end);
-
- if (hyp_pte_table_empty(start_pte))
- clear_hyp_pmd_entry(pmd);
-}
-
-static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t next;
- pmd_t *pmd, *start_pmd;
-
- start_pmd = pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- /* Hyp doesn't use huge pmds */
- if (!pmd_none(*pmd))
- unmap_hyp_ptes(pmd, addr, next);
- } while (pmd++, addr = next, addr != end);
-
- if (hyp_pmd_table_empty(start_pmd))
- clear_hyp_pud_entry(pud);
-}
-
-static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t next;
- pud_t *pud, *start_pud;
-
- start_pud = pud = pud_offset(p4d, addr);
- do {
- next = pud_addr_end(addr, end);
- /* Hyp doesn't use huge puds */
- if (!pud_none(*pud))
- unmap_hyp_pmds(pud, addr, next);
- } while (pud++, addr = next, addr != end);
-
- if (hyp_pud_table_empty(start_pud))
- clear_hyp_p4d_entry(p4d);
-}
-
-static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t next;
- p4d_t *p4d, *start_p4d;
-
- start_p4d = p4d = p4d_offset(pgd, addr);
- do {
- next = p4d_addr_end(addr, end);
- /* Hyp doesn't use huge p4ds */
- if (!p4d_none(*p4d))
- unmap_hyp_puds(p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-
- if (hyp_p4d_table_empty(start_p4d))
- clear_hyp_pgd_entry(pgd);
-}
-
-static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
-{
- return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
-}
-
-static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
- phys_addr_t start, u64 size)
-{
- pgd_t *pgd;
- phys_addr_t addr = start, end = start + size;
- phys_addr_t next;
-
- /*
- * We don't unmap anything from HYP, except at the hyp tear down.
- * Hence, we don't have to invalidate the TLBs here.
- */
- pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
- do {
- next = pgd_addr_end(addr, end);
- if (!pgd_none(*pgd))
- unmap_hyp_p4ds(pgd, addr, next);
- } while (pgd++, addr = next, addr != end);
-}
-
-static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
- __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
-}
-
-static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
- __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
-}
-
/**
* free_hyp_pgds - free Hyp-mode page tables
- *
- * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
- * therefore contains either mappings in the kernel memory area (above
- * PAGE_OFFSET), or device mappings in the idmap range.
- *
- * boot_hyp_pgd should only map the idmap range, and is only used in
- * the extended idmap case.
*/
void free_hyp_pgds(void)
{
- pgd_t *id_pgd;
-
mutex_lock(&kvm_hyp_pgd_mutex);
-
- id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
-
- if (id_pgd) {
- /* In case we never called hyp_mmu_init() */
- if (!io_map_base)
- io_map_base = hyp_idmap_start;
- unmap_hyp_idmap_range(id_pgd, io_map_base,
- hyp_idmap_start + PAGE_SIZE - io_map_base);
+ if (hyp_pgtable) {
+ kvm_pgtable_hyp_destroy(hyp_pgtable);
+ kfree(hyp_pgtable);
}
-
- if (boot_hyp_pgd) {
- free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
- boot_hyp_pgd = NULL;
- }
-
- if (hyp_pgd) {
- unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
- (uintptr_t)high_memory - PAGE_OFFSET);
-
- free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
- hyp_pgd = NULL;
- }
- if (merged_hyp_pgd) {
- clear_page(merged_hyp_pgd);
- free_page((unsigned long)merged_hyp_pgd);
- merged_hyp_pgd = NULL;
- }
-
mutex_unlock(&kvm_hyp_pgd_mutex);
}
-static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
-{
- pte_t *pte;
- unsigned long addr;
-
- addr = start;
- do {
- pte = pte_offset_kernel(pmd, addr);
- kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
- get_page(virt_to_page(pte));
- pfn++;
- } while (addr += PAGE_SIZE, addr != end);
-}
-
-static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
-{
- pmd_t *pmd;
- pte_t *pte;
- unsigned long addr, next;
-
- addr = start;
- do {
- pmd = pmd_offset(pud, addr);
-
- BUG_ON(pmd_sect(*pmd));
-
- if (pmd_none(*pmd)) {
- pte = pte_alloc_one_kernel(NULL);
- if (!pte) {
- kvm_err("Cannot allocate Hyp pte\n");
- return -ENOMEM;
- }
- kvm_pmd_populate(pmd, pte);
- get_page(virt_to_page(pmd));
- }
-
- next = pmd_addr_end(addr, end);
-
- create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-
- return 0;
-}
-
-static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
-{
- pud_t *pud;
- pmd_t *pmd;
- unsigned long addr, next;
- int ret;
-
- addr = start;
- do {
- pud = pud_offset(p4d, addr);
-
- if (pud_none_or_clear_bad(pud)) {
- pmd = pmd_alloc_one(NULL, addr);
- if (!pmd) {
- kvm_err("Cannot allocate Hyp pmd\n");
- return -ENOMEM;
- }
- kvm_pud_populate(pud, pmd);
- get_page(virt_to_page(pud));
- }
-
- next = pud_addr_end(addr, end);
- ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
- if (ret)
- return ret;
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-
- return 0;
-}
-
-static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
+static int __create_hyp_mappings(unsigned long start, unsigned long size,
+ unsigned long phys, enum kvm_pgtable_prot prot)
{
- p4d_t *p4d;
- pud_t *pud;
- unsigned long addr, next;
- int ret;
-
- addr = start;
- do {
- p4d = p4d_offset(pgd, addr);
-
- if (p4d_none(*p4d)) {
- pud = pud_alloc_one(NULL, addr);
- if (!pud) {
- kvm_err("Cannot allocate Hyp pud\n");
- return -ENOMEM;
- }
- kvm_p4d_populate(p4d, pud);
- get_page(virt_to_page(p4d));
- }
-
- next = p4d_addr_end(addr, end);
- ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot);
- if (ret)
- return ret;
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-
- return 0;
-}
-
-static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
- unsigned long start, unsigned long end,
- unsigned long pfn, pgprot_t prot)
-{
- pgd_t *pgd;
- p4d_t *p4d;
- unsigned long addr, next;
- int err = 0;
+ int err;
mutex_lock(&kvm_hyp_pgd_mutex);
- addr = start & PAGE_MASK;
- end = PAGE_ALIGN(end);
- do {
- pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-
- if (pgd_none(*pgd)) {
- p4d = p4d_alloc_one(NULL, addr);
- if (!p4d) {
- kvm_err("Cannot allocate Hyp p4d\n");
- err = -ENOMEM;
- goto out;
- }
- kvm_pgd_populate(pgd, p4d);
- get_page(virt_to_page(pgd));
- }
-
- next = pgd_addr_end(addr, end);
- err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot);
- if (err)
- goto out;
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-out:
+ err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
mutex_unlock(&kvm_hyp_pgd_mutex);
+
return err;
}
@@ -845,7 +220,7 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
* in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
* physical pages.
*/
-int create_hyp_mappings(void *from, void *to, pgprot_t prot)
+int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
{
phys_addr_t phys_addr;
unsigned long virt_addr;
@@ -862,9 +237,7 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
int err;
phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
- err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
- virt_addr, virt_addr + PAGE_SIZE,
- __phys_to_pfn(phys_addr),
+ err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
prot);
if (err)
return err;
@@ -874,9 +247,9 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
}
static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
- unsigned long *haddr, pgprot_t prot)
+ unsigned long *haddr,
+ enum kvm_pgtable_prot prot)
{
- pgd_t *pgd = hyp_pgd;
unsigned long base;
int ret = 0;
@@ -908,17 +281,11 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
if (ret)
goto out;
- if (__kvm_cpu_uses_extended_idmap())
- pgd = boot_hyp_pgd;
-
- ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
- base, base + size,
- __phys_to_pfn(phys_addr), prot);
+ ret = __create_hyp_mappings(base, size, phys_addr, prot);
if (ret)
goto out;
*haddr = base + offset_in_page(phys_addr);
-
out:
return ret;
}
@@ -989,47 +356,48 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
* @kvm: The pointer to the KVM structure
* @mmu: The pointer to the s2 MMU structure
*
- * Allocates only the stage-2 HW PGD level table(s) of size defined by
- * stage2_pgd_size(mmu->kvm).
- *
+ * Allocates only the stage-2 HW PGD level table(s).
* Note we don't need locking here as this is only called when the VM is
* created, which can only be done once.
*/
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
{
- phys_addr_t pgd_phys;
- pgd_t *pgd;
- int cpu;
+ int cpu, err;
+ struct kvm_pgtable *pgt;
- if (mmu->pgd != NULL) {
+ if (mmu->pgt != NULL) {
kvm_err("kvm_arch already initialized?\n");
return -EINVAL;
}
- /* Allocate the HW PGD, making sure that each page gets its own refcount */
- pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
- if (!pgd)
+ pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
+ if (!pgt)
return -ENOMEM;
- pgd_phys = virt_to_phys(pgd);
- if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
- return -EINVAL;
+ err = kvm_pgtable_stage2_init(pgt, kvm);
+ if (err)
+ goto out_free_pgtable;
mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
if (!mmu->last_vcpu_ran) {
- free_pages_exact(pgd, stage2_pgd_size(kvm));
- return -ENOMEM;
+ err = -ENOMEM;
+ goto out_destroy_pgtable;
}
for_each_possible_cpu(cpu)
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
mmu->kvm = kvm;
- mmu->pgd = pgd;
- mmu->pgd_phys = pgd_phys;
+ mmu->pgt = pgt;
+ mmu->pgd_phys = __pa(pgt->pgd);
mmu->vmid.vmid_gen = 0;
-
return 0;
+
+out_destroy_pgtable:
+ kvm_pgtable_stage2_destroy(pgt);
+out_free_pgtable:
+ kfree(pgt);
+ return err;
}
static void stage2_unmap_memslot(struct kvm *kvm,
@@ -1102,363 +470,21 @@ void stage2_unmap_vm(struct kvm *kvm)
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
{
struct kvm *kvm = mmu->kvm;
- void *pgd = NULL;
+ struct kvm_pgtable *pgt = NULL;
spin_lock(&kvm->mmu_lock);
- if (mmu->pgd) {
- unmap_stage2_range(mmu, 0, kvm_phys_size(kvm));
- pgd = READ_ONCE(mmu->pgd);
- mmu->pgd = NULL;
- }
- spin_unlock(&kvm->mmu_lock);
-
- /* Free the HW pgd, one page at a time */
- if (pgd) {
- free_pages_exact(pgd, stage2_pgd_size(kvm));
+ pgt = mmu->pgt;
+ if (pgt) {
+ mmu->pgd_phys = 0;
+ mmu->pgt = NULL;
free_percpu(mmu->last_vcpu_ran);
}
-}
-
-static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pgd_t *pgd;
- p4d_t *p4d;
-
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- if (stage2_pgd_none(kvm, *pgd)) {
- if (!cache)
- return NULL;
- p4d = kvm_mmu_memory_cache_alloc(cache);
- stage2_pgd_populate(kvm, pgd, p4d);
- get_page(virt_to_page(pgd));
- }
-
- return stage2_p4d_offset(kvm, pgd, addr);
-}
-
-static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d;
- pud_t *pud;
-
- p4d = stage2_get_p4d(mmu, cache, addr);
- if (stage2_p4d_none(kvm, *p4d)) {
- if (!cache)
- return NULL;
- pud = kvm_mmu_memory_cache_alloc(cache);
- stage2_p4d_populate(kvm, p4d, pud);
- get_page(virt_to_page(p4d));
- }
-
- return stage2_pud_offset(kvm, p4d, addr);
-}
-
-static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- pmd_t *pmd;
-
- pud = stage2_get_pud(mmu, cache, addr);
- if (!pud || stage2_pud_huge(kvm, *pud))
- return NULL;
-
- if (stage2_pud_none(kvm, *pud)) {
- if (!cache)
- return NULL;
- pmd = kvm_mmu_memory_cache_alloc(cache);
- stage2_pud_populate(kvm, pud, pmd);
- get_page(virt_to_page(pud));
- }
-
- return stage2_pmd_offset(kvm, pud, addr);
-}
-
-static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
- struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pmd_t *new_pmd)
-{
- pmd_t *pmd, old_pmd;
-
-retry:
- pmd = stage2_get_pmd(mmu, cache, addr);
- VM_BUG_ON(!pmd);
-
- old_pmd = *pmd;
- /*
- * Multiple vcpus faulting on the same PMD entry, can
- * lead to them sequentially updating the PMD with the
- * same value. Following the break-before-make
- * (pmd_clear() followed by tlb_flush()) process can
- * hinder forward progress due to refaults generated
- * on missing translations.
- *
- * Skip updating the page table if the entry is
- * unchanged.
- */
- if (pmd_val(old_pmd) == pmd_val(*new_pmd))
- return 0;
-
- if (pmd_present(old_pmd)) {
- /*
- * If we already have PTE level mapping for this block,
- * we must unmap it to avoid inconsistent TLB state and
- * leaking the table page. We could end up in this situation
- * if the memory slot was marked for dirty logging and was
- * reverted, leaving PTE level mappings for the pages accessed
- * during the period. So, unmap the PTE level mapping for this
- * block and retry, as we could have released the upper level
- * table in the process.
- *
- * Normal THP split/merge follows mmu_notifier callbacks and do
- * get handled accordingly.
- */
- if (!pmd_thp_or_huge(old_pmd)) {
- unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
- goto retry;
- }
- /*
- * Mapping in huge pages should only happen through a
- * fault. If a page is merged into a transparent huge
- * page, the individual subpages of that huge page
- * should be unmapped through MMU notifiers before we
- * get here.
- *
- * Merging of CompoundPages is not supported; they
- * should become splitting first, unmapped, merged,
- * and mapped back in on-demand.
- */
- WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
- } else {
- get_page(virt_to_page(pmd));
- }
-
- kvm_set_pmd(pmd, *new_pmd);
- return 0;
-}
-
-static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
- struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pud_t *new_pudp)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pudp, old_pud;
-
-retry:
- pudp = stage2_get_pud(mmu, cache, addr);
- VM_BUG_ON(!pudp);
-
- old_pud = *pudp;
-
- /*
- * A large number of vcpus faulting on the same stage 2 entry,
- * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
- * Skip updating the page tables if there is no change.
- */
- if (pud_val(old_pud) == pud_val(*new_pudp))
- return 0;
-
- if (stage2_pud_present(kvm, old_pud)) {
- /*
- * If we already have table level mapping for this block, unmap
- * the range for this block and retry.
- */
- if (!stage2_pud_huge(kvm, old_pud)) {
- unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
- goto retry;
- }
-
- WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
- stage2_pud_clear(kvm, pudp);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
- } else {
- get_page(virt_to_page(pudp));
- }
-
- kvm_set_pud(pudp, *new_pudp);
- return 0;
-}
-
-/*
- * stage2_get_leaf_entry - walk the stage2 VM page tables and return
- * true if a valid and present leaf-entry is found. A pointer to the
- * leaf-entry is returned in the appropriate level variable - pudpp,
- * pmdpp, ptepp.
- */
-static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
- pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
-
- *pudpp = NULL;
- *pmdpp = NULL;
- *ptepp = NULL;
-
- pudp = stage2_get_pud(mmu, NULL, addr);
- if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
- return false;
-
- if (stage2_pud_huge(kvm, *pudp)) {
- *pudpp = pudp;
- return true;
- }
-
- pmdp = stage2_pmd_offset(kvm, pudp, addr);
- if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
- return false;
-
- if (pmd_thp_or_huge(*pmdp)) {
- *pmdpp = pmdp;
- return true;
- }
-
- ptep = pte_offset_kernel(pmdp, addr);
- if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
- return false;
-
- *ptepp = ptep;
- return true;
-}
-
-static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
-{
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
- bool found;
-
- found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
- if (!found)
- return false;
-
- if (pudp)
- return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
- else if (pmdp)
- return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
- else
- return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
-}
-
-static int stage2_set_pte(struct kvm_s2_mmu *mmu,
- struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pte_t *new_pte,
- unsigned long flags)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte, old_pte;
- bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
- bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
-
- VM_BUG_ON(logging_active && !cache);
-
- /* Create stage-2 page table mapping - Levels 0 and 1 */
- pud = stage2_get_pud(mmu, cache, addr);
- if (!pud) {
- /*
- * Ignore calls from kvm_set_spte_hva for unallocated
- * address ranges.
- */
- return 0;
- }
-
- /*
- * While dirty page logging - dissolve huge PUD, then continue
- * on to allocate page.
- */
- if (logging_active)
- stage2_dissolve_pud(mmu, addr, pud);
-
- if (stage2_pud_none(kvm, *pud)) {
- if (!cache)
- return 0; /* ignore calls from kvm_set_spte_hva */
- pmd = kvm_mmu_memory_cache_alloc(cache);
- stage2_pud_populate(kvm, pud, pmd);
- get_page(virt_to_page(pud));
- }
-
- pmd = stage2_pmd_offset(kvm, pud, addr);
- if (!pmd) {
- /*
- * Ignore calls from kvm_set_spte_hva for unallocated
- * address ranges.
- */
- return 0;
- }
-
- /*
- * While dirty page logging - dissolve huge PMD, then continue on to
- * allocate page.
- */
- if (logging_active)
- stage2_dissolve_pmd(mmu, addr, pmd);
-
- /* Create stage-2 page mappings - Level 2 */
- if (pmd_none(*pmd)) {
- if (!cache)
- return 0; /* ignore calls from kvm_set_spte_hva */
- pte = kvm_mmu_memory_cache_alloc(cache);
- kvm_pmd_populate(pmd, pte);
- get_page(virt_to_page(pmd));
- }
-
- pte = pte_offset_kernel(pmd, addr);
-
- if (iomap && pte_present(*pte))
- return -EFAULT;
-
- /* Create 2nd stage page table mapping - Level 3 */
- old_pte = *pte;
- if (pte_present(old_pte)) {
- /* Skip page table update if there is no change */
- if (pte_val(old_pte) == pte_val(*new_pte))
- return 0;
-
- kvm_set_pte(pte, __pte(0));
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
- } else {
- get_page(virt_to_page(pte));
- }
-
- kvm_set_pte(pte, *new_pte);
- return 0;
-}
+ spin_unlock(&kvm->mmu_lock);
-#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
- if (pte_young(*pte)) {
- *pte = pte_mkold(*pte);
- return 1;
+ if (pgt) {
+ kvm_pgtable_stage2_destroy(pgt);
+ kfree(pgt);
}
- return 0;
-}
-#else
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
- return __ptep_test_and_clear_young(pte);
-}
-#endif
-
-static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
-{
- return stage2_ptep_test_and_clear_young((pte_t *)pmd);
-}
-
-static int stage2_pudp_test_and_clear_young(pud_t *pud)
-{
- return stage2_ptep_test_and_clear_young((pte_t *)pud);
}
/**
@@ -1468,169 +494,52 @@ static int stage2_pudp_test_and_clear_young(pud_t *pud)
* @guest_ipa: The IPA at which to insert the mapping
* @pa: The physical address of the device
* @size: The size of the mapping
+ * @writable: Whether or not to create a writable mapping
*/
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
phys_addr_t pa, unsigned long size, bool writable)
{
- phys_addr_t addr, end;
+ phys_addr_t addr;
int ret = 0;
- unsigned long pfn;
struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
+ struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
+ KVM_PGTABLE_PROT_R |
+ (writable ? KVM_PGTABLE_PROT_W : 0);
- end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
- pfn = __phys_to_pfn(pa);
-
- for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
- pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
-
- if (writable)
- pte = kvm_s2pte_mkwrite(pte);
+ size += offset_in_page(guest_ipa);
+ guest_ipa &= PAGE_MASK;
+ for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
ret = kvm_mmu_topup_memory_cache(&cache,
kvm_mmu_cache_min_pages(kvm));
if (ret)
- goto out;
+ break;
+
spin_lock(&kvm->mmu_lock);
- ret = stage2_set_pte(&kvm->arch.mmu, &cache, addr, &pte,
- KVM_S2PTE_FLAG_IS_IOMAP);
+ ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
+ &cache);
spin_unlock(&kvm->mmu_lock);
if (ret)
- goto out;
+ break;
- pfn++;
+ pa += PAGE_SIZE;
}
-out:
kvm_mmu_free_memory_cache(&cache);
return ret;
}
/**
- * stage2_wp_ptes - write protect PMD range
- * @pmd: pointer to pmd entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
- pte_t *pte;
-
- pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte)) {
- if (!kvm_s2pte_readonly(pte))
- kvm_set_s2pte_readonly(pte);
- }
- } while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-/**
- * stage2_wp_pmds - write protect PUD range
- * kvm: kvm instance for the VM
- * @pud: pointer to pud entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pmd_t *pmd;
- phys_addr_t next;
-
- pmd = stage2_pmd_offset(kvm, pud, addr);
-
- do {
- next = stage2_pmd_addr_end(kvm, addr, end);
- if (!pmd_none(*pmd)) {
- if (pmd_thp_or_huge(*pmd)) {
- if (!kvm_s2pmd_readonly(pmd))
- kvm_set_s2pmd_readonly(pmd);
- } else {
- stage2_wp_ptes(pmd, addr, next);
- }
- }
- } while (pmd++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_puds - write protect P4D range
- * @p4d: pointer to p4d entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- phys_addr_t next;
-
- pud = stage2_pud_offset(kvm, p4d, addr);
- do {
- next = stage2_pud_addr_end(kvm, addr, end);
- if (!stage2_pud_none(kvm, *pud)) {
- if (stage2_pud_huge(kvm, *pud)) {
- if (!kvm_s2pud_readonly(pud))
- kvm_set_s2pud_readonly(pud);
- } else {
- stage2_wp_pmds(mmu, pud, addr, next);
- }
- }
- } while (pud++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_p4ds - write protect PGD range
- * @pgd: pointer to pgd entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d;
- phys_addr_t next;
-
- p4d = stage2_p4d_offset(kvm, pgd, addr);
- do {
- next = stage2_p4d_addr_end(kvm, addr, end);
- if (!stage2_p4d_none(kvm, *p4d))
- stage2_wp_puds(mmu, p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-}
-
-/**
* stage2_wp_range() - write protect stage2 memory region range
- * @kvm: The KVM pointer
+ * @mmu: The KVM stage-2 MMU pointer
* @addr: Start address of range
* @end: End address of range
*/
static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
- pgd_t *pgd;
- phys_addr_t next;
-
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- do {
- /*
- * Release kvm_mmu_lock periodically if the memory region is
- * large. Otherwise, we may see kernel panics with
- * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
- * CONFIG_LOCKDEP. Additionally, holding the lock too long
- * will also starve other vCPUs. We have to also make sure
- * that the page tables are not freed while we released
- * the lock.
- */
- cond_resched_lock(&kvm->mmu_lock);
- if (!READ_ONCE(mmu->pgd))
- break;
- next = stage2_pgd_addr_end(kvm, addr, end);
- if (stage2_pgd_present(kvm, *pgd))
- stage2_wp_p4ds(mmu, pgd, addr, next);
- } while (pgd++, addr = next, addr != end);
+ stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
}
/**
@@ -1833,20 +742,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_memory_slot *memslot, unsigned long hva,
unsigned long fault_status)
{
- int ret;
+ int ret = 0;
bool write_fault, writable, force_pte = false;
- bool exec_fault, needs_exec;
+ bool exec_fault;
+ bool device = false;
unsigned long mmu_seq;
- gfn_t gfn = fault_ipa >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
struct vm_area_struct *vma;
short vma_shift;
+ gfn_t gfn;
kvm_pfn_t pfn;
- pgprot_t mem_type = PAGE_S2;
bool logging_active = memslot_is_logging(memslot);
- unsigned long vma_pagesize, flags = 0;
- struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
+ unsigned long vma_pagesize;
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
+ struct kvm_pgtable *pgt;
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
@@ -1871,31 +781,41 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
else
vma_shift = PAGE_SHIFT;
- vma_pagesize = 1ULL << vma_shift;
if (logging_active ||
- (vma->vm_flags & VM_PFNMAP) ||
- !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
+ (vma->vm_flags & VM_PFNMAP)) {
force_pte = true;
- vma_pagesize = PAGE_SIZE;
vma_shift = PAGE_SHIFT;
}
- /*
- * The stage2 has a minimum of 2 level table (For arm64 see
- * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
- * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
- * As for PUD huge maps, we must make sure that we have at least
- * 3 levels, i.e, PMD is not folded.
- */
- if (vma_pagesize == PMD_SIZE ||
- (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
- gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
+ if (vma_shift == PUD_SHIFT &&
+ !fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
+ vma_shift = PMD_SHIFT;
+
+ if (vma_shift == PMD_SHIFT &&
+ !fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+ force_pte = true;
+ vma_shift = PAGE_SHIFT;
+ }
+
+ vma_pagesize = 1UL << vma_shift;
+ if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
+ fault_ipa &= ~(vma_pagesize - 1);
+
+ gfn = fault_ipa >> PAGE_SHIFT;
mmap_read_unlock(current->mm);
- /* We need minimum second+third level pages */
- ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
- if (ret)
- return ret;
+ /*
+ * Permission faults just need to update the existing leaf entry,
+ * and so normally don't require allocations from the memcache. The
+ * only exception to this is when dirty logging is enabled at runtime
+ * and a write fault needs to collapse a block entry into a table.
+ */
+ if (fault_status != FSC_PERM || (logging_active && write_fault)) {
+ ret = kvm_mmu_topup_memory_cache(memcache,
+ kvm_mmu_cache_min_pages(kvm));
+ if (ret)
+ return ret;
+ }
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/*
@@ -1918,28 +838,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
if (kvm_is_device_pfn(pfn)) {
- mem_type = PAGE_S2_DEVICE;
- flags |= KVM_S2PTE_FLAG_IS_IOMAP;
- } else if (logging_active) {
- /*
- * Faults on pages in a memslot with logging enabled
- * should not be mapped with huge pages (it introduces churn
- * and performance degradation), so force a pte mapping.
- */
- flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
-
+ device = true;
+ } else if (logging_active && !write_fault) {
/*
* Only actually map the page as writable if this was a write
* fault.
*/
- if (!write_fault)
- writable = false;
+ writable = false;
}
- if (exec_fault && is_iomap(flags))
+ if (exec_fault && device)
return -ENOEXEC;
spin_lock(&kvm->mmu_lock);
+ pgt = vcpu->arch.hw_mmu->pgt;
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
@@ -1950,67 +862,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (vma_pagesize == PAGE_SIZE && !force_pte)
vma_pagesize = transparent_hugepage_adjust(memslot, hva,
&pfn, &fault_ipa);
- if (writable)
+ if (writable) {
+ prot |= KVM_PGTABLE_PROT_W;
kvm_set_pfn_dirty(pfn);
+ mark_page_dirty(kvm, gfn);
+ }
- if (fault_status != FSC_PERM && !is_iomap(flags))
+ if (fault_status != FSC_PERM && !device)
clean_dcache_guest_page(pfn, vma_pagesize);
- if (exec_fault)
+ if (exec_fault) {
+ prot |= KVM_PGTABLE_PROT_X;
invalidate_icache_guest_page(pfn, vma_pagesize);
+ }
- /*
- * If we took an execution fault we have made the
- * icache/dcache coherent above and should now let the s2
- * mapping be executable.
- *
- * Write faults (!exec_fault && FSC_PERM) are orthogonal to
- * execute permissions, and we preserve whatever we have.
- */
- needs_exec = exec_fault ||
- (fault_status == FSC_PERM &&
- stage2_is_exec(mmu, fault_ipa, vma_pagesize));
-
- /*
- * If PUD_SIZE == PMD_SIZE, there is no real PUD level, and
- * all we have is a 2-level page table. Trying to map a PUD in
- * this case would be fatally wrong.
- */
- if (PUD_SIZE != PMD_SIZE && vma_pagesize == PUD_SIZE) {
- pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
-
- new_pud = kvm_pud_mkhuge(new_pud);
- if (writable)
- new_pud = kvm_s2pud_mkwrite(new_pud);
-
- if (needs_exec)
- new_pud = kvm_s2pud_mkexec(new_pud);
-
- ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
- } else if (vma_pagesize == PMD_SIZE) {
- pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
-
- new_pmd = kvm_pmd_mkhuge(new_pmd);
-
- if (writable)
- new_pmd = kvm_s2pmd_mkwrite(new_pmd);
-
- if (needs_exec)
- new_pmd = kvm_s2pmd_mkexec(new_pmd);
+ if (device)
+ prot |= KVM_PGTABLE_PROT_DEVICE;
+ else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
+ prot |= KVM_PGTABLE_PROT_X;
- ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
+ if (fault_status == FSC_PERM && !(logging_active && writable)) {
+ ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
} else {
- pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
-
- if (writable) {
- new_pte = kvm_s2pte_mkwrite(new_pte);
- mark_page_dirty(kvm, gfn);
- }
-
- if (needs_exec)
- new_pte = kvm_s2pte_mkexec(new_pte);
-
- ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
+ ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
+ __pfn_to_phys(pfn), prot,
+ memcache);
}
out_unlock:
@@ -2020,46 +896,23 @@ out_unlock:
return ret;
}
-/*
- * Resolve the access fault by making the page young again.
- * Note that because the faulting entry is guaranteed not to be
- * cached in the TLB, we don't need to invalidate anything.
- * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
- * so there is no need for atomic (pte|pmd)_mkyoung operations.
- */
+/* Resolve the access fault by making the page young again. */
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- kvm_pfn_t pfn;
- bool pfn_valid = false;
+ pte_t pte;
+ kvm_pte_t kpte;
+ struct kvm_s2_mmu *mmu;
trace_kvm_access_fault(fault_ipa);
spin_lock(&vcpu->kvm->mmu_lock);
-
- if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte))
- goto out;
-
- if (pud) { /* HugeTLB */
- *pud = kvm_s2pud_mkyoung(*pud);
- pfn = kvm_pud_pfn(*pud);
- pfn_valid = true;
- } else if (pmd) { /* THP, HugeTLB */
- *pmd = pmd_mkyoung(*pmd);
- pfn = pmd_pfn(*pmd);
- pfn_valid = true;
- } else {
- *pte = pte_mkyoung(*pte); /* Just a page... */
- pfn = pte_pfn(*pte);
- pfn_valid = true;
- }
-
-out:
+ mmu = vcpu->arch.hw_mmu;
+ kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
spin_unlock(&vcpu->kvm->mmu_lock);
- if (pfn_valid)
- kvm_set_pfn_accessed(pfn);
+
+ pte = __pte(kpte);
+ if (pte_valid(pte))
+ kvm_set_pfn_accessed(pte_pfn(pte));
}
/**
@@ -2230,7 +1083,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *dat
int kvm_unmap_hva_range(struct kvm *kvm,
unsigned long start, unsigned long end, unsigned flags)
{
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_unmap_hva_range(start, end);
@@ -2240,28 +1093,27 @@ int kvm_unmap_hva_range(struct kvm *kvm,
static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- pte_t *pte = (pte_t *)data;
+ kvm_pfn_t *pfn = (kvm_pfn_t *)data;
WARN_ON(size != PAGE_SIZE);
+
/*
- * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
- * flag clear because MMU notifiers will have unmapped a huge PMD before
- * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
- * therefore stage2_set_pte() never needs to clear out a huge PMD
- * through this calling path.
+ * The MMU notifiers will have unmapped a huge PMD before calling
+ * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+ * therefore we never need to clear out a huge PMD through this
+ * calling path and a memcache is not required.
*/
- stage2_set_pte(&kvm->arch.mmu, NULL, gpa, pte, 0);
+ kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
+ __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
return 0;
}
-
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
unsigned long end = hva + PAGE_SIZE;
kvm_pfn_t pfn = pte_pfn(pte);
- pte_t stage2_pte;
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_set_spte_hva(hva);
@@ -2271,51 +1123,30 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
* just like a translation fault and clean the cache to the PoC.
*/
clean_dcache_guest_page(pfn, PAGE_SIZE);
- stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
- handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
-
+ handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
return 0;
}
static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
+ pte_t pte;
+ kvm_pte_t kpte;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
- return 0;
-
- if (pud)
- return stage2_pudp_test_and_clear_young(pud);
- else if (pmd)
- return stage2_pmdp_test_and_clear_young(pmd);
- else
- return stage2_ptep_test_and_clear_young(pte);
+ kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
+ pte = __pte(kpte);
+ return pte_valid(pte) && pte_young(pte);
}
static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
- return 0;
-
- if (pud)
- return kvm_s2pud_young(*pud);
- else if (pmd)
- return pmd_young(*pmd);
- else
- return pte_young(*pte);
+ return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
}
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
{
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_age_hva(start, end);
return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
@@ -2323,24 +1154,16 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
{
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_test_age_hva(hva);
return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
kvm_test_age_hva_handler, NULL);
}
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
- kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
-}
-
phys_addr_t kvm_mmu_get_httbr(void)
{
- if (__kvm_cpu_uses_extended_idmap())
- return virt_to_phys(merged_hyp_pgd);
- else
- return virt_to_phys(hyp_pgd);
+ return __pa(hyp_pgtable->pgd);
}
phys_addr_t kvm_get_idmap_vector(void)
@@ -2348,15 +1171,11 @@ phys_addr_t kvm_get_idmap_vector(void)
return hyp_idmap_vector;
}
-static int kvm_map_idmap_text(pgd_t *pgd)
+static int kvm_map_idmap_text(void)
{
- int err;
-
- /* Create the idmap in the boot page tables */
- err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
- hyp_idmap_start, hyp_idmap_end,
- __phys_to_pfn(hyp_idmap_start),
- PAGE_HYP_EXEC);
+ unsigned long size = hyp_idmap_end - hyp_idmap_start;
+ int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
+ PAGE_HYP_EXEC);
if (err)
kvm_err("Failed to idmap %lx-%lx\n",
hyp_idmap_start, hyp_idmap_end);
@@ -2367,6 +1186,7 @@ static int kvm_map_idmap_text(pgd_t *pgd)
int kvm_mmu_init(void)
{
int err;
+ u32 hyp_va_bits;
hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -2380,6 +1200,8 @@ int kvm_mmu_init(void)
*/
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
+ hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+ kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
kvm_debug("HYP VA range: %lx:%lx\n",
kern_hyp_va(PAGE_OFFSET),
@@ -2397,43 +1219,30 @@ int kvm_mmu_init(void)
goto out;
}
- hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
- if (!hyp_pgd) {
- kvm_err("Hyp mode PGD not allocated\n");
+ hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
+ if (!hyp_pgtable) {
+ kvm_err("Hyp mode page-table not allocated\n");
err = -ENOMEM;
goto out;
}
- if (__kvm_cpu_uses_extended_idmap()) {
- boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- hyp_pgd_order);
- if (!boot_hyp_pgd) {
- kvm_err("Hyp boot PGD not allocated\n");
- err = -ENOMEM;
- goto out;
- }
-
- err = kvm_map_idmap_text(boot_hyp_pgd);
- if (err)
- goto out;
+ err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
+ if (err)
+ goto out_free_pgtable;
- merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- if (!merged_hyp_pgd) {
- kvm_err("Failed to allocate extra HYP pgd\n");
- goto out;
- }
- __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
- hyp_idmap_start);
- } else {
- err = kvm_map_idmap_text(hyp_pgd);
- if (err)
- goto out;
- }
+ err = kvm_map_idmap_text();
+ if (err)
+ goto out_destroy_pgtable;
io_map_base = hyp_idmap_start;
return 0;
+
+out_destroy_pgtable:
+ kvm_pgtable_hyp_destroy(hyp_pgtable);
+out_free_pgtable:
+ kfree(hyp_pgtable);
+ hyp_pgtable = NULL;
out:
- free_hyp_pgds();
return err;
}
@@ -2537,7 +1346,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
if (ret)
unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
- else
+ else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
stage2_flush_memslot(kvm, memslot);
spin_unlock(&kvm->mmu_lock);
out:
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index f0d0312c0a55..ee13c5eecd3d 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -20,6 +20,21 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
+static u32 kvm_pmu_event_mask(struct kvm *kvm)
+{
+ switch (kvm->arch.pmuver) {
+ case 1: /* ARMv8.0 */
+ return GENMASK(9, 0);
+ case 4: /* ARMv8.1 */
+ case 5: /* ARMv8.4 */
+ case 6: /* ARMv8.5 */
+ return GENMASK(15, 0);
+ default: /* Shouldn't be here, just for sanity */
+ WARN_ONCE(1, "Unknown PMU version %d\n", kvm->arch.pmuver);
+ return 0;
+ }
+}
+
/**
* kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
* @vcpu: The vcpu pointer
@@ -100,7 +115,7 @@ static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
return false;
reg = PMEVTYPER0_EL0 + select_idx;
- eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT;
+ eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm);
return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
}
@@ -495,7 +510,7 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
/* PMSWINC only applies to ... SW_INC! */
type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
- type &= ARMV8_PMU_EVTYPE_EVENT;
+ type &= kvm_pmu_event_mask(vcpu->kvm);
if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
continue;
@@ -578,11 +593,21 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
data = __vcpu_sys_reg(vcpu, reg);
kvm_pmu_stop_counter(vcpu, pmc);
- eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
+ if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
+ eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
+ else
+ eventsel = data & kvm_pmu_event_mask(vcpu->kvm);
- /* Software increment event does't need to be backed by a perf event */
- if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
- pmc->idx != ARMV8_PMU_CYCLE_IDX)
+ /* Software increment event doesn't need to be backed by a perf event */
+ if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR)
+ return;
+
+ /*
+ * If we have a filter in place and that the event isn't allowed, do
+ * not install a perf event either.
+ */
+ if (vcpu->kvm->arch.pmu_filter &&
+ !test_bit(eventsel, vcpu->kvm->arch.pmu_filter))
return;
memset(&attr, 0, sizeof(struct perf_event_attr));
@@ -594,8 +619,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
attr.exclude_hv = 1; /* Don't count EL2 events */
attr.exclude_host = 1; /* Don't count host events */
- attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ?
- ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
+ attr.config = eventsel;
counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
@@ -679,17 +703,95 @@ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
u64 select_idx)
{
- u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK;
+ u64 reg, mask;
+
+ mask = ARMV8_PMU_EVTYPE_MASK;
+ mask &= ~ARMV8_PMU_EVTYPE_EVENT;
+ mask |= kvm_pmu_event_mask(vcpu->kvm);
reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
- __vcpu_sys_reg(vcpu, reg) = event_type;
+ __vcpu_sys_reg(vcpu, reg) = data & mask;
kvm_pmu_update_pmc_chained(vcpu, select_idx);
kvm_pmu_create_perf_event(vcpu, select_idx);
}
+static int kvm_pmu_probe_pmuver(void)
+{
+ struct perf_event_attr attr = { };
+ struct perf_event *event;
+ struct arm_pmu *pmu;
+ int pmuver = 0xf;
+
+ /*
+ * Create a dummy event that only counts user cycles. As we'll never
+ * leave this function with the event being live, it will never
+ * count anything. But it allows us to probe some of the PMU
+ * details. Yes, this is terrible.
+ */
+ attr.type = PERF_TYPE_RAW;
+ attr.size = sizeof(attr);
+ attr.pinned = 1;
+ attr.disabled = 0;
+ attr.exclude_user = 0;
+ attr.exclude_kernel = 1;
+ attr.exclude_hv = 1;
+ attr.exclude_host = 1;
+ attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
+ attr.sample_period = GENMASK(63, 0);
+
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ kvm_pmu_perf_overflow, &attr);
+
+ if (IS_ERR(event)) {
+ pr_err_once("kvm: pmu event creation failed %ld\n",
+ PTR_ERR(event));
+ return 0xf;
+ }
+
+ if (event->pmu) {
+ pmu = to_arm_pmu(event->pmu);
+ if (pmu->pmuver)
+ pmuver = pmu->pmuver;
+ }
+
+ perf_event_disable(event);
+ perf_event_release_kernel(event);
+
+ return pmuver;
+}
+
+u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
+{
+ unsigned long *bmap = vcpu->kvm->arch.pmu_filter;
+ u64 val, mask = 0;
+ int base, i;
+
+ if (!pmceid1) {
+ val = read_sysreg(pmceid0_el0);
+ base = 0;
+ } else {
+ val = read_sysreg(pmceid1_el0);
+ base = 32;
+ }
+
+ if (!bmap)
+ return val;
+
+ for (i = 0; i < 32; i += 8) {
+ u64 byte;
+
+ byte = bitmap_get_value8(bmap, base + i);
+ mask |= byte << i;
+ byte = bitmap_get_value8(bmap, 0x4000 + base + i);
+ mask |= byte << (32 + i);
+ }
+
+ return val & mask;
+}
+
bool kvm_arm_support_pmu_v3(void)
{
/*
@@ -735,15 +837,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
{
- if (!kvm_arm_support_pmu_v3())
- return -ENODEV;
-
- if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
- return -ENXIO;
-
- if (vcpu->arch.pmu.created)
- return -EBUSY;
-
if (irqchip_in_kernel(vcpu->kvm)) {
int ret;
@@ -796,6 +889,19 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
+ if (!kvm_arm_support_pmu_v3() ||
+ !test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+ return -ENODEV;
+
+ if (vcpu->arch.pmu.created)
+ return -EBUSY;
+
+ if (!vcpu->kvm->arch.pmuver)
+ vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver();
+
+ if (vcpu->kvm->arch.pmuver == 0xf)
+ return -ENODEV;
+
switch (attr->attr) {
case KVM_ARM_VCPU_PMU_V3_IRQ: {
int __user *uaddr = (int __user *)(long)attr->addr;
@@ -804,9 +910,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
if (!irqchip_in_kernel(vcpu->kvm))
return -EINVAL;
- if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
- return -ENODEV;
-
if (get_user(irq, uaddr))
return -EFAULT;
@@ -824,6 +927,53 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
vcpu->arch.pmu.irq_num = irq;
return 0;
}
+ case KVM_ARM_VCPU_PMU_V3_FILTER: {
+ struct kvm_pmu_event_filter __user *uaddr;
+ struct kvm_pmu_event_filter filter;
+ int nr_events;
+
+ nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1;
+
+ uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr;
+
+ if (copy_from_user(&filter, uaddr, sizeof(filter)))
+ return -EFAULT;
+
+ if (((u32)filter.base_event + filter.nevents) > nr_events ||
+ (filter.action != KVM_PMU_EVENT_ALLOW &&
+ filter.action != KVM_PMU_EVENT_DENY))
+ return -EINVAL;
+
+ mutex_lock(&vcpu->kvm->lock);
+
+ if (!vcpu->kvm->arch.pmu_filter) {
+ vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL);
+ if (!vcpu->kvm->arch.pmu_filter) {
+ mutex_unlock(&vcpu->kvm->lock);
+ return -ENOMEM;
+ }
+
+ /*
+ * The default depends on the first applied filter.
+ * If it allows events, the default is to deny.
+ * Conversely, if the first filter denies a set of
+ * events, the default is to allow.
+ */
+ if (filter.action == KVM_PMU_EVENT_ALLOW)
+ bitmap_zero(vcpu->kvm->arch.pmu_filter, nr_events);
+ else
+ bitmap_fill(vcpu->kvm->arch.pmu_filter, nr_events);
+ }
+
+ if (filter.action == KVM_PMU_EVENT_ALLOW)
+ bitmap_set(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
+ else
+ bitmap_clear(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
+
+ mutex_unlock(&vcpu->kvm->lock);
+
+ return 0;
+ }
case KVM_ARM_VCPU_PMU_V3_INIT:
return kvm_arm_pmu_v3_init(vcpu);
}
@@ -860,6 +1010,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_ARM_VCPU_PMU_V3_IRQ:
case KVM_ARM_VCPU_PMU_V3_INIT:
+ case KVM_ARM_VCPU_PMU_V3_FILTER:
if (kvm_arm_support_pmu_v3() &&
test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
return 0;
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 3c224162b3dd..faf32a44ba04 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -31,9 +31,9 @@ static bool kvm_pmu_switch_needed(struct perf_event_attr *attr)
*/
void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
{
- struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
+ struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
- if (!kvm_pmu_switch_needed(attr))
+ if (!ctx || !kvm_pmu_switch_needed(attr))
return;
if (!attr->exclude_host)
@@ -47,7 +47,10 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
*/
void kvm_clr_pmu_events(u32 clr)
{
- struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
+ struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
+
+ if (!ctx)
+ return;
ctx->pmu_events.events_host &= ~clr;
ctx->pmu_events.events_guest &= ~clr;
@@ -173,7 +176,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
return;
preempt_disable();
- host = this_cpu_ptr(&kvm_host_data);
+ host = this_cpu_ptr_hyp_sym(kvm_host_data);
events_guest = host->pmu_events.events_guest;
events_host = host->pmu_events.events_host;
@@ -193,7 +196,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
if (!has_vhe())
return;
- host = this_cpu_ptr(&kvm_host_data);
+ host = this_cpu_ptr_hyp_sym(kvm_host_data);
events_guest = host->pmu_events.events_guest;
events_host = host->pmu_events.events_host;
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index 83415e96b589..db4056ecccfd 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -425,27 +425,30 @@ static int get_kernel_wa_level(u64 regid)
{
switch (regid) {
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
- switch (kvm_arm_harden_branch_predictor()) {
- case KVM_BP_HARDEN_UNKNOWN:
+ switch (arm64_get_spectre_v2_state()) {
+ case SPECTRE_VULNERABLE:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
- case KVM_BP_HARDEN_WA_NEEDED:
+ case SPECTRE_MITIGATED:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL;
- case KVM_BP_HARDEN_NOT_REQUIRED:
+ case SPECTRE_UNAFFECTED:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED;
}
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
- switch (kvm_arm_have_ssbd()) {
- case KVM_SSBD_FORCE_DISABLE:
- return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
- case KVM_SSBD_KERNEL:
- return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL;
- case KVM_SSBD_FORCE_ENABLE:
- case KVM_SSBD_MITIGATED:
+ switch (arm64_get_spectre_v4_state()) {
+ case SPECTRE_MITIGATED:
+ /*
+ * As for the hypercall discovery, we pretend we
+ * don't have any FW mitigation if SSBS is there at
+ * all times.
+ */
+ if (cpus_have_final_cap(ARM64_SSBS))
+ return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
+ fallthrough;
+ case SPECTRE_UNAFFECTED:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
- case KVM_SSBD_UNKNOWN:
- default:
- return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN;
+ case SPECTRE_VULNERABLE:
+ return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
}
}
@@ -462,14 +465,8 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
val = kvm_psci_version(vcpu, vcpu->kvm);
break;
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
- val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
- break;
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
-
- if (val == KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
- kvm_arm_get_vcpu_workaround_2_flag(vcpu))
- val |= KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED;
break;
default:
return -ENOENT;
@@ -527,34 +524,35 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED))
return -EINVAL;
- wa_level = val & KVM_REG_FEATURE_LEVEL_MASK;
-
- if (get_kernel_wa_level(reg->id) < wa_level)
- return -EINVAL;
-
/* The enabled bit must not be set unless the level is AVAIL. */
- if (wa_level != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
- wa_level != val)
+ if ((val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED) &&
+ (val & KVM_REG_FEATURE_LEVEL_MASK) != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL)
return -EINVAL;
- /* Are we finished or do we need to check the enable bit ? */
- if (kvm_arm_have_ssbd() != KVM_SSBD_KERNEL)
- return 0;
-
/*
- * If this kernel supports the workaround to be switched on
- * or off, make sure it matches the requested setting.
+ * Map all the possible incoming states to the only two we
+ * really want to deal with.
*/
- switch (wa_level) {
- case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
- kvm_arm_set_vcpu_workaround_2_flag(vcpu,
- val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED);
+ switch (val & KVM_REG_FEATURE_LEVEL_MASK) {
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL:
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN:
+ wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
break;
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
- kvm_arm_set_vcpu_workaround_2_flag(vcpu, true);
+ wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
break;
+ default:
+ return -EINVAL;
}
+ /*
+ * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the
+ * other way around.
+ */
+ if (get_kernel_wa_level(reg->id) < wa_level)
+ return -EINVAL;
+
return 0;
default:
return -ENOENT;
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index ee33875c5c2a..f32490229a4c 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -319,10 +319,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
vcpu->arch.reset_state.reset = false;
}
- /* Default workaround setup is enabled (if supported) */
- if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL)
- vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;
-
/* Reset timer */
ret = kvm_timer_vcpu_reset(vcpu);
out:
@@ -339,7 +335,7 @@ u32 get_kvm_ipa_limit(void)
int kvm_set_ipa_limit(void)
{
- unsigned int ipa_max, pa_max, va_max, parange, tgran_2;
+ unsigned int parange, tgran_2;
u64 mmfr0;
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
@@ -376,39 +372,11 @@ int kvm_set_ipa_limit(void)
break;
}
- pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
-
- /* Clamp the IPA limit to the PA size supported by the kernel */
- ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max;
- /*
- * Since our stage2 table is dependent on the stage1 page table code,
- * we must always honor the following condition:
- *
- * Number of levels in Stage1 >= Number of levels in Stage2.
- *
- * So clamp the ipa limit further down to limit the number of levels.
- * Since we can concatenate upto 16 tables at entry level, we could
- * go upto 4bits above the maximum VA addressable with the current
- * number of levels.
- */
- va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
- va_max += 4;
-
- if (va_max < ipa_max)
- ipa_max = va_max;
-
- /*
- * If the final limit is lower than the real physical address
- * limit of the CPUs, report the reason.
- */
- if (ipa_max < pa_max)
- pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n",
- (va_max < pa_max) ? "Virtual" : "Physical");
-
- WARN(ipa_max < KVM_PHYS_SHIFT,
- "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
- kvm_ipa_limit = ipa_max;
- kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
+ kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange);
+ WARN(kvm_ipa_limit < KVM_PHYS_SHIFT,
+ "KVM IPA Size Limit (%d bits) is smaller than default size\n",
+ kvm_ipa_limit);
+ kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit);
return 0;
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 077293b5115f..3c203cb8c103 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -769,10 +769,7 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (pmu_access_el0_disabled(vcpu))
return false;
- if (!(p->Op2 & 1))
- pmceid = read_sysreg(pmceid0_el0);
- else
- pmceid = read_sysreg(pmceid1_el0);
+ pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1));
p->regval = pmceid;
@@ -1131,6 +1128,9 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
if (!vcpu_has_sve(vcpu))
val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT);
val &= ~(0xfUL << ID_AA64PFR0_AMU_SHIFT);
+ if (!(val & (0xfUL << ID_AA64PFR0_CSV2_SHIFT)) &&
+ arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED)
+ val |= (1UL << ID_AA64PFR0_CSV2_SHIFT);
} else if (id == SYS_ID_AA64ISAR1_EL1 && !vcpu_has_ptrauth(vcpu)) {
val &= ~((0xfUL << ID_AA64ISAR1_APA_SHIFT) |
(0xfUL << ID_AA64ISAR1_API_SHIFT) |
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index b13a9e3f99dd..f38c40a76251 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -260,34 +260,14 @@ static int vgic_debug_show(struct seq_file *s, void *v)
return 0;
}
-static const struct seq_operations vgic_debug_seq_ops = {
+static const struct seq_operations vgic_debug_sops = {
.start = vgic_debug_start,
.next = vgic_debug_next,
.stop = vgic_debug_stop,
.show = vgic_debug_show
};
-static int debug_open(struct inode *inode, struct file *file)
-{
- int ret;
- ret = seq_open(file, &vgic_debug_seq_ops);
- if (!ret) {
- struct seq_file *seq;
- /* seq_open will have modified file->private_data */
- seq = file->private_data;
- seq->private = inode->i_private;
- }
-
- return ret;
-};
-
-static const struct file_operations vgic_debug_fops = {
- .owner = THIS_MODULE,
- .open = debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release
-};
+DEFINE_SEQ_ATTRIBUTE(vgic_debug);
void vgic_debug_init(struct kvm *kvm)
{
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 76e2d85789ed..9cdf39a94a63 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -662,7 +662,7 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
if (likely(cpu_if->vgic_sre))
kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
- kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if));
+ kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if);
if (has_vhe())
__vgic_v3_activate_traps(cpu_if);
@@ -686,7 +686,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
vgic_v3_vmcr_sync(vcpu);
- kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if));
+ kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
if (has_vhe())
__vgic_v3_deactivate_traps(cpu_if);