diff options
Diffstat (limited to 'arch/arm64/kvm/hyp')
27 files changed, 1869 insertions, 500 deletions
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index b726332eec49..687598e41b21 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -10,4 +10,4 @@ subdir-ccflags-y := -I$(incdir) \ -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) -obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o reserved_mem.o +obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S index 3c635929771a..61e6f3ba7b7d 100644 --- a/arch/arm64/kvm/hyp/fpsimd.S +++ b/arch/arm64/kvm/hyp/fpsimd.S @@ -21,11 +21,7 @@ SYM_FUNC_START(__fpsimd_restore_state) SYM_FUNC_END(__fpsimd_restore_state) SYM_FUNC_START(__sve_restore_state) - __sve_load 0, x1, 2 + mov x2, #1 + sve_load 0, x1, x2, 3 ret SYM_FUNC_END(__sve_restore_state) - -SYM_FUNC_START(__sve_save_state) - sve_save 0, x1, 2 - ret -SYM_FUNC_END(__sve_save_state) diff --git a/arch/arm64/kvm/hyp/hyp-constants.c b/arch/arm64/kvm/hyp/hyp-constants.c new file mode 100644 index 000000000000..b3742a6691e8 --- /dev/null +++ b/arch/arm64/kvm/hyp/hyp-constants.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/kbuild.h> +#include <nvhe/memory.h> + +int main(void) +{ + DEFINE(STRUCT_HYP_PAGE_SIZE, sizeof(struct hyp_page)); + return 0; +} diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 9aa9b73475c9..b6b6801d96d5 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -44,7 +44,7 @@ el1_sync: // Guest trapped into EL2 mrs x0, esr_el2 - lsr x0, x0, #ESR_ELx_EC_SHIFT + ubfx x0, x0, #ESR_ELx_EC_SHIFT, #ESR_ELx_EC_WIDTH cmp x0, #ESR_ELx_EC_HVC64 ccmp x0, #ESR_ELx_EC_HVC32, #4, ne b.ne el1_trap diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h new file mode 100644 index 000000000000..1b8a2dcd712f --- /dev/null +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#ifndef __ARM64_KVM_HYP_FAULT_H__ +#define __ARM64_KVM_HYP_FAULT_H__ + +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) +{ + u64 par, tmp; + + /* + * Resolve the IPA the hard way using the guest VA. + * + * Stage-1 translation already validated the memory access + * rights. As such, we can use the EL1 translation regime, and + * don't have to distinguish between EL0 and EL1 access. + * + * We do need to save/restore PAR_EL1 though, as we haven't + * saved the guest context yet, and we may return early... + */ + par = read_sysreg_par(); + if (!__kvm_at("s1e1r", far)) + tmp = read_sysreg_par(); + else + tmp = SYS_PAR_EL1_F; /* back to the guest */ + write_sysreg(par, par_el1); + + if (unlikely(tmp & SYS_PAR_EL1_F)) + return false; /* Translation failed, back to guest */ + + /* Convert PAR to HPFAR format */ + *hpfar = PAR_TO_HPFAR(tmp); + return true; +} + +static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) +{ + u64 hpfar, far; + + far = read_sysreg_el2(SYS_FAR); + + /* + * The HPFAR can be invalid if the stage 2 fault did not + * happen during a stage 1 page table walk (the ESR_EL2.S1PTW + * bit is clear) and one of the two following cases are true: + * 1. The fault was due to a permission fault + * 2. The processor carries errata 834220 + * + * Therefore, for all non S1PTW faults where we either have a + * permission fault or the errata workaround is enabled, we + * resolve the IPA using the AT instruction. + */ + if (!(esr & ESR_ELx_S1PTW) && + (cpus_have_final_cap(ARM64_WORKAROUND_834220) || + (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { + if (!__translate_far_to_hpfar(far, &hpfar)) + return false; + } else { + hpfar = read_sysreg(hpfar_el2); + } + + fault->far_el2 = far; + fault->hpfar_el2 = hpfar; + return true; +} + +#endif diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index a0e78a6027be..58e14f8ead23 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -8,6 +8,7 @@ #define __ARM64_KVM_HYP_SWITCH_H__ #include <hyp/adjust_pc.h> +#include <hyp/fault.h> #include <linux/arm-smccc.h> #include <linux/kvm_host.h> @@ -28,10 +29,13 @@ #include <asm/fpsimd.h> #include <asm/debug-monitors.h> #include <asm/processor.h> -#include <asm/thread_info.h> -extern struct exception_table_entry __start___kvm_ex_table; -extern struct exception_table_entry __stop___kvm_ex_table; +struct kvm_exception_table_entry { + int insn, fixup; +}; + +extern struct kvm_exception_table_entry __start___kvm_ex_table; +extern struct kvm_exception_table_entry __stop___kvm_ex_table; /* Check whether the FP regs were dirtied while in the host-side run loop: */ static inline bool update_fp_enabled(struct kvm_vcpu *vcpu) @@ -44,7 +48,7 @@ static inline bool update_fp_enabled(struct kvm_vcpu *vcpu) * trap the accesses. */ if (!system_supports_fpsimd() || - vcpu->arch.host_thread_info->flags & _TIF_FOREIGN_FPSTATE) + vcpu->arch.flags & KVM_ARM64_FP_FOREIGN_FPSTATE) vcpu->arch.flags &= ~(KVM_ARM64_FP_ENABLED | KVM_ARM64_FP_HOST); @@ -133,88 +137,9 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu) } } -static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) -{ - u64 par, tmp; - - /* - * Resolve the IPA the hard way using the guest VA. - * - * Stage-1 translation already validated the memory access - * rights. As such, we can use the EL1 translation regime, and - * don't have to distinguish between EL0 and EL1 access. - * - * We do need to save/restore PAR_EL1 though, as we haven't - * saved the guest context yet, and we may return early... - */ - par = read_sysreg_par(); - if (!__kvm_at("s1e1r", far)) - tmp = read_sysreg_par(); - else - tmp = SYS_PAR_EL1_F; /* back to the guest */ - write_sysreg(par, par_el1); - - if (unlikely(tmp & SYS_PAR_EL1_F)) - return false; /* Translation failed, back to guest */ - - /* Convert PAR to HPFAR format */ - *hpfar = PAR_TO_HPFAR(tmp); - return true; -} - -static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) -{ - u64 hpfar, far; - - far = read_sysreg_el2(SYS_FAR); - - /* - * The HPFAR can be invalid if the stage 2 fault did not - * happen during a stage 1 page table walk (the ESR_EL2.S1PTW - * bit is clear) and one of the two following cases are true: - * 1. The fault was due to a permission fault - * 2. The processor carries errata 834220 - * - * Therefore, for all non S1PTW faults where we either have a - * permission fault or the errata workaround is enabled, we - * resolve the IPA using the AT instruction. - */ - if (!(esr & ESR_ELx_S1PTW) && - (cpus_have_final_cap(ARM64_WORKAROUND_834220) || - (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { - if (!__translate_far_to_hpfar(far, &hpfar)) - return false; - } else { - hpfar = read_sysreg(hpfar_el2); - } - - fault->far_el2 = far; - fault->hpfar_el2 = hpfar; - return true; -} - static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) { - u8 ec; - u64 esr; - - esr = vcpu->arch.fault.esr_el2; - ec = ESR_ELx_EC(esr); - - if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW) - return true; - - return __get_fault_info(esr, &vcpu->arch.fault); -} - -static inline void __hyp_sve_save_host(struct kvm_vcpu *vcpu) -{ - struct thread_struct *thread; - - thread = container_of(vcpu->arch.host_fpsimd_state, struct thread_struct, - uw.fpsimd_state); - - __sve_save_state(sve_pffr(thread), &vcpu->arch.host_fpsimd_state->fpsr); + return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault); } static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) @@ -225,28 +150,23 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR); } -/* Check for an FPSIMD/SVE trap and handle as appropriate */ -static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) +/* + * We trap the first access to the FP/SIMD to save the host context and + * restore the guest context lazily. + * If FP/SIMD is not implemented, handle the trap and inject an undefined + * instruction exception to the guest. Similarly for trapped SVE accesses. + */ +static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) { - bool sve_guest, sve_host; + bool sve_guest; u8 esr_ec; u64 reg; if (!system_supports_fpsimd()) return false; - if (system_supports_sve()) { - sve_guest = vcpu_has_sve(vcpu); - sve_host = vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE; - } else { - sve_guest = false; - sve_host = false; - } - + sve_guest = vcpu_has_sve(vcpu); esr_ec = kvm_vcpu_trap_get_class(vcpu); - if (esr_ec != ESR_ELx_EC_FP_ASIMD && - esr_ec != ESR_ELx_EC_SVE) - return false; /* Don't handle SVE traps for non-SVE vcpus here: */ if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD) @@ -269,11 +189,7 @@ static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) isb(); if (vcpu->arch.flags & KVM_ARM64_FP_HOST) { - if (sve_host) - __hyp_sve_save_host(vcpu); - else - __fpsimd_save_state(vcpu->arch.host_fpsimd_state); - + __fpsimd_save_state(vcpu->arch.host_fpsimd_state); vcpu->arch.flags &= ~KVM_ARM64_FP_HOST; } @@ -348,14 +264,6 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) static inline bool esr_is_ptrauth_trap(u32 esr) { - u32 ec = ESR_ELx_EC(esr); - - if (ec == ESR_ELx_EC_PAC) - return true; - - if (ec != ESR_ELx_EC_SYS64) - return false; - switch (esr_sys64_to_sysreg(esr)) { case SYS_APIAKEYLO_EL1: case SYS_APIAKEYHI_EL1: @@ -384,13 +292,12 @@ static inline bool esr_is_ptrauth_trap(u32 esr) DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); -static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) +static bool kvm_hyp_handle_ptrauth(struct kvm_vcpu *vcpu, u64 *exit_code) { struct kvm_cpu_context *ctxt; u64 val; - if (!vcpu_has_ptrauth(vcpu) || - !esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) + if (!vcpu_has_ptrauth(vcpu)) return false; ctxt = this_cpu_ptr(&kvm_hyp_ctxt); @@ -409,6 +316,92 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) return true; } +static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && + handle_tx2_tvm(vcpu)) + return true; + + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + __vgic_v3_perform_cpuif_access(vcpu) == 1) + return true; + + if (esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) + return kvm_hyp_handle_ptrauth(vcpu, exit_code); + + return false; +} + +static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + __vgic_v3_perform_cpuif_access(vcpu) == 1) + return true; + + return false; +} + +static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (!__populate_fault_info(vcpu)) + return true; + + return false; +} + +static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (!__populate_fault_info(vcpu)) + return true; + + if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { + bool valid; + + valid = kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && + kvm_vcpu_dabt_isvalid(vcpu) && + !kvm_vcpu_abt_issea(vcpu) && + !kvm_vcpu_abt_iss1tw(vcpu); + + if (valid) { + int ret = __vgic_v2_perform_cpuif_access(vcpu); + + if (ret == 1) + return true; + + /* Promote an illegal access to an SError.*/ + if (ret == -1) + *exit_code = ARM_EXCEPTION_EL1_SERROR; + } + } + + return false; +} + +typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); + +static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); + +static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); + +/* + * Allow the hypervisor to handle the exit with an exit handler if it has one. + * + * Returns true if the hypervisor handled the exit, and control should go back + * to the guest, or false if it hasn't. + */ +static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); + exit_handler_fn fn; + + fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; + + if (fn) + return fn(vcpu, exit_code); + + return false; +} + /* * Return true when we were able to fixup the guest exit and should return to * the guest, false when we should restore the host state and return to the @@ -416,6 +409,18 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) */ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { + /* + * Save PSTATE early so that we can evaluate the vcpu mode + * early on. + */ + vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR); + + /* + * Check whether we want to repaint the state one way or + * another. + */ + early_exit_filter(vcpu, exit_code); + if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); @@ -443,59 +448,9 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) if (*exit_code != ARM_EXCEPTION_TRAP) goto exit; - if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && - kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 && - handle_tx2_tvm(vcpu)) - goto guest; - - /* - * We trap the first access to the FP/SIMD to save the host context - * and restore the guest context lazily. - * If FP/SIMD is not implemented, handle the trap and inject an - * undefined instruction exception to the guest. - * Similarly for trapped SVE accesses. - */ - if (__hyp_handle_fpsimd(vcpu)) - goto guest; - - if (__hyp_handle_ptrauth(vcpu)) + /* Check if there's an exit handler and allow it to handle the exit. */ + if (kvm_hyp_handle_exit(vcpu, exit_code)) goto guest; - - if (!__populate_fault_info(vcpu)) - goto guest; - - if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { - bool valid; - - valid = kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_DABT_LOW && - kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && - kvm_vcpu_dabt_isvalid(vcpu) && - !kvm_vcpu_abt_issea(vcpu) && - !kvm_vcpu_abt_iss1tw(vcpu); - - if (valid) { - int ret = __vgic_v2_perform_cpuif_access(vcpu); - - if (ret == 1) - goto guest; - - /* Promote an illegal access to an SError.*/ - if (ret == -1) - *exit_code = ARM_EXCEPTION_EL1_SERROR; - - goto exit; - } - } - - if (static_branch_unlikely(&vgic_v3_cpuif_trap) && - (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 || - kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { - int ret = __vgic_v3_perform_cpuif_access(vcpu); - - if (ret == 1) - goto guest; - } - exit: /* Return to the host kernel and handle the exit */ return false; @@ -510,7 +465,7 @@ static inline void __kvm_unexpected_el2_exception(void) { extern char __guest_exit_panic[]; unsigned long addr, fixup; - struct exception_table_entry *entry, *end; + struct kvm_exception_table_entry *entry, *end; unsigned long elr_el2 = read_sysreg(elr_el2); entry = &__start___kvm_ex_table; diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index de7e14c862e6..7ecca8b07851 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -70,7 +70,12 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) { ctxt->regs.pc = read_sysreg_el2(SYS_ELR); - ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR); + /* + * Guest PSTATE gets saved at guest fixup time in all + * cases. We still need to handle the nVHE host side here. + */ + if (!has_vhe() && ctxt->__hyp_running_vcpu) + ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR); if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2); diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h new file mode 100644 index 000000000000..eea1f6a53723 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba <tabba@google.com> + */ + +#ifndef __ARM64_KVM_FIXED_CONFIG_H__ +#define __ARM64_KVM_FIXED_CONFIG_H__ + +#include <asm/sysreg.h> + +/* + * This file contains definitions for features to be allowed or restricted for + * guest virtual machines, depending on the mode KVM is running in and on the + * type of guest that is running. + * + * The ALLOW masks represent a bitmask of feature fields that are allowed + * without any restrictions as long as they are supported by the system. + * + * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for + * features that are restricted to support at most the specified feature. + * + * If a feature field is not present in either, than it is not supported. + * + * The approach taken for protected VMs is to allow features that are: + * - Needed by common Linux distributions (e.g., floating point) + * - Trivial to support, e.g., supporting the feature does not introduce or + * require tracking of additional state in KVM + * - Cannot be trapped or prevent the guest from using anyway + */ + +/* + * Allow for protected VMs: + * - Floating-point and Advanced SIMD + * - Data Independent Timing + */ +#define PVM_ID_AA64PFR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64PFR0_FP) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_ASIMD) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_DIT) \ + ) + +/* + * Restrict to the following *unsigned* features for protected VMs: + * - AArch64 guests only (no support for AArch32 guests): + * AArch32 adds complexity in trap handling, emulation, condition codes, + * etc... + * - RAS (v1) + * Supported by KVM + */ +#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL0), ID_AA64PFR0_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1), ID_AA64PFR0_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL2), ID_AA64PFR0_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL3), ID_AA64PFR0_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_RAS), ID_AA64PFR0_RAS_V1) \ + ) + +/* + * Allow for protected VMs: + * - Branch Target Identification + * - Speculative Store Bypassing + */ +#define PVM_ID_AA64PFR1_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64PFR1_BT) | \ + ARM64_FEATURE_MASK(ID_AA64PFR1_SSBS) \ + ) + +/* + * Allow for protected VMs: + * - Mixed-endian + * - Distinction between Secure and Non-secure Memory + * - Mixed-endian at EL0 only + * - Non-context synchronizing exception entry and exit + */ +#define PVM_ID_AA64MMFR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64MMFR0_BIGENDEL) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR0_SNSMEM) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR0_BIGENDEL0) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR0_EXS) \ + ) + +/* + * Restrict to the following *unsigned* features for protected VMs: + * - 40-bit IPA + * - 16-bit ASID + */ +#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_PARANGE), ID_AA64MMFR0_PARANGE_40) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_ASID), ID_AA64MMFR0_ASID_16) \ + ) + +/* + * Allow for protected VMs: + * - Hardware translation table updates to Access flag and Dirty state + * - Number of VMID bits from CPU + * - Hierarchical Permission Disables + * - Privileged Access Never + * - SError interrupt exceptions from speculative reads + * - Enhanced Translation Synchronization + */ +#define PVM_ID_AA64MMFR1_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64MMFR1_HADBS) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_VMIDBITS) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_HPD) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_PAN) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_SPECSEI) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_ETS) \ + ) + +/* + * Allow for protected VMs: + * - Common not Private translations + * - User Access Override + * - IESB bit in the SCTLR_ELx registers + * - Unaligned single-copy atomicity and atomic functions + * - ESR_ELx.EC value on an exception by read access to feature ID space + * - TTL field in address operations. + * - Break-before-make sequences when changing translation block size + * - E0PDx mechanism + */ +#define PVM_ID_AA64MMFR2_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64MMFR2_CNP) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_UAO) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_IESB) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_AT) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_IDS) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_TTL) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_BBM) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_E0PD) \ + ) + +/* + * No support for Scalable Vectors for protected VMs: + * Requires additional support from KVM, e.g., context-switching and + * trapping at EL2 + */ +#define PVM_ID_AA64ZFR0_ALLOW (0ULL) + +/* + * No support for debug, including breakpoints, and watchpoints for protected + * VMs: + * The Arm architecture mandates support for at least the Armv8 debug + * architecture, which would include at least 2 hardware breakpoints and + * watchpoints. Providing that support to protected guests adds + * considerable state and complexity. Therefore, the reserved value of 0 is + * used for debug-related fields. + */ +#define PVM_ID_AA64DFR0_ALLOW (0ULL) +#define PVM_ID_AA64DFR1_ALLOW (0ULL) + +/* + * No support for implementation defined features. + */ +#define PVM_ID_AA64AFR0_ALLOW (0ULL) +#define PVM_ID_AA64AFR1_ALLOW (0ULL) + +/* + * No restrictions on instructions implemented in AArch64. + */ +#define PVM_ID_AA64ISAR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64ISAR0_AES) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SHA1) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SHA2) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_CRC32) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_ATOMICS) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_RDM) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SHA3) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SM3) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SM4) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_DP) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_FHM) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_TS) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_TLB) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_RNDR) \ + ) + +#define PVM_ID_AA64ISAR1_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64ISAR1_DPB) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_API) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_JSCVT) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_FCMA) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_LRCPC) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_FRINTTS) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_SB) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_SPECRES) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_BF16) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_DGH) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_I8MM) \ + ) + +u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); +int kvm_check_pvm_sysreg_table(void); + +#endif /* __ARM64_KVM_FIXED_CONFIG_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index fb0f523d1492..0a048dc06a7d 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -24,6 +24,7 @@ struct hyp_pool { /* Allocation */ void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order); +void hyp_split_page(struct hyp_page *page); void hyp_get_page(struct hyp_pool *pool, void *addr); void hyp_put_page(struct hyp_pool *pool, void *addr); diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index b58c910babaf..80e99836eac7 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -24,6 +24,11 @@ enum pkvm_page_state { PKVM_PAGE_OWNED = 0ULL, PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0, PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1, + __PKVM_PAGE_RESERVED = KVM_PGTABLE_PROT_SW0 | + KVM_PGTABLE_PROT_SW1, + + /* Meta-states which aren't encoded directly in the PTE's SW bits */ + PKVM_NOPAGE, }; #define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) @@ -50,6 +55,7 @@ extern const u8 pkvm_hyp_id; int __pkvm_prot_finalize(void); int __pkvm_host_share_hyp(u64 pfn); +int __pkvm_host_unshare_hyp(u64 pfn); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index c9a8f535212e..2d08510c6cc1 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -10,13 +10,8 @@ #include <nvhe/memory.h> #include <nvhe/spinlock.h> -#define HYP_MEMBLOCK_REGIONS 128 -extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; -extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); extern struct kvm_pgtable pkvm_pgtable; extern hyp_spinlock_t pkvm_pgd_lock; -extern struct hyp_pool hpool; -extern u64 __io_map_base; int hyp_create_idmap(u32 hyp_va_bits); int hyp_map_vectors(void); @@ -39,58 +34,4 @@ static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size, *end = ALIGN(*end, PAGE_SIZE); } -static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) -{ - unsigned long total = 0, i; - - /* Provision the worst case scenario */ - for (i = 0; i < KVM_PGTABLE_MAX_LEVELS; i++) { - nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE); - total += nr_pages; - } - - return total; -} - -static inline unsigned long __hyp_pgtable_total_pages(void) -{ - unsigned long res = 0, i; - - /* Cover all of memory with page-granularity */ - for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) { - struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i]; - res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT); - } - - return res; -} - -static inline unsigned long hyp_s1_pgtable_pages(void) -{ - unsigned long res; - - res = __hyp_pgtable_total_pages(); - - /* Allow 1 GiB for private mappings */ - res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT); - - return res; -} - -static inline unsigned long host_s2_pgtable_pages(void) -{ - unsigned long res; - - /* - * Include an extra 16 pages to safely upper-bound the worst case of - * concatenated pgds. - */ - res = __hyp_pgtable_total_pages() + 16; - - /* Allow 1 GiB for MMIO mappings */ - res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT); - - return res; -} - #endif /* __KVM_HYP_MM_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h index 1e6d995968a1..45a84f0ade04 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -15,4 +15,6 @@ #define DECLARE_REG(type, name, ctxt, reg) \ type name = (type)cpu_reg(ctxt, (reg)) +void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu); + #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 8d741f71377f..24b2c2425b38 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -14,7 +14,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs)) obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \ - cache.o setup.o mm.o mem_protect.o + cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o obj-y += $(lib-objs) @@ -89,6 +89,7 @@ KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI) # cause crashes. Just disable it. GCOV_PROFILE := n KASAN_SANITIZE := n +KCSAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n diff --git a/arch/arm64/kvm/hyp/nvhe/early_alloc.c b/arch/arm64/kvm/hyp/nvhe/early_alloc.c index 1306c430ab87..00de04153cc6 100644 --- a/arch/arm64/kvm/hyp/nvhe/early_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/early_alloc.c @@ -43,6 +43,9 @@ void *hyp_early_alloc_page(void *arg) return hyp_early_alloc_contig(1); } +static void hyp_early_alloc_get_page(void *addr) { } +static void hyp_early_alloc_put_page(void *addr) { } + void hyp_early_alloc_init(void *virt, unsigned long size) { base = cur = (unsigned long)virt; @@ -51,4 +54,6 @@ void hyp_early_alloc_init(void *virt, unsigned long size) hyp_early_alloc_mm_ops.zalloc_page = hyp_early_alloc_page; hyp_early_alloc_mm_ops.phys_to_virt = hyp_phys_to_virt; hyp_early_alloc_mm_ops.virt_to_phys = hyp_virt_to_phys; + hyp_early_alloc_mm_ops.get_page = hyp_early_alloc_get_page; + hyp_early_alloc_mm_ops.put_page = hyp_early_alloc_put_page; } diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 4b652ffb591d..3d613e721a75 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -110,17 +110,14 @@ SYM_FUNC_START(__hyp_do_panic) b __host_enter_for_panic SYM_FUNC_END(__hyp_do_panic) -.macro host_el1_sync_vect - .align 7 -.L__vect_start\@: - stp x0, x1, [sp, #-16]! - mrs x0, esr_el2 - lsr x0, x0, #ESR_ELx_EC_SHIFT - cmp x0, #ESR_ELx_EC_HVC64 - b.ne __host_exit - +SYM_FUNC_START(__host_hvc) ldp x0, x1, [sp] // Don't fixup the stack yet + /* No stub for you, sonny Jim */ +alternative_if ARM64_KVM_PROTECTED_MODE + b __host_exit +alternative_else_nop_endif + /* Check for a stub HVC call */ cmp x0, #HVC_STUB_HCALL_NR b.hs __host_exit @@ -137,6 +134,17 @@ SYM_FUNC_END(__hyp_do_panic) ldr x5, =__kvm_handle_stub_hvc hyp_pa x5, x6 br x5 +SYM_FUNC_END(__host_hvc) + +.macro host_el1_sync_vect + .align 7 +.L__vect_start\@: + stp x0, x1, [sp, #-16]! + mrs x0, esr_el2 + ubfx x0, x0, #ESR_ELx_EC_SHIFT, #ESR_ELx_EC_WIDTH + cmp x0, #ESR_ELx_EC_HVC64 + b.eq __host_hvc + b __host_exit .L__vect_end\@: .if ((.L__vect_end\@ - .L__vect_start\@) > 0x80) .error "host_el1_sync_vect larger than vector entry" diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 2da6aa8da868..5e2197db0d32 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -4,7 +4,7 @@ * Author: Andrew Scull <ascull@google.com> */ -#include <hyp/switch.h> +#include <hyp/adjust_pc.h> #include <asm/pgtable-types.h> #include <asm/kvm_asm.h> @@ -147,6 +147,13 @@ static void handle___pkvm_host_share_hyp(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_host_share_hyp(pfn); } +static void handle___pkvm_host_unshare_hyp(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_host_unshare_hyp(pfn); +} + static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(phys_addr_t, phys, host_ctxt, 1); @@ -160,41 +167,66 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) { cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } + +static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); + + __pkvm_vcpu_init_traps(kern_hyp_va(vcpu)); +} + typedef void (*hcall_t)(struct kvm_cpu_context *); #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x static const hcall_t host_hcall[] = { - HANDLE_FUNC(__kvm_vcpu_run), + /* ___kvm_hyp_init */ + HANDLE_FUNC(__kvm_get_mdcr_el2), + HANDLE_FUNC(__pkvm_init), + HANDLE_FUNC(__pkvm_create_private_mapping), + HANDLE_FUNC(__pkvm_cpu_set_vector), + HANDLE_FUNC(__kvm_enable_ssbs), + HANDLE_FUNC(__vgic_v3_init_lrs), + HANDLE_FUNC(__vgic_v3_get_gic_config), + HANDLE_FUNC(__pkvm_prot_finalize), + + HANDLE_FUNC(__pkvm_host_share_hyp), + HANDLE_FUNC(__pkvm_host_unshare_hyp), HANDLE_FUNC(__kvm_adjust_pc), + HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), HANDLE_FUNC(__kvm_tlb_flush_vmid), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), - HANDLE_FUNC(__kvm_enable_ssbs), - HANDLE_FUNC(__vgic_v3_get_gic_config), HANDLE_FUNC(__vgic_v3_read_vmcr), HANDLE_FUNC(__vgic_v3_write_vmcr), - HANDLE_FUNC(__vgic_v3_init_lrs), - HANDLE_FUNC(__kvm_get_mdcr_el2), HANDLE_FUNC(__vgic_v3_save_aprs), HANDLE_FUNC(__vgic_v3_restore_aprs), - HANDLE_FUNC(__pkvm_init), - HANDLE_FUNC(__pkvm_cpu_set_vector), - HANDLE_FUNC(__pkvm_host_share_hyp), - HANDLE_FUNC(__pkvm_create_private_mapping), - HANDLE_FUNC(__pkvm_prot_finalize), + HANDLE_FUNC(__pkvm_vcpu_init_traps), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(unsigned long, id, host_ctxt, 0); + unsigned long hcall_min = 0; hcall_t hfn; + /* + * If pKVM has been initialised then reject any calls to the + * early "privileged" hypercalls. Note that we cannot reject + * calls to __pkvm_prot_finalize for two reasons: (1) The static + * key used to determine initialisation must be toggled prior to + * finalisation and (2) finalisation is performed on a per-CPU + * basis. This is all fine, however, since __pkvm_prot_finalize + * returns -EPERM after the first call for a given CPU. + */ + if (static_branch_unlikely(&kvm_protected_mode_initialized)) + hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize; + id -= KVM_HOST_SMCCC_ID(0); - if (unlikely(id >= ARRAY_SIZE(host_hcall))) + if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall))) goto inval; hfn = host_hcall[id]; diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index bacd493a4eac..674f10564373 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -9,9 +9,10 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> +#include <asm/kvm_pkvm.h> #include <asm/stage2_pgtable.h> -#include <hyp/switch.h> +#include <hyp/fault.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> @@ -25,17 +26,42 @@ struct host_kvm host_kvm; static struct hyp_pool host_s2_pool; -/* - * Copies of the host's CPU features registers holding sanitized values. - */ -u64 id_aa64mmfr0_el1_sys_val; -u64 id_aa64mmfr1_el1_sys_val; - const u8 pkvm_hyp_id = 1; +static void host_lock_component(void) +{ + hyp_spin_lock(&host_kvm.lock); +} + +static void host_unlock_component(void) +{ + hyp_spin_unlock(&host_kvm.lock); +} + +static void hyp_lock_component(void) +{ + hyp_spin_lock(&pkvm_pgd_lock); +} + +static void hyp_unlock_component(void) +{ + hyp_spin_unlock(&pkvm_pgd_lock); +} + static void *host_s2_zalloc_pages_exact(size_t size) { - return hyp_alloc_pages(&host_s2_pool, get_order(size)); + void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size)); + + hyp_split_page(hyp_virt_to_page(addr)); + + /* + * The size of concatenated PGDs is always a power of two of PAGE_SIZE, + * so there should be no need to free any of the tail pages to make the + * allocation exact. + */ + WARN_ON(size != (PAGE_SIZE << get_order(size))); + + return addr; } static void *host_s2_zalloc_page(void *pool) @@ -98,19 +124,19 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) prepare_host_vtcr(); hyp_spin_lock_init(&host_kvm.lock); + mmu->arch = &host_kvm.arch; ret = prepare_s2_pool(pgt_pool_base); if (ret) return ret; - ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, &host_kvm.arch, + ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, mmu, &host_kvm.mm_ops, KVM_HOST_S2_FLAGS, host_stage2_force_pte_cb); if (ret) return ret; mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd); - mmu->arch = &host_kvm.arch; mmu->pgt = &host_kvm.pgt; WRITE_ONCE(mmu->vmid.vmid_gen, 0); WRITE_ONCE(mmu->vmid.vmid, 0); @@ -123,6 +149,9 @@ int __pkvm_prot_finalize(void) struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + if (params->hcr_el2 & HCR_VM) + return -EPERM; + params->vttbr = kvm_get_vttbr(mmu); params->vtcr = host_kvm.arch.vtcr; params->hcr_el2 |= HCR_VM; @@ -330,116 +359,446 @@ static int host_stage2_idmap(u64 addr) prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT; - hyp_spin_lock(&host_kvm.lock); + host_lock_component(); ret = host_stage2_adjust_range(addr, &range); if (ret) goto unlock; ret = host_stage2_idmap_locked(range.start, range.end - range.start, prot); unlock: - hyp_spin_unlock(&host_kvm.lock); + host_unlock_component(); return ret; } -static inline bool check_prot(enum kvm_pgtable_prot prot, - enum kvm_pgtable_prot required, - enum kvm_pgtable_prot denied) +void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) { - return (prot & (required | denied)) == required; + struct kvm_vcpu_fault_info fault; + u64 esr, addr; + int ret = 0; + + esr = read_sysreg_el2(SYS_ESR); + BUG_ON(!__get_fault_info(esr, &fault)); + + addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; + ret = host_stage2_idmap(addr); + BUG_ON(ret && ret != -EAGAIN); } -int __pkvm_host_share_hyp(u64 pfn) +/* This corresponds to locking order */ +enum pkvm_component_id { + PKVM_ID_HOST, + PKVM_ID_HYP, +}; + +struct pkvm_mem_transition { + u64 nr_pages; + + struct { + enum pkvm_component_id id; + /* Address in the initiator's address space */ + u64 addr; + + union { + struct { + /* Address in the completer's address space */ + u64 completer_addr; + } host; + }; + } initiator; + + struct { + enum pkvm_component_id id; + } completer; +}; + +struct pkvm_mem_share { + const struct pkvm_mem_transition tx; + const enum kvm_pgtable_prot completer_prot; +}; + +struct check_walk_data { + enum pkvm_page_state desired; + enum pkvm_page_state (*get_page_state)(kvm_pte_t pte); +}; + +static int __check_page_state_visitor(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) { - phys_addr_t addr = hyp_pfn_to_phys(pfn); - enum kvm_pgtable_prot prot, cur; - void *virt = __hyp_va(addr); - enum pkvm_page_state state; - kvm_pte_t pte; - int ret; + struct check_walk_data *d = arg; + kvm_pte_t pte = *ptep; - if (!addr_is_memory(addr)) + if (kvm_pte_valid(pte) && !addr_is_memory(kvm_pte_to_phys(pte))) return -EINVAL; - hyp_spin_lock(&host_kvm.lock); - hyp_spin_lock(&pkvm_pgd_lock); + return d->get_page_state(pte) == d->desired ? 0 : -EPERM; +} + +static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct check_walk_data *data) +{ + struct kvm_pgtable_walker walker = { + .cb = __check_page_state_visitor, + .arg = data, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} + +static enum pkvm_page_state host_get_page_state(kvm_pte_t pte) +{ + if (!kvm_pte_valid(pte) && pte) + return PKVM_NOPAGE; + + return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); +} + +static int __host_check_page_state_range(u64 addr, u64 size, + enum pkvm_page_state state) +{ + struct check_walk_data d = { + .desired = state, + .get_page_state = host_get_page_state, + }; + + hyp_assert_lock_held(&host_kvm.lock); + return check_page_state_range(&host_kvm.pgt, addr, size, &d); +} + +static int __host_set_page_state_range(u64 addr, u64 size, + enum pkvm_page_state state) +{ + enum kvm_pgtable_prot prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, state); + + return host_stage2_idmap_locked(addr, size, prot); +} + +static int host_request_owned_transition(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + u64 addr = tx->initiator.addr; + + *completer_addr = tx->initiator.host.completer_addr; + return __host_check_page_state_range(addr, size, PKVM_PAGE_OWNED); +} + +static int host_request_unshare(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + u64 addr = tx->initiator.addr; + + *completer_addr = tx->initiator.host.completer_addr; + return __host_check_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); +} + +static int host_initiate_share(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + u64 addr = tx->initiator.addr; + + *completer_addr = tx->initiator.host.completer_addr; + return __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); +} + +static int host_initiate_unshare(u64 *completer_addr, + const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + u64 addr = tx->initiator.addr; + + *completer_addr = tx->initiator.host.completer_addr; + return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED); +} + +static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte) +{ + if (!kvm_pte_valid(pte)) + return PKVM_NOPAGE; + + return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); +} + +static int __hyp_check_page_state_range(u64 addr, u64 size, + enum pkvm_page_state state) +{ + struct check_walk_data d = { + .desired = state, + .get_page_state = hyp_get_page_state, + }; + + hyp_assert_lock_held(&pkvm_pgd_lock); + return check_page_state_range(&pkvm_pgtable, addr, size, &d); +} + +static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) +{ + return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || + tx->initiator.id != PKVM_ID_HOST); +} + +static int hyp_ack_share(u64 addr, const struct pkvm_mem_transition *tx, + enum kvm_pgtable_prot perms) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + + if (perms != PAGE_HYP) + return -EPERM; + + if (__hyp_ack_skip_pgtable_check(tx)) + return 0; + + return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); +} + +static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + + if (__hyp_ack_skip_pgtable_check(tx)) + return 0; + + return __hyp_check_page_state_range(addr, size, + PKVM_PAGE_SHARED_BORROWED); +} + +static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx, + enum kvm_pgtable_prot perms) +{ + void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); + enum kvm_pgtable_prot prot; + + prot = pkvm_mkstate(perms, PKVM_PAGE_SHARED_BORROWED); + return pkvm_create_mappings_locked(start, end, prot); +} + +static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx) +{ + u64 size = tx->nr_pages * PAGE_SIZE; + int ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, addr, size); + + return (ret != size) ? -EFAULT : 0; +} + +static int check_share(struct pkvm_mem_share *share) +{ + const struct pkvm_mem_transition *tx = &share->tx; + u64 completer_addr; + int ret; + + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_request_owned_transition(&completer_addr, tx); + break; + default: + ret = -EINVAL; + } - ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, NULL); if (ret) - goto unlock; - if (!pte) - goto map_shared; + return ret; - /* - * Check attributes in the host stage-2 PTE. We need the page to be: - * - mapped RWX as we're sharing memory; - * - not borrowed, as that implies absence of ownership. - * Otherwise, we can't let it got through - */ - cur = kvm_pgtable_stage2_pte_prot(pte); - prot = pkvm_mkstate(0, PKVM_PAGE_SHARED_BORROWED); - if (!check_prot(cur, PKVM_HOST_MEM_PROT, prot)) { - ret = -EPERM; - goto unlock; + switch (tx->completer.id) { + case PKVM_ID_HYP: + ret = hyp_ack_share(completer_addr, tx, share->completer_prot); + break; + default: + ret = -EINVAL; } - state = pkvm_getstate(cur); - if (state == PKVM_PAGE_OWNED) - goto map_shared; + return ret; +} - /* - * Tolerate double-sharing the same page, but this requires - * cross-checking the hypervisor stage-1. - */ - if (state != PKVM_PAGE_SHARED_OWNED) { - ret = -EPERM; - goto unlock; +static int __do_share(struct pkvm_mem_share *share) +{ + const struct pkvm_mem_transition *tx = &share->tx; + u64 completer_addr; + int ret; + + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_initiate_share(&completer_addr, tx); + break; + default: + ret = -EINVAL; } - ret = kvm_pgtable_get_leaf(&pkvm_pgtable, (u64)virt, &pte, NULL); if (ret) - goto unlock; + return ret; - /* - * If the page has been shared with the hypervisor, it must be - * already mapped as SHARED_BORROWED in its stage-1. - */ - cur = kvm_pgtable_hyp_pte_prot(pte); - prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); - if (!check_prot(cur, prot, ~prot)) - ret = -EPERM; - goto unlock; + switch (tx->completer.id) { + case PKVM_ID_HYP: + ret = hyp_complete_share(completer_addr, tx, share->completer_prot); + break; + default: + ret = -EINVAL; + } -map_shared: - /* - * If the page is not yet shared, adjust mappings in both page-tables - * while both locks are held. - */ - prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); - ret = pkvm_create_mappings_locked(virt, virt + PAGE_SIZE, prot); - BUG_ON(ret); + return ret; +} - prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); - ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot); - BUG_ON(ret); +/* + * do_share(): + * + * The page owner grants access to another component with a given set + * of permissions. + * + * Initiator: OWNED => SHARED_OWNED + * Completer: NOPAGE => SHARED_BORROWED + */ +static int do_share(struct pkvm_mem_share *share) +{ + int ret; -unlock: - hyp_spin_unlock(&pkvm_pgd_lock); - hyp_spin_unlock(&host_kvm.lock); + ret = check_share(share); + if (ret) + return ret; + + return WARN_ON(__do_share(share)); +} + +static int check_unshare(struct pkvm_mem_share *share) +{ + const struct pkvm_mem_transition *tx = &share->tx; + u64 completer_addr; + int ret; + + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_request_unshare(&completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + if (ret) + return ret; + + switch (tx->completer.id) { + case PKVM_ID_HYP: + ret = hyp_ack_unshare(completer_addr, tx); + break; + default: + ret = -EINVAL; + } return ret; } -void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) +static int __do_unshare(struct pkvm_mem_share *share) { - struct kvm_vcpu_fault_info fault; - u64 esr, addr; - int ret = 0; + const struct pkvm_mem_transition *tx = &share->tx; + u64 completer_addr; + int ret; - esr = read_sysreg_el2(SYS_ESR); - BUG_ON(!__get_fault_info(esr, &fault)); + switch (tx->initiator.id) { + case PKVM_ID_HOST: + ret = host_initiate_unshare(&completer_addr, tx); + break; + default: + ret = -EINVAL; + } - addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; - ret = host_stage2_idmap(addr); - BUG_ON(ret && ret != -EAGAIN); + if (ret) + return ret; + + switch (tx->completer.id) { + case PKVM_ID_HYP: + ret = hyp_complete_unshare(completer_addr, tx); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/* + * do_unshare(): + * + * The page owner revokes access from another component for a range of + * pages which were previously shared using do_share(). + * + * Initiator: SHARED_OWNED => OWNED + * Completer: SHARED_BORROWED => NOPAGE + */ +static int do_unshare(struct pkvm_mem_share *share) +{ + int ret; + + ret = check_unshare(share); + if (ret) + return ret; + + return WARN_ON(__do_unshare(share)); +} + +int __pkvm_host_share_hyp(u64 pfn) +{ + int ret; + u64 host_addr = hyp_pfn_to_phys(pfn); + u64 hyp_addr = (u64)__hyp_va(host_addr); + struct pkvm_mem_share share = { + .tx = { + .nr_pages = 1, + .initiator = { + .id = PKVM_ID_HOST, + .addr = host_addr, + .host = { + .completer_addr = hyp_addr, + }, + }, + .completer = { + .id = PKVM_ID_HYP, + }, + }, + .completer_prot = PAGE_HYP, + }; + + host_lock_component(); + hyp_lock_component(); + + ret = do_share(&share); + + hyp_unlock_component(); + host_unlock_component(); + + return ret; +} + +int __pkvm_host_unshare_hyp(u64 pfn) +{ + int ret; + u64 host_addr = hyp_pfn_to_phys(pfn); + u64 hyp_addr = (u64)__hyp_va(host_addr); + struct pkvm_mem_share share = { + .tx = { + .nr_pages = 1, + .initiator = { + .id = PKVM_ID_HOST, + .addr = host_addr, + .host = { + .completer_addr = hyp_addr, + }, + }, + .completer = { + .id = PKVM_ID_HYP, + }, + }, + .completer_prot = PAGE_HYP, + }; + + host_lock_component(); + hyp_lock_component(); + + ret = do_unshare(&share); + + hyp_unlock_component(); + host_unlock_component(); + + return ret; } diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 2fabeceb889a..526a7d6fa86f 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -8,6 +8,7 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> +#include <asm/kvm_pkvm.h> #include <asm/spectre.h> #include <nvhe/early_alloc.h> @@ -18,11 +19,12 @@ struct kvm_pgtable pkvm_pgtable; hyp_spinlock_t pkvm_pgd_lock; -u64 __io_map_base; struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS]; unsigned int hyp_memblock_nr; +static u64 __io_map_base; + static int __pkvm_create_mappings(unsigned long start, unsigned long size, unsigned long phys, enum kvm_pgtable_prot prot) { diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index 41fc25bdfb34..543cad6c376a 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -152,6 +152,7 @@ static inline void hyp_page_ref_inc(struct hyp_page *p) static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) { + BUG_ON(!p->refcount); p->refcount--; return (p->refcount == 0); } @@ -193,6 +194,20 @@ void hyp_get_page(struct hyp_pool *pool, void *addr) hyp_spin_unlock(&pool->lock); } +void hyp_split_page(struct hyp_page *p) +{ + unsigned short order = p->order; + unsigned int i; + + p->order = 0; + for (i = 1; i < (1 << order); i++) { + struct hyp_page *tail = p + i; + + tail->order = 0; + hyp_set_page_refcounted(tail); + } +} + void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order) { unsigned short i = order; @@ -226,7 +241,7 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, int i; hyp_spin_lock_init(&pool->lock); - pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT)); + pool->max_order = min(MAX_ORDER, get_order((nr_pages + 1) << PAGE_SHIFT)); for (i = 0; i < pool->max_order; i++) INIT_LIST_HEAD(&pool->free_area[i]); pool->range_start = phys; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c new file mode 100644 index 000000000000..99c8d8b73e70 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba <tabba@google.com> + */ + +#include <linux/kvm_host.h> +#include <linux/mm.h> +#include <nvhe/fixed_config.h> +#include <nvhe/trap_handler.h> + +/* + * Set trap register values based on features in ID_AA64PFR0. + */ +static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); + u64 hcr_set = HCR_RW; + u64 hcr_clear = 0; + u64 cptr_set = 0; + + /* Protected KVM does not support AArch32 guests. */ + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL0), + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_ELx_64BIT_ONLY); + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1), + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_ELx_64BIT_ONLY); + + /* + * Linux guests assume support for floating-point and Advanced SIMD. Do + * not change the trapping behavior for these from the KVM default. + */ + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_FP), + PVM_ID_AA64PFR0_ALLOW)); + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_ASIMD), + PVM_ID_AA64PFR0_ALLOW)); + + /* Trap RAS unless all current versions are supported */ + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_RAS), feature_ids) < + ID_AA64PFR0_RAS_V1P1) { + hcr_set |= HCR_TERR | HCR_TEA; + hcr_clear |= HCR_FIEN; + } + + /* Trap AMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_AMU), feature_ids)) { + hcr_clear |= HCR_AMVOFFEN; + cptr_set |= CPTR_EL2_TAM; + } + + /* Trap SVE */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_SVE), feature_ids)) + cptr_set |= CPTR_EL2_TZ; + + vcpu->arch.hcr_el2 |= hcr_set; + vcpu->arch.hcr_el2 &= ~hcr_clear; + vcpu->arch.cptr_el2 |= cptr_set; +} + +/* + * Set trap register values based on features in ID_AA64PFR1. + */ +static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); + u64 hcr_set = 0; + u64 hcr_clear = 0; + + /* Memory Tagging: Trap and Treat as Untagged if not supported. */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), feature_ids)) { + hcr_set |= HCR_TID5; + hcr_clear |= HCR_DCT | HCR_ATA; + } + + vcpu->arch.hcr_el2 |= hcr_set; + vcpu->arch.hcr_el2 &= ~hcr_clear; +} + +/* + * Set trap register values based on features in ID_AA64DFR0. + */ +static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); + u64 mdcr_set = 0; + u64 mdcr_clear = 0; + u64 cptr_set = 0; + + /* Trap/constrain PMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_PMUVER), feature_ids)) { + mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; + mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME | + MDCR_EL2_HPMN_MASK; + } + + /* Trap Debug */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), feature_ids)) + mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA | MDCR_EL2_TDE; + + /* Trap OS Double Lock */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DOUBLELOCK), feature_ids)) + mdcr_set |= MDCR_EL2_TDOSA; + + /* Trap SPE */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_PMSVER), feature_ids)) { + mdcr_set |= MDCR_EL2_TPMS; + mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; + } + + /* Trap Trace Filter */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_TRACE_FILT), feature_ids)) + mdcr_set |= MDCR_EL2_TTRF; + + /* Trap Trace */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_TRACEVER), feature_ids)) + cptr_set |= CPTR_EL2_TTA; + + vcpu->arch.mdcr_el2 |= mdcr_set; + vcpu->arch.mdcr_el2 &= ~mdcr_clear; + vcpu->arch.cptr_el2 |= cptr_set; +} + +/* + * Set trap register values based on features in ID_AA64MMFR0. + */ +static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); + u64 mdcr_set = 0; + + /* Trap Debug Communications Channel registers */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_FGT), feature_ids)) + mdcr_set |= MDCR_EL2_TDCC; + + vcpu->arch.mdcr_el2 |= mdcr_set; +} + +/* + * Set trap register values based on features in ID_AA64MMFR1. + */ +static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); + u64 hcr_set = 0; + + /* Trap LOR */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_LOR), feature_ids)) + hcr_set |= HCR_TLOR; + + vcpu->arch.hcr_el2 |= hcr_set; +} + +/* + * Set baseline trap register values. + */ +static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) +{ + const u64 hcr_trap_feat_regs = HCR_TID3; + const u64 hcr_trap_impdef = HCR_TACR | HCR_TIDCP | HCR_TID1; + + /* + * Always trap: + * - Feature id registers: to control features exposed to guests + * - Implementation-defined features + */ + vcpu->arch.hcr_el2 |= hcr_trap_feat_regs | hcr_trap_impdef; + + /* Clear res0 and set res1 bits to trap potential new features. */ + vcpu->arch.hcr_el2 &= ~(HCR_RES0); + vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); + vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; + vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); +} + +/* + * Initialize trap register values for protected VMs. + */ +void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) +{ + pvm_init_trap_regs(vcpu); + pvm_init_traps_aa64pfr0(vcpu); + pvm_init_traps_aa64pfr1(vcpu); + pvm_init_traps_aa64dfr0(vcpu); + pvm_init_traps_aa64mmfr0(vcpu); + pvm_init_traps_aa64mmfr1(vcpu); +} diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 57c27846320f..27af337f9fea 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -8,15 +8,16 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> +#include <asm/kvm_pkvm.h> #include <nvhe/early_alloc.h> +#include <nvhe/fixed_config.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> #include <nvhe/mem_protect.h> #include <nvhe/mm.h> #include <nvhe/trap_handler.h> -struct hyp_pool hpool; unsigned long hyp_nr_cpus; #define hyp_percpu_size ((unsigned long)__per_cpu_end - \ @@ -26,6 +27,7 @@ static void *vmemmap_base; static void *hyp_pgt_base; static void *host_s2_pgt_base; static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; +static struct hyp_pool hpool; static int divide_memory_pool(void *virt, unsigned long size) { @@ -164,6 +166,7 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, enum kvm_pgtable_walk_flags flag, void * const arg) { + struct kvm_pgtable_mm_ops *mm_ops = arg; enum kvm_pgtable_prot prot; enum pkvm_page_state state; kvm_pte_t pte = *ptep; @@ -172,12 +175,21 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, if (!kvm_pte_valid(pte)) return 0; + /* + * Fix-up the refcount for the page-table pages as the early allocator + * was unable to access the hyp_vmemmap and so the buddy allocator has + * initialised the refcount to '1'. + */ + mm_ops->get_page(ptep); + if (flag != KVM_PGTABLE_WALK_LEAF) + return 0; + if (level != (KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL; phys = kvm_pte_to_phys(pte); if (!addr_is_memory(phys)) - return 0; + return -EINVAL; /* * Adjust the host stage-2 mappings to match the ownership attributes @@ -204,10 +216,21 @@ static int finalize_host_mappings(void) { struct kvm_pgtable_walker walker = { .cb = finalize_host_mappings_walker, - .flags = KVM_PGTABLE_WALK_LEAF, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + .arg = pkvm_pgtable.mm_ops, }; + int i, ret; + + for (i = 0; i < hyp_memblock_nr; i++) { + struct memblock_region *reg = &hyp_memory[i]; + u64 start = (u64)hyp_phys_to_virt(reg->base); + + ret = kvm_pgtable_walk(&pkvm_pgtable, start, reg->size, &walker); + if (ret) + return ret; + } - return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits), &walker); + return 0; } void __noreturn __pkvm_init_finalise(void) @@ -229,19 +252,20 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; - ret = finalize_host_mappings(); - if (ret) - goto out; - pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_page = hyp_zalloc_hyp_page, .phys_to_virt = hyp_phys_to_virt, .virt_to_phys = hyp_virt_to_phys, .get_page = hpool_get_page, .put_page = hpool_put_page, + .page_count = hyp_page_count, }; pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops; + ret = finalize_host_mappings(); + if (ret) + goto out; + out: /* * We tail-called to here from handle___pkvm_init() and will not return, @@ -260,6 +284,8 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, void (*fn)(phys_addr_t params_pa, void *finalize_fn_va); int ret; + BUG_ON(kvm_check_pvm_sysreg_table()); + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) return -EINVAL; diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index a34b01cc8ab9..6410d21d8695 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -25,8 +25,8 @@ #include <asm/fpsimd.h> #include <asm/debug-monitors.h> #include <asm/processor.h> -#include <asm/thread_info.h> +#include <nvhe/fixed_config.h> #include <nvhe/mem_protect.h> /* Non-VHE specific context */ @@ -158,6 +158,98 @@ static void __pmu_switch_to_host(struct kvm_cpu_context *host_ctxt) write_sysreg(pmu->events_host, pmcntenset_el0); } +/** + * Handler for protected VM MSR, MRS or System instruction execution in AArch64. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't. + */ +static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + /* + * Make sure we handle the exit for workarounds and ptrauth + * before the pKVM handling, as the latter could decide to + * UNDEF. + */ + return (kvm_hyp_handle_sysreg(vcpu, exit_code) || + kvm_handle_pvm_sysreg(vcpu, exit_code)); +} + +/** + * Handler for protected floating-point and Advanced SIMD accesses. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't. + */ +static bool kvm_handle_pvm_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + /* Linux guests assume support for floating-point and Advanced SIMD. */ + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_FP), + PVM_ID_AA64PFR0_ALLOW)); + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_ASIMD), + PVM_ID_AA64PFR0_ALLOW)); + + return kvm_hyp_handle_fpsimd(vcpu, exit_code); +} + +static const exit_handler_fn hyp_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, + [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg, + [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, +}; + +static const exit_handler_fn pvm_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_SYS64] = kvm_handle_pvm_sys64, + [ESR_ELx_EC_SVE] = kvm_handle_pvm_restricted, + [ESR_ELx_EC_FP_ASIMD] = kvm_handle_pvm_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, +}; + +static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +{ + if (unlikely(kvm_vm_is_protected(kern_hyp_va(vcpu->kvm)))) + return pvm_exit_handlers; + + return hyp_exit_handlers; +} + +/* + * Some guests (e.g., protected VMs) are not be allowed to run in AArch32. + * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a + * guest from dropping to AArch32 EL0 if implemented by the CPU. If the + * hypervisor spots a guest in such a state ensure it is handled, and don't + * trust the host to spot or fix it. The check below is based on the one in + * kvm_arch_vcpu_ioctl_run(). + * + * Returns false if the guest ran in AArch32 when it shouldn't have, and + * thus should exit to the host, or true if a the guest run loop can continue. + */ +static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + struct kvm *kvm = kern_hyp_va(vcpu->kvm); + + if (kvm_vm_is_protected(kvm) && vcpu_mode_is_32bit(vcpu)) { + /* + * As we have caught the guest red-handed, decide that it isn't + * fit for purpose anymore by making the vcpu invalid. The VMM + * can try and fix it by re-initializing the vcpu with + * KVM_ARM_VCPU_INIT, however, this is likely not possible for + * protected VMs. + */ + vcpu->arch.target = -1; + *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); + *exit_code |= ARM_EXCEPTION_IL; + } +} + /* Switch to the guest for legacy non-VHE systems */ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) { diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c new file mode 100644 index 000000000000..792cf6e6ac92 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -0,0 +1,487 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba <tabba@google.com> + */ + +#include <linux/irqchip/arm-gic-v3.h> + +#include <asm/kvm_asm.h> +#include <asm/kvm_mmu.h> + +#include <hyp/adjust_pc.h> + +#include <nvhe/fixed_config.h> + +#include "../../sys_regs.h" + +/* + * Copies of the host's CPU features registers holding sanitized values at hyp. + */ +u64 id_aa64pfr0_el1_sys_val; +u64 id_aa64pfr1_el1_sys_val; +u64 id_aa64isar0_el1_sys_val; +u64 id_aa64isar1_el1_sys_val; +u64 id_aa64mmfr0_el1_sys_val; +u64 id_aa64mmfr1_el1_sys_val; +u64 id_aa64mmfr2_el1_sys_val; + +/* + * Inject an unknown/undefined exception to an AArch64 guest while most of its + * sysregs are live. + */ +static void inject_undef64(struct kvm_vcpu *vcpu) +{ + u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); + + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); + + vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | + KVM_ARM64_EXCEPT_AA64_ELx_SYNC | + KVM_ARM64_PENDING_EXCEPTION); + + __kvm_adjust_pc(vcpu); + + write_sysreg_el1(esr, SYS_ESR); + write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); +} + +/* + * Returns the restricted features values of the feature register based on the + * limitations in restrict_fields. + * A feature id field value of 0b0000 does not impose any restrictions. + * Note: Use only for unsigned feature field values. + */ +static u64 get_restricted_features_unsigned(u64 sys_reg_val, + u64 restrict_fields) +{ + u64 value = 0UL; + u64 mask = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); + + /* + * According to the Arm Architecture Reference Manual, feature fields + * use increasing values to indicate increases in functionality. + * Iterate over the restricted feature fields and calculate the minimum + * unsigned value between the one supported by the system, and what the + * value is being restricted to. + */ + while (sys_reg_val && restrict_fields) { + value |= min(sys_reg_val & mask, restrict_fields & mask); + sys_reg_val &= ~mask; + restrict_fields &= ~mask; + mask <<= ARM64_FEATURE_FIELD_BITS; + } + + return value; +} + +/* + * Functions that return the value of feature id registers for protected VMs + * based on allowed features, system features, and KVM support. + */ + +static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) +{ + const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); + u64 set_mask = 0; + u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; + + if (!vcpu_has_sve(vcpu)) + allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_SVE); + + set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); + + /* Spectre and Meltdown mitigation in KVM */ + set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), + (u64)kvm->arch.pfr0_csv2); + set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), + (u64)kvm->arch.pfr0_csv3); + + return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; +} + +static u64 get_pvm_id_aa64pfr1(const struct kvm_vcpu *vcpu) +{ + const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); + u64 allow_mask = PVM_ID_AA64PFR1_ALLOW; + + if (!kvm_has_mte(kvm)) + allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE); + + return id_aa64pfr1_el1_sys_val & allow_mask; +} + +static u64 get_pvm_id_aa64zfr0(const struct kvm_vcpu *vcpu) +{ + /* + * No support for Scalable Vectors, therefore, hyp has no sanitized + * copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64ZFR0_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64dfr0(const struct kvm_vcpu *vcpu) +{ + /* + * No support for debug, including breakpoints, and watchpoints, + * therefore, pKVM has no sanitized copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64DFR0_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64dfr1(const struct kvm_vcpu *vcpu) +{ + /* + * No support for debug, therefore, hyp has no sanitized copy of the + * feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64DFR1_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64afr0(const struct kvm_vcpu *vcpu) +{ + /* + * No support for implementation defined features, therefore, hyp has no + * sanitized copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64AFR0_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64afr1(const struct kvm_vcpu *vcpu) +{ + /* + * No support for implementation defined features, therefore, hyp has no + * sanitized copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64AFR1_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64isar0(const struct kvm_vcpu *vcpu) +{ + return id_aa64isar0_el1_sys_val & PVM_ID_AA64ISAR0_ALLOW; +} + +static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu) +{ + u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW; + + if (!vcpu_has_ptrauth(vcpu)) + allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_API) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI)); + + return id_aa64isar1_el1_sys_val & allow_mask; +} + +static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu) +{ + u64 set_mask; + + set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val, + PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED); + + return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask; +} + +static u64 get_pvm_id_aa64mmfr1(const struct kvm_vcpu *vcpu) +{ + return id_aa64mmfr1_el1_sys_val & PVM_ID_AA64MMFR1_ALLOW; +} + +static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu) +{ + return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW; +} + +/* Read a sanitized cpufeature ID register by its encoding */ +u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) +{ + switch (id) { + case SYS_ID_AA64PFR0_EL1: + return get_pvm_id_aa64pfr0(vcpu); + case SYS_ID_AA64PFR1_EL1: + return get_pvm_id_aa64pfr1(vcpu); + case SYS_ID_AA64ZFR0_EL1: + return get_pvm_id_aa64zfr0(vcpu); + case SYS_ID_AA64DFR0_EL1: + return get_pvm_id_aa64dfr0(vcpu); + case SYS_ID_AA64DFR1_EL1: + return get_pvm_id_aa64dfr1(vcpu); + case SYS_ID_AA64AFR0_EL1: + return get_pvm_id_aa64afr0(vcpu); + case SYS_ID_AA64AFR1_EL1: + return get_pvm_id_aa64afr1(vcpu); + case SYS_ID_AA64ISAR0_EL1: + return get_pvm_id_aa64isar0(vcpu); + case SYS_ID_AA64ISAR1_EL1: + return get_pvm_id_aa64isar1(vcpu); + case SYS_ID_AA64MMFR0_EL1: + return get_pvm_id_aa64mmfr0(vcpu); + case SYS_ID_AA64MMFR1_EL1: + return get_pvm_id_aa64mmfr1(vcpu); + case SYS_ID_AA64MMFR2_EL1: + return get_pvm_id_aa64mmfr2(vcpu); + default: + /* + * Should never happen because all cases are covered in + * pvm_sys_reg_descs[]. + */ + WARN_ON(1); + break; + } + + return 0; +} + +static u64 read_id_reg(const struct kvm_vcpu *vcpu, + struct sys_reg_desc const *r) +{ + return pvm_read_id_reg(vcpu, reg_to_encoding(r)); +} + +/* Handler to RAZ/WI sysregs */ +static bool pvm_access_raz_wi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!p->is_write) + p->regval = 0; + + return true; +} + +/* + * Accessor for AArch32 feature id registers. + * + * The value of these registers is "unknown" according to the spec if AArch32 + * isn't supported. + */ +static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) { + inject_undef64(vcpu); + return false; + } + + /* + * No support for AArch32 guests, therefore, pKVM has no sanitized copy + * of AArch32 feature id registers. + */ + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1), + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) > ID_AA64PFR0_ELx_64BIT_ONLY); + + return pvm_access_raz_wi(vcpu, p, r); +} + +/* + * Accessor for AArch64 feature id registers. + * + * If access is allowed, set the regval to the protected VM's view of the + * register and return true. + * Otherwise, inject an undefined exception and return false. + */ +static bool pvm_access_id_aarch64(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) { + inject_undef64(vcpu); + return false; + } + + p->regval = read_id_reg(vcpu, r); + return true; +} + +static bool pvm_gic_read_sre(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + /* pVMs only support GICv3. 'nuf said. */ + if (!p->is_write) + p->regval = ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB | ICC_SRE_EL1_SRE; + + return true; +} + +/* Mark the specified system register as an AArch32 feature id register. */ +#define AARCH32(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch32 } + +/* Mark the specified system register as an AArch64 feature id register. */ +#define AARCH64(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch64 } + +/* Mark the specified system register as Read-As-Zero/Write-Ignored */ +#define RAZ_WI(REG) { SYS_DESC(REG), .access = pvm_access_raz_wi } + +/* Mark the specified system register as not being handled in hyp. */ +#define HOST_HANDLED(REG) { SYS_DESC(REG), .access = NULL } + +/* + * Architected system registers. + * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 + * + * NOTE: Anything not explicitly listed here is *restricted by default*, i.e., + * it will lead to injecting an exception into the guest. + */ +static const struct sys_reg_desc pvm_sys_reg_descs[] = { + /* Cache maintenance by set/way operations are restricted. */ + + /* Debug and Trace Registers are restricted. */ + + /* AArch64 mappings of the AArch32 ID registers */ + /* CRm=1 */ + AARCH32(SYS_ID_PFR0_EL1), + AARCH32(SYS_ID_PFR1_EL1), + AARCH32(SYS_ID_DFR0_EL1), + AARCH32(SYS_ID_AFR0_EL1), + AARCH32(SYS_ID_MMFR0_EL1), + AARCH32(SYS_ID_MMFR1_EL1), + AARCH32(SYS_ID_MMFR2_EL1), + AARCH32(SYS_ID_MMFR3_EL1), + + /* CRm=2 */ + AARCH32(SYS_ID_ISAR0_EL1), + AARCH32(SYS_ID_ISAR1_EL1), + AARCH32(SYS_ID_ISAR2_EL1), + AARCH32(SYS_ID_ISAR3_EL1), + AARCH32(SYS_ID_ISAR4_EL1), + AARCH32(SYS_ID_ISAR5_EL1), + AARCH32(SYS_ID_MMFR4_EL1), + AARCH32(SYS_ID_ISAR6_EL1), + + /* CRm=3 */ + AARCH32(SYS_MVFR0_EL1), + AARCH32(SYS_MVFR1_EL1), + AARCH32(SYS_MVFR2_EL1), + AARCH32(SYS_ID_PFR2_EL1), + AARCH32(SYS_ID_DFR1_EL1), + AARCH32(SYS_ID_MMFR5_EL1), + + /* AArch64 ID registers */ + /* CRm=4 */ + AARCH64(SYS_ID_AA64PFR0_EL1), + AARCH64(SYS_ID_AA64PFR1_EL1), + AARCH64(SYS_ID_AA64ZFR0_EL1), + AARCH64(SYS_ID_AA64DFR0_EL1), + AARCH64(SYS_ID_AA64DFR1_EL1), + AARCH64(SYS_ID_AA64AFR0_EL1), + AARCH64(SYS_ID_AA64AFR1_EL1), + AARCH64(SYS_ID_AA64ISAR0_EL1), + AARCH64(SYS_ID_AA64ISAR1_EL1), + AARCH64(SYS_ID_AA64MMFR0_EL1), + AARCH64(SYS_ID_AA64MMFR1_EL1), + AARCH64(SYS_ID_AA64MMFR2_EL1), + + /* Scalable Vector Registers are restricted. */ + + RAZ_WI(SYS_ERRIDR_EL1), + RAZ_WI(SYS_ERRSELR_EL1), + RAZ_WI(SYS_ERXFR_EL1), + RAZ_WI(SYS_ERXCTLR_EL1), + RAZ_WI(SYS_ERXSTATUS_EL1), + RAZ_WI(SYS_ERXADDR_EL1), + RAZ_WI(SYS_ERXMISC0_EL1), + RAZ_WI(SYS_ERXMISC1_EL1), + + /* Performance Monitoring Registers are restricted. */ + + /* Limited Ordering Regions Registers are restricted. */ + + HOST_HANDLED(SYS_ICC_SGI1R_EL1), + HOST_HANDLED(SYS_ICC_ASGI1R_EL1), + HOST_HANDLED(SYS_ICC_SGI0R_EL1), + { SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, }, + + HOST_HANDLED(SYS_CCSIDR_EL1), + HOST_HANDLED(SYS_CLIDR_EL1), + HOST_HANDLED(SYS_CSSELR_EL1), + HOST_HANDLED(SYS_CTR_EL0), + + /* Performance Monitoring Registers are restricted. */ + + /* Activity Monitoring Registers are restricted. */ + + HOST_HANDLED(SYS_CNTP_TVAL_EL0), + HOST_HANDLED(SYS_CNTP_CTL_EL0), + HOST_HANDLED(SYS_CNTP_CVAL_EL0), + + /* Performance Monitoring Registers are restricted. */ +}; + +/* + * Checks that the sysreg table is unique and in-order. + * + * Returns 0 if the table is consistent, or 1 otherwise. + */ +int kvm_check_pvm_sysreg_table(void) +{ + unsigned int i; + + for (i = 1; i < ARRAY_SIZE(pvm_sys_reg_descs); i++) { + if (cmp_sys_reg(&pvm_sys_reg_descs[i-1], &pvm_sys_reg_descs[i]) >= 0) + return 1; + } + + return 0; +} + +/* + * Handler for protected VM MSR, MRS or System instruction execution. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't, to be handled by the host. + */ +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + const struct sys_reg_desc *r; + struct sys_reg_params params; + unsigned long esr = kvm_vcpu_get_esr(vcpu); + int Rt = kvm_vcpu_sys_get_rt(vcpu); + + params = esr_sys64_to_params(esr); + params.regval = vcpu_get_reg(vcpu, Rt); + + r = find_reg(¶ms, pvm_sys_reg_descs, ARRAY_SIZE(pvm_sys_reg_descs)); + + /* Undefined (RESTRICTED). */ + if (r == NULL) { + inject_undef64(vcpu); + return true; + } + + /* Handled by the host (HOST_HANDLED) */ + if (r->access == NULL) + return false; + + /* Handled by hyp: skip instruction if instructed to do so. */ + if (r->access(vcpu, ¶ms, r)) + __kvm_skip_instr(vcpu); + + if (!params.is_write) + vcpu_set_reg(vcpu, Rt, params.regval); + + return true; +} + +/* + * Handler for protected VM restricted exceptions. + * + * Inject an undefined exception into the guest and return true to indicate that + * the hypervisor has handled the exit, and control should go back to the guest. + */ +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + inject_undef64(vcpu); + return true; +} diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index f8ceebe4982e..844a6f003fd5 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -383,21 +383,6 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) return prot; } -static bool hyp_pte_needs_update(kvm_pte_t old, kvm_pte_t new) -{ - /* - * Tolerate KVM recreating the exact same mapping, or changing software - * bits if the existing mapping was valid. - */ - if (old == new) - return false; - - if (!kvm_pte_valid(old)) - return true; - - return !WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW); -} - static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct hyp_map_data *data) { @@ -407,11 +392,16 @@ static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, if (!kvm_block_mapping_supported(addr, end, phys, level)) return false; + data->phys += granule; new = kvm_init_valid_leaf_pte(phys, data->attr, level); - if (hyp_pte_needs_update(old, new)) - smp_store_release(ptep, new); + if (old == new) + return true; + if (!kvm_pte_valid(old)) + data->mm_ops->get_page(ptep); + else if (WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) + return false; - data->phys += granule; + smp_store_release(ptep, new); return true; } @@ -433,6 +423,7 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, return -ENOMEM; kvm_set_table_pte(ptep, childp, mm_ops); + mm_ops->get_page(ptep); return 0; } @@ -460,6 +451,69 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, return ret; } +struct hyp_unmap_data { + u64 unmapped; + struct kvm_pgtable_mm_ops *mm_ops; +}; + +static int hyp_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + kvm_pte_t pte = *ptep, *childp = NULL; + u64 granule = kvm_granule_size(level); + struct hyp_unmap_data *data = arg; + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + + if (!kvm_pte_valid(pte)) + return -EINVAL; + + if (kvm_pte_table(pte, level)) { + childp = kvm_pte_follow(pte, mm_ops); + + if (mm_ops->page_count(childp) != 1) + return 0; + + kvm_clear_pte(ptep); + dsb(ishst); + __tlbi_level(vae2is, __TLBI_VADDR(addr, 0), level); + } else { + if (end - addr < granule) + return -EINVAL; + + kvm_clear_pte(ptep); + dsb(ishst); + __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level); + data->unmapped += granule; + } + + dsb(ish); + isb(); + mm_ops->put_page(ptep); + + if (childp) + mm_ops->put_page(childp); + + return 0; +} + +u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct hyp_unmap_data unmap_data = { + .mm_ops = pgt->mm_ops, + }; + struct kvm_pgtable_walker walker = { + .cb = hyp_unmap_walker, + .arg = &unmap_data, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + }; + + if (!pgt->mm_ops->page_count) + return 0; + + kvm_pgtable_walk(pgt, addr, size, &walker); + return unmap_data.unmapped; +} + int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, struct kvm_pgtable_mm_ops *mm_ops) { @@ -482,8 +536,16 @@ static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag, void * const arg) { struct kvm_pgtable_mm_ops *mm_ops = arg; + kvm_pte_t pte = *ptep; + + if (!kvm_pte_valid(pte)) + return 0; + + mm_ops->put_page(ptep); + + if (kvm_pte_table(pte, level)) + mm_ops->put_page(kvm_pte_follow(pte, mm_ops)); - mm_ops->put_page((void *)kvm_pte_follow(*ptep, mm_ops)); return 0; } @@ -491,7 +553,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) { struct kvm_pgtable_walker walker = { .cb = hyp_free_walker, - .flags = KVM_PGTABLE_WALK_TABLE_POST, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, .arg = pgt->mm_ops, }; @@ -1116,13 +1178,13 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) } -int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch, +int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops, enum kvm_pgtable_stage2_flags flags, kvm_pgtable_force_pte_cb_t force_pte_cb) { size_t pgd_sz; - u64 vtcr = arch->vtcr; + u64 vtcr = mmu->arch->vtcr; u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; @@ -1135,7 +1197,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch, pgt->ia_bits = ia_bits; pgt->start_level = start_level; pgt->mm_ops = mm_ops; - pgt->mmu = &arch->mmu; + pgt->mmu = mmu; pgt->flags = flags; pgt->force_pte_cb = force_pte_cb; diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c deleted file mode 100644 index 578670e3f608..000000000000 --- a/arch/arm64/kvm/hyp/reserved_mem.c +++ /dev/null @@ -1,109 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2020 - Google LLC - * Author: Quentin Perret <qperret@google.com> - */ - -#include <linux/kvm_host.h> -#include <linux/memblock.h> -#include <linux/sort.h> - -#include <asm/kvm_host.h> - -#include <nvhe/memory.h> -#include <nvhe/mm.h> - -static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory); -static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr); - -phys_addr_t hyp_mem_base; -phys_addr_t hyp_mem_size; - -static int cmp_hyp_memblock(const void *p1, const void *p2) -{ - const struct memblock_region *r1 = p1; - const struct memblock_region *r2 = p2; - - return r1->base < r2->base ? -1 : (r1->base > r2->base); -} - -static void __init sort_memblock_regions(void) -{ - sort(hyp_memory, - *hyp_memblock_nr_ptr, - sizeof(struct memblock_region), - cmp_hyp_memblock, - NULL); -} - -static int __init register_memblock_regions(void) -{ - struct memblock_region *reg; - - for_each_mem_region(reg) { - if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) - return -ENOMEM; - - hyp_memory[*hyp_memblock_nr_ptr] = *reg; - (*hyp_memblock_nr_ptr)++; - } - sort_memblock_regions(); - - return 0; -} - -void __init kvm_hyp_reserve(void) -{ - u64 nr_pages, prev, hyp_mem_pages = 0; - int ret; - - if (!is_hyp_mode_available() || is_kernel_in_hyp_mode()) - return; - - if (kvm_get_mode() != KVM_MODE_PROTECTED) - return; - - ret = register_memblock_regions(); - if (ret) { - *hyp_memblock_nr_ptr = 0; - kvm_err("Failed to register hyp memblocks: %d\n", ret); - return; - } - - hyp_mem_pages += hyp_s1_pgtable_pages(); - hyp_mem_pages += host_s2_pgtable_pages(); - - /* - * The hyp_vmemmap needs to be backed by pages, but these pages - * themselves need to be present in the vmemmap, so compute the number - * of pages needed by looking for a fixed point. - */ - nr_pages = 0; - do { - prev = nr_pages; - nr_pages = hyp_mem_pages + prev; - nr_pages = DIV_ROUND_UP(nr_pages * sizeof(struct hyp_page), PAGE_SIZE); - nr_pages += __hyp_pgtable_max_pages(nr_pages); - } while (nr_pages != prev); - hyp_mem_pages += nr_pages; - - /* - * Try to allocate a PMD-aligned region to reduce TLB pressure once - * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. - */ - hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; - hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE), - PMD_SIZE); - if (!hyp_mem_base) - hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE); - else - hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); - - if (!hyp_mem_base) { - kvm_err("Failed to reserve hyp memory\n"); - return; - } - - kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, - hyp_mem_base); -} diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 39f8f7f9227c..20db2f281cf2 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -695,9 +695,7 @@ static void __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt) goto spurious; lr_val &= ~ICH_LR_STATE; - /* No active state for LPIs */ - if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI) - lr_val |= ICH_LR_ACTIVE_BIT; + lr_val |= ICH_LR_ACTIVE_BIT; __gic_v3_set_lr(lr_val, lr); __vgic_v3_set_active_priority(lr_prio, vmcr, grp); vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); @@ -764,20 +762,18 @@ static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) /* Drop priority in any case */ act_prio = __vgic_v3_clear_highest_active_priority(); - /* If EOIing an LPI, no deactivate to be performed */ - if (vid >= VGIC_MIN_LPI) - return; - - /* EOImode == 1, nothing to be done here */ - if (vmcr & ICH_VMCR_EOIM_MASK) - return; - lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); if (lr == -1) { - __vgic_v3_bump_eoicount(); + /* Do not bump EOIcount for LPIs that aren't in the LRs */ + if (!(vid >= VGIC_MIN_LPI)) + __vgic_v3_bump_eoicount(); return; } + /* EOImode == 1 and not an LPI, nothing to be done here */ + if ((vmcr & ICH_VMCR_EOIM_MASK) && !(vid >= VGIC_MIN_LPI)) + return; + lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; /* If priorities or group do not match, the guest has fscked-up. */ @@ -987,8 +983,6 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; - /* SEIS */ - val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT; /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index ded2c66675f0..11d053fdd604 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -24,7 +24,6 @@ #include <asm/fpsimd.h> #include <asm/debug-monitors.h> #include <asm/processor.h> -#include <asm/thread_info.h> /* VHE specific context */ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); @@ -96,6 +95,26 @@ void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu) __deactivate_traps_common(vcpu); } +static const exit_handler_fn hyp_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, + [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg, + [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, +}; + +static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +{ + return hyp_exit_handlers; +} + +static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) +{ +} + /* Switch to the guest for VHE systems running in EL2 */ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) { |