diff options
Diffstat (limited to 'arch/x86/kvm')
46 files changed, 2577 insertions, 1223 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2c86673155c9..4e43923656d0 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -46,8 +46,8 @@ config KVM_X86 select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_PRE_FAULT_MEMORY - select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM select KVM_WERROR if WERROR + select KVM_GUEST_MEMFD if X86_64 config KVM tristate "Kernel-based Virtual Machine (KVM) support" @@ -74,7 +74,7 @@ config KVM_WERROR # FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning. # Building KVM with -Werror and KASAN is still doable via enabling # the kernel-wide WERROR=y. - depends on KVM && ((EXPERT && !KASAN) || WERROR) + depends on KVM_X86 && ((EXPERT && !KASAN) || WERROR) help Add -Werror to the build flags for KVM. @@ -83,7 +83,8 @@ config KVM_WERROR config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT - depends on KVM && X86_64 + depends on KVM_X86 && X86_64 + select KVM_GENERIC_MEMORY_ATTRIBUTES help Enable support for KVM software-protected VMs. Currently, software- protected VMs are purely a development and testing vehicle for @@ -95,8 +96,6 @@ config KVM_SW_PROTECTED_VM config KVM_INTEL tristate "KVM for Intel (and compatible) processors support" depends on KVM && IA32_FEAT_CTL - select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST - select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST help Provides support for KVM on processors equipped with Intel's VT extensions, a.k.a. Virtual Machine Extensions (VMX). @@ -135,6 +134,8 @@ config KVM_INTEL_TDX bool "Intel Trust Domain Extensions (TDX) support" default y depends on INTEL_TDX_HOST + select KVM_GENERIC_MEMORY_ATTRIBUTES + select HAVE_KVM_ARCH_GMEM_POPULATE help Provides support for launching Intel Trust Domain Extensions (TDX) confidential VMs on Intel processors. @@ -157,9 +158,10 @@ config KVM_AMD_SEV depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM - select KVM_GENERIC_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES select HAVE_KVM_ARCH_GMEM_PREPARE select HAVE_KVM_ARCH_GMEM_INVALIDATE + select HAVE_KVM_ARCH_GMEM_POPULATE help Provides support for launching encrypted VMs which use Secure Encrypted Virtualization (SEV), Secure Encrypted Virtualization with @@ -169,7 +171,7 @@ config KVM_AMD_SEV config KVM_IOAPIC bool "I/O APIC, PIC, and PIT emulation" default y - depends on KVM + depends on KVM_X86 help Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e. for full in-kernel APIC emulation. @@ -179,7 +181,7 @@ config KVM_IOAPIC config KVM_SMM bool "System Management Mode emulation" default y - depends on KVM + depends on KVM_X86 help Provides support for KVM to emulate System Management Mode (SMM) in virtual machines. This can be used by the virtual machine @@ -189,7 +191,7 @@ config KVM_SMM config KVM_HYPERV bool "Support for Microsoft Hyper-V emulation" - depends on KVM + depends on KVM_X86 default y help Provides KVM support for emulating Microsoft Hyper-V. This allows KVM @@ -203,7 +205,7 @@ config KVM_HYPERV config KVM_XEN bool "Support for Xen hypercall interface" - depends on KVM + depends on KVM_X86 help Provides KVM support for the hosting Xen HVM guests and passing Xen hypercalls to userspace. @@ -213,7 +215,7 @@ config KVM_XEN config KVM_PROVE_MMU bool "Prove KVM MMU correctness" depends on DEBUG_KERNEL - depends on KVM + depends on KVM_X86 depends on EXPERT help Enables runtime assertions in KVM's MMU that are too costly to enable @@ -228,7 +230,7 @@ config KVM_EXTERNAL_WRITE_TRACKING config KVM_MAX_NR_VCPUS int "Maximum number of vCPUs per KVM guest" - depends on KVM + depends on KVM_X86 range 1024 4096 default 4096 if MAXSMP default 1024 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e2836a255b16..52524e0ca97f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -34,7 +34,7 @@ * aligned to sizeof(unsigned long) because it's not accessed via bitops. */ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; -EXPORT_SYMBOL_GPL(kvm_cpu_caps); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_caps); struct cpuid_xstate_sizes { u32 eax; @@ -131,7 +131,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2( return NULL; } -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_cpuid_entry2); static int kvm_check_cpuid(struct kvm_vcpu *vcpu) { @@ -263,6 +263,17 @@ static u64 cpuid_get_supported_xcr0(struct kvm_vcpu *vcpu) return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; } +static u64 cpuid_get_supported_xss(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry_index(vcpu, 0xd, 1); + if (!best) + return 0; + + return (best->ecx | ((u64)best->edx << 32)) & kvm_caps.supported_xss; +} + static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entry, unsigned int x86_feature, @@ -305,7 +316,8 @@ static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry_index(vcpu, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) - best->ebx = xstate_required_size(vcpu->arch.xcr0, true); + best->ebx = xstate_required_size(vcpu->arch.xcr0 | + vcpu->arch.ia32_xss, true); } static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu) @@ -424,6 +436,7 @@ void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) } vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu); + vcpu->arch.guest_supported_xss = cpuid_get_supported_xss(vcpu); vcpu->arch.pv_cpuid.features = kvm_apply_cpuid_pv_features_quirk(vcpu); @@ -448,6 +461,8 @@ void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * adjustments to the reserved GPA bits. */ kvm_mmu_after_set_cpuid(vcpu); + + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); } int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) @@ -931,6 +946,7 @@ void kvm_set_cpu_caps(void) VENDOR_F(WAITPKG), F(SGX_LC), F(BUS_LOCK_DETECT), + X86_64_F(SHSTK), ); /* @@ -940,6 +956,14 @@ void kvm_set_cpu_caps(void) if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) kvm_cpu_cap_clear(X86_FEATURE_PKU); + /* + * Shadow Stacks aren't implemented in the Shadow MMU. Shadow Stack + * accesses require "magic" Writable=0,Dirty=1 protection, which KVM + * doesn't know how to emulate or map. + */ + if (!tdp_enabled) + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_init(CPUID_7_EDX, F(AVX512_4VNNIW), F(AVX512_4FMAPS), @@ -957,8 +981,19 @@ void kvm_set_cpu_caps(void) F(AMX_INT8), F(AMX_BF16), F(FLUSH_L1D), + F(IBT), ); + /* + * Disable support for IBT and SHSTK if KVM is configured to emulate + * accesses to reserved GPAs, as KVM's emulator doesn't support IBT or + * SHSTK, nor does KVM handle Shadow Stack #PFs (see above). + */ + if (allow_smaller_maxphyaddr) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + } + if (boot_cpu_has(X86_FEATURE_AMD_IBPB_RET) && boot_cpu_has(X86_FEATURE_AMD_IBPB) && boot_cpu_has(X86_FEATURE_AMD_IBRS)) @@ -985,6 +1020,10 @@ void kvm_set_cpu_caps(void) F(LAM), ); + kvm_cpu_cap_init(CPUID_7_1_ECX, + SCATTERED_F(MSR_IMM), + ); + kvm_cpu_cap_init(CPUID_7_1_EDX, F(AVX_VNNI_INT8), F(AVX_NE_CONVERT), @@ -1222,7 +1261,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_RDPID); } } -EXPORT_SYMBOL_GPL(kvm_set_cpu_caps); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cpu_caps); #undef F #undef SCATTERED_F @@ -1411,9 +1450,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; cpuid_entry_override(entry, CPUID_7_1_EAX); + cpuid_entry_override(entry, CPUID_7_1_ECX); cpuid_entry_override(entry, CPUID_7_1_EDX); entry->ebx = 0; - entry->ecx = 0; } if (max_idx >= 2) { entry = do_host_cpuid(array, function, 2); @@ -1820,7 +1859,8 @@ static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func, int r; if (func == CENTAUR_CPUID_SIGNATURE && - boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) + boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) return 0; r = do_cpuid_func(array, func, type); @@ -2001,7 +2041,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, if (function == 7 && index == 0) { u64 data; if ((*ebx & (feature_bit(RTM) | feature_bit(HLE))) && - !__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && + !kvm_msr_read(vcpu, MSR_IA32_TSX_CTRL, &data) && (data & TSX_CTRL_CPUID_CLEAR)) *ebx &= ~(feature_bit(RTM) | feature_bit(HLE)); } else if (function == 0x80000007) { @@ -2045,7 +2085,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, used_max_basic); return exact; } -EXPORT_SYMBOL_GPL(kvm_cpuid); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpuid); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { @@ -2063,4 +2103,4 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_rdx_write(vcpu, edx); return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1349e278cd2a..59f93f68718a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -178,6 +178,7 @@ #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ #define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ #define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */ +#define ShadowStack ((u64)1 << 57) /* Instruction affects Shadow Stacks. */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -1553,6 +1554,37 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, return linear_write_system(ctxt, addr, desc, sizeof(*desc)); } +static bool emulator_is_ssp_invalid(struct x86_emulate_ctxt *ctxt, u8 cpl) +{ + const u32 MSR_IA32_X_CET = cpl == 3 ? MSR_IA32_U_CET : MSR_IA32_S_CET; + u64 efer = 0, cet = 0, ssp = 0; + + if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET)) + return false; + + if (ctxt->ops->get_msr(ctxt, MSR_EFER, &efer)) + return true; + + /* SSP is guaranteed to be valid if the vCPU was already in 32-bit mode. */ + if (!(efer & EFER_LMA)) + return false; + + if (ctxt->ops->get_msr(ctxt, MSR_IA32_X_CET, &cet)) + return true; + + if (!(cet & CET_SHSTK_EN)) + return false; + + if (ctxt->ops->get_msr(ctxt, MSR_KVM_INTERNAL_GUEST_SSP, &ssp)) + return true; + + /* + * On transfer from 64-bit mode to compatibility mode, SSP[63:32] must + * be 0, i.e. SSP must be a 32-bit value outside of 64-bit mode. + */ + return ssp >> 32; +} + static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg, u8 cpl, enum x86_transfer_type transfer, @@ -1693,6 +1725,10 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (efer & EFER_LMA) goto exception; } + if (!seg_desc.l && emulator_is_ssp_invalid(ctxt, cpl)) { + err_code = 0; + goto exception; + } /* CS(RPL) <- CPL */ selector = (selector & 0xfffc) | cpl; @@ -4068,8 +4104,8 @@ static const struct opcode group4[] = { static const struct opcode group5[] = { F(DstMem | SrcNone | Lock, em_inc), F(DstMem | SrcNone | Lock, em_dec), - I(SrcMem | NearBranch | IsBranch, em_call_near_abs), - I(SrcMemFAddr | ImplicitOps | IsBranch, em_call_far), + I(SrcMem | NearBranch | IsBranch | ShadowStack, em_call_near_abs), + I(SrcMemFAddr | ImplicitOps | IsBranch | ShadowStack, em_call_far), I(SrcMem | NearBranch | IsBranch, em_jmp_abs), I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far), I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined), @@ -4304,7 +4340,7 @@ static const struct opcode opcode_table[256] = { DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), - I(SrcImmFAddr | No64 | IsBranch, em_call_far), N, + I(SrcImmFAddr | No64 | IsBranch | ShadowStack, em_call_far), N, II(ImplicitOps | Stack, em_pushf, pushf), II(ImplicitOps | Stack, em_popf, popf), I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf), @@ -4324,19 +4360,19 @@ static const struct opcode opcode_table[256] = { X8(I(DstReg | SrcImm64 | Mov, em_mov)), /* 0xC0 - 0xC7 */ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2), - I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch, em_ret_near_imm), - I(ImplicitOps | NearBranch | IsBranch, em_ret), + I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch | ShadowStack, em_ret_near_imm), + I(ImplicitOps | NearBranch | IsBranch | ShadowStack, em_ret), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ - I(Stack | SrcImmU16 | Src2ImmByte | IsBranch, em_enter), - I(Stack | IsBranch, em_leave), - I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm), - I(ImplicitOps | IsBranch, em_ret_far), - D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn), + I(Stack | SrcImmU16 | Src2ImmByte, em_enter), + I(Stack, em_leave), + I(ImplicitOps | SrcImmU16 | IsBranch | ShadowStack, em_ret_far_imm), + I(ImplicitOps | IsBranch | ShadowStack, em_ret_far), + D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch | ShadowStack, intn), D(ImplicitOps | No64 | IsBranch), - II(ImplicitOps | IsBranch, em_iret, iret), + II(ImplicitOps | IsBranch | ShadowStack, em_iret, iret), /* 0xD0 - 0xD7 */ G(Src2One | ByteOp, group2), G(Src2One, group2), G(Src2CL | ByteOp, group2), G(Src2CL, group2), @@ -4352,7 +4388,7 @@ static const struct opcode opcode_table[256] = { I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), /* 0xE8 - 0xEF */ - I(SrcImm | NearBranch | IsBranch, em_call), + I(SrcImm | NearBranch | IsBranch | ShadowStack, em_call), D(SrcImm | ImplicitOps | NearBranch | IsBranch), I(SrcImmFAddr | No64 | IsBranch, em_jmp_far), D(SrcImmByte | ImplicitOps | NearBranch | IsBranch), @@ -4371,7 +4407,7 @@ static const struct opcode opcode_table[256] = { static const struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ G(0, group6), GD(0, &group7), N, N, - N, I(ImplicitOps | EmulateOnUD | IsBranch, em_syscall), + N, I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_syscall), II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, @@ -4402,8 +4438,8 @@ static const struct opcode twobyte_table[256] = { IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), II(ImplicitOps | Priv, em_rdmsr, rdmsr), IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), - I(ImplicitOps | EmulateOnUD | IsBranch, em_sysenter), - I(ImplicitOps | Priv | EmulateOnUD | IsBranch, em_sysexit), + I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_sysenter), + I(ImplicitOps | Priv | EmulateOnUD | IsBranch | ShadowStack, em_sysexit), N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ @@ -4514,6 +4550,60 @@ static const struct opcode opcode_map_0f_38[256] = { #undef I2bvIP #undef I6ALU +static bool is_shstk_instruction(struct x86_emulate_ctxt *ctxt) +{ + return ctxt->d & ShadowStack; +} + +static bool is_ibt_instruction(struct x86_emulate_ctxt *ctxt) +{ + u64 flags = ctxt->d; + + if (!(flags & IsBranch)) + return false; + + /* + * All far JMPs and CALLs (including SYSCALL, SYSENTER, and INTn) are + * indirect and thus affect IBT state. All far RETs (including SYSEXIT + * and IRET) are protected via Shadow Stacks and thus don't affect IBT + * state. IRET #GPs when returning to virtual-8086 and IBT or SHSTK is + * enabled, but that should be handled by IRET emulation (in the very + * unlikely scenario that KVM adds support for fully emulating IRET). + */ + if (!(flags & NearBranch)) + return ctxt->execute != em_iret && + ctxt->execute != em_ret_far && + ctxt->execute != em_ret_far_imm && + ctxt->execute != em_sysexit; + + switch (flags & SrcMask) { + case SrcReg: + case SrcMem: + case SrcMem16: + case SrcMem32: + return true; + case SrcMemFAddr: + case SrcImmFAddr: + /* Far branches should be handled above. */ + WARN_ON_ONCE(1); + return true; + case SrcNone: + case SrcImm: + case SrcImmByte: + /* + * Note, ImmU16 is used only for the stack adjustment operand on ENTER + * and RET instructions. ENTER isn't a branch and RET FAR is handled + * by the NearBranch check above. RET itself isn't an indirect branch. + */ + case SrcImmU16: + return false; + default: + WARN_ONCE(1, "Unexpected Src operand '%llx' on branch", + flags & SrcMask); + return false; + } +} + static unsigned imm_size(struct x86_emulate_ctxt *ctxt) { unsigned size; @@ -4943,6 +5033,40 @@ done_prefixes: ctxt->execute = opcode.u.execute; + /* + * Reject emulation if KVM might need to emulate shadow stack updates + * and/or indirect branch tracking enforcement, which the emulator + * doesn't support. + */ + if ((is_ibt_instruction(ctxt) || is_shstk_instruction(ctxt)) && + ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET) { + u64 u_cet = 0, s_cet = 0; + + /* + * Check both User and Supervisor on far transfers as inter- + * privilege level transfers are impacted by CET at the target + * privilege level, and that is not known at this time. The + * expectation is that the guest will not require emulation of + * any CET-affected instructions at any privilege level. + */ + if (!(ctxt->d & NearBranch)) + u_cet = s_cet = CET_SHSTK_EN | CET_ENDBR_EN; + else if (ctxt->ops->cpl(ctxt) == 3) + u_cet = CET_SHSTK_EN | CET_ENDBR_EN; + else + s_cet = CET_SHSTK_EN | CET_ENDBR_EN; + + if ((u_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_U_CET, &u_cet)) || + (s_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_S_CET, &s_cet))) + return EMULATION_FAILED; + + if ((u_cet | s_cet) & CET_SHSTK_EN && is_shstk_instruction(ctxt)) + return EMULATION_FAILED; + + if ((u_cet | s_cet) & CET_ENDBR_EN && is_ibt_instruction(ctxt)) + return EMULATION_FAILED; + } + if (unlikely(emulation_type & EMULTYPE_TRAP_UD) && likely(!(ctxt->d & EmulateOnUD))) return EMULATION_FAILED; @@ -5107,12 +5231,11 @@ void init_decode_cache(struct x86_emulate_ctxt *ctxt) ctxt->mem_read.end = 0; } -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts) { const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; - bool is_guest_mode = ctxt->ops->is_guest_mode(ctxt); ctxt->mem_read.pos = 0; @@ -5160,7 +5283,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(&ctxt->dst); } - if (unlikely(is_guest_mode) && ctxt->intercept) { + if (unlikely(check_intercepts) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5189,7 +5312,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(check_intercepts) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5243,7 +5366,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(check_intercepts) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 72b19a88a776..38595ecb990d 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -923,7 +923,7 @@ bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) return false; return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } -EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_assist_page_enabled); int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) { @@ -935,7 +935,7 @@ int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page)); } -EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_get_assist_page); static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) { @@ -1168,15 +1168,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); - mutex_lock(&hv->hv_lock); + guard(mutex)(&hv->hv_lock); if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || hv->hv_tsc_page_status == HV_TSC_PAGE_SET || hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) - goto out_unlock; + return; if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - goto out_unlock; + return; gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; /* @@ -1192,7 +1192,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; - goto out_unlock; + return; } /* @@ -1228,12 +1228,10 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; - goto out_unlock; + return; out_err: hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; -out_unlock: - mutex_unlock(&hv->hv_lock); } void kvm_hv_request_tsc_page_update(struct kvm *kvm) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 2b5d389bca5f..2c2783296aed 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later /* * Copyright (C) 2001 MandrakeSoft S.A. * Copyright 2010 Red Hat, Inc. and/or its affiliates. @@ -8,20 +9,6 @@ * http://www.linux-mandrake.com/ * http://www.mandrakesoft.com/ * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * Yunhong Jiang <yunhong.jiang@intel.com> * Yaozu (Eddie) Dong <eddie.dong@intel.com> * Based on Xen 3.1 code. diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 16da89259011..7cc8950005b6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -103,7 +103,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } -EXPORT_SYMBOL_GPL(kvm_cpu_has_injectable_intr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_has_injectable_intr); /* * check if there is pending interrupt without @@ -119,7 +119,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } -EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_has_interrupt); /* * Read pending interrupt(from non-APIC source) @@ -148,7 +148,7 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v) WARN_ON_ONCE(!irqchip_split(v->kvm)); return get_userspace_extint(v); } -EXPORT_SYMBOL_GPL(kvm_cpu_get_extint); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_get_extint); /* * Read pending interrupt vector and intack. @@ -195,63 +195,6 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) return irqchip_in_kernel(kvm); } -int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, struct dest_map *dest_map) -{ - int r = -1; - struct kvm_vcpu *vcpu, *lowest = NULL; - unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; - unsigned int dest_vcpus = 0; - - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) - return r; - - if (irq->dest_mode == APIC_DEST_PHYSICAL && - irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { - pr_info("apic: phys broadcast and lowest prio\n"); - irq->delivery_mode = APIC_DM_FIXED; - } - - memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!kvm_apic_present(vcpu)) - continue; - - if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, - irq->dest_id, irq->dest_mode)) - continue; - - if (!kvm_lowest_prio_delivery(irq)) { - if (r < 0) - r = 0; - r += kvm_apic_set_irq(vcpu, irq, dest_map); - } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { - if (!kvm_vector_hashing_enabled()) { - if (!lowest) - lowest = vcpu; - else if (kvm_apic_compare_prio(vcpu, lowest) < 0) - lowest = vcpu; - } else { - __set_bit(i, dest_vcpu_bitmap); - dest_vcpus++; - } - } - } - - if (dest_vcpus != 0) { - int idx = kvm_vector_to_index(irq->vector, dest_vcpus, - dest_vcpu_bitmap, KVM_MAX_VCPUS); - - lowest = kvm_get_vcpu(kvm, idx); - } - - if (lowest) - r = kvm_apic_set_irq(lowest, irq, dest_map); - - return r; -} - static void kvm_msi_to_lapic_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) @@ -411,34 +354,6 @@ int kvm_set_routing_entry(struct kvm *kvm, return 0; } -bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, - struct kvm_vcpu **dest_vcpu) -{ - int r = 0; - unsigned long i; - struct kvm_vcpu *vcpu; - - if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) - return true; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!kvm_apic_present(vcpu)) - continue; - - if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, - irq->dest_id, irq->dest_mode)) - continue; - - if (++r == 2) - return false; - - *dest_vcpu = vcpu; - } - - return r == 1; -} -EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); - void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, u8 vector, unsigned long *ioapic_handled_vectors) { diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 5e62c1f79ce6..34f4a78a7a01 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -121,8 +121,4 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); -int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, - struct dest_map *dest_map); - #endif diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 36a8786db291..8ddb01191d6f 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -7,7 +7,8 @@ #define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP) #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE \ + | X86_CR4_CET) #define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG) #define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index c1df5acfacaf..7b5ddb787a25 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -235,7 +235,6 @@ struct x86_emulate_ops { void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); bool (*is_smm)(struct x86_emulate_ctxt *ctxt); - bool (*is_guest_mode)(struct x86_emulate_ctxt *ctxt); int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); @@ -521,7 +520,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_RESTART 1 #define EMULATION_INTERCEPTED 2 void init_decode_cache(struct x86_emulate_ctxt *ctxt); -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index ded0bd688c65..ee53e75a60cb 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -101,13 +101,13 @@ int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, gfn_t nr_pages) return __hv_flush_remote_tlbs_range(kvm, &range); } -EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs_range); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(hv_flush_remote_tlbs_range); int hv_flush_remote_tlbs(struct kvm *kvm) { return __hv_flush_remote_tlbs_range(kvm, NULL); } -EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(hv_flush_remote_tlbs); void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) { @@ -121,4 +121,4 @@ void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) spin_unlock(&kvm_arch->hv_root_tdp_lock); } } -EXPORT_SYMBOL_GPL(hv_track_root_tdp); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(hv_track_root_tdp); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5fc437341e03..0ae7f913d782 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -74,6 +74,10 @@ module_param(lapic_timer_advance, bool, 0444); #define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 + +static bool __read_mostly vector_hashing_enabled = true; +module_param_named(vector_hashing, vector_hashing_enabled, bool, 0444); + static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data); @@ -102,7 +106,7 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) } __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); -EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_has_noapic_vcpu); __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ); __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ); @@ -130,7 +134,7 @@ static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm)); } -bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) +static bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) { return kvm_x86_ops.set_hv_timer && !(kvm_mwait_in_guest(vcpu->kvm) || @@ -642,7 +646,7 @@ bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) return ((max_updated_irr != -1) && (max_updated_irr == *max_irr)); } -EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_apic_update_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr) { @@ -653,7 +657,7 @@ bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr apic->irr_pending = true; return irr_updated; } -EXPORT_SYMBOL_GPL(kvm_apic_update_irr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_irr); static inline int apic_search_irr(struct kvm_lapic *apic) { @@ -693,7 +697,7 @@ void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec) { apic_clear_irr(vec, vcpu->arch.apic); } -EXPORT_SYMBOL_GPL(kvm_apic_clear_irr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_clear_irr); static void *apic_vector_to_isr(int vec, struct kvm_lapic *apic) { @@ -775,7 +779,7 @@ void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu) kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } -EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_hwapic_isr); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { @@ -786,7 +790,7 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) */ return apic_find_highest_irr(vcpu->arch.apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_find_highest_irr); static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, @@ -950,7 +954,7 @@ void kvm_apic_update_ppr(struct kvm_vcpu *vcpu) { apic_update_ppr(vcpu->arch.apic); } -EXPORT_SYMBOL_GPL(kvm_apic_update_ppr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_ppr); static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) { @@ -1061,21 +1065,14 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, return false; } } -EXPORT_SYMBOL_GPL(kvm_apic_match_dest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_match_dest); -int kvm_vector_to_index(u32 vector, u32 dest_vcpus, - const unsigned long *bitmap, u32 bitmap_size) +static int kvm_vector_to_index(u32 vector, u32 dest_vcpus, + const unsigned long *bitmap, u32 bitmap_size) { - u32 mod; - int i, idx = -1; - - mod = vector % dest_vcpus; - - for (i = 0; i <= mod; i++) { - idx = find_next_bit(bitmap, bitmap_size, idx + 1); - BUG_ON(idx == bitmap_size); - } + int idx = find_nth_bit(bitmap, bitmap_size, vector % dest_vcpus); + BUG_ON(idx >= bitmap_size); return idx; } @@ -1106,6 +1103,16 @@ static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, return false; } +static bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) +{ + return (irq->delivery_mode == APIC_DM_LOWEST || irq->msi_redir_hint); +} + +static int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) +{ + return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; +} + /* Return true if the interrupt can be handled by using *bitmap as index mask * for valid destinations in *dst array. * Return false if kvm_apic_map_get_dest_lapic did nothing useful. @@ -1149,7 +1156,7 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, if (!kvm_lowest_prio_delivery(irq)) return true; - if (!kvm_vector_hashing_enabled()) { + if (!vector_hashing_enabled) { lowest = -1; for_each_set_bit(i, bitmap, 16) { if (!(*dst)[i]) @@ -1230,8 +1237,9 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, * interrupt. * - Otherwise, use remapped mode to inject the interrupt. */ -bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, - struct kvm_vcpu **dest_vcpu) +static bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, + struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) { struct kvm_apic_map *map; unsigned long bitmap; @@ -1258,6 +1266,91 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, return ret; } +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + int r = 0; + unsigned long i; + struct kvm_vcpu *vcpu; + + if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) + return true; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (++r == 2) + return false; + + *dest_vcpu = vcpu; + } + + return r == 1; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_intr_is_single_vcpu); + +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, struct dest_map *dest_map) +{ + int r = -1; + struct kvm_vcpu *vcpu, *lowest = NULL; + unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + unsigned int dest_vcpus = 0; + + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) + return r; + + if (irq->dest_mode == APIC_DEST_PHYSICAL && + irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { + pr_info("apic: phys broadcast and lowest prio\n"); + irq->delivery_mode = APIC_DM_FIXED; + } + + memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (!kvm_lowest_prio_delivery(irq)) { + if (r < 0) + r = 0; + r += kvm_apic_set_irq(vcpu, irq, dest_map); + } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { + if (!vector_hashing_enabled) { + if (!lowest) + lowest = vcpu; + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) + lowest = vcpu; + } else { + __set_bit(i, dest_vcpu_bitmap); + dest_vcpus++; + } + } + } + + if (dest_vcpus != 0) { + int idx = kvm_vector_to_index(irq->vector, dest_vcpus, + dest_vcpu_bitmap, KVM_MAX_VCPUS); + + lowest = kvm_get_vcpu(kvm, idx); + } + + if (lowest) + r = kvm_apic_set_irq(lowest, irq, dest_map); + + return r; +} + /* * Add a pending IRQ into lapic. * Return 1 if successfully added and 0 if discarded. @@ -1401,11 +1494,6 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, rcu_read_unlock(); } -int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) -{ - return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; -} - static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) { return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors); @@ -1481,32 +1569,38 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) kvm_ioapic_send_eoi(apic, vector); kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } -EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_set_eoi_accelerated); -void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) +static void kvm_icr_to_lapic_irq(struct kvm_lapic *apic, u32 icr_low, + u32 icr_high, struct kvm_lapic_irq *irq) { - struct kvm_lapic_irq irq; - /* KVM has no delay and should always clear the BUSY/PENDING flag. */ WARN_ON_ONCE(icr_low & APIC_ICR_BUSY); - irq.vector = icr_low & APIC_VECTOR_MASK; - irq.delivery_mode = icr_low & APIC_MODE_MASK; - irq.dest_mode = icr_low & APIC_DEST_MASK; - irq.level = (icr_low & APIC_INT_ASSERT) != 0; - irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; - irq.shorthand = icr_low & APIC_SHORT_MASK; - irq.msi_redir_hint = false; + irq->vector = icr_low & APIC_VECTOR_MASK; + irq->delivery_mode = icr_low & APIC_MODE_MASK; + irq->dest_mode = icr_low & APIC_DEST_MASK; + irq->level = (icr_low & APIC_INT_ASSERT) != 0; + irq->trig_mode = icr_low & APIC_INT_LEVELTRIG; + irq->shorthand = icr_low & APIC_SHORT_MASK; + irq->msi_redir_hint = false; if (apic_x2apic_mode(apic)) - irq.dest_id = icr_high; + irq->dest_id = icr_high; else - irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high); + irq->dest_id = GET_XAPIC_DEST_FIELD(icr_high); +} + +void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) +{ + struct kvm_lapic_irq irq; + + kvm_icr_to_lapic_irq(apic, icr_low, icr_high, &irq); trace_kvm_apic_ipi(icr_low, irq.dest_id); kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); } -EXPORT_SYMBOL_GPL(kvm_apic_send_ipi); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_send_ipi); static u32 apic_get_tmcct(struct kvm_lapic *apic) { @@ -1623,7 +1717,7 @@ u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic) return valid_reg_mask; } -EXPORT_SYMBOL_GPL(kvm_lapic_readable_reg_mask); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_readable_reg_mask); static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data) @@ -1864,7 +1958,7 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) lapic_timer_int_injected(vcpu)) __kvm_wait_lapic_expire(vcpu); } -EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_wait_lapic_expire); static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) { @@ -2178,7 +2272,7 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) out: preempt_enable(); } -EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_expired_hv_timer); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { @@ -2431,11 +2525,11 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) { kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0); } -EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_set_eoi); #define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) -int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) +static int __kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data, bool fast) { if (data & X2APIC_ICR_RESERVED_BITS) return 1; @@ -2450,7 +2544,20 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) */ data &= ~APIC_ICR_BUSY; - kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); + if (fast) { + struct kvm_lapic_irq irq; + int ignored; + + kvm_icr_to_lapic_irq(apic, (u32)data, (u32)(data >> 32), &irq); + + if (!kvm_irq_delivery_to_apic_fast(apic->vcpu->kvm, apic, &irq, + &ignored, NULL)) + return -EWOULDBLOCK; + + trace_kvm_apic_ipi((u32)data, irq.dest_id); + } else { + kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); + } if (kvm_x86_ops.x2apic_icr_is_split) { kvm_lapic_set_reg(apic, APIC_ICR, data); kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); @@ -2461,6 +2568,16 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) return 0; } +static int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) +{ + return __kvm_x2apic_icr_write(apic, data, false); +} + +int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data) +{ + return __kvm_x2apic_icr_write(apic, data, true); +} + static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) { if (kvm_x86_ops.x2apic_icr_is_split) @@ -2491,7 +2608,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) else kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); } -EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_write_nodecode); void kvm_free_lapic(struct kvm_vcpu *vcpu) { @@ -2629,7 +2746,7 @@ int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated) kvm_recalculate_apic_map(vcpu->kvm); return 0; } -EXPORT_SYMBOL_GPL(kvm_apic_set_base); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_set_base); void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { @@ -2661,26 +2778,23 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) int kvm_alloc_apic_access_page(struct kvm *kvm) { void __user *hva; - int ret = 0; - mutex_lock(&kvm->slots_lock); + guard(mutex)(&kvm->slots_lock); + if (kvm->arch.apic_access_memslot_enabled || kvm->arch.apic_access_memslot_inhibited) - goto out; + return 0; hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); - if (IS_ERR(hva)) { - ret = PTR_ERR(hva); - goto out; - } + if (IS_ERR(hva)) + return PTR_ERR(hva); kvm->arch.apic_access_memslot_enabled = true; -out: - mutex_unlock(&kvm->slots_lock); - return ret; + + return 0; } -EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_alloc_apic_access_page); void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu) { @@ -2944,7 +3058,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) __apic_update_ppr(apic, &ppr); return apic_has_interrupt_for_ppr(apic, ppr); } -EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_has_interrupt); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { @@ -3003,7 +3117,7 @@ void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector) } } -EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_ack_interrupt); static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s, bool set) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 72de14527698..282b9b7da98c 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -105,7 +105,6 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu); void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); -int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr); @@ -119,6 +118,9 @@ void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, + struct dest_map *dest_map); void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high); int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated); @@ -137,7 +139,7 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); -int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data); +int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); @@ -222,12 +224,6 @@ static inline bool kvm_apic_init_sipi_allowed(struct kvm_vcpu *vcpu) !kvm_x86_call(apic_init_signal_blocked)(vcpu); } -static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) -{ - return (irq->delivery_mode == APIC_DM_LOWEST || - irq->msi_redir_hint); -} - static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) { return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); @@ -240,16 +236,13 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, unsigned long *vcpu_bitmap); -bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, - struct kvm_vcpu **dest_vcpu); -int kvm_vector_to_index(u32 vector, u32 dest_vcpus, - const unsigned long *bitmap, u32 bitmap_size); +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu); void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); -bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu); static inline enum lapic_mode kvm_apic_mode(u64 apic_base) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index b4b6860ab971..f63074048ec6 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -212,7 +212,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, fault = (mmu->permissions[index] >> pte_access) & 1; - WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); + WARN_ON_ONCE(pfec & (PFERR_PK_MASK | PFERR_SS_MASK | PFERR_RSVD_MASK)); if (unlikely(mmu->pkru_mask)) { u32 pkru_bits, offset; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6e838cb6c9e1..667d66cf76d5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -110,7 +110,7 @@ static bool __ro_after_init tdp_mmu_allowed; #ifdef CONFIG_X86_64 bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); -EXPORT_SYMBOL_GPL(tdp_mmu_enabled); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(tdp_mmu_enabled); #endif static int max_huge_page_level __read_mostly; @@ -776,7 +776,8 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K); } -void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, + enum kvm_mmu_type mmu_type) { /* * If it's possible to replace the shadow page with an NX huge page, @@ -790,8 +791,9 @@ void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) return; ++kvm->stat.nx_lpage_splits; + ++kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages; list_add_tail(&sp->possible_nx_huge_page_link, - &kvm->arch.possible_nx_huge_pages); + &kvm->arch.possible_nx_huge_pages[mmu_type].pages); } static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, @@ -800,7 +802,7 @@ static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, sp->nx_huge_page_disallowed = true; if (nx_huge_page_possible) - track_possible_nx_huge_page(kvm, sp); + track_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -819,12 +821,14 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } -void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, + enum kvm_mmu_type mmu_type) { if (list_empty(&sp->possible_nx_huge_page_link)) return; --kvm->stat.nx_lpage_splits; + --kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages; list_del_init(&sp->possible_nx_huge_page_link); } @@ -832,7 +836,7 @@ static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { sp->nx_huge_page_disallowed = false; - untrack_possible_nx_huge_page(kvm, sp); + untrack_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU); } static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, @@ -3285,12 +3289,72 @@ out: return level; } -static int __kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, - gfn_t gfn, int max_level, bool is_private) +static u8 kvm_max_level_for_order(int order) +{ + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); + + KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) && + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) && + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)); + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) + return PG_LEVEL_1G; + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static u8 kvm_gmem_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, + const struct kvm_memory_slot *slot, gfn_t gfn, + bool is_private) +{ + u8 max_level, coco_level; + kvm_pfn_t pfn; + + /* For faults, use the gmem information that was resolved earlier. */ + if (fault) { + pfn = fault->pfn; + max_level = fault->max_level; + } else { + /* TODO: Call into guest_memfd once hugepages are supported. */ + WARN_ONCE(1, "Get pfn+order from guest_memfd"); + pfn = KVM_PFN_ERR_FAULT; + max_level = PG_LEVEL_4K; + } + + if (max_level == PG_LEVEL_4K) + return max_level; + + /* + * CoCo may influence the max mapping level, e.g. due to RMP or S-EPT + * restrictions. A return of '0' means "no additional restrictions", to + * allow for using an optional "ret0" static call. + */ + coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn, is_private); + if (coco_level) + max_level = min(max_level, coco_level); + + return max_level; +} + +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, + const struct kvm_memory_slot *slot, gfn_t gfn) { struct kvm_lpage_info *linfo; - int host_level; + int host_level, max_level; + bool is_private; + + lockdep_assert_held(&kvm->mmu_lock); + + if (fault) { + max_level = fault->max_level; + is_private = fault->is_private; + } else { + max_level = PG_LEVEL_NUM; + is_private = kvm_mem_is_private(kvm, gfn); + } max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { @@ -3299,25 +3363,17 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm, break; } - if (is_private) - return max_level; - if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - host_level = host_pfn_mapping_level(kvm, gfn, slot); + if (is_private || kvm_memslot_is_gmem_only(slot)) + host_level = kvm_gmem_max_mapping_level(kvm, fault, slot, gfn, + is_private); + else + host_level = host_pfn_mapping_level(kvm, gfn, slot); return min(host_level, max_level); } -int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn) -{ - bool is_private = kvm_slot_can_be_private(slot) && - kvm_mem_is_private(kvm, gfn); - - return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private); -} - void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; @@ -3338,9 +3394,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot, - fault->gfn, fault->max_level, - fault->is_private); + fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault, + fault->slot, fault->gfn); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; @@ -3810,7 +3865,7 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, write_unlock(&kvm->mmu_lock); } } -EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_roots); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) { @@ -3837,7 +3892,7 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) kvm_mmu_free_roots(kvm, mmu, roots_to_free); } -EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_guest_mode_roots); static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, u8 level) @@ -4503,42 +4558,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) vcpu->stat.pf_fixed++; } -static inline u8 kvm_max_level_for_order(int order) -{ - BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); - - KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) && - order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) && - order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)); - - if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) - return PG_LEVEL_1G; - - if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) - return PG_LEVEL_2M; - - return PG_LEVEL_4K; -} - -static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, - u8 max_level, int gmem_order) -{ - u8 req_max_level; - - if (max_level == PG_LEVEL_4K) - return PG_LEVEL_4K; - - max_level = min(kvm_max_level_for_order(gmem_order), max_level); - if (max_level == PG_LEVEL_4K) - return PG_LEVEL_4K; - - req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn); - if (req_max_level) - max_level = min(max_level, req_max_level); - - return max_level; -} - static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int r) { @@ -4546,12 +4565,12 @@ static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, r == RET_PF_RETRY, fault->map_writable); } -static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault) +static int kvm_mmu_faultin_pfn_gmem(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { int max_order, r; - if (!kvm_slot_can_be_private(fault->slot)) { + if (!kvm_slot_has_gmem(fault->slot)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } @@ -4564,8 +4583,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, } fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); - fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, - fault->max_level, max_order); + fault->max_level = kvm_max_level_for_order(max_order); return RET_PF_CONTINUE; } @@ -4575,8 +4593,8 @@ static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, { unsigned int foll = fault->write ? FOLL_WRITE : 0; - if (fault->is_private) - return kvm_mmu_faultin_pfn_private(vcpu, fault); + if (fault->is_private || kvm_memslot_is_gmem_only(fault->slot)) + return kvm_mmu_faultin_pfn_gmem(vcpu, fault); foll |= FOLL_NOWAIT; fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, @@ -4649,10 +4667,16 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, /* * Retry the page fault if the gfn hit a memslot that is being deleted * or moved. This ensures any existing SPTEs for the old memslot will - * be zapped before KVM inserts a new MMIO SPTE for the gfn. + * be zapped before KVM inserts a new MMIO SPTE for the gfn. Punt the + * error to userspace if this is a prefault, as KVM's prefaulting ABI + * doesn't provide the same forward progress guarantees as KVM_RUN. */ - if (slot->flags & KVM_MEMSLOT_INVALID) + if (slot->flags & KVM_MEMSLOT_INVALID) { + if (fault->prefetch) + return -EAGAIN; + return RET_PF_RETRY; + } if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) { /* @@ -4852,7 +4876,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, return r; } -EXPORT_SYMBOL_GPL(kvm_handle_page_fault); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_page_fault); #ifdef CONFIG_X86_64 static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, @@ -4942,7 +4966,7 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level return -EIO; } } -EXPORT_SYMBOL_GPL(kvm_tdp_map_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_map_page); long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) @@ -5138,7 +5162,7 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) __clear_sp_write_flooding_count(sp); } } -EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_new_pgd); static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned int access) @@ -5784,7 +5808,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, shadow_mmu_init_context(vcpu, context, cpu_role, root_role); kvm_mmu_new_pgd(vcpu, nested_cr3); } -EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_npt_mmu); static union kvm_cpu_role kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, @@ -5838,7 +5862,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, kvm_mmu_new_pgd(vcpu, new_eptp); } -EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) @@ -5903,7 +5927,7 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu) else init_kvm_softmmu(vcpu, cpu_role); } -EXPORT_SYMBOL_GPL(kvm_init_mmu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_mmu); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) { @@ -5939,7 +5963,7 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); kvm_init_mmu(vcpu); } -EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_reset_context); int kvm_mmu_load(struct kvm_vcpu *vcpu) { @@ -5973,7 +5997,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) out: return r; } -EXPORT_SYMBOL_GPL(kvm_mmu_load); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { @@ -6035,7 +6059,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } -EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_obsolete_roots); static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) @@ -6361,7 +6385,7 @@ emulate: return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } -EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_page_fault); void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg) { @@ -6377,7 +6401,7 @@ void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg) pr_cont(", spte[%d] = 0x%llx", level, sptes[level]); pr_cont("\n"); } -EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_print_sptes); static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, hpa_t root_hpa) @@ -6443,7 +6467,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa); } } -EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invalidate_addr); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { @@ -6460,7 +6484,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL); ++vcpu->stat.invlpg; } -EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invlpg); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) @@ -6513,7 +6537,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, else max_huge_page_level = PG_LEVEL_2M; } -EXPORT_SYMBOL_GPL(kvm_configure_mmu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_configure_mmu); static void free_mmu_pages(struct kvm_mmu *mmu) { @@ -6737,11 +6761,12 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) int kvm_mmu_init_vm(struct kvm *kvm) { - int r; + int r, i; kvm->arch.shadow_mmio_value = shadow_mmio_value; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); + for (i = 0; i < KVM_NR_MMU_TYPES; ++i) + INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages[i].pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); if (tdp_mmu_enabled) { @@ -7165,7 +7190,7 @@ restart: * mapping if the indirect sp has level = 1. */ if (sp->role.direct && - sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) { + sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_remote_tlbs_range()) @@ -7179,7 +7204,7 @@ restart: return need_tlb_flush; } -EXPORT_SYMBOL_GPL(kvm_zap_gfn_range); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_zap_gfn_range); static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) @@ -7582,19 +7607,64 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel return err; } -static void kvm_recover_nx_huge_pages(struct kvm *kvm) +static unsigned long nx_huge_pages_to_zap(struct kvm *kvm, + enum kvm_mmu_type mmu_type) +{ + unsigned long pages = READ_ONCE(kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages); + unsigned int ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + + return ratio ? DIV_ROUND_UP(pages, ratio) : 0; +} + +static bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm, + struct kvm_mmu_page *sp) { - unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; struct kvm_memory_slot *slot; - int rcu_idx; + + /* + * Skip the memslot lookup if dirty tracking can't possibly be enabled, + * as memslot lookups are relatively expensive. + * + * If a memslot update is in progress, reading an incorrect value of + * kvm->nr_memslots_dirty_logging is not a problem: if it is becoming + * zero, KVM will do an unnecessary memslot lookup; if it is becoming + * nonzero, the page will be zapped unnecessarily. Either way, this + * only affects efficiency in racy situations, and not correctness. + */ + if (!atomic_read(&kvm->nr_memslots_dirty_logging)) + return false; + + slot = __gfn_to_memslot(kvm_memslots_for_spte_role(kvm, sp->role), sp->gfn); + if (WARN_ON_ONCE(!slot)) + return false; + + return kvm_slot_dirty_track_enabled(slot); +} + +static void kvm_recover_nx_huge_pages(struct kvm *kvm, + const enum kvm_mmu_type mmu_type) +{ +#ifdef CONFIG_X86_64 + const bool is_tdp_mmu = mmu_type == KVM_TDP_MMU; + spinlock_t *tdp_mmu_pages_lock = &kvm->arch.tdp_mmu_pages_lock; +#else + const bool is_tdp_mmu = false; + spinlock_t *tdp_mmu_pages_lock = NULL; +#endif + unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type); + struct list_head *nx_huge_pages; struct kvm_mmu_page *sp; - unsigned int ratio; LIST_HEAD(invalid_list); bool flush = false; - ulong to_zap; + int rcu_idx; + + nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages; rcu_idx = srcu_read_lock(&kvm->srcu); - write_lock(&kvm->mmu_lock); + if (is_tdp_mmu) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); /* * Zapping TDP MMU shadow pages, including the remote TLB flush, must @@ -7603,11 +7673,15 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) */ rcu_read_lock(); - ratio = READ_ONCE(nx_huge_pages_recovery_ratio); - to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { - if (list_empty(&kvm->arch.possible_nx_huge_pages)) + if (is_tdp_mmu) + spin_lock(tdp_mmu_pages_lock); + + if (list_empty(nx_huge_pages)) { + if (is_tdp_mmu) + spin_unlock(tdp_mmu_pages_lock); break; + } /* * We use a separate list instead of just using active_mmu_pages @@ -7616,56 +7690,44 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) * the total number of shadow pages. And because the TDP MMU * doesn't use active_mmu_pages. */ - sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, + sp = list_first_entry(nx_huge_pages, struct kvm_mmu_page, possible_nx_huge_page_link); WARN_ON_ONCE(!sp->nx_huge_page_disallowed); WARN_ON_ONCE(!sp->role.direct); + unaccount_nx_huge_page(kvm, sp); + + if (is_tdp_mmu) + spin_unlock(tdp_mmu_pages_lock); + /* - * Unaccount and do not attempt to recover any NX Huge Pages - * that are being dirty tracked, as they would just be faulted - * back in as 4KiB pages. The NX Huge Pages in this slot will be - * recovered, along with all the other huge pages in the slot, - * when dirty logging is disabled. - * - * Since gfn_to_memslot() is relatively expensive, it helps to - * skip it if it the test cannot possibly return true. On the - * other hand, if any memslot has logging enabled, chances are - * good that all of them do, in which case unaccount_nx_huge_page() - * is much cheaper than zapping the page. - * - * If a memslot update is in progress, reading an incorrect value - * of kvm->nr_memslots_dirty_logging is not a problem: if it is - * becoming zero, gfn_to_memslot() will be done unnecessarily; if - * it is becoming nonzero, the page will be zapped unnecessarily. - * Either way, this only affects efficiency in racy situations, - * and not correctness. + * Do not attempt to recover any NX Huge Pages that are being + * dirty tracked, as they would just be faulted back in as 4KiB + * pages. The NX Huge Pages in this slot will be recovered, + * along with all the other huge pages in the slot, when dirty + * logging is disabled. */ - slot = NULL; - if (atomic_read(&kvm->nr_memslots_dirty_logging)) { - struct kvm_memslots *slots; + if (!kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) { + if (is_tdp_mmu) + flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp); + else + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - slots = kvm_memslots_for_spte_role(kvm, sp->role); - slot = __gfn_to_memslot(slots, sp->gfn); - WARN_ON_ONCE(!slot); } - if (slot && kvm_slot_dirty_track_enabled(slot)) - unaccount_nx_huge_page(kvm, sp); - else if (is_tdp_mmu_page(sp)) - flush |= kvm_tdp_mmu_zap_sp(kvm, sp); - else - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->nx_huge_page_disallowed); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); rcu_read_unlock(); - cond_resched_rwlock_write(&kvm->mmu_lock); - flush = false; + if (is_tdp_mmu) + cond_resched_rwlock_read(&kvm->mmu_lock); + else + cond_resched_rwlock_write(&kvm->mmu_lock); + flush = false; rcu_read_lock(); } } @@ -7673,7 +7735,10 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) rcu_read_unlock(); - write_unlock(&kvm->mmu_lock); + if (is_tdp_mmu) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); } @@ -7684,9 +7749,10 @@ static void kvm_nx_huge_page_recovery_worker_kill(void *data) static bool kvm_nx_huge_page_recovery_worker(void *data) { struct kvm *kvm = data; + long remaining_time; bool enabled; uint period; - long remaining_time; + int i; enabled = calc_nx_huge_pages_recovery_period(&period); if (!enabled) @@ -7701,7 +7767,8 @@ static bool kvm_nx_huge_page_recovery_worker(void *data) } __set_current_state(TASK_RUNNING); - kvm_recover_nx_huge_pages(kvm); + for (i = 0; i < KVM_NR_MMU_TYPES; ++i) + kvm_recover_nx_huge_pages(kvm, i); kvm->arch.nx_huge_page_last = get_jiffies_64(); return true; } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 65f3c89d7c5d..ed5c01df21ba 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -411,12 +411,14 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return r; } -int kvm_mmu_max_mapping_level(struct kvm *kvm, +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); -void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); -void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, + enum kvm_mmu_type mmu_type); +void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, + enum kvm_mmu_type mmu_type); #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index f35a830ce469..764e3015d021 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -51,6 +51,9 @@ { PFERR_PRESENT_MASK, "P" }, \ { PFERR_WRITE_MASK, "W" }, \ { PFERR_USER_MASK, "U" }, \ + { PFERR_PK_MASK, "PK" }, \ + { PFERR_SS_MASK, "SS" }, \ + { PFERR_SGX_MASK, "SGX" }, \ { PFERR_RSVD_MASK, "RSVD" }, \ { PFERR_FETCH_MASK, "F" } diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index df31039b5d63..37647afde7d3 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -22,7 +22,7 @@ bool __read_mostly enable_mmio_caching = true; static bool __ro_after_init allow_mmio_caching; module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); -EXPORT_SYMBOL_GPL(enable_mmio_caching); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mmio_caching); bool __read_mostly kvm_ad_enabled; @@ -470,13 +470,13 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) shadow_mmio_mask = mmio_mask; shadow_mmio_access_mask = access_mask; } -EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_mmio_spte_mask); void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value) { kvm->arch.shadow_mmio_value = mmio_value; } -EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_mmio_spte_value); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) { @@ -487,7 +487,7 @@ void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) shadow_me_value = me_value; shadow_me_mask = me_mask; } -EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_me_spte_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { @@ -513,7 +513,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0); } -EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_ept_masks); void kvm_mmu_reset_all_pte_masks(void) { diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7f3d7229b2c1..c5734ca5c17d 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -355,7 +355,7 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) spin_lock(&kvm->arch.tdp_mmu_pages_lock); sp->nx_huge_page_disallowed = false; - untrack_possible_nx_huge_page(kvm, sp); + untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } @@ -925,23 +925,52 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_unlock(); } -bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm, + struct kvm_mmu_page *sp) { - u64 old_spte; + struct tdp_iter iter = { + .old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0, + .sptep = sp->ptep, + .level = sp->role.level + 1, + .gfn = sp->gfn, + .as_id = kvm_mmu_page_as_id(sp), + }; + + lockdep_assert_held_read(&kvm->mmu_lock); + + if (WARN_ON_ONCE(!is_tdp_mmu_page(sp))) + return false; /* - * This helper intentionally doesn't allow zapping a root shadow page, - * which doesn't have a parent page table and thus no associated entry. + * Root shadow pages don't have a parent page table and thus no + * associated entry, but they can never be possible NX huge pages. */ if (WARN_ON_ONCE(!sp->ptep)) return false; - old_spte = kvm_tdp_mmu_read_spte(sp->ptep); - if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) + /* + * Since mmu_lock is held in read mode, it's possible another task has + * already modified the SPTE. Zap the SPTE if and only if the SPTE + * points at the SP's page table, as checking shadow-present isn't + * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even + * another SP. Note, spte_to_child_pt() also checks that the SPTE is + * shadow-present, i.e. guards against zapping a frozen SPTE. + */ + if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level)) return false; - tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, - SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1); + /* + * If a different task modified the SPTE, then it should be impossible + * for the SPTE to still be used for the to-be-zapped SP. Non-leaf + * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when + * creating non-leaf SPTEs, and all other bits are immutable for non- + * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are + * zapping and replacement. + */ + if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) { + WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level)); + return false; + } return true; } @@ -1303,7 +1332,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) fault->req_level >= iter.level) { spin_lock(&kvm->arch.tdp_mmu_pages_lock); if (sp->nx_huge_page_disallowed) - track_possible_nx_huge_page(kvm, sp); + track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } } @@ -1813,7 +1842,7 @@ retry: if (iter.gfn < start || iter.gfn >= end) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn); if (max_mapping_level < iter.level) continue; @@ -1953,7 +1982,7 @@ bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) spte = sptes[leaf]; return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); } -EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_gpa_is_mapped); /* * Returns the last level spte pointer of the shadow page walk for the given diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 52acf99d40a0..bd62977c9199 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -64,7 +64,8 @@ static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu, } bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush); -bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); +bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm, + struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, enum kvm_tdp_mmu_root_types root_types); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 75e9cfc689f8..40ac4cb44ed2 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -26,11 +26,18 @@ /* This is enough to filter the vast majority of currently defined events. */ #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 +/* Unadultered PMU capabilities of the host, i.e. of hardware. */ +static struct x86_pmu_capability __read_mostly kvm_host_pmu; + +/* KVM's PMU capabilities, i.e. the intersection of KVM and hardware support. */ struct x86_pmu_capability __read_mostly kvm_pmu_cap; -EXPORT_SYMBOL_GPL(kvm_pmu_cap); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_cap); -struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; -EXPORT_SYMBOL_GPL(kvm_pmu_eventsel); +struct kvm_pmu_emulated_event_selectors { + u64 INSTRUCTIONS_RETIRED; + u64 BRANCH_INSTRUCTIONS_RETIRED; +}; +static struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; /* Precise Distribution of Instructions Retired (PDIR) */ static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { @@ -96,6 +103,54 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } +void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) +{ + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; + + perf_get_x86_pmu_capability(&kvm_host_pmu); + + /* + * Hybrid PMUs don't play nice with virtualization without careful + * configuration by userspace, and KVM's APIs for reporting supported + * vPMU features do not account for hybrid PMUs. Disable vPMU support + * for hybrid PMUs until KVM gains a way to let userspace opt-in. + */ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + enable_pmu = false; + + if (enable_pmu) { + /* + * WARN if perf did NOT disable hardware PMU if the number of + * architecturally required GP counters aren't present, i.e. if + * there are a non-zero number of counters, but fewer than what + * is architecturally required. + */ + if (!kvm_host_pmu.num_counters_gp || + WARN_ON_ONCE(kvm_host_pmu.num_counters_gp < min_nr_gp_ctrs)) + enable_pmu = false; + else if (is_intel && !kvm_host_pmu.version) + enable_pmu = false; + } + + if (!enable_pmu) { + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + return; + } + + memcpy(&kvm_pmu_cap, &kvm_host_pmu, sizeof(kvm_host_pmu)); + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, + pmu_ops->MAX_NR_GP_COUNTERS); + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, + KVM_MAX_NR_FIXED_COUNTERS); + + kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = + perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); + kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = + perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); +} + static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -318,7 +373,7 @@ void pmc_write_counter(struct kvm_pmc *pmc, u64 val) pmc->counter &= pmc_bitmask(pmc); pmc_update_sample_period(pmc); } -EXPORT_SYMBOL_GPL(pmc_write_counter); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(pmc_write_counter); static int filter_cmp(const void *pa, const void *pb, u64 mask) { @@ -426,7 +481,7 @@ static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, return true; } -static bool check_pmu_event_filter(struct kvm_pmc *pmc) +static bool pmc_is_event_allowed(struct kvm_pmc *pmc) { struct kvm_x86_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; @@ -441,12 +496,6 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) return is_fixed_event_allowed(filter, pmc->idx); } -static bool pmc_event_is_allowed(struct kvm_pmc *pmc) -{ - return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && - check_pmu_event_filter(pmc); -} - static int reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -457,7 +506,8 @@ static int reprogram_counter(struct kvm_pmc *pmc) emulate_overflow = pmc_pause_counter(pmc); - if (!pmc_event_is_allowed(pmc)) + if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || + !pmc_is_event_allowed(pmc)) return 0; if (emulate_overflow) @@ -492,6 +542,47 @@ static int reprogram_counter(struct kvm_pmc *pmc) eventsel & ARCH_PERFMON_EVENTSEL_INT); } +static bool pmc_is_event_match(struct kvm_pmc *pmc, u64 eventsel) +{ + /* + * Ignore checks for edge detect (all events currently emulated by KVM + * are always rising edges), pin control (unsupported by modern CPUs), + * and counter mask and its invert flag (KVM doesn't emulate multiple + * events in a single clock cycle). + * + * Note, the uppermost nibble of AMD's mask overlaps Intel's IN_TX (bit + * 32) and IN_TXCP (bit 33), as well as two reserved bits (bits 35:34). + * Checking the "in HLE/RTM transaction" flags is correct as the vCPU + * can't be in a transaction if KVM is emulating an instruction. + * + * Checking the reserved bits might be wrong if they are defined in the + * future, but so could ignoring them, so do the simple thing for now. + */ + return !((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB); +} + +void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc) +{ + bitmap_clear(pmu->pmc_counting_instructions, pmc->idx, 1); + bitmap_clear(pmu->pmc_counting_branches, pmc->idx, 1); + + /* + * Do NOT consult the PMU event filters, as the filters must be checked + * at the time of emulation to ensure KVM uses fresh information, e.g. + * omitting a PMC from a bitmap could result in a missed event if the + * filter is changed to allow counting the event. + */ + if (!pmc_is_locally_enabled(pmc)) + return; + + if (pmc_is_event_match(pmc, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED)) + bitmap_set(pmu->pmc_counting_instructions, pmc->idx, 1); + + if (pmc_is_event_match(pmc, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED)) + bitmap_set(pmu->pmc_counting_branches, pmc->idx, 1); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_recalc_pmc_emulation); + void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); @@ -527,6 +618,9 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) */ if (unlikely(pmu->need_cleanup)) kvm_pmu_cleanup(vcpu); + + kvm_for_each_pmc(pmu, pmc, bit, bitmap) + kvm_pmu_recalc_pmc_emulation(pmu, pmc); } int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) @@ -650,6 +744,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = pmu->global_ctrl; break; case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; break; @@ -711,6 +806,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) pmu->global_status &= ~data; break; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: + if (!msr_info->host_initiated) + pmu->global_status |= data & ~pmu->global_status_rsvd; + break; default: kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); return kvm_pmu_call(set_msr)(vcpu, msr_info); @@ -789,6 +888,10 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) */ if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); + + bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); + bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX, + pmu->nr_arch_fixed_counters); } void kvm_pmu_init(struct kvm_vcpu *vcpu) @@ -813,7 +916,7 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmu->pmc_in_use, X86_PMC_IDX_MAX); kvm_for_each_pmc(pmu, pmc, i, bitmask) { - if (pmc->perf_event && !pmc_speculative_in_use(pmc)) + if (pmc->perf_event && !pmc_is_locally_enabled(pmc)) pmc_stop_counter(pmc); } @@ -860,44 +963,46 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) select_user; } -void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) +static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, + const unsigned long *event_pmcs) { DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; - int i; + int i, idx; BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX); + if (bitmap_empty(event_pmcs, X86_PMC_IDX_MAX)) + return; + if (!kvm_pmu_has_perf_global_ctrl(pmu)) - bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); - else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx, + bitmap_copy(bitmap, event_pmcs, X86_PMC_IDX_MAX); + else if (!bitmap_and(bitmap, event_pmcs, (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX)) return; + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_for_each_pmc(pmu, pmc, i, bitmap) { - /* - * Ignore checks for edge detect (all events currently emulated - * but KVM are always rising edges), pin control (unsupported - * by modern CPUs), and counter mask and its invert flag (KVM - * doesn't emulate multiple events in a single clock cycle). - * - * Note, the uppermost nibble of AMD's mask overlaps Intel's - * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved - * bits (bits 35:34). Checking the "in HLE/RTM transaction" - * flags is correct as the vCPU can't be in a transaction if - * KVM is emulating an instruction. Checking the reserved bits - * might be wrong if they are defined in the future, but so - * could ignoring them, so do the simple thing for now. - */ - if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) || - !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc)) + if (!pmc_is_event_allowed(pmc) || !cpl_is_matched(pmc)) continue; kvm_pmu_incr_counter(pmc); } + srcu_read_unlock(&vcpu->kvm->srcu, idx); +} + +void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu) +{ + kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_instructions); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_instruction_retired); + +void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu) +{ + kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_branches); } -EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_branch_retired); static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) { diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ad89d0bd6005..5c3939e91f1d 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -13,7 +13,7 @@ #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) -/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ +/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) \ (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) @@ -23,11 +23,6 @@ #define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED -struct kvm_pmu_emulated_event_selectors { - u64 INSTRUCTIONS_RETIRED; - u64 BRANCH_INSTRUCTIONS_RETIRED; -}; - struct kvm_pmu_ops { struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); @@ -165,7 +160,7 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) return NULL; } -static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -178,57 +173,15 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) } extern struct x86_pmu_capability kvm_pmu_cap; -extern struct kvm_pmu_emulated_event_selectors kvm_pmu_eventsel; -static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) -{ - bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; - int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; +void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops); - /* - * Hybrid PMUs don't play nice with virtualization without careful - * configuration by userspace, and KVM's APIs for reporting supported - * vPMU features do not account for hybrid PMUs. Disable vPMU support - * for hybrid PMUs until KVM gains a way to let userspace opt-in. - */ - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - enable_pmu = false; - - if (enable_pmu) { - perf_get_x86_pmu_capability(&kvm_pmu_cap); - - /* - * WARN if perf did NOT disable hardware PMU if the number of - * architecturally required GP counters aren't present, i.e. if - * there are a non-zero number of counters, but fewer than what - * is architecturally required. - */ - if (!kvm_pmu_cap.num_counters_gp || - WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) - enable_pmu = false; - else if (is_intel && !kvm_pmu_cap.version) - enable_pmu = false; - } - - if (!enable_pmu) { - memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); - return; - } - - kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); - kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, - pmu_ops->MAX_NR_GP_COUNTERS); - kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, - KVM_MAX_NR_FIXED_COUNTERS); - - kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = - perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); - kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = - perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); -} +void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc); static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) { + kvm_pmu_recalc_pmc_emulation(pmc_to_pmu(pmc), pmc); + set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } @@ -272,7 +225,8 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); -void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel); +void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); +void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index c53b92379e6e..743ab25ba787 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -25,6 +25,9 @@ #define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) #define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11) +/* Intel-defined sub-features, CPUID level 0x00000007:1 (ECX) */ +#define KVM_X86_FEATURE_MSR_IMM KVM_X86_FEATURE(CPUID_7_1_ECX, 5) + /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ #define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) @@ -87,6 +90,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX}, + [CPUID_7_1_ECX] = { 7, 1, CPUID_ECX}, }; /* @@ -128,6 +132,7 @@ static __always_inline u32 __feature_translate(int x86_feature) KVM_X86_TRANSLATE_FEATURE(BHI_CTRL); KVM_X86_TRANSLATE_FEATURE(TSA_SQ_NO); KVM_X86_TRANSLATE_FEATURE(TSA_L1_NO); + KVM_X86_TRANSLATE_FEATURE(MSR_IMM); default: return x86_feature; } diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 9864c057187d..f623c5986119 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -131,7 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) kvm_mmu_reset_context(vcpu); } -EXPORT_SYMBOL_GPL(kvm_smm_changed); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_smm_changed); void process_smi(struct kvm_vcpu *vcpu) { @@ -269,6 +269,10 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS); smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + kvm_msr_read(vcpu, MSR_KVM_INTERNAL_GUEST_SSP, &smram->ssp)) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); } #endif @@ -529,7 +533,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, vcpu->arch.smbase = smstate->smbase; - if (kvm_set_msr(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA)) + if (__kvm_emulate_msr_write(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA)) return X86EMUL_UNHANDLEABLE; rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR); @@ -558,6 +562,10 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, kvm_x86_call(set_interrupt_shadow)(vcpu, 0); ctxt->interruptibility = (u8)smstate->int_shadow; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + kvm_msr_write(vcpu, MSR_KVM_INTERNAL_GUEST_SSP, smstate->ssp)) + return X86EMUL_UNHANDLEABLE; + return X86EMUL_CONTINUE; } #endif @@ -620,7 +628,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) /* And finally go back to 32-bit mode. */ efer = 0; - kvm_set_msr(vcpu, MSR_EFER, efer); + __kvm_emulate_msr_write(vcpu, MSR_EFER, efer); } #endif diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index 551703fbe200..db3c88f16138 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -116,8 +116,8 @@ struct kvm_smram_state_64 { u32 smbase; u32 reserved4[5]; - /* ssp and svm_* fields below are not implemented by KVM */ u64 ssp; + /* svm_* fields below are not implemented by KVM */ u64 svm_guest_pat; u64 svm_host_efer; u64 svm_host_cr4; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index a34c5c3b164e..f286b5706d7c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -64,6 +64,34 @@ static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); +#define AVIC_AUTO_MODE -1 + +static int avic_param_set(const char *val, const struct kernel_param *kp) +{ + if (val && sysfs_streq(val, "auto")) { + *(int *)kp->arg = AVIC_AUTO_MODE; + return 0; + } + + return param_set_bint(val, kp); +} + +static const struct kernel_param_ops avic_ops = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = avic_param_set, + .get = param_get_bool, +}; + +/* + * Enable / disable AVIC. In "auto" mode (default behavior), AVIC is enabled + * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met). + */ +static int avic = AVIC_AUTO_MODE; +module_param_cb(avic, &avic_ops, &avic, 0444); +__MODULE_PARM_TYPE(avic, "bool"); + +module_param(enable_ipiv, bool, 0444); + static bool force_avic; module_param_unsafe(force_avic, bool, 0444); @@ -77,7 +105,58 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); -bool x2avic_enabled; +static bool x2avic_enabled; + + +static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, + bool intercept) +{ + static const u32 x2avic_passthrough_msrs[] = { + X2APIC_MSR(APIC_ID), + X2APIC_MSR(APIC_LVR), + X2APIC_MSR(APIC_TASKPRI), + X2APIC_MSR(APIC_ARBPRI), + X2APIC_MSR(APIC_PROCPRI), + X2APIC_MSR(APIC_EOI), + X2APIC_MSR(APIC_RRR), + X2APIC_MSR(APIC_LDR), + X2APIC_MSR(APIC_DFR), + X2APIC_MSR(APIC_SPIV), + X2APIC_MSR(APIC_ISR), + X2APIC_MSR(APIC_TMR), + X2APIC_MSR(APIC_IRR), + X2APIC_MSR(APIC_ESR), + X2APIC_MSR(APIC_ICR), + X2APIC_MSR(APIC_ICR2), + + /* + * Note! Always intercept LVTT, as TSC-deadline timer mode + * isn't virtualized by hardware, and the CPU will generate a + * #GP instead of a #VMEXIT. + */ + X2APIC_MSR(APIC_LVTTHMR), + X2APIC_MSR(APIC_LVTPC), + X2APIC_MSR(APIC_LVT0), + X2APIC_MSR(APIC_LVT1), + X2APIC_MSR(APIC_LVTERR), + X2APIC_MSR(APIC_TMICT), + X2APIC_MSR(APIC_TMCCT), + X2APIC_MSR(APIC_TDCR), + }; + int i; + + if (intercept == svm->x2avic_msrs_intercepted) + return; + + if (!x2avic_enabled) + return; + + for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) + svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], + MSR_TYPE_RW, intercept); + + svm->x2avic_msrs_intercepted = intercept; +} static void avic_activate_vmcb(struct vcpu_svm *svm) { @@ -99,7 +178,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) vmcb->control.int_ctl |= X2APIC_MODE_MASK; vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; /* Disabling MSR intercept for x2APIC registers */ - svm_set_x2apic_msr_interception(svm, false); + avic_set_x2apic_msr_interception(svm, false); } else { /* * Flush the TLB, the guest may have inserted a non-APIC @@ -110,7 +189,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) /* For xAVIC and hybrid-xAVIC modes */ vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; /* Enabling MSR intercept for x2APIC registers */ - svm_set_x2apic_msr_interception(svm, true); + avic_set_x2apic_msr_interception(svm, true); } } @@ -130,7 +209,7 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm) return; /* Enabling MSR intercept for x2APIC registers */ - svm_set_x2apic_msr_interception(svm, true); + avic_set_x2apic_msr_interception(svm, true); } /* Note: @@ -1090,23 +1169,27 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) avic_vcpu_load(vcpu, vcpu->cpu); } -/* - * Note: - * - The module param avic enable both xAPIC and x2APIC mode. - * - Hypervisor can support both xAVIC and x2AVIC in the same guest. - * - The mode can be switched at run-time. - */ -bool avic_hardware_setup(void) +static bool __init avic_want_avic_enabled(void) { - if (!npt_enabled) + /* + * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is + * supported (to avoid enabling partial support by default, and because + * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for + * family 0x19 and later (Zen5+), as the kernel's synthetic ZenX flags + * aren't inclusive of previous generations, i.e. the kernel will set + * at most one ZenX feature flag. + */ + if (avic == AVIC_AUTO_MODE) + avic = boot_cpu_has(X86_FEATURE_X2AVIC) && + (boot_cpu_data.x86 > 0x19 || cpu_feature_enabled(X86_FEATURE_ZEN4)); + + if (!avic || !npt_enabled) return false; /* AVIC is a prerequisite for x2AVIC. */ if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { - if (boot_cpu_has(X86_FEATURE_X2AVIC)) { - pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); - pr_warn(FW_BUG "Try enable AVIC using force_avic option"); - } + if (boot_cpu_has(X86_FEATURE_X2AVIC)) + pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n"); return false; } @@ -1116,21 +1199,37 @@ bool avic_hardware_setup(void) return false; } - if (boot_cpu_has(X86_FEATURE_AVIC)) { - pr_info("AVIC enabled\n"); - } else if (force_avic) { - /* - * Some older systems does not advertise AVIC support. - * See Revision Guide for specific AMD processor for more detail. - */ - pr_warn("AVIC is not supported in CPUID but force enabled"); - pr_warn("Your system might crash and burn"); - } + /* + * Print a scary message if AVIC is force enabled to make it abundantly + * clear that ignoring CPUID could have repercussions. See Revision + * Guide for specific AMD processor for more details. + */ + if (!boot_cpu_has(X86_FEATURE_AVIC)) + pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n"); + + return true; +} + +/* + * Note: + * - The module param avic enable both xAPIC and x2APIC mode. + * - Hypervisor can support both xAVIC and x2AVIC in the same guest. + * - The mode can be switched at run-time. + */ +bool __init avic_hardware_setup(void) +{ + avic = avic_want_avic_enabled(); + if (!avic) + return false; + + pr_info("AVIC enabled\n"); /* AVIC is a prerequisite for x2AVIC. */ x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); if (x2avic_enabled) pr_info("x2AVIC enabled\n"); + else + svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; /* * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index b7fd2e869998..a6443feab252 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -636,6 +636,14 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DT); } + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_CET)))) { + vmcb02->save.s_cet = vmcb12->save.s_cet; + vmcb02->save.isst_addr = vmcb12->save.isst_addr; + vmcb02->save.ssp = vmcb12->save.ssp; + vmcb_mark_dirty(vmcb02, VMCB_CET); + } + kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); svm_set_efer(vcpu, svm->nested.save.efer); @@ -1044,6 +1052,12 @@ void svm_copy_vmrun_state(struct vmcb_save_area *to_save, to_save->rsp = from_save->rsp; to_save->rip = from_save->rip; to_save->cpl = 0; + + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + to_save->s_cet = from_save->s_cet; + to_save->isst_addr = from_save->isst_addr; + to_save->ssp = from_save->ssp; + } } void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) @@ -1111,6 +1125,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->save.dr6 = svm->vcpu.arch.dr6; vmcb12->save.cpl = vmcb02->save.cpl; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { + vmcb12->save.s_cet = vmcb02->save.s_cet; + vmcb12->save.isst_addr = vmcb02->save.isst_addr; + vmcb12->save.ssp = vmcb02->save.ssp; + } + vmcb12->control.int_state = vmcb02->control.int_state; vmcb12->control.exit_code = vmcb02->control.exit_code; vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi; @@ -1798,17 +1818,15 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE) return -EINVAL; - ret = -ENOMEM; - ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); - save = kzalloc(sizeof(*save), GFP_KERNEL); - if (!ctl || !save) - goto out_free; + ctl = memdup_user(&user_vmcb->control, sizeof(*ctl)); + if (IS_ERR(ctl)) + return PTR_ERR(ctl); - ret = -EFAULT; - if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl))) - goto out_free; - if (copy_from_user(save, &user_vmcb->save, sizeof(*save))) - goto out_free; + save = memdup_user(&user_vmcb->save, sizeof(*save)); + if (IS_ERR(save)) { + kfree(ctl); + return PTR_ERR(save); + } ret = -EINVAL; __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 288f7f2a46f2..bc062285fbf5 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -41,7 +41,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); unsigned int idx; - if (!vcpu->kvm->arch.enable_pmu) + if (!pmu->version) return NULL; switch (msr) { @@ -113,6 +113,7 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: return pmu->version > 1; default: if (msr > MSR_F15H_PERF_CTR5 && @@ -199,17 +200,16 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.num_counters_gp); if (pmu->version > 1) { - pmu->global_ctrl_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->global_ctrl_rsvd = ~(BIT_ULL(pmu->nr_arch_gp_counters) - 1); pmu->global_status_rsvd = pmu->global_ctrl_rsvd; } - pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; + pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(48) - 1; pmu->reserved_bits = 0xfffffff000280000ull; pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->nr_arch_fixed_counters = 0; - bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); } static void amd_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0635bd71c10e..0835c664fbfd 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -37,7 +37,6 @@ #include "trace.h" #define GHCB_VERSION_MAX 2ULL -#define GHCB_VERSION_DEFAULT 2ULL #define GHCB_VERSION_MIN 1ULL #define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) @@ -59,6 +58,9 @@ static bool sev_es_debug_swap_enabled = true; module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); static u64 sev_supported_vmsa_features; +static unsigned int nr_ciphertext_hiding_asids; +module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 0444); + #define AP_RESET_HOLD_NONE 0 #define AP_RESET_HOLD_NAE_EVENT 1 #define AP_RESET_HOLD_MSR_PROTO 2 @@ -85,6 +87,10 @@ static DECLARE_RWSEM(sev_deactivate_lock); static DEFINE_MUTEX(sev_bitmap_lock); unsigned int max_sev_asid; static unsigned int min_sev_asid; +static unsigned int max_sev_es_asid; +static unsigned int min_sev_es_asid; +static unsigned int max_snp_asid; +static unsigned int min_snp_asid; static unsigned long sev_me_mask; static unsigned int nr_asids; static unsigned long *sev_asid_bitmap; @@ -147,6 +153,14 @@ static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; } +static bool snp_is_secure_tsc_enabled(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + + return (sev->vmsa_features & SVM_SEV_FEAT_SECURE_TSC) && + !WARN_ON_ONCE(!sev_snp_guest(kvm)); +} + /* Must be called with the sev_bitmap_lock held */ static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid) { @@ -173,20 +187,34 @@ static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) misc_cg_uncharge(type, sev->misc_cg, 1); } -static int sev_asid_new(struct kvm_sev_info *sev) +static int sev_asid_new(struct kvm_sev_info *sev, unsigned long vm_type) { /* * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. - * Note: min ASID can end up larger than the max if basic SEV support is - * effectively disabled by disallowing use of ASIDs for SEV guests. */ - unsigned int min_asid = sev->es_active ? 1 : min_sev_asid; - unsigned int max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; - unsigned int asid; + unsigned int min_asid, max_asid, asid; bool retry = true; int ret; + if (vm_type == KVM_X86_SNP_VM) { + min_asid = min_snp_asid; + max_asid = max_snp_asid; + } else if (sev->es_active) { + min_asid = min_sev_es_asid; + max_asid = max_sev_es_asid; + } else { + min_asid = min_sev_asid; + max_asid = max_sev_asid; + } + + /* + * The min ASID can end up larger than the max if basic SEV support is + * effectively disabled by disallowing use of ASIDs for SEV guests. + * Similarly for SEV-ES guests the min ASID can end up larger than the + * max when ciphertext hiding is enabled, effectively disabling SEV-ES + * support. + */ if (min_asid > max_asid) return -ENOTTY; @@ -406,6 +434,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_platform_init_args init_args = {0}; bool es_active = vm_type != KVM_X86_SEV_VM; + bool snp_active = vm_type == KVM_X86_SNP_VM; u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; int ret; @@ -415,12 +444,26 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (data->flags) return -EINVAL; + if (!snp_active) + valid_vmsa_features &= ~SVM_SEV_FEAT_SECURE_TSC; + if (data->vmsa_features & ~valid_vmsa_features) return -EINVAL; if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version)) return -EINVAL; + /* + * KVM supports the full range of mandatory features defined by version + * 2 of the GHCB protocol, so default to that for SEV-ES guests created + * via KVM_SEV_INIT2 (KVM_SEV_INIT forces version 1). + */ + if (es_active && !data->ghcb_version) + data->ghcb_version = 2; + + if (snp_active && data->ghcb_version < 2) + return -EINVAL; + if (unlikely(sev->active)) return -EINVAL; @@ -429,18 +472,10 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, sev->vmsa_features = data->vmsa_features; sev->ghcb_version = data->ghcb_version; - /* - * Currently KVM supports the full range of mandatory features defined - * by version 2 of the GHCB protocol, so default to that for SEV-ES - * guests created via KVM_SEV_INIT2. - */ - if (sev->es_active && !sev->ghcb_version) - sev->ghcb_version = GHCB_VERSION_DEFAULT; - - if (vm_type == KVM_X86_SNP_VM) + if (snp_active) sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; - ret = sev_asid_new(sev); + ret = sev_asid_new(sev, vm_type); if (ret) goto e_no_asid; @@ -455,7 +490,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, } /* This needs to happen after SEV/SNP firmware initialization. */ - if (vm_type == KVM_X86_SNP_VM) { + if (snp_active) { ret = snp_guest_req_init(kvm); if (ret) goto e_free; @@ -569,8 +604,6 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; - sev->policy = params.policy; - memset(&start, 0, sizeof(start)); dh_blob = NULL; @@ -618,6 +651,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_session; } + sev->policy = params.policy; sev->handle = start.handle; sev->fd = argp->sev_fd; @@ -1968,7 +2002,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) { dst_svm = to_svm(dst_vcpu); - sev_init_vmcb(dst_svm); + sev_init_vmcb(dst_svm, false); if (!dst->es_active) continue; @@ -2180,7 +2214,12 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO)) return -EINVAL; - sev->policy = params.policy; + if (snp_is_secure_tsc_enabled(kvm)) { + if (WARN_ON_ONCE(!kvm->arch.default_tsc_khz)) + return -EINVAL; + + start.desired_tsc_khz = kvm->arch.default_tsc_khz; + } sev->snp_context = snp_context_create(kvm, argp); if (!sev->snp_context) @@ -2188,6 +2227,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) start.gctx_paddr = __psp_pa(sev->snp_context); start.policy = params.policy; + memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); if (rc) { @@ -2196,6 +2236,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_context; } + sev->policy = params.policy; sev->fd = argp->sev_fd; rc = snp_bind_asid(kvm, &argp->error); if (rc) { @@ -2329,7 +2370,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__, params.gfn_start, params.len, params.type, params.flags); - if (!PAGE_ALIGNED(params.len) || params.flags || + if (!params.len || !PAGE_ALIGNED(params.len) || params.flags || (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL && params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO && params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED && @@ -2361,7 +2402,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) mutex_lock(&kvm->slots_lock); memslot = gfn_to_memslot(kvm, params.gfn_start); - if (!kvm_slot_can_be_private(memslot)) { + if (!kvm_slot_has_gmem(memslot)) { ret = -EINVAL; goto out; } @@ -3038,6 +3079,9 @@ void __init sev_hardware_setup(void) if (min_sev_asid == 1) goto out; + min_sev_es_asid = min_snp_asid = 1; + max_sev_es_asid = max_snp_asid = min_sev_asid - 1; + sev_es_asid_count = min_sev_asid - 1; WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); sev_es_supported = true; @@ -3046,10 +3090,32 @@ void __init sev_hardware_setup(void) out: if (sev_enabled) { init_args.probe = true; + + if (sev_is_snp_ciphertext_hiding_supported()) + init_args.max_snp_asid = min(nr_ciphertext_hiding_asids, + min_sev_asid - 1); + if (sev_platform_init(&init_args)) sev_supported = sev_es_supported = sev_snp_supported = false; else if (sev_snp_supported) sev_snp_supported = is_sev_snp_initialized(); + + if (sev_snp_supported) + nr_ciphertext_hiding_asids = init_args.max_snp_asid; + + /* + * If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP + * ASID range is partitioned into separate SEV-ES and SEV-SNP + * ASID ranges, with the SEV-SNP range being [1..max_snp_asid] + * and the SEV-ES range being (max_snp_asid..max_sev_es_asid]. + * Note, SEV-ES may effectively be disabled if all ASIDs from + * the joint range are assigned to SEV-SNP. + */ + if (nr_ciphertext_hiding_asids) { + max_snp_asid = nr_ciphertext_hiding_asids; + min_sev_es_asid = max_snp_asid + 1; + pr_info("SEV-SNP ciphertext hiding enabled\n"); + } } if (boot_cpu_has(X86_FEATURE_SEV)) @@ -3060,12 +3126,14 @@ out: min_sev_asid, max_sev_asid); if (boot_cpu_has(X86_FEATURE_SEV_ES)) pr_info("SEV-ES %s (ASIDs %u - %u)\n", - str_enabled_disabled(sev_es_supported), - min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); + sev_es_supported ? min_sev_es_asid <= max_sev_es_asid ? "enabled" : + "unusable" : + "disabled", + min_sev_es_asid, max_sev_es_asid); if (boot_cpu_has(X86_FEATURE_SEV_SNP)) pr_info("SEV-SNP %s (ASIDs %u - %u)\n", str_enabled_disabled(sev_snp_supported), - min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); + min_snp_asid, max_snp_asid); sev_enabled = sev_supported; sev_es_enabled = sev_es_supported; @@ -3078,6 +3146,9 @@ out: sev_supported_vmsa_features = 0; if (sev_es_debug_swap_enabled) sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; + + if (sev_snp_enabled && tsc_khz && cpu_feature_enabled(X86_FEATURE_SNP_SECURE_TSC)) + sev_supported_vmsa_features |= SVM_SEV_FEAT_SECURE_TSC; } void sev_hardware_unsetup(void) @@ -3193,7 +3264,7 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } -static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +static u64 kvm_get_cached_sw_exit_code(struct vmcb_control_area *control) { return (((u64)control->exit_code_hi) << 32) | control->exit_code; } @@ -3219,7 +3290,7 @@ static void dump_ghcb(struct vcpu_svm *svm) */ pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); + kvm_get_cached_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", @@ -3272,26 +3343,27 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap)); memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap)); - vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb); - vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb); - vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb); - vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb); - vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm); + vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm); + vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm); + vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm); + vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm); - svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb); + svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm); - if (kvm_ghcb_xcr0_is_valid(svm)) { - vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); - vcpu->arch.cpuid_dynamic_bits_dirty = true; - } + if (kvm_ghcb_xcr0_is_valid(svm)) + __kvm_set_xcr(vcpu, 0, kvm_ghcb_get_xcr0(svm)); + + if (kvm_ghcb_xss_is_valid(svm)) + __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm)); /* Copy the GHCB exit information into the VMCB fields */ - exit_code = ghcb_get_sw_exit_code(ghcb); + exit_code = kvm_ghcb_get_sw_exit_code(svm); control->exit_code = lower_32_bits(exit_code); control->exit_code_hi = upper_32_bits(exit_code); - control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb); - control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb); - svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb); + control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm); + control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm); + svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm); /* Clear the valid entries fields */ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); @@ -3308,7 +3380,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) * Retrieve the exit code now even though it may not be marked valid * as it could help with debugging. */ - exit_code = kvm_ghcb_get_sw_exit_code(control); + exit_code = kvm_get_cached_sw_exit_code(control); /* Only GHCB Usage code 0 is supported */ if (svm->sev_es.ghcb->ghcb_usage) { @@ -3880,7 +3952,7 @@ next_range: /* * Invoked as part of svm_vcpu_reset() processing of an init event. */ -void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) +static void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_memory_slot *slot; @@ -3888,9 +3960,6 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) kvm_pfn_t pfn; gfn_t gfn; - if (!sev_snp_guest(vcpu->kvm)) - return; - guard(mutex)(&svm->sev_es.snp_vmsa_mutex); if (!svm->sev_es.snp_ap_waiting_for_reset) @@ -4316,7 +4385,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm_vmgexit_success(svm, 0); - exit_code = kvm_ghcb_get_sw_exit_code(control); + exit_code = kvm_get_cached_sw_exit_code(control); switch (exit_code) { case SVM_VMGEXIT_MMIO_READ: ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); @@ -4448,6 +4517,9 @@ void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu) !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_GUEST_TSC_FREQ, MSR_TYPE_R, + !snp_is_secure_tsc_enabled(vcpu->kvm)); + /* * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if * the host/guest supports its use. @@ -4476,7 +4548,7 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } -static void sev_es_init_vmcb(struct vcpu_svm *svm) +static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event) { struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct vmcb *vmcb = svm->vmcb01.ptr; @@ -4537,10 +4609,21 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* Can't intercept XSETBV, HV can't modify XCR0 directly */ svm_clr_intercept(svm, INTERCEPT_XSETBV); + + /* + * Set the GHCB MSR value as per the GHCB specification when emulating + * vCPU RESET for an SEV-ES guest. + */ + if (!init_event) + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, + GHCB_VERSION_MIN, + sev_enc_bit)); } -void sev_init_vmcb(struct vcpu_svm *svm) +void sev_init_vmcb(struct vcpu_svm *svm, bool init_event) { + struct kvm_vcpu *vcpu = &svm->vcpu; + svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; clr_exception_intercept(svm, UD_VECTOR); @@ -4550,24 +4633,36 @@ void sev_init_vmcb(struct vcpu_svm *svm) */ clr_exception_intercept(svm, GP_VECTOR); - if (sev_es_guest(svm->vcpu.kvm)) - sev_es_init_vmcb(svm); + if (init_event && sev_snp_guest(vcpu->kvm)) + sev_snp_init_protected_guest_state(vcpu); + + if (sev_es_guest(vcpu->kvm)) + sev_es_init_vmcb(svm, init_event); } -void sev_es_vcpu_reset(struct vcpu_svm *svm) +int sev_vcpu_create(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); + struct vcpu_svm *svm = to_svm(vcpu); + struct page *vmsa_page; + + mutex_init(&svm->sev_es.snp_vmsa_mutex); + + if (!sev_es_guest(vcpu->kvm)) + return 0; /* - * Set the GHCB MSR value as per the GHCB specification when emulating - * vCPU RESET for an SEV-ES guest. + * SEV-ES guests require a separate (from the VMCB) VMSA page used to + * contain the encrypted register state of the guest. */ - set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, - GHCB_VERSION_MIN, - sev_enc_bit)); + vmsa_page = snp_safe_alloc_page(); + if (!vmsa_page) + return -ENOMEM; - mutex_init(&svm->sev_es.snp_vmsa_mutex); + svm->sev_es.vmsa = page_address(vmsa_page); + + vcpu->arch.guest_tsc_protected = snp_is_secure_tsc_enabled(vcpu->kvm); + + return 0; } void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) @@ -4618,6 +4713,16 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3); } + + /* + * TSC_AUX is always virtualized for SEV-ES guests when the feature is + * available, i.e. TSC_AUX is loaded on #VMEXIT from the host save area. + * Set the save area to the current hardware value, i.e. the current + * user return value, so that the correct value is restored on #VMEXIT. + */ + if (cpu_feature_enabled(X86_FEATURE_V_TSC_AUX) && + !WARN_ON_ONCE(tsc_aux_uret_slot < 0)) + hostsa->tsc_aux = kvm_get_user_return_msr(tsc_aux_uret_slot); } void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) @@ -4715,7 +4820,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) } slot = gfn_to_memslot(kvm, gfn); - if (!kvm_slot_can_be_private(slot)) { + if (!kvm_slot_has_gmem(slot)) { pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", gpa); return; @@ -4943,7 +5048,7 @@ next_pfn: } } -int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) { int level, rc; bool assigned; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d9931c6c4bc6..153c12dbf3eb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -158,14 +158,6 @@ module_param(lbrv, int, 0444); static int tsc_scaling = true; module_param(tsc_scaling, int, 0444); -/* - * enable / disable AVIC. Because the defaults differ for APICv - * support between VMX and SVM we cannot use module_param_named. - */ -static bool avic; -module_param(avic, bool, 0444); -module_param(enable_ipiv, bool, 0444); - module_param(enable_device_posted_irqs, bool, 0444); bool __read_mostly dump_invalid_vmcb; @@ -195,7 +187,7 @@ static DEFINE_MUTEX(vmcb_dump_mutex); * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to * defer the restoration of TSC_AUX until the CPU returns to userspace. */ -static int tsc_aux_uret_slot __read_mostly = -1; +int tsc_aux_uret_slot __ro_after_init = -1; static int get_npt_level(void) { @@ -577,18 +569,6 @@ static int svm_enable_virtualization_cpu(void) amd_pmu_enable_virt(); - /* - * If TSC_AUX virtualization is supported, TSC_AUX becomes a swap type - * "B" field (see sev_es_prepare_switch_to_guest()) for SEV-ES guests. - * Since Linux does not change the value of TSC_AUX once set, prime the - * TSC_AUX field now to avoid a RDMSR on every vCPU run. - */ - if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { - u32 __maybe_unused msr_hi; - - rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); - } - return 0; } @@ -736,55 +716,6 @@ static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); } -void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) -{ - static const u32 x2avic_passthrough_msrs[] = { - X2APIC_MSR(APIC_ID), - X2APIC_MSR(APIC_LVR), - X2APIC_MSR(APIC_TASKPRI), - X2APIC_MSR(APIC_ARBPRI), - X2APIC_MSR(APIC_PROCPRI), - X2APIC_MSR(APIC_EOI), - X2APIC_MSR(APIC_RRR), - X2APIC_MSR(APIC_LDR), - X2APIC_MSR(APIC_DFR), - X2APIC_MSR(APIC_SPIV), - X2APIC_MSR(APIC_ISR), - X2APIC_MSR(APIC_TMR), - X2APIC_MSR(APIC_IRR), - X2APIC_MSR(APIC_ESR), - X2APIC_MSR(APIC_ICR), - X2APIC_MSR(APIC_ICR2), - - /* - * Note! Always intercept LVTT, as TSC-deadline timer mode - * isn't virtualized by hardware, and the CPU will generate a - * #GP instead of a #VMEXIT. - */ - X2APIC_MSR(APIC_LVTTHMR), - X2APIC_MSR(APIC_LVTPC), - X2APIC_MSR(APIC_LVT0), - X2APIC_MSR(APIC_LVT1), - X2APIC_MSR(APIC_LVTERR), - X2APIC_MSR(APIC_TMICT), - X2APIC_MSR(APIC_TMCCT), - X2APIC_MSR(APIC_TDCR), - }; - int i; - - if (intercept == svm->x2avic_msrs_intercepted) - return; - - if (!x2avic_enabled) - return; - - for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) - svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], - MSR_TYPE_RW, intercept); - - svm->x2avic_msrs_intercepted = intercept; -} - void svm_vcpu_free_msrpm(void *msrpm) { __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); @@ -844,6 +775,17 @@ static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); } + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + bool shstk_enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + + svm_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, !shstk_enabled); + } + if (sev_es_guest(vcpu->kvm)) sev_es_recalc_msr_intercepts(vcpu); @@ -1077,13 +1019,13 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) } } -static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu) +static void svm_recalc_intercepts(struct kvm_vcpu *vcpu) { svm_recalc_instruction_intercepts(vcpu); svm_recalc_msr_intercepts(vcpu); } -static void init_vmcb(struct kvm_vcpu *vcpu) +static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb01.ptr; @@ -1221,11 +1163,11 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_BUSLOCK); if (sev_guest(vcpu->kvm)) - sev_init_vmcb(svm); + sev_init_vmcb(svm, init_event); svm_hv_init_vmcb(vmcb); - svm_recalc_intercepts_after_set_cpuid(vcpu); + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); vmcb_mark_all_dirty(vmcb); @@ -1244,9 +1186,6 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) svm->nmi_masked = false; svm->awaiting_iret_completion = false; - - if (sev_es_guest(vcpu->kvm)) - sev_es_vcpu_reset(svm); } static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -1256,10 +1195,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; - if (init_event) - sev_snp_init_protected_guest_state(vcpu); - - init_vmcb(vcpu); + init_vmcb(vcpu, init_event); if (!init_event) __svm_vcpu_reset(vcpu); @@ -1275,7 +1211,6 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *vmcb01_page; - struct page *vmsa_page = NULL; int err; BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); @@ -1286,24 +1221,18 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) if (!vmcb01_page) goto out; - if (sev_es_guest(vcpu->kvm)) { - /* - * SEV-ES guests require a separate VMSA page used to contain - * the encrypted register state of the guest. - */ - vmsa_page = snp_safe_alloc_page(); - if (!vmsa_page) - goto error_free_vmcb_page; - } + err = sev_vcpu_create(vcpu); + if (err) + goto error_free_vmcb_page; err = avic_init_vcpu(svm); if (err) - goto error_free_vmsa_page; + goto error_free_sev; svm->msrpm = svm_vcpu_alloc_msrpm(); if (!svm->msrpm) { err = -ENOMEM; - goto error_free_vmsa_page; + goto error_free_sev; } svm->x2avic_msrs_intercepted = true; @@ -1312,16 +1241,12 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); svm_switch_vmcb(svm, &svm->vmcb01); - if (vmsa_page) - svm->sev_es.vmsa = page_address(vmsa_page); - svm->guest_state_loaded = false; return 0; -error_free_vmsa_page: - if (vmsa_page) - __free_page(vmsa_page); +error_free_sev: + sev_free_vcpu(vcpu); error_free_vmcb_page: __free_page(vmcb01_page); out: @@ -1423,10 +1348,10 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); /* - * TSC_AUX is always virtualized for SEV-ES guests when the feature is - * available. The user return MSR support is not required in this case - * because TSC_AUX is restored on #VMEXIT from the host save area - * (which has been initialized in svm_enable_virtualization_cpu()). + * TSC_AUX is always virtualized (context switched by hardware) for + * SEV-ES guests when the feature is available. For non-SEV-ES guests, + * context switch TSC_AUX via the user_return MSR infrastructure (not + * all CPUs support TSC_AUX virtualization). */ if (likely(tsc_aux_uret_slot >= 0) && (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) @@ -2727,8 +2652,8 @@ static int svm_get_feature_msr(u32 msr, u64 *data) static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return sev_es_guest(vcpu->kvm) && - vcpu->arch.guest_state_protected && + return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected && + msr_info->index != MSR_IA32_XSS && !msr_write_intercepted(vcpu, msr_info->index); } @@ -2784,6 +2709,15 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (guest_cpuid_is_intel_compatible(vcpu)) msr_info->data |= (u64)svm->sysenter_esp_hi << 32; break; + case MSR_IA32_S_CET: + msr_info->data = svm->vmcb->save.s_cet; + break; + case MSR_IA32_INT_SSP_TAB: + msr_info->data = svm->vmcb->save.isst_addr; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + msr_info->data = svm->vmcb->save.ssp; + break; case MSR_TSC_AUX: msr_info->data = svm->tsc_aux; break; @@ -3016,13 +2950,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->vmcb01.ptr->save.sysenter_esp = (u32)data; svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; break; + case MSR_IA32_S_CET: + svm->vmcb->save.s_cet = data; + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); + break; + case MSR_IA32_INT_SSP_TAB: + svm->vmcb->save.isst_addr = data; + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + svm->vmcb->save.ssp = data; + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); + break; case MSR_TSC_AUX: /* * TSC_AUX is always virtualized for SEV-ES guests when the * feature is available. The user return MSR support is not * required in this case because TSC_AUX is restored on #VMEXIT - * from the host save area (which has been initialized in - * svm_enable_virtualization_cpu()). + * from the host save area. */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) break; @@ -3407,6 +3352,10 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "rsp:", save->rsp, "rax:", save->rax); pr_err("%-15s %016llx %-13s %016llx\n", + "s_cet:", save->s_cet, "ssp:", save->ssp); + pr_err("%-15s %016llx\n", + "isst_addr:", save->isst_addr); + pr_err("%-15s %016llx %-13s %016llx\n", "star:", save01->star, "lstar:", save01->lstar); pr_err("%-15s %016llx %-13s %016llx\n", "cstar:", save01->cstar, "sfmask:", save01->sfmask); @@ -3431,6 +3380,13 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "sev_features", vmsa->sev_features); pr_err("%-15s %016llx %-13s %016llx\n", + "pl0_ssp:", vmsa->pl0_ssp, "pl1_ssp:", vmsa->pl1_ssp); + pr_err("%-15s %016llx %-13s %016llx\n", + "pl2_ssp:", vmsa->pl2_ssp, "pl3_ssp:", vmsa->pl3_ssp); + pr_err("%-15s %016llx\n", + "u_cet:", vmsa->u_cet); + + pr_err("%-15s %016llx %-13s %016llx\n", "rax:", vmsa->rax, "rbx:", vmsa->rbx); pr_err("%-15s %016llx %-13s %016llx\n", "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); @@ -4046,8 +4002,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (nested_svm_virtualize_tpr(vcpu) || - kvm_vcpu_apicv_active(vcpu)) + if (nested_svm_virtualize_tpr(vcpu)) return; cr8 = kvm_get_cr8(vcpu); @@ -4181,17 +4136,27 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + /* + * Next RIP must be provided as IRQs are disabled, and accessing guest + * memory to decode the instruction might fault, i.e. might sleep. + */ + if (!nrips || !control->next_rip) + return EXIT_FASTPATH_NONE; if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; - switch (svm->vmcb->control.exit_code) { + switch (control->exit_code) { case SVM_EXIT_MSR: - if (!svm->vmcb->control.exit_info_1) + if (!control->exit_info_1) break; - return handle_fastpath_set_msr_irqoff(vcpu); + return handle_fastpath_wrmsr(vcpu); case SVM_EXIT_HLT: return handle_fastpath_hlt(vcpu); + case SVM_EXIT_INVD: + return handle_fastpath_invd(vcpu); default: break; } @@ -4468,8 +4433,6 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (sev_guest(vcpu->kvm)) sev_vcpu_after_set_cpuid(svm); - - svm_recalc_intercepts_after_set_cpuid(vcpu); } static bool svm_has_wbinvd_exit(void) @@ -5042,7 +5005,7 @@ static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) return page_address(page); } -static struct kvm_x86_ops svm_x86_ops __initdata = { +struct kvm_x86_ops svm_x86_ops __initdata = { .name = KBUILD_MODNAME, .check_processor_compatibility = svm_check_processor_compat, @@ -5171,7 +5134,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .apic_init_signal_blocked = svm_apic_init_signal_blocked, - .recalc_msr_intercepts = svm_recalc_msr_intercepts, + .recalc_intercepts = svm_recalc_intercepts, .complete_emulated_msr = svm_complete_emulated_msr, .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, @@ -5180,7 +5143,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .gmem_prepare = sev_gmem_prepare, .gmem_invalidate = sev_gmem_invalidate, - .private_max_mapping_level = sev_private_max_mapping_level, + .gmem_max_mapping_level = sev_gmem_max_mapping_level, }; /* @@ -5229,7 +5192,8 @@ static __init void svm_set_cpu_caps(void) kvm_set_cpu_caps(); kvm_caps.supported_perf_cap = 0; - kvm_caps.supported_xss = 0; + + kvm_cpu_cap_clear(X86_FEATURE_IBT); /* CPUID 0x80000001 and 0x8000000A (SVM features) */ if (nested) { @@ -5301,8 +5265,12 @@ static __init void svm_set_cpu_caps(void) /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); - /* Don't advertise Bus Lock Detect to guest if SVM support is absent */ + /* + * Clear capabilities that are automatically configured by common code, + * but that require explicit SVM support (that isn't yet implemented). + */ kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); + kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); } static __init int svm_hardware_setup(void) @@ -5375,6 +5343,21 @@ static __init int svm_hardware_setup(void) get_npt_level(), PG_LEVEL_1G); pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled)); + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; + /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); @@ -5409,15 +5392,12 @@ static __init int svm_hardware_setup(void) goto err; } - enable_apicv = avic = avic && avic_hardware_setup(); - + enable_apicv = avic_hardware_setup(); if (!enable_apicv) { enable_ipiv = false; svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; - } else if (!x2avic_enabled) { - svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; } if (vls) { @@ -5454,21 +5434,6 @@ static __init int svm_hardware_setup(void) svm_set_cpu_caps(); - /* - * It seems that on AMD processors PTE's accessed bit is - * being set by the CPU hardware before the NPF vmexit. - * This is not expected behaviour and our tests fail because - * of it. - * A workaround here is to disable support for - * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. - * In this case userspace can know if there is support using - * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle - * it - * If future AMD CPU models change the behaviour described above, - * this variable can be changed accordingly - */ - allow_smaller_maxphyaddr = !npt_enabled; - kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; return 0; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 58b9d168e0c8..e4b04f435b3d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -48,10 +48,13 @@ extern bool npt_enabled; extern int nrips; extern int vgif; extern bool intercept_smi; -extern bool x2avic_enabled; extern bool vnmi; extern int lbrv; +extern int tsc_aux_uret_slot __ro_after_init; + +extern struct kvm_x86_ops svm_x86_ops __initdata; + /* * Clean bits in VMCB. * VMCB_ALL_CLEAN_MASK might also need to @@ -74,6 +77,7 @@ enum { * AVIC PHYSICAL_TABLE pointer, * AVIC LOGICAL_TABLE pointer */ + VMCB_CET, /* S_CET, SSP, ISST_ADDR */ VMCB_SW = 31, /* Reserved for hypervisor/software use */ }; @@ -82,7 +86,7 @@ enum { (1U << VMCB_ASID) | (1U << VMCB_INTR) | \ (1U << VMCB_NPT) | (1U << VMCB_CR) | (1U << VMCB_DR) | \ (1U << VMCB_DT) | (1U << VMCB_SEG) | (1U << VMCB_CR2) | \ - (1U << VMCB_LBR) | (1U << VMCB_AVIC) | \ + (1U << VMCB_LBR) | (1U << VMCB_AVIC) | (1U << VMCB_CET) | \ (1U << VMCB_SW)) /* TPR and CR2 are always written before VMRUN */ @@ -699,7 +703,6 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); -void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable); void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vec); @@ -801,7 +804,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \ ) -bool avic_hardware_setup(void); +bool __init avic_hardware_setup(void); int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); @@ -826,10 +829,9 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu); /* sev.c */ int pre_sev_run(struct vcpu_svm *svm, int cpu); -void sev_init_vmcb(struct vcpu_svm *svm); +void sev_init_vmcb(struct vcpu_svm *svm, bool init_event); void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); -void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa); @@ -854,6 +856,7 @@ static inline struct page *snp_safe_alloc_page(void) return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT); } +int sev_vcpu_create(struct kvm_vcpu *vcpu); void sev_free_vcpu(struct kvm_vcpu *vcpu); void sev_vm_destroy(struct kvm *kvm); void __init sev_set_cpu_caps(void); @@ -863,10 +866,9 @@ int sev_cpu_init(struct svm_cpu_data *sd); int sev_dev_get_attr(u32 group, u64 attr, u64 *val); extern unsigned int max_sev_asid; void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); -void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); -int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private); struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); #else @@ -880,6 +882,7 @@ static inline struct page *snp_safe_alloc_page(void) return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT); } +static inline int sev_vcpu_create(struct kvm_vcpu *vcpu) { return 0; } static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {} static inline void sev_vm_destroy(struct kvm *kvm) {} static inline void __init sev_set_cpu_caps(void) {} @@ -889,13 +892,12 @@ static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; } static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; } #define max_sev_asid 0 static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {} -static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {} static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) { return 0; } static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {} -static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +static inline int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) { return 0; } @@ -914,16 +916,21 @@ void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted, void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); #define DEFINE_KVM_GHCB_ACCESSORS(field) \ - static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \ - { \ - return test_bit(GHCB_BITMAP_IDX(field), \ - (unsigned long *)&svm->sev_es.valid_bitmap); \ - } \ - \ - static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm, struct ghcb *ghcb) \ - { \ - return kvm_ghcb_##field##_is_valid(svm) ? ghcb->save.field : 0; \ - } \ +static __always_inline u64 kvm_ghcb_get_##field(struct vcpu_svm *svm) \ +{ \ + return READ_ONCE(svm->sev_es.ghcb->save.field); \ +} \ + \ +static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \ +{ \ + return test_bit(GHCB_BITMAP_IDX(field), \ + (unsigned long *)&svm->sev_es.valid_bitmap); \ +} \ + \ +static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm) \ +{ \ + return kvm_ghcb_##field##_is_valid(svm) ? kvm_ghcb_get_##field(svm) : 0; \ +} DEFINE_KVM_GHCB_ACCESSORS(cpl) DEFINE_KVM_GHCB_ACCESSORS(rax) @@ -936,5 +943,6 @@ DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1) DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2) DEFINE_KVM_GHCB_ACCESSORS(sw_scratch) DEFINE_KVM_GHCB_ACCESSORS(xcr0) +DEFINE_KVM_GHCB_ACCESSORS(xss) #endif diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 3971b3ea5d04..a8e78c0e5956 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -15,7 +15,7 @@ #include "kvm_onhyperv.h" #include "svm_onhyperv.h" -int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) +static int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) { struct hv_vmcb_enlightenments *hve; hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); @@ -35,3 +35,29 @@ int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) return 0; } +__init void svm_hv_hardware_setup(void) +{ + if (npt_enabled && + ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { + pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n"); + svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; + svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; + } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { + int cpu; + + pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n"); + for_each_online_cpu(cpu) { + struct hv_vp_assist_page *vp_ap = + hv_get_vp_assist_page(cpu); + + if (!vp_ap) + continue; + + vp_ap->nested_control.features.directhypercall = 1; + } + svm_x86_ops.enable_l2_tlb_flush = + svm_hv_enable_l2_tlb_flush; + } +} diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index f85bc617ffe4..08f14e6f195c 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -13,9 +13,7 @@ #include "kvm_onhyperv.h" #include "svm/hyperv.h" -static struct kvm_x86_ops svm_x86_ops; - -int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu); +__init void svm_hv_hardware_setup(void); static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu) { @@ -40,33 +38,6 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb) hve->hv_enlightenments_control.msr_bitmap = 1; } -static inline __init void svm_hv_hardware_setup(void) -{ - if (npt_enabled && - ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { - pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n"); - svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; - svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; - } - - if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { - int cpu; - - pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n"); - for_each_online_cpu(cpu) { - struct hv_vp_assist_page *vp_ap = - hv_get_vp_assist_page(cpu); - - if (!vp_ap) - continue; - - vp_ap->nested_control.features.directhypercall = 1; - } - svm_x86_ops.enable_l2_tlb_flush = - svm_hv_enable_l2_tlb_flush; - } -} - static inline void svm_hv_vmcb_dirty_nested_enlightenments( struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 57d79fd31df0..e79bc9cb7162 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -461,8 +461,9 @@ TRACE_EVENT(kvm_inj_virq, #define kvm_trace_sym_exc \ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ - EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ - EXS(MF), EXS(AC), EXS(MC) + EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), EXS(MF), \ + EXS(AC), EXS(MC), EXS(XM), EXS(VE), EXS(CP), \ + EXS(HV), EXS(VC), EXS(SX) /* * Tracepoint for kvm interrupt injection: diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 5316c27f6099..02aadb9d730e 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -20,9 +20,6 @@ extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 #define PT_MODE_HOST_GUEST 1 -#define PMU_CAP_FW_WRITES (1ULL << 13) -#define PMU_CAP_LBR_FMT 0x3f - struct nested_vmx_msrs { /* * We only store the "true" versions of the VMX capability MSRs. We @@ -76,6 +73,11 @@ static inline bool cpu_has_vmx_basic_inout(void) return vmcs_config.basic & VMX_BASIC_INOUT; } +static inline bool cpu_has_vmx_basic_no_hw_errcode_cc(void) +{ + return vmcs_config.basic & VMX_BASIC_NO_HW_ERROR_CODE_CC; +} + static inline bool cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS && @@ -103,6 +105,10 @@ static inline bool cpu_has_load_perf_global_ctrl(void) return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; } +static inline bool cpu_has_load_cet_ctrl(void) +{ + return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_CET_STATE); +} static inline bool cpu_has_vmx_mpx(void) { return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index dbab1c15b0cd..0eb2773b2ae2 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -188,18 +188,18 @@ static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return vmx_get_msr(vcpu, msr_info); } -static void vt_recalc_msr_intercepts(struct kvm_vcpu *vcpu) +static void vt_recalc_intercepts(struct kvm_vcpu *vcpu) { /* - * TDX doesn't allow VMM to configure interception of MSR accesses. - * TDX guest requests MSR accesses by calling TDVMCALL. The MSR - * filters will be applied when handling the TDVMCALL for RDMSR/WRMSR - * if the userspace has set any. + * TDX doesn't allow VMM to configure interception of instructions or + * MSR accesses. TDX guest requests MSR accesses by calling TDVMCALL. + * The MSR filters will be applied when handling the TDVMCALL for + * RDMSR/WRMSR if the userspace has set any. */ if (is_td_vcpu(vcpu)) return; - vmx_recalc_msr_intercepts(vcpu); + vmx_recalc_intercepts(vcpu); } static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) @@ -831,10 +831,11 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return tdx_vcpu_ioctl(vcpu, argp); } -static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, + bool is_private) { if (is_td(kvm)) - return tdx_gmem_private_max_mapping_level(kvm, pfn); + return tdx_gmem_max_mapping_level(kvm, pfn, is_private); return 0; } @@ -995,7 +996,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .apic_init_signal_blocked = vt_op(apic_init_signal_blocked), .migrate_timers = vmx_migrate_timers, - .recalc_msr_intercepts = vt_op(recalc_msr_intercepts), + .recalc_intercepts = vt_op(recalc_intercepts), .complete_emulated_msr = vt_op(complete_emulated_msr), .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, @@ -1005,7 +1006,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl), .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl), - .private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level) + .gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level) }; struct kvm_x86_init_ops vt_init_ops __initdata = { diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b8ea1969113d..76271962cb70 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -721,6 +721,24 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_MPERF, MSR_TYPE_R); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_U_CET, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_S_CET, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL0_SSP, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL1_SSP, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL2_SSP, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL3_SSP, MSR_TYPE_RW); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; @@ -997,7 +1015,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); goto fail; } - if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) { + if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); @@ -1033,7 +1051,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, } } - if (kvm_get_msr_with_filter(vcpu, msr_index, data)) { + if (kvm_emulate_msr_read(vcpu, msr_index, data)) { pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, msr_index); return false; @@ -1272,9 +1290,10 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) { const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | VMX_BASIC_INOUT | - VMX_BASIC_TRUE_CTLS; + VMX_BASIC_TRUE_CTLS | + VMX_BASIC_NO_HW_ERROR_CODE_CC; - const u64 reserved_bits = GENMASK_ULL(63, 56) | + const u64 reserved_bits = GENMASK_ULL(63, 57) | GENMASK_ULL(47, 45) | BIT_ULL(31); @@ -2520,6 +2539,32 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 } } +static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, + u64 *ssp, u64 *ssp_tbl) +{ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || + guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + *s_cet = vmcs_readl(GUEST_S_CET); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { + *ssp = vmcs_readl(GUEST_SSP); + *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); + } +} + +static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, + u64 ssp, u64 ssp_tbl) +{ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || + guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + vmcs_writel(GUEST_S_CET, s_cet); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { + vmcs_writel(GUEST_SSP, ssp); + vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); + } +} + static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); @@ -2636,6 +2681,10 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) + vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, + vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); + set_cr4_guest_host_mask(vmx); } @@ -2675,6 +2724,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, kvm_set_dr(vcpu, 7, vcpu->arch.dr7); vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); } + + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) + vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, + vmx->nested.pre_vmenter_ssp, + vmx->nested.pre_vmenter_ssp_tbl); + if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); @@ -2770,8 +2826,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && - WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->guest_ia32_perf_global_ctrl))) { + WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -2949,7 +3005,6 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, u8 vector = intr_info & INTR_INFO_VECTOR_MASK; u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; - bool should_have_error_code; bool urg = nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST); bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; @@ -2966,12 +3021,19 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) return -EINVAL; - /* VM-entry interruption-info field: deliver error code */ - should_have_error_code = - intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && - x86_exception_has_error_code(vector); - if (CC(has_error_code != should_have_error_code)) - return -EINVAL; + /* + * Cannot deliver error code in real mode or if the interrupt + * type is not hardware exception. For other cases, do the + * consistency check only if the vCPU doesn't enumerate + * VMX_BASIC_NO_HW_ERROR_CODE_CC. + */ + if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { + if (CC(has_error_code)) + return -EINVAL; + } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { + if (CC(has_error_code != x86_exception_has_error_code(vector))) + return -EINVAL; + } /* VM-entry exception error code */ if (CC(has_error_code && @@ -3038,6 +3100,16 @@ static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) return !__is_canonical_address(la, l1_address_bits_on_exit); } +static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, + u64 ssp, u64 ssp_tbl) +{ + if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || + CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -3048,6 +3120,9 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) return -EINVAL; + if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) + return -EINVAL; + if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) return -EINVAL; @@ -3104,6 +3179,27 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, return -EINVAL; } + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { + if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, + vmcs12->host_ssp, + vmcs12->host_ssp_tbl)) + return -EINVAL; + + /* + * IA32_S_CET and SSP must be canonical if the host will + * enter 64-bit mode after VM-exit; otherwise, higher + * 32-bits must be all 0s. + */ + if (ia32e) { + if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || + CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) + return -EINVAL; + } else { + if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) + return -EINVAL; + } + } + return 0; } @@ -3162,6 +3258,9 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) return -EINVAL; + if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) + return -EINVAL; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) @@ -3211,6 +3310,23 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) return -EINVAL; + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { + if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, + vmcs12->guest_ssp, + vmcs12->guest_ssp_tbl)) + return -EINVAL; + + /* + * Guest SSP must have 63:N bits identical, rather than + * be canonical (i.e., 63:N-1 bits identical), where N is + * the CPU's maximum linear-address width. Similar to + * is_noncanonical_msr_address(), use the host's + * linear-address width. + */ + if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) + return -EINVAL; + } + if (nested_check_guest_non_reg_state(vmcs12)) return -EINVAL; @@ -3544,6 +3660,12 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) + vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, + &vmx->nested.pre_vmenter_ssp, + &vmx->nested.pre_vmenter_ssp_tbl); + /* * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* * nested early checks are disabled. In the event of a "late" VM-Fail, @@ -3690,7 +3812,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); + kvm_pmu_branch_retired(vcpu); if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) return nested_vmx_failInvalid(vcpu); @@ -4627,6 +4749,10 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; + + vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, + &vmcs12->guest_ssp, + &vmcs12->guest_ssp_tbl); } /* @@ -4752,14 +4878,26 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) vmcs_write64(GUEST_BNDCFGS, 0); + /* + * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. + * otherwise CET state should be retained across VM-exit, i.e., + * guest values should be propagated from vmcs12 to vmcs01. + */ + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) + vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, + vmcs12->host_ssp_tbl); + else + vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, + vmcs12->guest_ssp_tbl); + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; } if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) - WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->host_ia32_perf_global_ctrl)); + WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl)); /* Set L1 segment info according to Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers */ @@ -4937,7 +5075,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) goto vmabort; } - if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) { + if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { pr_debug_ratelimited( "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", __func__, j, h.index, h.value); @@ -6216,19 +6354,26 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, union vmx_exit_reason exit_reason) { - u32 msr_index = kvm_rcx_read(vcpu); + u32 msr_index; gpa_t bitmap; if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return true; + if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || + exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) + msr_index = vmx_get_exit_qual(vcpu); + else + msr_index = kvm_rcx_read(vcpu); + /* * The MSR_BITMAP page is divided into four 1024-byte bitmaps, * for the four combinations of read/write and low/high MSR numbers. * First we need to figure out which of the four to use: */ bitmap = vmcs12->msr_bitmap; - if (exit_reason.basic == EXIT_REASON_MSR_WRITE) + if (exit_reason.basic == EXIT_REASON_MSR_WRITE || + exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) bitmap += 2048; if (msr_index >= 0xc0000000) { msr_index -= 0xc0000000; @@ -6527,6 +6672,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: + case EXIT_REASON_MSR_READ_IMM: + case EXIT_REASON_MSR_WRITE_IMM: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); case EXIT_REASON_INVALID_STATE: return true; @@ -6561,14 +6708,17 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: return true; - case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: + case EXIT_REASON_XSAVES: + case EXIT_REASON_XRSTORS: /* - * This should never happen, since it is not possible to - * set XSS to a non-zero value---neither in L1 nor in L2. - * If if it were, XSS would have to be checked against - * the XSS exit bitmap in vmcs12. + * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize + * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap + * verbatim, i.e. any exit is due to L1's bitmap. WARN if + * XSAVES isn't enabled, as the CPU is supposed to inject #UD + * in that case, before consulting the XSS-bitmap. */ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); + WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); + return true; case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, @@ -7029,13 +7179,17 @@ static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | - VM_EXIT_CLEAR_BNDCFGS; + VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; + /* We support free control of debug control saving. */ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; } @@ -7051,11 +7205,16 @@ static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; + VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | + VM_ENTRY_LOAD_CET_STATE; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; + /* We support free control of debug control loading. */ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; } @@ -7205,6 +7364,8 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) msrs->basic |= VMX_BASIC_TRUE_CTLS; if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; + if (cpu_has_vmx_basic_no_hw_errcode_cc()) + msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; } static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 6eedcfc91070..983484d42ebf 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -309,6 +309,11 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) __kvm_is_valid_cr4(vcpu, val); } +static inline bool nested_cpu_has_no_hw_errcode_cc(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.basic & VMX_BASIC_NO_HW_ERROR_CODE_CC; +} + /* No difference in the restrictions on guest and host CR4 in VMX operation. */ #define nested_guest_cr4_valid nested_cr4_valid #define nested_host_cr4_valid nested_cr4_valid diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 0b173602821b..de1d9785c01f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -138,7 +138,7 @@ static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) { - return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0; + return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0; } static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) @@ -478,8 +478,8 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index) }; u64 eventsel; - BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUTNERS); - BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUTNERS); + BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUNTERS); + BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUNTERS); /* * Yell if perf reports support for a fixed counter but perf doesn't @@ -536,29 +536,44 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.num_counters_gp); eax.split.bit_width = min_t(int, eax.split.bit_width, kvm_pmu_cap.bit_width_gp); - pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; + pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(eax.split.bit_width) - 1; eax.split.mask_length = min_t(int, eax.split.mask_length, kvm_pmu_cap.events_mask_len); - pmu->available_event_types = ~entry->ebx & - ((1ull << eax.split.mask_length) - 1); - - if (pmu->version == 1) { - pmu->nr_arch_fixed_counters = 0; - } else { - pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, - kvm_pmu_cap.num_counters_fixed); - edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, - kvm_pmu_cap.bit_width_fixed); - pmu->counter_bitmask[KVM_PMC_FIXED] = - ((u64)1 << edx.split.bit_width_fixed) - 1; + pmu->available_event_types = ~entry->ebx & (BIT_ULL(eax.split.mask_length) - 1); + + entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); + if (entry && + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { + pmu->reserved_bits ^= HSW_IN_TX; + pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); } + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + if (intel_pmu_lbr_is_compatible(vcpu) && + (perf_capabilities & PERF_CAP_LBR_FMT)) + memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); + else + lbr_desc->records.nr = 0; + + if (lbr_desc->records.nr) + bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); + + if (pmu->version == 1) + return; + + pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, + kvm_pmu_cap.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, + kvm_pmu_cap.bit_width_fixed); + pmu->counter_bitmask[KVM_PMC_FIXED] = BIT_ULL(edx.split.bit_width_fixed) - 1; + intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | INTEL_FIXED_0_ENABLE_PMI); - counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); + counter_rsvd = ~((BIT_ULL(pmu->nr_arch_gp_counters) - 1) | + ((BIT_ULL(pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); pmu->global_ctrl_rsvd = counter_rsvd; /* @@ -573,29 +588,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_status_rsvd &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; - entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); - if (entry && - (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && - (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { - pmu->reserved_bits ^= HSW_IN_TX; - pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); - } - - bitmap_set(pmu->all_valid_pmc_idx, - 0, pmu->nr_arch_gp_counters); - bitmap_set(pmu->all_valid_pmc_idx, - INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - - perf_capabilities = vcpu_get_perf_capabilities(vcpu); - if (intel_pmu_lbr_is_compatible(vcpu) && - (perf_capabilities & PMU_CAP_LBR_FMT)) - memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); - else - lbr_desc->records.nr = 0; - - if (lbr_desc->records.nr) - bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); - if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { pmu->pebs_enable_rsvd = counter_rsvd; @@ -603,8 +595,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_data_cfg_rsvd = ~0xff00000full; intel_pmu_enable_fixed_counter_bits(pmu, ICL_FIXED_0_ADAPTIVE); } else { - pmu->pebs_enable_rsvd = - ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->pebs_enable_rsvd = ~(BIT_ULL(pmu->nr_arch_gp_counters) - 1); } } } @@ -625,7 +616,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->gp_counters[i].current_config = 0; } - for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUTNERS; i++) { + for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUNTERS; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX; @@ -762,7 +753,7 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) int bit, hw_idx; kvm_for_each_pmc(pmu, pmc, bit, (unsigned long *)&pmu->global_ctrl) { - if (!pmc_speculative_in_use(pmc) || + if (!pmc_is_locally_enabled(pmc) || !pmc_is_globally_enabled(pmc) || !pmc->perf_event) continue; diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 66744f5768c8..0a49c863c811 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -281,25 +281,6 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } -static void tdx_clear_page(struct page *page) -{ - const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0)); - void *dest = page_to_virt(page); - unsigned long i; - - /* - * The page could have been poisoned. MOVDIR64B also clears - * the poison bit so the kernel can safely use the page again. - */ - for (i = 0; i < PAGE_SIZE; i += 64) - movdir64b(dest + i, zero_page); - /* - * MOVDIR64B store uses WC buffer. Prevent following memory reads - * from seeing potentially poisoned cache. - */ - __mb(); -} - static void tdx_no_vcpus_enter_start(struct kvm *kvm) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); @@ -345,7 +326,7 @@ static int tdx_reclaim_page(struct page *page) r = __tdx_reclaim_page(page); if (!r) - tdx_clear_page(page); + tdx_quirk_reset_page(page); return r; } @@ -442,6 +423,16 @@ void tdx_disable_virtualization_cpu(void) tdx_flush_vp(&arg); } local_irq_restore(flags); + + /* + * Flush cache now if kexec is possible: this is necessary to avoid + * having dirty private memory cachelines when the new kernel boots, + * but WBINVD is a relatively expensive operation and doing it during + * kexec can exacerbate races in native_stop_other_cpus(). Do it + * now, since this is a safe moment and there is going to be no more + * TDX activity on this CPU from this point on. + */ + tdx_cpu_flush_cache_for_kexec(); } #define TDX_SEAMCALL_RETRIES 10000 @@ -593,7 +584,7 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm) pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); return; } - tdx_clear_page(kvm_tdx->td.tdr_page); + tdx_quirk_reset_page(kvm_tdx->td.tdr_page); __free_page(kvm_tdx->td.tdr_page); kvm_tdx->td.tdr_page = NULL; @@ -629,6 +620,11 @@ int tdx_vm_init(struct kvm *kvm) struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); kvm->arch.has_protected_state = true; + /* + * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap, + * i.e. all EOIs are accelerated and never trigger exits. + */ + kvm->arch.has_protected_eoi = true; kvm->arch.has_private_mem = true; kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; @@ -861,6 +857,7 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu) if (tdx->vp.tdvpr_page) { tdx_reclaim_control_page(tdx->vp.tdvpr_page); tdx->vp.tdvpr_page = 0; + tdx->vp.tdvpr_pa = 0; } tdx->state = VCPU_TD_STATE_UNINITIALIZED; @@ -1714,7 +1711,7 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); return -EIO; } - tdx_clear_page(page); + tdx_quirk_reset_page(page); tdx_unpin(kvm, page); return 0; } @@ -2002,6 +1999,8 @@ static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) * handle retries locally in their EPT violation handlers. */ while (1) { + struct kvm_memory_slot *slot; + ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual); if (ret != RET_PF_RETRY || !local_retry) @@ -2015,6 +2014,15 @@ static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) break; } + /* + * Bail if the memslot is invalid, i.e. is being deleted, as + * faulting in will never succeed and this task needs to drop + * SRCU in order to let memslot deletion complete. + */ + slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa)); + if (slot && slot->flags & KVM_MEMSLOT_INVALID) + break; + cond_resched(); } return ret; @@ -2480,7 +2488,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */ kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1; tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages), - GFP_KERNEL | __GFP_ZERO); + GFP_KERNEL); if (!tdcs_pages) goto free_tdr; @@ -2940,6 +2948,13 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) return -ENOMEM; tdx->vp.tdvpr_page = page; + /* + * page_to_phys() does not work in 'noinstr' code, like guest + * entry via tdh_vp_enter(). Precalculate and store it instead + * of doing it at runtime later. + */ + tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page); + tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), GFP_KERNEL); if (!tdx->vp.tdcx_pages) { @@ -3002,6 +3017,7 @@ free_tdvpr: if (tdx->vp.tdvpr_page) __free_page(tdx->vp.tdvpr_page); tdx->vp.tdvpr_page = 0; + tdx->vp.tdvpr_pa = 0; return ret; } @@ -3318,8 +3334,11 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return ret; } -int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) { + if (!is_private) + return 0; + return PG_LEVEL_4K; } @@ -3457,12 +3476,11 @@ static int __init __tdx_bringup(void) if (r) goto tdx_bringup_err; + r = -EINVAL; /* Get TDX global information for later use */ tdx_sysinfo = tdx_get_sysinfo(); - if (WARN_ON_ONCE(!tdx_sysinfo)) { - r = -EINVAL; + if (WARN_ON_ONCE(!tdx_sysinfo)) goto get_sysinfo_err; - } /* Check TDX module and KVM capabilities */ if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || @@ -3505,14 +3523,11 @@ static int __init __tdx_bringup(void) if (td_conf->max_vcpus_per_td < num_present_cpus()) { pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", td_conf->max_vcpus_per_td, num_present_cpus()); - r = -EINVAL; goto get_sysinfo_err; } - if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) { - r = -EINVAL; + if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) goto get_sysinfo_err; - } /* * Leave hardware virtualization enabled after TDX is enabled diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 106a72c923ca..4233b5ca9461 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -139,6 +139,9 @@ const unsigned short vmcs12_field_offsets[] = { FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), + FIELD(GUEST_S_CET, guest_s_cet), + FIELD(GUEST_SSP, guest_ssp), + FIELD(GUEST_INTR_SSP_TABLE, guest_ssp_tbl), FIELD(HOST_CR0, host_cr0), FIELD(HOST_CR3, host_cr3), FIELD(HOST_CR4, host_cr4), @@ -151,5 +154,8 @@ const unsigned short vmcs12_field_offsets[] = { FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), FIELD(HOST_RSP, host_rsp), FIELD(HOST_RIP, host_rip), + FIELD(HOST_S_CET, host_s_cet), + FIELD(HOST_SSP, host_ssp), + FIELD(HOST_INTR_SSP_TABLE, host_ssp_tbl), }; const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets); diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 56fd150a6f24..4ad6b16525b9 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -117,7 +117,13 @@ struct __packed vmcs12 { natural_width host_ia32_sysenter_eip; natural_width host_rsp; natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ + natural_width host_s_cet; + natural_width host_ssp; + natural_width host_ssp_tbl; + natural_width guest_s_cet; + natural_width guest_ssp; + natural_width guest_ssp_tbl; + natural_width paddingl[2]; /* room for future expansion */ u32 pin_based_vm_exec_control; u32 cpu_based_vm_exec_control; u32 exception_bitmap; @@ -294,6 +300,12 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(host_ia32_sysenter_eip, 656); CHECK_OFFSET(host_rsp, 664); CHECK_OFFSET(host_rip, 672); + CHECK_OFFSET(host_s_cet, 680); + CHECK_OFFSET(host_ssp, 688); + CHECK_OFFSET(host_ssp_tbl, 696); + CHECK_OFFSET(guest_s_cet, 704); + CHECK_OFFSET(guest_ssp, 712); + CHECK_OFFSET(guest_ssp_tbl, 720); CHECK_OFFSET(pin_based_vm_exec_control, 744); CHECK_OFFSET(cpu_based_vm_exec_control, 748); CHECK_OFFSET(exception_bitmap, 752); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aa157fe5b7b3..546272a5d34d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1344,22 +1344,35 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) } #ifdef CONFIG_X86_64 -static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) +static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache) { preempt_disable(); if (vmx->vt.guest_state_loaded) - rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + *cache = read_msr(msr); preempt_enable(); - return vmx->msr_guest_kernel_gs_base; + return *cache; } -static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) +static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data, + u64 *cache) { preempt_disable(); if (vmx->vt.guest_state_loaded) - wrmsrq(MSR_KERNEL_GS_BASE, data); + wrmsrns(msr, data); preempt_enable(); - vmx->msr_guest_kernel_gs_base = data; + *cache = data; +} + +static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) +{ + return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, + &vmx->msr_guest_kernel_gs_base); +} + +static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) +{ + vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data, + &vmx->msr_guest_kernel_gs_base); } #endif @@ -2093,6 +2106,15 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; + case MSR_IA32_S_CET: + msr_info->data = vmcs_readl(GUEST_S_CET); + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + msr_info->data = vmcs_readl(GUEST_SSP); + break; + case MSR_IA32_INT_SSP_TAB: + msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE); + break; case MSR_IA32_DEBUGCTLMSR: msr_info->data = vmx_guest_debugctl_read(); break; @@ -2127,7 +2149,7 @@ u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; - if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && + if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) && (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; @@ -2411,10 +2433,19 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else vmx->pt_desc.guest.addr_a[index / 2] = data; break; + case MSR_IA32_S_CET: + vmcs_writel(GUEST_S_CET, data); + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + vmcs_writel(GUEST_SSP, data); + break; + case MSR_IA32_INT_SSP_TAB: + vmcs_writel(GUEST_INTR_SSP_TABLE, data); + break; case MSR_IA32_PERF_CAPABILITIES: - if (data & PMU_CAP_LBR_FMT) { - if ((data & PMU_CAP_LBR_FMT) != - (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) + if (data & PERF_CAP_LBR_FMT) { + if ((data & PERF_CAP_LBR_FMT) != + (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1; @@ -2584,6 +2615,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, + { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE }, }; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); @@ -4068,8 +4100,10 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) } } -void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) +static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { + bool intercept; + if (!cpu_has_vmx_msr_bitmap()) return; @@ -4115,12 +4149,34 @@ void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept); + } + + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) { + intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + + vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); + } + /* * x2APIC and LBR MSR intercepts are modified on-demand and cannot be * filtered by userspace. */ } +void vmx_recalc_intercepts(struct kvm_vcpu *vcpu) +{ + vmx_recalc_msr_intercepts(vcpu); +} + static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { @@ -4270,6 +4326,21 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) if (cpu_has_load_ia32_efer()) vmcs_write64(HOST_IA32_EFER, kvm_host.efer); + + /* + * Supervisor shadow stack is not enabled on host side, i.e., + * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM + * description(RDSSP instruction), SSP is not readable in CPL0, + * so resetting the two registers to 0s at VM-Exit does no harm + * to kernel execution. When execution flow exits to userspace, + * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter + * 3 and 4 for details. + */ + if (cpu_has_load_cet_ctrl()) { + vmcs_writel(HOST_S_CET, kvm_host.s_cet); + vmcs_writel(HOST_SSP, 0); + vmcs_writel(HOST_INTR_SSP_TABLE, 0); + } } void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) @@ -4304,7 +4375,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) return pin_based_exec_ctrl; } -static u32 vmx_vmentry_ctrl(void) +static u32 vmx_get_initial_vmentry_ctrl(void) { u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; @@ -4321,7 +4392,7 @@ static u32 vmx_vmentry_ctrl(void) return vmentry_ctrl; } -static u32 vmx_vmexit_ctrl(void) +static u32 vmx_get_initial_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; @@ -4351,19 +4422,13 @@ void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (kvm_vcpu_apicv_active(vcpu)) { - secondary_exec_controls_setbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - if (enable_ipiv) - tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); - } else { - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - if (enable_ipiv) - tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); - } + secondary_exec_controls_changebit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY, + kvm_vcpu_apicv_active(vcpu)); + if (enable_ipiv) + tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT, + kvm_vcpu_apicv_active(vcpu)); vmx_update_msr_bitmap_x2apic(vcpu); } @@ -4686,10 +4751,10 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); + vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl()); /* 22.2.1, 20.8.1 */ - vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); + vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl()); vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); @@ -4817,6 +4882,14 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + vmcs_writel(GUEST_SSP, 0); + vmcs_writel(GUEST_INTR_SSP_TABLE, 0); + } + if (kvm_cpu_cap_has(X86_FEATURE_IBT) || + kvm_cpu_cap_has(X86_FEATURE_SHSTK)) + vmcs_writel(GUEST_S_CET, 0); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); vpid_sync_context(vmx->vpid); @@ -5785,6 +5858,13 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; + /* + * Ensure that any updates to kvm->buses[] observed by the + * previous instruction (emulated or otherwise) are also + * visible to the instruction KVM is about to emulate. + */ + smp_rmb(); + if (!kvm_emulate_instruction(vcpu, 0)) return 0; @@ -6003,6 +6083,23 @@ static int handle_notify(struct kvm_vcpu *vcpu) return 1; } +static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu) +{ + return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO)); +} + +static int handle_rdmsr_imm(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu), + vmx_get_msr_imm_reg(vcpu)); +} + +static int handle_wrmsr_imm(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), + vmx_get_msr_imm_reg(vcpu)); +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6061,6 +6158,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, [EXIT_REASON_NOTIFY] = handle_notify, + [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm, + [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm, }; static const int kvm_vmx_max_exit_handlers = @@ -6265,6 +6364,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); + if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) + pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", + vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP), + vmcs_readl(GUEST_INTR_SSP_TABLE)); pr_err("*** Host State ***\n"); pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); @@ -6295,6 +6398,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); + if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE) + pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", + vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP), + vmcs_readl(HOST_INTR_SSP_TABLE)); pr_err("*** Control State ***\n"); pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", @@ -6495,6 +6602,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) #ifdef CONFIG_MITIGATION_RETPOLINE if (exit_reason.basic == EXIT_REASON_MSR_WRITE) return kvm_emulate_wrmsr(vcpu); + else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) + return handle_wrmsr_imm(vcpu); else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) return handle_preemption_timer(vcpu); else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) @@ -7170,11 +7279,16 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, switch (vmx_get_exit_reason(vcpu).basic) { case EXIT_REASON_MSR_WRITE: - return handle_fastpath_set_msr_irqoff(vcpu); + return handle_fastpath_wrmsr(vcpu); + case EXIT_REASON_MSR_WRITE_IMM: + return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), + vmx_get_msr_imm_reg(vcpu)); case EXIT_REASON_PREEMPTION_TIMER: return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); case EXIT_REASON_HLT: return handle_fastpath_hlt(vcpu); + case EXIT_REASON_INVD: + return handle_fastpath_invd(vcpu); default: return EXIT_FASTPATH_NONE; } @@ -7641,6 +7755,8 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); + cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK)); + cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT)); entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); @@ -7775,16 +7891,13 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_LC_ENABLED; - /* Recalc MSR interception to account for feature changes. */ - vmx_recalc_msr_intercepts(vcpu); - /* Refresh #PF interception to account for MAXPHYADDR changes. */ vmx_update_exception_bitmap(vcpu); } static __init u64 vmx_get_perf_capabilities(void) { - u64 perf_cap = PMU_CAP_FW_WRITES; + u64 perf_cap = PERF_CAP_FW_WRITES; u64 host_perf_cap = 0; if (!enable_pmu) @@ -7804,7 +7917,7 @@ static __init u64 vmx_get_perf_capabilities(void) if (!vmx_lbr_caps.has_callstack) memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); else if (vmx_lbr_caps.nr) - perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT; } if (vmx_pebs_supported()) { @@ -7872,7 +7985,6 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_UMIP); /* CPUID 0xD.1 */ - kvm_caps.supported_xss = 0; if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); @@ -7884,6 +7996,18 @@ static __init void vmx_set_cpu_caps(void) if (cpu_has_vmx_waitpkg()) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); + + /* + * Disable CET if unrestricted_guest is unsupported as KVM doesn't + * enforce CET HW behaviors in emulator. On platforms with + * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code + * fails, so disable CET in this case too. + */ + if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || + !cpu_has_vmx_basic_no_hw_errcode_cc()) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + } } static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, @@ -8333,8 +8457,6 @@ __init int vmx_hardware_setup(void) vmx_setup_user_return_msrs(); - if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) - return -EIO; if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@ -8364,6 +8486,14 @@ __init int vmx_hardware_setup(void) return -EOPNOTSUPP; } + /* + * Shadow paging doesn't have a (further) performance penalty + * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it + * by default + */ + if (!enable_ept) + allow_smaller_maxphyaddr = true; + if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; @@ -8489,6 +8619,13 @@ __init int vmx_hardware_setup(void) setup_default_sgx_lepubkeyhash(); + vmx_set_cpu_caps(); + + /* + * Configure nested capabilities after core CPU capabilities so that + * nested support can be conditional on base support, e.g. so that KVM + * can hide/show features based on kvm_cpu_cap_has(). + */ if (nested) { nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); @@ -8497,8 +8634,6 @@ __init int vmx_hardware_setup(void) return r; } - vmx_set_cpu_caps(); - r = alloc_kvm_area(); if (r && nested) nested_vmx_hardware_unsetup(); @@ -8525,7 +8660,9 @@ __init int vmx_hardware_setup(void) */ if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; - kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + return r; } @@ -8558,11 +8695,18 @@ int __init vmx_init(void) return -EOPNOTSUPP; /* - * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing - * to unwind if a later step fails. + * Note, VMCS and eVMCS configuration only touch VMX knobs/variables, + * i.e. there's nothing to unwind if a later step fails. */ hv_init_evmcs(); + /* + * Parse the VMCS config and VMX capabilities before anything else, so + * that the information is available to all setup flows. + */ + if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) + return -EIO; + r = kvm_x86_vendor_init(&vt_init_ops); if (r) return r; @@ -8586,14 +8730,6 @@ int __init vmx_init(void) vmx_check_vmcs12_offsets(); - /* - * Shadow paging doesn't have a (further) performance penalty - * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it - * by default - */ - if (!enable_ept) - allow_smaller_maxphyaddr = true; - return 0; err_l1d_flush: diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index d3389baf3ab3..ea93121029f9 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -181,6 +181,9 @@ struct nested_vmx { */ u64 pre_vmenter_debugctl; u64 pre_vmenter_bndcfgs; + u64 pre_vmenter_s_cet; + u64 pre_vmenter_ssp; + u64 pre_vmenter_ssp_tbl; /* to migrate it to L1 if L2 writes to L1's CR8 directly */ int l1_tpr_threshold; @@ -484,7 +487,8 @@ static inline u8 vmx_get_rvi(void) VM_ENTRY_LOAD_IA32_EFER | \ VM_ENTRY_LOAD_BNDCFGS | \ VM_ENTRY_PT_CONCEAL_PIP | \ - VM_ENTRY_LOAD_IA32_RTIT_CTL) + VM_ENTRY_LOAD_IA32_RTIT_CTL | \ + VM_ENTRY_LOAD_CET_STATE) #define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ (VM_EXIT_SAVE_DEBUG_CONTROLS | \ @@ -506,7 +510,8 @@ static inline u8 vmx_get_rvi(void) VM_EXIT_LOAD_IA32_EFER | \ VM_EXIT_CLEAR_BNDCFGS | \ VM_EXIT_PT_CONCEAL_PIP | \ - VM_EXIT_CLEAR_IA32_RTIT_CTL) + VM_EXIT_CLEAR_IA32_RTIT_CTL | \ + VM_EXIT_LOAD_CET_STATE) #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \ (PIN_BASED_EXT_INTR_MASK | \ @@ -608,6 +613,14 @@ static __always_inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##b { \ BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \ lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ +} \ +static __always_inline void lname##_controls_changebit(struct vcpu_vmx *vmx, u##bits val, \ + bool set) \ +{ \ + if (set) \ + lname##_controls_setbit(vmx, val); \ + else \ + lname##_controls_clearbit(vmx, val); \ } BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32) BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) @@ -706,6 +719,11 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) void dump_vmcs(struct kvm_vcpu *vcpu); +static inline int vmx_get_instr_info_reg(u32 vmx_instr_info) +{ + return (vmx_instr_info >> 3) & 0xf; +} + static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) { return (vmx_instr_info >> 28) & 0xf; diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 2b3424f638db..9697368d65b3 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -52,7 +52,7 @@ void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, int trig_mode, int vector); void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); -void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu); +void vmx_recalc_intercepts(struct kvm_vcpu *vcpu); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); int vmx_get_feature_msr(u32 msr, u64 *data); @@ -153,7 +153,7 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); -int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private); #endif #endif /* __KVM_X86_VMX_X86_OPS_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 604490b1cb19..4b8138bd4857 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -97,10 +97,10 @@ * vendor module being reloaded with different module parameters. */ struct kvm_caps kvm_caps __read_mostly; -EXPORT_SYMBOL_GPL(kvm_caps); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_caps); struct kvm_host_values kvm_host __read_mostly; -EXPORT_SYMBOL_GPL(kvm_host); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_host); #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) @@ -136,6 +136,9 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static DEFINE_MUTEX(vendor_module_lock); +static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); +static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); + struct kvm_x86_ops kvm_x86_ops __read_mostly; #define KVM_X86_OP(func) \ @@ -152,7 +155,7 @@ module_param(ignore_msrs, bool, 0644); bool __read_mostly report_ignored_msrs = true; module_param(report_ignored_msrs, bool, 0644); -EXPORT_SYMBOL_GPL(report_ignored_msrs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs); unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, 0644); @@ -164,12 +167,9 @@ module_param(kvmclock_periodic_sync, bool, 0444); static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, 0644); -static bool __read_mostly vector_hashing = true; -module_param(vector_hashing, bool, 0444); - bool __read_mostly enable_vmware_backdoor = false; module_param(enable_vmware_backdoor, bool, 0444); -EXPORT_SYMBOL_GPL(enable_vmware_backdoor); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_vmware_backdoor); /* * Flags to manipulate forced emulation behavior (any non-zero value will @@ -184,7 +184,7 @@ module_param(pi_inject_timer, bint, 0644); /* Enable/disable PMU virtualization */ bool __read_mostly enable_pmu = true; -EXPORT_SYMBOL_GPL(enable_pmu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu); module_param(enable_pmu, bool, 0444); bool __read_mostly eager_page_split = true; @@ -211,7 +211,7 @@ struct kvm_user_return_msrs { }; u32 __read_mostly kvm_nr_uret_msrs; -EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs); static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; static struct kvm_user_return_msrs __percpu *user_return_msrs; @@ -220,17 +220,26 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs; | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) +#define XFEATURE_MASK_CET_ALL (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL) +/* + * Note, KVM supports exposing PT to the guest, but does not support context + * switching PT via XSTATE (KVM's PT virtualization relies on perf; swapping + * PT via guest XSTATE would clobber perf state), i.e. KVM doesn't support + * IA32_XSS[bit 8] (guests can/must use RDMSR/WRMSR to save/restore PT MSRs). + */ +#define KVM_SUPPORTED_XSS (XFEATURE_MASK_CET_ALL) + bool __read_mostly allow_smaller_maxphyaddr = 0; -EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(allow_smaller_maxphyaddr); bool __read_mostly enable_apicv = true; -EXPORT_SYMBOL_GPL(enable_apicv); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_apicv); bool __read_mostly enable_ipiv = true; -EXPORT_SYMBOL_GPL(enable_ipiv); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_ipiv); bool __read_mostly enable_device_posted_irqs = true; -EXPORT_SYMBOL_GPL(enable_device_posted_irqs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_device_posted_irqs); const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -335,7 +344,11 @@ static const u32 msrs_to_save_base[] = { MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, MSR_IA32_UMWAIT_CONTROL, - MSR_IA32_XFD, MSR_IA32_XFD_ERR, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS, + + MSR_IA32_U_CET, MSR_IA32_S_CET, + MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP, + MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB, }; static const u32 msrs_to_save_pmu[] = { @@ -367,6 +380,7 @@ static const u32 msrs_to_save_pmu[] = { MSR_AMD64_PERF_CNTR_GLOBAL_CTL, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + @@ -614,7 +628,7 @@ int kvm_add_user_return_msr(u32 msr) kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; return kvm_nr_uret_msrs++; } -EXPORT_SYMBOL_GPL(kvm_add_user_return_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_add_user_return_msr); int kvm_find_user_return_msr(u32 msr) { @@ -626,7 +640,7 @@ int kvm_find_user_return_msr(u32 msr) } return -1; } -EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr); static void kvm_user_return_msr_cpu_online(void) { @@ -666,7 +680,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) kvm_user_return_register_notifier(msrs); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr); void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) { @@ -675,7 +689,13 @@ void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) msrs->values[slot].curr = value; kvm_user_return_register_notifier(msrs); } -EXPORT_SYMBOL_GPL(kvm_user_return_msr_update_cache); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_user_return_msr_update_cache); + +u64 kvm_get_user_return_msr(unsigned int slot) +{ + return this_cpu_ptr(user_return_msrs)->values[slot].curr; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr); static void drop_user_return_notifiers(void) { @@ -697,7 +717,7 @@ noinstr void kvm_spurious_fault(void) /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); } -EXPORT_SYMBOL_GPL(kvm_spurious_fault); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spurious_fault); #define EXCPT_BENIGN 0 #define EXCPT_CONTRIBUTORY 1 @@ -802,7 +822,7 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, ex->has_payload = false; ex->payload = 0; } -EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_deliver_exception_payload); static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector, bool has_error_code, u32 error_code, @@ -886,7 +906,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { kvm_multiple_exception(vcpu, nr, false, 0, false, 0); } -EXPORT_SYMBOL_GPL(kvm_queue_exception); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, @@ -894,7 +914,7 @@ void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, { kvm_multiple_exception(vcpu, nr, false, 0, true, payload); } -EXPORT_SYMBOL_GPL(kvm_queue_exception_p); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) @@ -929,7 +949,7 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, vcpu->arch.exception.has_payload = false; vcpu->arch.exception.payload = 0; } -EXPORT_SYMBOL_GPL(kvm_requeue_exception); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_requeue_exception); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { @@ -940,7 +960,7 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) return 1; } -EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_complete_insn_gp); static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) { @@ -990,7 +1010,7 @@ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, fault_mmu->inject_page_fault(vcpu, fault); } -EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_emulated_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { @@ -1002,7 +1022,7 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { kvm_multiple_exception(vcpu, nr, true, error_code, false, 0); } -EXPORT_SYMBOL_GPL(kvm_queue_exception_e); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_e); /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue @@ -1024,7 +1044,7 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) kvm_queue_exception(vcpu, UD_VECTOR); return false; } -EXPORT_SYMBOL_GPL(kvm_require_dr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr); static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { @@ -1079,7 +1099,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) return 1; } -EXPORT_SYMBOL_GPL(load_pdptrs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(load_pdptrs); static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { @@ -1132,7 +1152,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) kvm_mmu_reset_context(vcpu); } -EXPORT_SYMBOL_GPL(kvm_post_set_cr0); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr0); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { @@ -1167,19 +1187,22 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) return 1; + if (!(cr0 & X86_CR0_WP) && kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) + return 1; + kvm_x86_call(set_cr0)(vcpu, cr0); kvm_post_set_cr0(vcpu, old_cr0, cr0); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr0); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } -EXPORT_SYMBOL_GPL(kvm_lmsw); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw); void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) { @@ -1202,7 +1225,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) wrpkru(vcpu->arch.pkru); } -EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_guest_xsave_state); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { @@ -1228,7 +1251,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) } } -EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state); #ifdef CONFIG_X86_64 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) @@ -1237,7 +1260,7 @@ static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) } #endif -static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; u64 old_xcr0 = vcpu->arch.xcr0; @@ -1281,6 +1304,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.cpuid_dynamic_bits_dirty = true; return 0; } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_set_xcr); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { @@ -1293,7 +1317,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_xsetbv); static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { @@ -1341,7 +1365,7 @@ void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned lon kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } -EXPORT_SYMBOL_GPL(kvm_post_set_cr4); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { @@ -1366,13 +1390,16 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } + if ((cr4 & X86_CR4_CET) && !kvm_is_cr0_bit_set(vcpu, X86_CR0_WP)) + return 1; + kvm_x86_call(set_cr4)(vcpu, cr4); kvm_post_set_cr4(vcpu, old_cr4, cr4); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr4); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr4); static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) { @@ -1464,7 +1491,7 @@ handle_tlb_flush: return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr3); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr3); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { @@ -1476,7 +1503,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) vcpu->arch.cr8 = cr8; return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr8); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) { @@ -1485,7 +1512,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) else return vcpu->arch.cr8; } -EXPORT_SYMBOL_GPL(kvm_get_cr8); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_cr8); static void kvm_update_dr0123(struct kvm_vcpu *vcpu) { @@ -1510,7 +1537,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu) if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } -EXPORT_SYMBOL_GPL(kvm_update_dr7); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_update_dr7); static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { @@ -1551,7 +1578,7 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) return 0; } -EXPORT_SYMBOL_GPL(kvm_set_dr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_dr); unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) { @@ -1568,14 +1595,14 @@ unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) return vcpu->arch.dr7; } } -EXPORT_SYMBOL_GPL(kvm_get_dr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dr); int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_rcx_read(vcpu); + u32 pmc = kvm_rcx_read(vcpu); u64 data; - if (kvm_pmu_rdpmc(vcpu, ecx, &data)) { + if (kvm_pmu_rdpmc(vcpu, pmc, &data)) { kvm_inject_gp(vcpu, 0); return 1; } @@ -1584,7 +1611,7 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) kvm_rdx_write(vcpu, data >> 32); return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdpmc); /* * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM @@ -1723,7 +1750,7 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) return __kvm_valid_efer(vcpu, efer); } -EXPORT_SYMBOL_GPL(kvm_valid_efer); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_valid_efer); static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { @@ -1766,7 +1793,7 @@ void kvm_enable_efer_bits(u64 mask) { efer_reserved_bits &= ~mask; } -EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { @@ -1809,7 +1836,7 @@ out: return allowed; } -EXPORT_SYMBOL_GPL(kvm_msr_allowed); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_msr_allowed); /* * Write @data into the MSR specified by @index. Select MSR specific fault @@ -1870,6 +1897,44 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, data = (u32)data; break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) + return KVM_MSR_RET_UNSUPPORTED; + if (!kvm_is_valid_u_s_cet(vcpu, data)) + return 1; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + if (!host_initiated) + return 1; + fallthrough; + /* + * Note that the MSR emulation here is flawed when a vCPU + * doesn't support the Intel 64 architecture. The expected + * architectural behavior in this case is that the upper 32 + * bits do not exist and should always read '0'. However, + * because the actual hardware on which the virtual CPU is + * running does support Intel 64, XRSTORS/XSAVES in the + * guest could observe behavior that violates the + * architecture. Intercepting XRSTORS/XSAVES for this + * special case isn't deemed worthwhile. + */ + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return KVM_MSR_RET_UNSUPPORTED; + /* + * MSR_IA32_INT_SSP_TAB is not present on processors that do + * not support Intel 64 architecture. + */ + if (index == MSR_IA32_INT_SSP_TAB && !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) + return KVM_MSR_RET_UNSUPPORTED; + if (is_noncanonical_msr_address(data, vcpu)) + return 1; + /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */ + if (index != MSR_IA32_INT_SSP_TAB && !IS_ALIGNED(data, 4)) + return 1; + break; } msr.data = data; @@ -1898,8 +1963,8 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, - bool host_initiated) +static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { struct msr_data msr; int ret; @@ -1914,6 +1979,20 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) return 1; break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) + return KVM_MSR_RET_UNSUPPORTED; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + if (!host_initiated) + return 1; + fallthrough; + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return KVM_MSR_RET_UNSUPPORTED; + break; } msr.index = index; @@ -1925,6 +2004,16 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, return ret; } +int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + return __kvm_set_msr(vcpu, index, data, true); +} + +int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + return __kvm_get_msr(vcpu, index, data, true); +} + static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { @@ -1932,33 +2021,36 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, __kvm_get_msr); } -int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) +int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) - return KVM_MSR_RET_FILTERED; return kvm_get_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_read); -int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) +int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) { - if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) - return KVM_MSR_RET_FILTERED; return kvm_set_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_write); -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) +int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - return kvm_get_msr_ignored_check(vcpu, index, data, false); + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return KVM_MSR_RET_FILTERED; + + return __kvm_emulate_msr_read(vcpu, index, data); } -EXPORT_SYMBOL_GPL(kvm_get_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_read); -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) { - return kvm_set_msr_ignored_check(vcpu, index, data, false); + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return KVM_MSR_RET_FILTERED; + + return __kvm_emulate_msr_write(vcpu, index, data); } -EXPORT_SYMBOL_GPL(kvm_set_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_write); + static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) { @@ -1990,6 +2082,15 @@ static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) return complete_fast_msr_access(vcpu); } +static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu) +{ + if (!vcpu->run->msr.error) + kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg, + vcpu->run->msr.data); + + return complete_fast_msr_access(vcpu); +} + static u64 kvm_msr_reason(int r) { switch (r) { @@ -2024,55 +2125,82 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, return 1; } -int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) +static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg, + int (*complete_rdmsr)(struct kvm_vcpu *)) { - u32 ecx = kvm_rcx_read(vcpu); u64 data; int r; - r = kvm_get_msr_with_filter(vcpu, ecx, &data); + r = kvm_emulate_msr_read(vcpu, msr, &data); if (!r) { - trace_kvm_msr_read(ecx, data); + trace_kvm_msr_read(msr, data); - kvm_rax_write(vcpu, data & -1u); - kvm_rdx_write(vcpu, (data >> 32) & -1u); + if (reg < 0) { + kvm_rax_write(vcpu, data & -1u); + kvm_rdx_write(vcpu, (data >> 32) & -1u); + } else { + kvm_register_write(vcpu, reg, data); + } } else { /* MSR read failed? See if we should ask user space */ - if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0, - complete_fast_rdmsr, r)) + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0, + complete_rdmsr, r)) return 0; - trace_kvm_msr_read_ex(ecx); + trace_kvm_msr_read_ex(msr); } return kvm_x86_call(complete_emulated_msr)(vcpu, r); } -EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); -int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_rcx_read(vcpu); - u64 data = kvm_read_edx_eax(vcpu); - int r; + return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1, + complete_fast_rdmsr); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr); + +int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + vcpu->arch.cui_rdmsr_imm_reg = reg; + + return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr_imm); - r = kvm_set_msr_with_filter(vcpu, ecx, data); +static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + int r; + r = kvm_emulate_msr_write(vcpu, msr, data); if (!r) { - trace_kvm_msr_write(ecx, data); + trace_kvm_msr_write(msr, data); } else { /* MSR write failed? See if we should ask user space */ - if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data, + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_WRMSR, data, complete_fast_msr_access, r)) return 0; /* Signal all other negative errors to userspace */ if (r < 0) return r; - trace_kvm_msr_write_ex(ecx, data); + trace_kvm_msr_write_ex(msr, data); } return kvm_x86_call(complete_emulated_msr)(vcpu, r); } -EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); + +int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +{ + return __kvm_emulate_wrmsr(vcpu, kvm_rcx_read(vcpu), + kvm_read_edx_eax(vcpu)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr); + +int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr_imm); int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) { @@ -2084,14 +2212,23 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu) /* Treat an INVD instruction as a NOP and just skip it. */ return kvm_emulate_as_nop(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_invd); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_invd); + +fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu) +{ + if (!kvm_emulate_invd(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; + + return EXIT_FASTPATH_REENTER_GUEST; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_invd); int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } -EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_invalid_op); static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) @@ -2117,13 +2254,13 @@ int kvm_emulate_mwait(struct kvm_vcpu *vcpu) { return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); } -EXPORT_SYMBOL_GPL(kvm_emulate_mwait); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_mwait); int kvm_emulate_monitor(struct kvm_vcpu *vcpu) { return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); } -EXPORT_SYMBOL_GPL(kvm_emulate_monitor); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { @@ -2133,74 +2270,41 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } -/* - * The fast path for frequent and performance sensitive wrmsr emulation, - * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces - * the latency of virtual IPI by avoiding the expensive bits of transitioning - * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the - * other cases which must be called after interrupts are enabled on the host. - */ -static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) +static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { - if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) - return 1; - - if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && - ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && - ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && - ((u32)(data >> 32) != X2APIC_BROADCAST)) - return kvm_x2apic_icr_write(vcpu->arch.apic, data); - - return 1; -} - -static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) -{ - if (!kvm_can_use_hv_timer(vcpu)) - return 1; - - kvm_set_lapic_tscdeadline_msr(vcpu, data); - return 0; -} - -fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) -{ - u32 msr = kvm_rcx_read(vcpu); - u64 data; - fastpath_t ret; - bool handled; - - kvm_vcpu_srcu_read_lock(vcpu); - switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): - data = kvm_read_edx_eax(vcpu); - handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || + kvm_x2apic_icr_write_fast(vcpu->arch.apic, data)) + return EXIT_FASTPATH_NONE; break; case MSR_IA32_TSC_DEADLINE: - data = kvm_read_edx_eax(vcpu); - handled = !handle_fastpath_set_tscdeadline(vcpu, data); + kvm_set_lapic_tscdeadline_msr(vcpu, data); break; default: - handled = false; - break; + return EXIT_FASTPATH_NONE; } - if (handled) { - if (!kvm_skip_emulated_instruction(vcpu)) - ret = EXIT_FASTPATH_EXIT_USERSPACE; - else - ret = EXIT_FASTPATH_REENTER_GUEST; - trace_kvm_msr_write(msr, data); - } else { - ret = EXIT_FASTPATH_NONE; - } + trace_kvm_msr_write(msr, data); - kvm_vcpu_srcu_read_unlock(vcpu); + if (!kvm_skip_emulated_instruction(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; - return ret; + return EXIT_FASTPATH_REENTER_GUEST; } -EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); + +fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu) +{ + return __handle_fastpath_wrmsr(vcpu, kvm_rcx_read(vcpu), + kvm_read_edx_eax(vcpu)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr); + +fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr_imm); /* * Adapt set_msr() to msr_io()'s calling convention @@ -2566,7 +2670,7 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio); } -EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_l1_tsc); u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) { @@ -2581,7 +2685,7 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) nested_offset += l2_offset; return nested_offset; } -EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_offset); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) { @@ -2591,7 +2695,7 @@ u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) return l1_multiplier; } -EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_multiplier); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) { @@ -3669,7 +3773,7 @@ void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); } -EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_service_local_tlb_flush_requests); static void record_steal_time(struct kvm_vcpu *vcpu) { @@ -3769,6 +3873,67 @@ static void record_steal_time(struct kvm_vcpu *vcpu) mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } +/* + * Returns true if the MSR in question is managed via XSTATE, i.e. is context + * switched with the rest of guest FPU state. Note! S_CET is _not_ context + * switched via XSTATE even though it _is_ saved/restored via XSAVES/XRSTORS. + * Because S_CET is loaded on VM-Enter and VM-Exit via dedicated VMCS fields, + * the value saved/restored via XSTATE is always the host's value. That detail + * is _extremely_ important, as the guest's S_CET must _never_ be resident in + * hardware while executing in the host. Loading guest values for U_CET and + * PL[0-3]_SSP while executing in the kernel is safe, as U_CET is specific to + * userspace, and PL[0-3]_SSP are only consumed when transitioning to lower + * privilege levels, i.e. are effectively only consumed by userspace as well. + */ +static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + if (!vcpu) + return false; + + switch (msr) { + case MSR_IA32_U_CET: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) || + guest_cpu_cap_has(vcpu, X86_FEATURE_IBT); + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + default: + return false; + } +} + +/* + * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an + * MSR that is managed via XSTATE. Note, the caller is responsible for doing + * the initial FPU load, this helper only ensures that guest state is resident + * in hardware (the kernel can load its FPU state in IRQ context). + */ +static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu, + struct msr_data *msr_info, + int access) +{ + BUILD_BUG_ON(access != MSR_TYPE_R && access != MSR_TYPE_W); + + KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm); + KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm); + + kvm_fpu_get(); + if (access == MSR_TYPE_R) + rdmsrq(msr_info->index, msr_info->data); + else + wrmsrq(msr_info->index, msr_info->data); + kvm_fpu_put(); +} + +static void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W); +} + +static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R); +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u32 msr = msr_info->index; @@ -3960,16 +4125,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_XSS: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) - return 1; - /* - * KVM supports exposing PT to the guest, but does not support - * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than - * XSAVES/XRSTORS to save/restore PT MSRs. - */ - if (data & ~kvm_caps.supported_xss) + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return KVM_MSR_RET_UNSUPPORTED; + + if (data & ~vcpu->arch.guest_supported_xss) return 1; + if (vcpu->arch.ia32_xss == data) + break; vcpu->arch.ia32_xss = data; vcpu->arch.cpuid_dynamic_bits_dirty = true; break; @@ -4153,6 +4315,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif + case MSR_IA32_U_CET: + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + kvm_set_xstate_msr(vcpu, msr_info); + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -4161,7 +4327,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_msr_common); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_msr_common); static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { @@ -4502,6 +4668,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.guest_fpu.xfd_err; break; #endif + case MSR_IA32_U_CET: + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + kvm_get_xstate_msr(vcpu, msr_info); + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); @@ -4510,7 +4680,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } return 0; } -EXPORT_SYMBOL_GPL(kvm_get_msr_common); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_msr_common); /* * Read or write a bunch of msrs. All parameters are kernel addresses. @@ -4522,11 +4692,25 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { + bool fpu_loaded = false; int i; - for (i = 0; i < msrs->nmsrs; ++i) + for (i = 0; i < msrs->nmsrs; ++i) { + /* + * If userspace is accessing one or more XSTATE-managed MSRs, + * temporarily load the guest's FPU state so that the guest's + * MSR value(s) is resident in hardware and thus can be accessed + * via RDMSR/WRMSR. + */ + if (!fpu_loaded && is_xstate_managed_msr(vcpu, entries[i].index)) { + kvm_load_guest_fpu(vcpu); + fpu_loaded = true; + } if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; + } + if (fpu_loaded) + kvm_put_guest_fpu(vcpu); return i; } @@ -4711,6 +4895,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_MEMORY_FAULT_INFO: case KVM_CAP_X86_GUEST_MODE: + case KVM_CAP_ONE_REG: r = 1; break; case KVM_CAP_PRE_FAULT_MEMORY: @@ -5889,6 +6074,134 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, } } +struct kvm_x86_reg_id { + __u32 index; + __u8 type; + __u8 rsvd1; + __u8 rsvd2:4; + __u8 size:4; + __u8 x86; +}; + +static int kvm_translate_kvm_reg(struct kvm_vcpu *vcpu, + struct kvm_x86_reg_id *reg) +{ + switch (reg->index) { + case KVM_REG_GUEST_SSP: + /* + * FIXME: If host-initiated accesses are ever exempted from + * ignore_msrs (in kvm_do_msr_access()), drop this manual check + * and rely on KVM's standard checks to reject accesses to regs + * that don't exist. + */ + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return -EINVAL; + + reg->type = KVM_X86_REG_TYPE_MSR; + reg->index = MSR_KVM_INTERNAL_GUEST_SSP; + break; + default: + return -EINVAL; + } + return 0; +} + +static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) +{ + u64 val; + + if (do_get_msr(vcpu, msr, &val)) + return -EINVAL; + + if (put_user(val, user_val)) + return -EFAULT; + + return 0; +} + +static int kvm_set_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) +{ + u64 val; + + if (get_user(val, user_val)) + return -EFAULT; + + if (do_set_msr(vcpu, msr, &val)) + return -EINVAL; + + return 0; +} + +static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, + void __user *argp) +{ + struct kvm_one_reg one_reg; + struct kvm_x86_reg_id *reg; + u64 __user *user_val; + bool load_fpu; + int r; + + if (copy_from_user(&one_reg, argp, sizeof(one_reg))) + return -EFAULT; + + if ((one_reg.id & KVM_REG_ARCH_MASK) != KVM_REG_X86) + return -EINVAL; + + reg = (struct kvm_x86_reg_id *)&one_reg.id; + if (reg->rsvd1 || reg->rsvd2) + return -EINVAL; + + if (reg->type == KVM_X86_REG_TYPE_KVM) { + r = kvm_translate_kvm_reg(vcpu, reg); + if (r) + return r; + } + + if (reg->type != KVM_X86_REG_TYPE_MSR) + return -EINVAL; + + if ((one_reg.id & KVM_REG_SIZE_MASK) != KVM_REG_SIZE_U64) + return -EINVAL; + + guard(srcu)(&vcpu->kvm->srcu); + + load_fpu = is_xstate_managed_msr(vcpu, reg->index); + if (load_fpu) + kvm_load_guest_fpu(vcpu); + + user_val = u64_to_user_ptr(one_reg.addr); + if (ioctl == KVM_GET_ONE_REG) + r = kvm_get_one_msr(vcpu, reg->index, user_val); + else + r = kvm_set_one_msr(vcpu, reg->index, user_val); + + if (load_fpu) + kvm_put_guest_fpu(vcpu); + return r; +} + +static int kvm_get_reg_list(struct kvm_vcpu *vcpu, + struct kvm_reg_list __user *user_list) +{ + u64 nr_regs = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ? 1 : 0; + u64 user_nr_regs; + + if (get_user(user_nr_regs, &user_list->n)) + return -EFAULT; + + if (put_user(nr_regs, &user_list->n)) + return -EFAULT; + + if (user_nr_regs < nr_regs) + return -E2BIG; + + if (nr_regs && + put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0])) + return -EFAULT; + + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -6005,6 +6318,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp, srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } + case KVM_GET_ONE_REG: + case KVM_SET_ONE_REG: + r = kvm_get_set_one_reg(vcpu, ioctl, argp); + break; + case KVM_GET_REG_LIST: + r = kvm_get_reg_list(vcpu, argp); + break; case KVM_TPR_ACCESS_REPORTING: { struct kvm_tpr_access_ctl tac; @@ -6771,7 +7091,11 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, kvm_free_msr_filter(old_filter); - kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); + /* + * Recalc MSR intercepts as userspace may want to intercept accesses to + * MSRs that KVM would otherwise pass through to the guest. + */ + kvm_make_all_cpus_request(kvm, KVM_REQ_RECALC_INTERCEPTS); return 0; } @@ -6966,6 +7290,15 @@ set_identity_unlock: if (irqchip_in_kernel(kvm)) goto create_irqchip_unlock; + /* + * Disallow an in-kernel I/O APIC if the VM has protected EOIs, + * i.e. if KVM can't intercept EOIs and thus can't properly + * emulate level-triggered interrupts. + */ + r = -ENOTTY; + if (kvm->arch.has_protected_eoi) + goto create_irqchip_unlock; + r = -EINVAL; if (kvm->created_vcpus) goto create_irqchip_unlock; @@ -7353,6 +7686,7 @@ static void kvm_probe_msr_to_save(u32 msr_index) case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) return; break; @@ -7365,6 +7699,24 @@ static void kvm_probe_msr_to_save(u32 msr_index) if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) return; break; + case MSR_IA32_XSS: + if (!kvm_caps.supported_xss) + return; + break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + return; + break; + case MSR_IA32_INT_SSP_TAB: + if (!kvm_cpu_cap_has(X86_FEATURE_LM)) + return; + fallthrough; + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK)) + return; + break; default: break; } @@ -7484,7 +7836,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } -EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) @@ -7495,7 +7847,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, access |= PFERR_WRITE_MASK; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } -EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_write); /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, @@ -7581,7 +7933,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -EXPORT_SYMBOL_GPL(kvm_read_guest_virt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_virt); static int emulator_read_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, @@ -7653,7 +8005,7 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } -EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_virt_system); static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) @@ -7687,7 +8039,7 @@ int handle_ud(struct kvm_vcpu *vcpu) return kvm_emulate_instruction(vcpu, emul_type); } -EXPORT_SYMBOL_GPL(handle_ud); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_ud); static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t gpa, bool write) @@ -8166,7 +8518,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) kvm_emulate_wbinvd_noskip(vcpu); return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wbinvd); @@ -8353,7 +8705,7 @@ static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_get_msr_with_filter(vcpu, msr_index, pdata); + r = kvm_emulate_msr_read(vcpu, msr_index, pdata); if (r < 0) return X86EMUL_UNHANDLEABLE; @@ -8376,7 +8728,7 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_set_msr_with_filter(vcpu, msr_index, data); + r = kvm_emulate_msr_write(vcpu, msr_index, data); if (r < 0) return X86EMUL_UNHANDLEABLE; @@ -8396,7 +8748,16 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); + /* + * Treat emulator accesses to the current shadow stack pointer as host- + * initiated, as they aren't true MSR accesses (SSP is a "just a reg"), + * and this API is used only for implicit accesses, i.e. not RDMSR, and + * so the index is fully KVM-controlled. + */ + if (unlikely(msr_index == MSR_KVM_INTERNAL_GUEST_SSP)) + return kvm_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); + + return __kvm_emulate_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); } static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc) @@ -8470,11 +8831,6 @@ static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt) return is_smm(emul_to_vcpu(ctxt)); } -static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt) -{ - return is_guest_mode(emul_to_vcpu(ctxt)); -} - #ifndef CONFIG_KVM_SMM static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) { @@ -8558,7 +8914,6 @@ static const struct x86_emulate_ops emulate_ops = { .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible, .set_nmi_mask = emulator_set_nmi_mask, .is_smm = emulator_is_smm, - .is_guest_mode = emulator_is_guest_mode, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, @@ -8661,7 +9016,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) kvm_set_rflags(vcpu, ctxt->eflags); } } -EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_realmode_interrupt); static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, u8 ndata, u8 *insn_bytes, u8 insn_size) @@ -8726,13 +9081,13 @@ void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, { prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0); } -EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_prepare_emulation_failure_exit); void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) { __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0); } -EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_emulation_failure_exit); void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa) { @@ -8754,7 +9109,7 @@ void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa) run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; run->internal.ndata = ndata; } -EXPORT_SYMBOL_GPL(kvm_prepare_event_vectoring_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_event_vectoring_exit); static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { @@ -8864,7 +9219,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) if (unlikely(!r)) return 0; - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); + kvm_pmu_instruction_retired(vcpu); /* * rflags is the old, "raw" value of the flags. The new value has @@ -8878,7 +9233,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) r = kvm_vcpu_do_singlestep(vcpu); return r; } -EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_skip_emulated_instruction); static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu) { @@ -9009,7 +9364,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, return r; } -EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(x86_decode_emulated_instruction); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len) @@ -9143,7 +9498,14 @@ restart: ctxt->exception.address = 0; } - r = x86_emulate_insn(ctxt); + /* + * Check L1's instruction intercepts when emulating instructions for + * L2, unless KVM is re-emulating a previously decoded instruction, + * e.g. to complete userspace I/O, in which case KVM has already + * checked the intercepts. + */ + r = x86_emulate_insn(ctxt, is_guest_mode(vcpu) && + !(emulation_type & EMULTYPE_NO_DECODE)); if (r == EMULATION_INTERCEPTED) return 1; @@ -9198,9 +9560,9 @@ writeback: */ if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); + kvm_pmu_instruction_retired(vcpu); if (ctxt->is_branch) - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); + kvm_pmu_branch_retired(vcpu); kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); @@ -9226,14 +9588,14 @@ int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) { return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); } -EXPORT_SYMBOL_GPL(kvm_emulate_instruction); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_instruction); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void *insn, int insn_len) { return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len); } -EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_instruction_from_buffer); static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu) { @@ -9328,7 +9690,7 @@ int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in) ret = kvm_fast_pio_out(vcpu, size, port); return ret && kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_fast_pio); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fast_pio); static int kvmclock_cpu_down_prep(unsigned int cpu) { @@ -9651,6 +10013,18 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return -EIO; } + if (boot_cpu_has(X86_FEATURE_SHSTK) || boot_cpu_has(X86_FEATURE_IBT)) { + rdmsrq(MSR_IA32_S_CET, kvm_host.s_cet); + /* + * Linux doesn't yet support supervisor shadow stacks (SSS), so + * KVM doesn't save/restore the associated MSRs, i.e. KVM may + * clobber the host values. Yell and refuse to load if SSS is + * unexpectedly enabled, e.g. to avoid crashing the host. + */ + if (WARN_ON_ONCE(kvm_host.s_cet & CET_SHSTK_EN)) + return -EIO; + } + memset(&kvm_caps, 0, sizeof(kvm_caps)); x86_emulator_cache = kvm_alloc_emulator_cache(); @@ -9678,14 +10052,17 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } + + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + rdmsrq(MSR_IA32_XSS, kvm_host.xss); + kvm_caps.supported_xss = kvm_host.xss & KVM_SUPPORTED_XSS; + } + kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS; kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS; rdmsrq_safe(MSR_EFER, &kvm_host.efer); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrq(MSR_IA32_XSS, kvm_host.xss); - kvm_init_pmu_capability(ops->pmu_ops); if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) @@ -9734,6 +10111,16 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) kvm_caps.supported_xss = 0; + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + + if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + } + if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that @@ -9760,7 +10147,7 @@ out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); return r; } -EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x86_vendor_init); void kvm_x86_vendor_exit(void) { @@ -9794,7 +10181,7 @@ void kvm_x86_vendor_exit(void) kvm_x86_ops.enable_virtualization_cpu = NULL; mutex_unlock(&vendor_module_lock); } -EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x86_vendor_exit); #ifdef CONFIG_X86_64 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, @@ -9858,7 +10245,7 @@ bool kvm_apicv_activated(struct kvm *kvm) { return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0); } -EXPORT_SYMBOL_GPL(kvm_apicv_activated); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apicv_activated); bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu) { @@ -9868,7 +10255,7 @@ bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu) return (vm_reasons | vcpu_reasons) == 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_apicv_activated); static void set_or_clear_apicv_inhibit(unsigned long *inhibits, enum kvm_apicv_inhibit reason, bool set) @@ -10044,7 +10431,7 @@ out: vcpu->run->hypercall.ret = ret; return 1; } -EXPORT_SYMBOL_GPL(____kvm_emulate_hypercall); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(____kvm_emulate_hypercall); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { @@ -10057,7 +10444,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) return __kvm_emulate_hypercall(vcpu, kvm_x86_call(get_cpl)(vcpu), complete_hypercall_exit); } -EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_hypercall); static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) { @@ -10500,7 +10887,7 @@ out: preempt_enable(); up_read(&vcpu->kvm->arch.apicv_update_lock); } -EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_update_apicv); static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { @@ -10576,7 +10963,7 @@ void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, __kvm_set_or_clear_apicv_inhibit(kvm, reason, set); up_write(&kvm->arch.apicv_update_lock); } -EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_or_clear_apicv_inhibit); static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { @@ -10796,13 +11183,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); - /* - * Recalc MSR intercepts as userspace may want to intercept - * accesses to MSRs that KVM would otherwise pass through to - * the guest. - */ - if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) - kvm_x86_call(recalc_msr_intercepts)(vcpu); + if (kvm_check_request(KVM_REQ_RECALC_INTERCEPTS, vcpu)) + kvm_x86_call(recalc_intercepts)(vcpu); if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) kvm_x86_call(update_cpu_dirty_logging)(vcpu); @@ -11011,6 +11393,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) wrmsrq(MSR_IA32_XFD_ERR, 0); /* + * Mark this CPU as needing a branch predictor flush before running + * userspace. Must be done before enabling preemption to ensure it gets + * set for the CPU that actually ran the guest, and not the CPU that it + * may migrate to. + */ + if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER)) + this_cpu_write(x86_ibpb_exit_to_user, true); + + /* * Consume any pending interrupts, including the possible source of * VM-Exit on SVM and any ticks that occur between VM-Exit and now. * An instruction is required after local_irq_enable() to fully unblock @@ -11126,7 +11517,7 @@ bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return false; } -EXPORT_SYMBOL_GPL(kvm_vcpu_has_events); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_has_events); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { @@ -11279,7 +11670,7 @@ int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) { return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); } -EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt_noskip); int kvm_emulate_halt(struct kvm_vcpu *vcpu) { @@ -11290,17 +11681,11 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) */ return kvm_emulate_halt_noskip(vcpu) && ret; } -EXPORT_SYMBOL_GPL(kvm_emulate_halt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) { - int ret; - - kvm_vcpu_srcu_read_lock(vcpu); - ret = kvm_emulate_halt(vcpu); - kvm_vcpu_srcu_read_unlock(vcpu); - - if (!ret) + if (!kvm_emulate_halt(vcpu)) return EXIT_FASTPATH_EXIT_USERSPACE; if (kvm_vcpu_running(vcpu)) @@ -11308,7 +11693,7 @@ fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_EXIT_HANDLED; } -EXPORT_SYMBOL_GPL(handle_fastpath_hlt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_hlt); int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) { @@ -11317,7 +11702,7 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret; } -EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_ap_reset_hold); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { @@ -11828,6 +12213,25 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) { + u64 u_cet, s_cet; + + /* + * Check both User and Supervisor on task switches as inter- + * privilege level task switches are impacted by CET at both + * the current privilege level and the new privilege level, and + * that information is not known at this time. The expectation + * is that the guest won't require emulation of task switches + * while using IBT or Shadow Stacks. + */ + if (__kvm_emulate_msr_read(vcpu, MSR_IA32_U_CET, &u_cet) || + __kvm_emulate_msr_read(vcpu, MSR_IA32_S_CET, &s_cet)) + goto unhandled_task_switch; + + if ((u_cet | s_cet) & (CET_ENDBR_EN | CET_SHSTK_EN)) + goto unhandled_task_switch; + } + init_emulate_ctxt(vcpu); ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, @@ -11837,19 +12241,21 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, * Report an error userspace if MMIO is needed, as KVM doesn't support * MMIO during a task switch (or any other complex operation). */ - if (ret || vcpu->mmio_needed) { - vcpu->mmio_needed = false; - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } + if (ret || vcpu->mmio_needed) + goto unhandled_task_switch; kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); return 1; + +unhandled_task_switch: + vcpu->mmio_needed = false; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; } -EXPORT_SYMBOL_GPL(kvm_task_switch); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_task_switch); static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -12379,6 +12785,42 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvfree(vcpu->arch.cpuid_entries); } +static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; + u64 xfeatures_mask; + int i; + + /* + * Guest FPU state is zero allocated and so doesn't need to be manually + * cleared on RESET, i.e. during vCPU creation. + */ + if (!init_event || !fpstate) + return; + + /* + * On INIT, only select XSTATE components are zeroed, most components + * are unchanged. Currently, the only components that are zeroed and + * supported by KVM are MPX and CET related. + */ + xfeatures_mask = (kvm_caps.supported_xcr0 | kvm_caps.supported_xss) & + (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR | + XFEATURE_MASK_CET_ALL); + if (!xfeatures_mask) + return; + + BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX); + + /* + * All paths that lead to INIT are required to load the guest's FPU + * state (because most paths are buried in KVM_RUN). + */ + kvm_put_guest_fpu(vcpu); + for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX) + fpstate_clear_xstate_component(fpstate, i); + kvm_load_guest_fpu(vcpu); +} + void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_cpuid_entry2 *cpuid_0x1; @@ -12436,22 +12878,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) { - struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; - - /* - * All paths that lead to INIT are required to load the guest's - * FPU state (because most paths are buried in KVM_RUN). - */ - if (init_event) - kvm_put_guest_fpu(vcpu); - - fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS); - fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR); - - if (init_event) - kvm_load_guest_fpu(vcpu); - } + kvm_xstate_reset(vcpu, init_event); if (!init_event) { vcpu->arch.smbase = 0x30000; @@ -12463,7 +12890,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); - __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); + kvm_msr_write(vcpu, MSR_IA32_XSS, 0); } /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ @@ -12529,7 +12956,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (init_event) kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } -EXPORT_SYMBOL_GPL(kvm_vcpu_reset); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_reset); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { @@ -12541,7 +12968,7 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); kvm_rip_write(vcpu, 0); } -EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_deliver_sipi_vector); void kvm_arch_enable_virtualization(void) { @@ -12659,7 +13086,7 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; } -EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_is_reset_bsp); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { @@ -12823,7 +13250,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, return (void __user *)hva; } -EXPORT_SYMBOL_GPL(__x86_set_memory_region); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__x86_set_memory_region); void kvm_arch_pre_destroy_vm(struct kvm *kvm) { @@ -13231,13 +13658,13 @@ unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) + kvm_rip_read(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_get_linear_rip); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_linear_rip); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) { return kvm_get_linear_rip(vcpu) == linear_rip; } -EXPORT_SYMBOL_GPL(kvm_is_linear_rip); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_linear_rip); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { @@ -13248,7 +13675,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) rflags &= ~X86_EFLAGS_TF; return rflags; } -EXPORT_SYMBOL_GPL(kvm_get_rflags); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_rflags); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { @@ -13263,7 +13690,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) __kvm_set_rflags(vcpu, rflags); kvm_make_request(KVM_REQ_EVENT, vcpu); } -EXPORT_SYMBOL_GPL(kvm_set_rflags); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_rflags); static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) { @@ -13495,31 +13922,33 @@ void kvm_arch_register_noncoherent_dma(struct kvm *kvm) if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1) kvm_noncoherent_dma_assignment_start_or_stop(kvm); } -EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) { if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count)) kvm_noncoherent_dma_assignment_start_or_stop(kvm); } -EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) { return atomic_read(&kvm->arch.noncoherent_dma_count); } -EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_arch_has_noncoherent_dma); -bool kvm_vector_hashing_enabled(void) +bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { - return vector_hashing; + return (vcpu->arch.msr_kvm_poll_control & 1) == 0; } -bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) +#ifdef CONFIG_KVM_GUEST_MEMFD +/* + * KVM doesn't yet support mmap() on guest_memfd for VMs with private memory + * (the private vs. shared tracking needs to be moved into guest_memfd). + */ +bool kvm_arch_supports_gmem_mmap(struct kvm *kvm) { - return (vcpu->arch.msr_kvm_poll_control & 1) == 0; + return !kvm_arch_has_private_mem(kvm); } -EXPORT_SYMBOL_GPL(kvm_arch_no_poll); #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) @@ -13534,6 +13963,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) kvm_x86_call(gmem_invalidate)(start, end); } #endif +#endif int kvm_spec_ctrl_test_value(u64 value) { @@ -13559,7 +13989,7 @@ int kvm_spec_ctrl_test_value(u64 value) return ret; } -EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spec_ctrl_test_value); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) { @@ -13584,7 +14014,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c } vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); } -EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fixup_and_inject_pf_error); /* * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns @@ -13613,7 +14043,7 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, return 0; } -EXPORT_SYMBOL_GPL(kvm_handle_memory_failure); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_memory_failure); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) { @@ -13677,7 +14107,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return 1; } } -EXPORT_SYMBOL_GPL(kvm_handle_invpcid); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_invpcid); static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu) { @@ -13762,7 +14192,7 @@ int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, return 0; } -EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_mmio_write); int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, void *data) @@ -13800,7 +14230,7 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, return 0; } -EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_mmio_read); static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size) { @@ -13888,7 +14318,7 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, return in ? kvm_sev_es_ins(vcpu, size, port) : kvm_sev_es_outs(vcpu, size, port); } -EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_string_io); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index bcfd9b719ada..f3dc77f006f9 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -50,6 +50,7 @@ struct kvm_host_values { u64 efer; u64 xcr0; u64 xss; + u64 s_cet; u64 arch_capabilities; }; @@ -101,6 +102,16 @@ do { \ #define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX #define KVM_SVM_DEFAULT_PLE_WINDOW 3000 +/* + * KVM's internal, non-ABI indices for synthetic MSRs. The values themselves + * are arbitrary and have no meaning, the only requirement is that they don't + * conflict with "real" MSRs that KVM supports. Use values at the upper end + * of KVM's reserved paravirtual MSR range to minimize churn, i.e. these values + * will be usable until KVM exhausts its supply of paravirtual MSR indices. + */ + +#define MSR_KVM_INTERNAL_GUEST_SSP 0x4b564dff + static inline unsigned int __grow_ple_window(unsigned int val, unsigned int base, unsigned int modifier, unsigned int max) { @@ -431,14 +442,15 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); -bool kvm_vector_hashing_enabled(void); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code); int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, void *insn, int insn_len); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); -fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu); extern struct kvm_caps kvm_caps; extern struct kvm_host_values kvm_host; @@ -668,6 +680,9 @@ static inline bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) __reserved_bits |= X86_CR4_PCIDE; \ if (!__cpu_has(__c, X86_FEATURE_LAM)) \ __reserved_bits |= X86_CR4_LAM_SUP; \ + if (!__cpu_has(__c, X86_FEATURE_SHSTK) && \ + !__cpu_has(__c, X86_FEATURE_IBT)) \ + __reserved_bits |= X86_CR4_CET; \ __reserved_bits; \ }) @@ -699,4 +714,27 @@ int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); +#define CET_US_RESERVED_BITS GENMASK(9, 6) +#define CET_US_SHSTK_MASK_BITS GENMASK(1, 0) +#define CET_US_IBT_MASK_BITS (GENMASK_ULL(5, 2) | GENMASK_ULL(63, 10)) +#define CET_US_LEGACY_BITMAP_BASE(data) ((data) >> 12) + +static inline bool kvm_is_valid_u_s_cet(struct kvm_vcpu *vcpu, u64 data) +{ + if (data & CET_US_RESERVED_BITS) + return false; + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + (data & CET_US_SHSTK_MASK_BITS)) + return false; + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && + (data & CET_US_IBT_MASK_BITS)) + return false; + if (!IS_ALIGNED(CET_US_LEGACY_BITMAP_BASE(data), 4)) + return false; + /* IBT can be suppressed iff the TRACKER isn't WAIT_ENDBR. */ + if ((data & CET_SUPPRESS) && (data & CET_WAIT_ENDBR)) + return false; + + return true; +} #endif |