diff options
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 67 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/sgx.c | 502 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/sgx.h | 34 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs12.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs12.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 403 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 24 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx_ops.h | 4 |
9 files changed, 840 insertions, 204 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bcca0b80e0d0..8b111682fe5c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -11,6 +11,7 @@ #include "mmu.h" #include "nested.h" #include "pmu.h" +#include "sgx.h" #include "trace.h" #include "vmx.h" #include "x86.h" @@ -21,13 +22,7 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __read_mostly nested_early_check = 0; module_param(nested_early_check, bool, S_IRUGO); -#define CC(consistency_check) \ -({ \ - bool failed = (consistency_check); \ - if (failed) \ - trace_kvm_nested_vmenter_failed(#consistency_check, 0); \ - failed; \ -}) +#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK /* * Hyper-V requires all of these, so mark them as supported even though @@ -2306,6 +2301,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) + vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); + secondary_exec_controls_set(vmx, exec_control); } @@ -3453,6 +3451,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); enum nested_evmptrld_status evmptrld_status; + ++vcpu->stat.nested_run; + if (!nested_vmx_check_permission(vcpu)) return 1; @@ -3810,9 +3810,15 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) /* * Process any exceptions that are not debug traps before MTF. + * + * Note that only a pending nested run can block a pending exception. + * Otherwise an injected NMI/interrupt should either be + * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO, + * while delivering the pending exception. */ + if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { - if (block_nested_events) + if (vmx->nested.nested_run_pending) return -EBUSY; if (!nested_vmx_check_exception(vcpu, &exit_qual)) goto no_vmexit; @@ -3829,7 +3835,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) } if (vcpu->arch.exception.pending) { - if (block_nested_events) + if (vmx->nested.nested_run_pending) return -EBUSY; if (!nested_vmx_check_exception(vcpu, &exit_qual)) goto no_vmexit; @@ -4105,6 +4111,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; + if (to_vmx(vcpu)->exit_reason.enclave_mode) + vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; vmcs12->vm_exit_intr_info = exit_intr_info; @@ -4422,6 +4430,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); + /* Similarly, triple faults in L2 should never escape. */ + WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* Service the TLB flush request for L2 before switching to L1. */ @@ -4558,6 +4569,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx->fail = 0; } +static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) +{ + nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); +} + /* * Decode the memory-address operand of a vmx instruction, as recorded on an * exit caused by such an instruction (run by a guest hypervisor). @@ -5479,16 +5495,11 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, if (!nested_vmx_check_eptp(vcpu, new_eptp)) return 1; - kvm_mmu_unload(vcpu); mmu->ept_ad = accessed_dirty; mmu->mmu_role.base.ad_disabled = !accessed_dirty; vmcs12->ept_pointer = new_eptp; - /* - * TODO: Check what's the correct approach in case - * mmu reload fails. Currently, we just let the next - * reload potentially fail - */ - kvm_mmu_reload(vcpu); + + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); } return 0; @@ -5705,6 +5716,21 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return false; } +static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 encls_leaf; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) + return false; + + encls_leaf = kvm_rax_read(vcpu); + if (encls_leaf > 62) + encls_leaf = 63; + return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); +} + static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, gpa_t bitmap) { @@ -5801,9 +5827,6 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ return true; - case EXIT_REASON_ENCLS: - /* SGX is never exposed to L1 */ - return true; default: break; } @@ -5927,6 +5950,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); + case EXIT_REASON_ENCLS: + return nested_vmx_exit_handled_encls(vcpu, vmcs12); default: return true; } @@ -6502,6 +6527,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) msrs->secondary_ctls_high |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (enable_sgx) + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, msrs->misc_low, @@ -6599,6 +6627,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) struct kvm_x86_nested_ops vmx_nested_ops = { .check_events = vmx_check_nested_events, .hv_timer_pending = nested_vmx_preemption_timer_pending, + .triple_fault = nested_vmx_triple_fault, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, .get_nested_state_pages = vmx_get_nested_state_pages, diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 197148d76b8f..184418baeb3c 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -244,6 +244,11 @@ static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu) PIN_BASED_EXT_INTR_MASK; } +static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING); +} + /* * if fixed0[i] == 1: val[i] must be 1 * if fixed1[i] == 0: val[i] must be 0 diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c new file mode 100644 index 000000000000..6693ebdc0770 --- /dev/null +++ b/arch/x86/kvm/vmx/sgx.c @@ -0,0 +1,502 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2021 Intel Corporation. */ + +#include <asm/sgx.h> + +#include "cpuid.h" +#include "kvm_cache_regs.h" +#include "nested.h" +#include "sgx.h" +#include "vmx.h" +#include "x86.h" + +bool __read_mostly enable_sgx = 1; +module_param_named(sgx, enable_sgx, bool, 0444); + +/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */ +static u64 sgx_pubkey_hash[4] __ro_after_init; + +/* + * ENCLS's memory operands use a fixed segment (DS) and a fixed + * address size based on the mode. Related prefixes are ignored. + */ +static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, + int size, int alignment, gva_t *gva) +{ + struct kvm_segment s; + bool fault; + + /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */ + *gva = offset; + if (!is_long_mode(vcpu)) { + vmx_get_segment(vcpu, &s, VCPU_SREG_DS); + *gva += s.base; + } + + if (!IS_ALIGNED(*gva, alignment)) { + fault = true; + } else if (likely(is_long_mode(vcpu))) { + fault = is_noncanonical_address(*gva, vcpu); + } else { + *gva &= 0xffffffff; + fault = (s.unusable) || + (s.type != 2 && s.type != 3) || + (*gva > s.limit) || + ((s.base != 0 || s.limit != 0xffffffff) && + (((u64)*gva + size - 1) > s.limit + 1)); + } + if (fault) + kvm_inject_gp(vcpu, 0); + return fault ? -EINVAL : 0; +} + +static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr, + unsigned int size) +{ + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = addr; + vcpu->run->internal.data[1] = size; +} + +static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data, + unsigned int size) +{ + if (__copy_from_user(data, (void __user *)hva, size)) { + sgx_handle_emulation_failure(vcpu, hva, size); + return -EFAULT; + } + + return 0; +} + +static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write, + gpa_t *gpa) +{ + struct x86_exception ex; + + if (write) + *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex); + else + *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex); + + if (*gpa == UNMAPPED_GVA) { + kvm_inject_emulated_page_fault(vcpu, &ex); + return -EFAULT; + } + + return 0; +} + +static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva) +{ + *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa)); + if (kvm_is_error_hva(*hva)) { + sgx_handle_emulation_failure(vcpu, gpa, 1); + return -EFAULT; + } + + *hva |= gpa & ~PAGE_MASK; + + return 0; +} + +static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) +{ + struct x86_exception ex; + + /* + * A non-EPCM #PF indicates a bad userspace HVA. This *should* check + * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC, + * but the error code isn't (yet) plumbed through the ENCLS helpers. + */ + if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + /* + * If the guest thinks it's running on SGX2 hardware, inject an SGX + * #PF if the fault matches an EPCM fault signature (#GP on SGX1, + * #PF on SGX2). The assumption is that EPCM faults are much more + * likely than a bad userspace address. + */ + if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) && + guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) { + memset(&ex, 0, sizeof(ex)); + ex.vector = PF_VECTOR; + ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK | + PFERR_SGX_MASK; + ex.address = gva; + ex.error_code_valid = true; + ex.nested_page_fault = false; + kvm_inject_page_fault(vcpu, &ex); + } else { + kvm_inject_gp(vcpu, 0); + } + return 1; +} + +static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, + struct sgx_pageinfo *pageinfo, + unsigned long secs_hva, + gva_t secs_gva) +{ + struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents; + struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1; + u64 attributes, xfrm, size; + u32 miscselect; + u8 max_size_log2; + int trapnr, ret; + + sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); + sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); + if (!sgx_12_0 || !sgx_12_1) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + miscselect = contents->miscselect; + attributes = contents->attributes; + xfrm = contents->xfrm; + size = contents->size; + + /* Enforce restriction of access to the PROVISIONKEY. */ + if (!vcpu->kvm->arch.sgx_provisioning_allowed && + (attributes & SGX_ATTR_PROVISIONKEY)) { + if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY) + pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n"); + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */ + if ((u32)miscselect & ~sgx_12_0->ebx || + (u32)attributes & ~sgx_12_1->eax || + (u32)(attributes >> 32) & ~sgx_12_1->ebx || + (u32)xfrm & ~sgx_12_1->ecx || + (u32)(xfrm >> 32) & ~sgx_12_1->edx) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* Enforce CPUID restriction on max enclave size. */ + max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 : + sgx_12_0->edx; + if (size >= BIT_ULL(max_size_log2)) + kvm_inject_gp(vcpu, 0); + + /* + * sgx_virt_ecreate() returns: + * 1) 0: ECREATE was successful + * 2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the + * exception number. + * 3) -EINVAL: access_ok() on @secs_hva failed. This should never + * happen as KVM checks host addresses at memslot creation. + * sgx_virt_ecreate() has already warned in this case. + */ + ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr); + if (!ret) + return kvm_skip_emulated_instruction(vcpu); + if (ret == -EFAULT) + return sgx_inject_fault(vcpu, secs_gva, trapnr); + + return ret; +} + +static int handle_encls_ecreate(struct kvm_vcpu *vcpu) +{ + gva_t pageinfo_gva, secs_gva; + gva_t metadata_gva, contents_gva; + gpa_t metadata_gpa, contents_gpa, secs_gpa; + unsigned long metadata_hva, contents_hva, secs_hva; + struct sgx_pageinfo pageinfo; + struct sgx_secs *contents; + struct x86_exception ex; + int r; + + if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) || + sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva)) + return 1; + + /* + * Copy the PAGEINFO to local memory, its pointers need to be + * translated, i.e. we need to do a deep copy/translate. + */ + r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo, + sizeof(pageinfo), &ex); + if (r == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_emulated_page_fault(vcpu, &ex); + return 1; + } else if (r != X86EMUL_CONTINUE) { + sgx_handle_emulation_failure(vcpu, pageinfo_gva, + sizeof(pageinfo)); + return 0; + } + + if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) || + sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096, + &contents_gva)) + return 1; + + /* + * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA. + * Resume the guest on failure to inject a #PF. + */ + if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) || + sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) || + sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa)) + return 1; + + /* + * ...and then to HVA. The order of accesses isn't architectural, i.e. + * KVM doesn't have to fully process one address at a time. Exit to + * userspace if a GPA is invalid. + */ + if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) || + sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) || + sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva)) + return 0; + + /* + * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the + * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and + * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to + * enforce restriction of access to the PROVISIONKEY. + */ + contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (!contents) + return -ENOMEM; + + /* Exit to userspace if copying from a host userspace address fails. */ + if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) { + free_page((unsigned long)contents); + return 0; + } + + pageinfo.metadata = metadata_hva; + pageinfo.contents = (u64)contents; + + r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva); + + free_page((unsigned long)contents); + + return r; +} + +static int handle_encls_einit(struct kvm_vcpu *vcpu) +{ + unsigned long sig_hva, secs_hva, token_hva, rflags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gva_t sig_gva, secs_gva, token_gva; + gpa_t sig_gpa, secs_gpa, token_gpa; + int ret, trapnr; + + if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) || + sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) || + sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva)) + return 1; + + /* + * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA. + * Resume the guest on failure to inject a #PF. + */ + if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) || + sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) || + sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa)) + return 1; + + /* + * ...and then to HVA. The order of accesses isn't architectural, i.e. + * KVM doesn't have to fully process one address at a time. Exit to + * userspace if a GPA is invalid. Note, all structures are aligned and + * cannot split pages. + */ + if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) || + sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) || + sgx_gpa_to_hva(vcpu, token_gpa, &token_hva)) + return 0; + + ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva, + (void __user *)secs_hva, + vmx->msr_ia32_sgxlepubkeyhash, &trapnr); + + if (ret == -EFAULT) + return sgx_inject_fault(vcpu, secs_gva, trapnr); + + /* + * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva, + * @token_hva or @secs_hva. This should never happen as KVM checks host + * addresses at memslot creation. sgx_virt_einit() has already warned + * in this case, so just return. + */ + if (ret < 0) + return ret; + + rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | + X86_EFLAGS_AF | X86_EFLAGS_SF | + X86_EFLAGS_OF); + if (ret) + rflags |= X86_EFLAGS_ZF; + else + rflags &= ~X86_EFLAGS_ZF; + vmx_set_rflags(vcpu, rflags); + + kvm_rax_write(vcpu, ret); + return kvm_skip_emulated_instruction(vcpu); +} + +static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) +{ + if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + return false; + + if (leaf >= ECREATE && leaf <= ETRACK) + return guest_cpuid_has(vcpu, X86_FEATURE_SGX1); + + if (leaf >= EAUG && leaf <= EMODT) + return guest_cpuid_has(vcpu, X86_FEATURE_SGX2); + + return false; +} + +static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu) +{ + const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED; + + return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits; +} + +int handle_encls(struct kvm_vcpu *vcpu) +{ + u32 leaf = (u32)kvm_rax_read(vcpu); + + if (!encls_leaf_enabled_in_guest(vcpu, leaf)) { + kvm_queue_exception(vcpu, UD_VECTOR); + } else if (!sgx_enabled_in_guest_bios(vcpu)) { + kvm_inject_gp(vcpu, 0); + } else { + if (leaf == ECREATE) + return handle_encls_ecreate(vcpu); + if (leaf == EINIT) + return handle_encls_einit(vcpu); + WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; + return 0; + } + return 1; +} + +void setup_default_sgx_lepubkeyhash(void) +{ + /* + * Use Intel's default value for Skylake hardware if Launch Control is + * not supported, i.e. Intel's hash is hardcoded into silicon, or if + * Launch Control is supported and enabled, i.e. mimic the reset value + * and let the guest write the MSRs at will. If Launch Control is + * supported but disabled, then use the current MSR values as the hash + * MSRs exist but are read-only (locked and not writable). + */ + if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) || + rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { + sgx_pubkey_hash[0] = 0xa6053e051270b7acULL; + sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL; + sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL; + sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL; + } else { + /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */ + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); + } +} + +void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash, + sizeof(sgx_pubkey_hash)); +} + +/* + * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM + * restrictions if the guest's allowed-1 settings diverge from hardware. + */ +static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *guest_cpuid; + u32 eax, ebx, ecx, edx; + + if (!vcpu->kvm->arch.sgx_provisioning_allowed) + return true; + + guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0); + if (!guest_cpuid) + return true; + + cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx); + if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) + return true; + + guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1); + if (!guest_cpuid) + return true; + + cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx); + if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx || + guest_cpuid->ecx != ecx || guest_cpuid->edx != edx) + return true; + + return false; +} + +void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + /* + * There is no software enable bit for SGX that is virtualized by + * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the + * guest (either by the host or by the guest's BIOS) but enabled in the + * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate + * the expected system behavior for ENCLS. + */ + u64 bitmap = -1ull; + + /* Nothing to do if hardware doesn't support SGX */ + if (!cpu_has_vmx_encls_vmexit()) + return; + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) && + sgx_enabled_in_guest_bios(vcpu)) { + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { + bitmap &= ~GENMASK_ULL(ETRACK, ECREATE); + if (sgx_intercept_encls_ecreate(vcpu)) + bitmap |= (1 << ECREATE); + } + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) + bitmap &= ~GENMASK_ULL(EMODT, EAUG); + + /* + * Trap and execute EINIT if launch control is enabled in the + * host using the guest's values for launch control MSRs, even + * if the guest's values are fixed to hardware default values. + * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing + * the MSRs is extraordinarily expensive. + */ + if (boot_cpu_has(X86_FEATURE_SGX_LC)) + bitmap |= (1 << EINIT); + + if (!vmcs12 && is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + if (vmcs12 && nested_cpu_has_encls_exit(vmcs12)) + bitmap |= vmcs12->encls_exiting_bitmap; + } + vmcs_write64(ENCLS_EXITING_BITMAP, bitmap); +} diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h new file mode 100644 index 000000000000..a400888b376d --- /dev/null +++ b/arch/x86/kvm/vmx/sgx.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_SGX_H +#define __KVM_X86_SGX_H + +#include <linux/kvm_host.h> + +#include "capabilities.h" +#include "vmx_ops.h" + +#ifdef CONFIG_X86_SGX_KVM +extern bool __read_mostly enable_sgx; + +int handle_encls(struct kvm_vcpu *vcpu); + +void setup_default_sgx_lepubkeyhash(void); +void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu); + +void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); +#else +#define enable_sgx 0 + +static inline void setup_default_sgx_lepubkeyhash(void) { } +static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { } + +static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + /* Nothing to do if hardware doesn't support SGX */ + if (cpu_has_vmx_encls_vmexit()) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); +} +#endif + +#endif /* __KVM_X86_SGX_H */ diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index c8e51c004f78..034adb6404dc 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -50,6 +50,7 @@ const unsigned short vmcs_field_to_offset_table[] = { FIELD64(VMREAD_BITMAP, vmread_bitmap), FIELD64(VMWRITE_BITMAP, vmwrite_bitmap), FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), + FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 80232daf00ff..13494956d0e9 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -69,7 +69,8 @@ struct __packed vmcs12 { u64 vm_function_control; u64 eptp_list_address; u64 pml_address; - u64 padding64[3]; /* room for future expansion */ + u64 encls_exiting_bitmap; + u64 padding64[2]; /* room for future expansion */ /* * To allow migration of L1 (complete with its L2 guests) between * machines of different natural widths (32 or 64 bit), we cannot have @@ -256,6 +257,7 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(vm_function_control, 296); CHECK_OFFSET(eptp_list_address, 304); CHECK_OFFSET(pml_address, 312); + CHECK_OFFSET(encls_exiting_bitmap, 320); CHECK_OFFSET(cr0_guest_host_mask, 344); CHECK_OFFSET(cr4_guest_host_mask, 352); CHECK_OFFSET(cr0_read_shadow, 360); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 32cf8287d4a7..6501d66167b8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -57,6 +57,7 @@ #include "mmu.h" #include "nested.h" #include "pmu.h" +#include "sgx.h" #include "trace.h" #include "vmcs.h" #include "vmcs12.h" @@ -472,26 +473,6 @@ static const u32 vmx_uret_msrs_list[] = { static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); -/* check_ept_pointer() should be under protection of ept_pointer_lock. */ -static void check_ept_pointer_match(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - u64 tmp_eptp = INVALID_PAGE; - int i; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!VALID_PAGE(tmp_eptp)) { - tmp_eptp = to_vmx(vcpu)->ept_pointer; - } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { - to_kvm_vmx(kvm)->ept_pointers_match - = EPT_POINTERS_MISMATCH; - return; - } - } - - to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; -} - static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, void *data) { @@ -501,47 +482,70 @@ static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush range->pages); } -static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm, - struct kvm_vcpu *vcpu, struct kvm_tlb_range *range) +static inline int hv_remote_flush_root_ept(hpa_t root_ept, + struct kvm_tlb_range *range) { - u64 ept_pointer = to_vmx(vcpu)->ept_pointer; - - /* - * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address - * of the base of EPT PML4 table, strip off EPT configuration - * information. - */ if (range) - return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK, + return hyperv_flush_guest_mapping_range(root_ept, kvm_fill_hv_flush_list_func, (void *)range); else - return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK); + return hyperv_flush_guest_mapping(root_ept); } static int hv_remote_flush_tlb_with_range(struct kvm *kvm, struct kvm_tlb_range *range) { + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); struct kvm_vcpu *vcpu; - int ret = 0, i; + int ret = 0, i, nr_unique_valid_roots; + hpa_t root; - spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + spin_lock(&kvm_vmx->hv_root_ept_lock); - if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) - check_ept_pointer_match(kvm); + if (!VALID_PAGE(kvm_vmx->hv_root_ept)) { + nr_unique_valid_roots = 0; - if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { + /* + * Flush all valid roots, and see if all vCPUs have converged + * on a common root, in which case future flushes can skip the + * loop and flush the common root. + */ kvm_for_each_vcpu(i, vcpu, kvm) { - /* If ept_pointer is invalid pointer, bypass flush request. */ - if (VALID_PAGE(to_vmx(vcpu)->ept_pointer)) - ret |= __hv_remote_flush_tlb_with_range( - kvm, vcpu, range); + root = to_vmx(vcpu)->hv_root_ept; + if (!VALID_PAGE(root) || root == kvm_vmx->hv_root_ept) + continue; + + /* + * Set the tracked root to the first valid root. Keep + * this root for the entirety of the loop even if more + * roots are encountered as a low effort optimization + * to avoid flushing the same (first) root again. + */ + if (++nr_unique_valid_roots == 1) + kvm_vmx->hv_root_ept = root; + + if (!ret) + ret = hv_remote_flush_root_ept(root, range); + + /* + * Stop processing roots if a failure occurred and + * multiple valid roots have already been detected. + */ + if (ret && nr_unique_valid_roots > 1) + break; } + + /* + * The optimized flush of a single root can't be used if there + * are multiple valid roots (obviously). + */ + if (nr_unique_valid_roots > 1) + kvm_vmx->hv_root_ept = INVALID_PAGE; } else { - ret = __hv_remote_flush_tlb_with_range(kvm, - kvm_get_vcpu(kvm, 0), range); + ret = hv_remote_flush_root_ept(kvm_vmx->hv_root_ept, range); } - spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + spin_unlock(&kvm_vmx->hv_root_ept_lock); return ret; } static int hv_remote_flush_tlb(struct kvm *kvm) @@ -559,7 +563,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) * evmcs in singe VM shares same assist page. */ if (!*p_hv_pa_pg) - *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL); + *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); if (!*p_hv_pa_pg) return -ENOMEM; @@ -576,6 +580,21 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) #endif /* IS_ENABLED(CONFIG_HYPERV) */ +static void hv_track_root_ept(struct kvm_vcpu *vcpu, hpa_t root_ept) +{ +#if IS_ENABLED(CONFIG_HYPERV) + struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); + + if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { + spin_lock(&kvm_vmx->hv_root_ept_lock); + to_vmx(vcpu)->hv_root_ept = root_ept; + if (root_ept != kvm_vmx->hv_root_ept) + kvm_vmx->hv_root_ept = INVALID_PAGE; + spin_unlock(&kvm_vmx->hv_root_ept_lock); + } +#endif +} + /* * Comment's format: document - errata name - stepping - processor name. * Refer from @@ -1570,12 +1589,25 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) { + /* + * Emulation of instructions in SGX enclaves is impossible as RIP does + * not point tthe failing instruction, and even if it did, the code + * stream is inaccessible. Inject #UD instead of exiting to userspace + * so that guest userspace can't DoS the guest simply by triggering + * emulation (enclaves are CPL3 only). + */ + if (to_vmx(vcpu)->exit_reason.enclave_mode) { + kvm_queue_exception(vcpu, UD_VECTOR); + return false; + } return true; } static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { + union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; unsigned long rip, orig_rip; + u32 instr_len; /* * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on @@ -1586,9 +1618,33 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) * i.e. we end up advancing IP with some random value. */ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || - to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { + exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { + instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + + /* + * Emulating an enclave's instructions isn't supported as KVM + * cannot access the enclave's memory or its true RIP, e.g. the + * vmcs.GUEST_RIP points at the exit point of the enclave, not + * the RIP that actually triggered the VM-Exit. But, because + * most instructions that cause VM-Exit will #UD in an enclave, + * most instruction-based VM-Exits simply do not occur. + * + * There are a few exceptions, notably the debug instructions + * INT1ICEBRK and INT3, as they are allowed in debug enclaves + * and generate #DB/#BP as expected, which KVM might intercept. + * But again, the CPU does the dirty work and saves an instr + * length of zero so VMMs don't shoot themselves in the foot. + * WARN if KVM tries to skip a non-zero length instruction on + * a VM-Exit from an enclave. + */ + if (!instr_len) + goto rip_updated; + + WARN(exit_reason.enclave_mode, + "KVM: skipping instruction after SGX enclave VM-Exit"); + orig_rip = kvm_rip_read(vcpu); - rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + rip = orig_rip + instr_len; #ifdef CONFIG_X86_64 /* * We need to mask out the high 32 bits of RIP if not in 64-bit @@ -1604,6 +1660,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) return 0; } +rip_updated: /* skipping an emulated instruction also counts */ vmx_set_interrupt_shadow(vcpu, 0); @@ -1865,6 +1922,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_FEAT_CTL: msr_info->data = vmx->msr_ia32_feature_control; break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + return 1; + msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash + [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; + break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) return 1; @@ -2158,6 +2222,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); + + /* SGX may be enabled/disabled by guest's firmware */ + vmx_write_encls_bitmap(vcpu, NULL); + break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + /* + * On real hardware, the LE hash MSRs are writable before + * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), + * at which point SGX related bits in IA32_FEATURE_CONTROL + * become writable. + * + * KVM does not emulate SGX activation for simplicity, so + * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL + * is unlocked. This is technically not architectural + * behavior, but it's close enough. + */ + if (!msr_info->host_initiated && + (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || + ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && + !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) + return 1; + vmx->msr_ia32_sgxlepubkeyhash + [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!msr_info->host_initiated) @@ -3088,8 +3175,7 @@ static int vmx_get_max_tdp_level(void) return 4; } -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, - int root_level) +u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { u64 eptp = VMX_EPTP_MT_WB; @@ -3098,13 +3184,13 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, if (enable_ept_ad_bits && (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) eptp |= VMX_EPTP_AD_ENABLE_BIT; - eptp |= (root_hpa & PAGE_MASK); + eptp |= root_hpa; return eptp; } -static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, - int pgd_level) +static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, + int root_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@ -3112,16 +3198,10 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, u64 eptp; if (enable_ept) { - eptp = construct_eptp(vcpu, pgd, pgd_level); + eptp = construct_eptp(vcpu, root_hpa, root_level); vmcs_write64(EPT_POINTER, eptp); - if (kvm_x86_ops.tlb_remote_flush) { - spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); - to_vmx(vcpu)->ept_pointer = eptp; - to_kvm_vmx(kvm)->ept_pointers_match - = EPT_POINTERS_CHECK; - spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); - } + hv_track_root_ept(vcpu, root_hpa); if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; @@ -3131,7 +3211,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, update_guest_cr3 = false; vmx_ept_load_pdptrs(vcpu); } else { - guest_cr3 = pgd; + guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu); } if (update_guest_cr3) @@ -4314,15 +4394,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) vmx->secondary_exec_control = exec_control; } -static void ept_set_mmio_spte_mask(void) -{ - /* - * EPT Misconfigurations can be generated if the value of bits 2:0 - * of an EPT paging-structure entry is 110b (write/execute). - */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0); -} - #define VMX_XSS_EXIT_BITMAP 0 /* @@ -4410,8 +4481,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } - if (cpu_has_vmx_encls_vmexit()) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + vmx_write_encls_bitmap(&vmx->vcpu, NULL); if (vmx_pt_mode_is_host_guest()) { memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); @@ -5184,17 +5254,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) return 1; } -static int handle_vmcall(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_hypercall(vcpu); -} - -static int handle_invd(struct kvm_vcpu *vcpu) -{ - /* Treat an INVD instruction as a NOP and just skip it. */ - return kvm_skip_emulated_instruction(vcpu); -} - static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); @@ -5203,28 +5262,6 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -static int handle_rdpmc(struct kvm_vcpu *vcpu) -{ - int err; - - err = kvm_rdpmc(vcpu); - return kvm_complete_insn_gp(vcpu, err); -} - -static int handle_wbinvd(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_wbinvd(vcpu); -} - -static int handle_xsetbv(struct kvm_vcpu *vcpu) -{ - u64 new_bv = kvm_read_edx_eax(vcpu); - u32 index = kvm_rcx_read(vcpu); - - int err = kvm_set_xcr(vcpu, index, new_bv); - return kvm_complete_insn_gp(vcpu, err); -} - static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { @@ -5384,6 +5421,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; + if (!vmx_can_emulate_instruction(vcpu, NULL, 0)) + return 1; + /* * A nested guest cannot optimize MMIO vmexits, because we have an * nGPA here instead of the required GPA. @@ -5485,18 +5525,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -static void vmx_enable_tdp(void) -{ - kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, - enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK, - cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - VMX_EPT_RWX_MASK, 0ull); - - ept_set_mmio_spte_mask(); -} - /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -5516,34 +5544,11 @@ static int handle_pause(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -static int handle_nop(struct kvm_vcpu *vcpu) -{ - return kvm_skip_emulated_instruction(vcpu); -} - -static int handle_mwait(struct kvm_vcpu *vcpu) -{ - printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); - return handle_nop(vcpu); -} - -static int handle_invalid_op(struct kvm_vcpu *vcpu) -{ - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; -} - static int handle_monitor_trap(struct kvm_vcpu *vcpu) { return 1; } -static int handle_monitor(struct kvm_vcpu *vcpu) -{ - printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); - return handle_nop(vcpu); -} - static int handle_invpcid(struct kvm_vcpu *vcpu) { u32 vmx_instruction_info; @@ -5632,16 +5637,18 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu) return 1; } +#ifndef CONFIG_X86_SGX_KVM static int handle_encls(struct kvm_vcpu *vcpu) { /* - * SGX virtualization is not yet supported. There is no software - * enable bit for SGX, so we have to trap ENCLS and inject a #UD - * to prevent the guest from executing ENCLS. + * SGX virtualization is disabled. There is no software enable bit for + * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent + * the guest from executing ENCLS (when SGX is supported by hardware). */ kvm_queue_exception(vcpu, UD_VECTOR); return 1; } +#endif /* CONFIG_X86_SGX_KVM */ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) { @@ -5668,10 +5675,10 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, [EXIT_REASON_HLT] = kvm_emulate_halt, - [EXIT_REASON_INVD] = handle_invd, + [EXIT_REASON_INVD] = kvm_emulate_invd, [EXIT_REASON_INVLPG] = handle_invlpg, - [EXIT_REASON_RDPMC] = handle_rdpmc, - [EXIT_REASON_VMCALL] = handle_vmcall, + [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, + [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, @@ -5685,8 +5692,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_APIC_WRITE] = handle_apic_write, [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, - [EXIT_REASON_WBINVD] = handle_wbinvd, - [EXIT_REASON_XSETBV] = handle_xsetbv, + [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, + [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_GDTR_IDTR] = handle_desc, @@ -5694,13 +5701,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, - [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, + [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, - [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, + [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, [EXIT_REASON_INVEPT] = handle_vmx_instruction, [EXIT_REASON_INVVPID] = handle_vmx_instruction, - [EXIT_REASON_RDRAND] = handle_invalid_op, - [EXIT_REASON_RDSEED] = handle_invalid_op, + [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, + [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmx_instruction, @@ -5787,12 +5794,23 @@ static void vmx_dump_dtsel(char *name, uint32_t limit) vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); } -void dump_vmcs(void) +static void vmx_dump_msrs(char *name, struct vmx_msrs *m) +{ + unsigned int i; + struct vmx_msr_entry *e; + + pr_err("MSR %s:\n", name); + for (i = 0, e = m->val; i < m->nr; ++i, ++e) + pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); +} + +void dump_vmcs(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmentry_ctl, vmexit_ctl; u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; unsigned long cr4; - u64 efer; + int efer_slot; if (!dump_invalid_vmcs) { pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); @@ -5804,7 +5822,6 @@ void dump_vmcs(void) cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); cr4 = vmcs_readl(GUEST_CR4); - efer = vmcs_read64(GUEST_IA32_EFER); secondary_exec_control = 0; if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); @@ -5816,9 +5833,7 @@ void dump_vmcs(void) pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); - if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && - (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) - { + if (cpu_has_vmx_ept()) { pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", @@ -5841,10 +5856,20 @@ void dump_vmcs(void) vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); - if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || - (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) - pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", - efer, vmcs_read64(GUEST_IA32_PAT)); + efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); + if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) + pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); + else if (efer_slot >= 0) + pr_err("EFER= 0x%016llx (autoload)\n", + vmx->msr_autoload.guest.val[efer_slot].value); + else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) + pr_err("EFER= 0x%016llx (effective)\n", + vcpu->arch.efer | (EFER_LMA | EFER_LME)); + else + pr_err("EFER= 0x%016llx (effective)\n", + vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); + if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) + pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", vmcs_read64(GUEST_IA32_DEBUGCTL), vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); @@ -5860,6 +5885,10 @@ void dump_vmcs(void) if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) pr_err("InterruptStatus = %04x\n", vmcs_read16(GUEST_INTR_STATUS)); + if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) + vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); + if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) + vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); pr_err("*** Host State ***\n"); pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", @@ -5881,14 +5910,16 @@ void dump_vmcs(void) vmcs_readl(HOST_IA32_SYSENTER_ESP), vmcs_read32(HOST_IA32_SYSENTER_CS), vmcs_readl(HOST_IA32_SYSENTER_EIP)); - if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) - pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", - vmcs_read64(HOST_IA32_EFER), - vmcs_read64(HOST_IA32_PAT)); + if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) + pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); + if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) + pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); if (cpu_has_load_perf_global_ctrl() && vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); + if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) + vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); pr_err("*** Control State ***\n"); pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", @@ -5997,7 +6028,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) } if (exit_reason.failed_vmentry) { - dump_vmcs(); + dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; @@ -6006,7 +6037,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) } if (unlikely(vmx->fail)) { - dump_vmcs(); + dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); @@ -6092,7 +6123,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) unexpected_vmexit: vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason.full); - dump_vmcs(); + dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; @@ -6976,6 +7007,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); + vcpu_setup_sgx_lepubkeyhash(vcpu); + vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -6989,8 +7022,9 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx->pi_desc.nv = POSTED_INTR_VECTOR; vmx->pi_desc.sn = 1; - vmx->ept_pointer = INVALID_PAGE; - +#if IS_ENABLED(CONFIG_HYPERV) + vmx->hv_root_ept = INVALID_PAGE; +#endif return 0; free_vmcs: @@ -7007,7 +7041,9 @@ free_vpid: static int vmx_vm_init(struct kvm *kvm) { - spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); +#if IS_ENABLED(CONFIG_HYPERV) + spin_lock_init(&to_kvm_vmx(kvm)->hv_root_ept_lock); +#endif if (!ple_gap) kvm->arch.pause_in_guest = true; @@ -7302,6 +7338,19 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) set_cr4_guest_host_mask(vmx); + vmx_write_encls_bitmap(vcpu, NULL); + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; + else + vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + vmx->msr_ia32_feature_control_valid_bits |= + FEAT_CTL_SGX_LC_ENABLED; + else + vmx->msr_ia32_feature_control_valid_bits &= + ~FEAT_CTL_SGX_LC_ENABLED; + /* Refresh #PF interception to account for MAXPHYADDR changes. */ vmx_update_exception_bitmap(vcpu); } @@ -7322,6 +7371,13 @@ static __init void vmx_set_cpu_caps(void) if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (!enable_sgx) { + kvm_cpu_cap_clear(X86_FEATURE_SGX); + kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); + kvm_cpu_cap_clear(X86_FEATURE_SGX1); + kvm_cpu_cap_clear(X86_FEATURE_SGX2); + } + if (vmx_umip_emulated()) kvm_cpu_cap_set(X86_FEATURE_UMIP); @@ -7848,7 +7904,8 @@ static __init int hardware_setup(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ if (enable_ept) - vmx_enable_tdp(); + kvm_mmu_set_ept_masks(enable_ept_ad_bits, + cpu_has_vmx_ept_execute_only()); if (!enable_ept) ept_lpage_level = 0; @@ -7909,6 +7966,8 @@ static __init int hardware_setup(void) if (!enable_ept || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; + setup_default_sgx_lepubkeyhash(); + if (nested) { nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 89da5e1251f1..19fe09fad2fe 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -325,7 +325,12 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; - u64 ept_pointer; + /* SGX Launch Control public key hash */ + u64 msr_ia32_sgxlepubkeyhash[4]; + +#if IS_ENABLED(CONFIG_HYPERV) + u64 hv_root_ept; +#endif struct pt_desc pt_desc; struct lbr_desc lbr_desc; @@ -338,12 +343,6 @@ struct vcpu_vmx { } shadow_msr_intercept; }; -enum ept_pointers_status { - EPT_POINTERS_CHECK = 0, - EPT_POINTERS_MATCH = 1, - EPT_POINTERS_MISMATCH = 2 -}; - struct kvm_vmx { struct kvm kvm; @@ -351,8 +350,10 @@ struct kvm_vmx { bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; - enum ept_pointers_status ept_pointers_match; - spinlock_t ept_pointer_lock; +#if IS_ENABLED(CONFIG_HYPERV) + hpa_t hv_root_ept; + spinlock_t hv_root_ept_lock; +#endif }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); @@ -376,8 +377,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, - int root_level); +u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); @@ -543,6 +543,6 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu); } -void dump_vmcs(void); +void dump_vmcs(struct kvm_vcpu *vcpu); #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 692b0c31c9c8..164b64f65a8f 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -37,6 +37,10 @@ static __always_inline void vmcs_check32(unsigned long field) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, "32-bit accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, + "32-bit accessor invalid for 64-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "32-bit accessor invalid for 64-bit high field"); BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, "32-bit accessor invalid for natural width field"); } |