diff options
Diffstat (limited to 'arch/x86/kvm/svm/svm.c')
| -rw-r--r-- | arch/x86/kvm/svm/svm.c | 2332 |
1 files changed, 1353 insertions, 979 deletions
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 58f0077d9357..f56c2d895011 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1,4 +1,4 @@ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> @@ -6,6 +6,7 @@ #include "mmu.h" #include "kvm_cache_regs.h" #include "x86.h" +#include "smm.h" #include "cpuid.h" #include "pmu.h" @@ -26,8 +27,12 @@ #include <linux/swap.h> #include <linux/rwsem.h> #include <linux/cc_platform.h> +#include <linux/smp.h> +#include <linux/string_choices.h> +#include <linux/mutex.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> @@ -37,9 +42,11 @@ #include <asm/spec-ctrl.h> #include <asm/cpu_device_id.h> #include <asm/traps.h> +#include <asm/reboot.h> #include <asm/fpu/api.h> -#include <asm/virtext.h> +#include <trace/events/ipi.h> + #include "trace.h" #include "svm.h" @@ -49,6 +56,7 @@ #include "svm_onhyperv.h" MODULE_AUTHOR("Qumranet"); +MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions"); MODULE_LICENSE("GPL"); #ifdef MODULE @@ -64,8 +72,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); static bool erratum_383_found __read_mostly; -u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; - /* * Set osvw_len to higher value when updated Revision Guides * are published and we know what the new status bits are @@ -74,69 +80,6 @@ static uint64_t osvw_len = 4, osvw_status; static DEFINE_PER_CPU(u64, current_tsc_ratio); -#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4)) - -static const struct svm_direct_access_msrs { - u32 index; /* Index of the MSR */ - bool always; /* True if intercept is initially cleared */ -} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { - { .index = MSR_STAR, .always = true }, - { .index = MSR_IA32_SYSENTER_CS, .always = true }, - { .index = MSR_IA32_SYSENTER_EIP, .always = false }, - { .index = MSR_IA32_SYSENTER_ESP, .always = false }, -#ifdef CONFIG_X86_64 - { .index = MSR_GS_BASE, .always = true }, - { .index = MSR_FS_BASE, .always = true }, - { .index = MSR_KERNEL_GS_BASE, .always = true }, - { .index = MSR_LSTAR, .always = true }, - { .index = MSR_CSTAR, .always = true }, - { .index = MSR_SYSCALL_MASK, .always = true }, -#endif - { .index = MSR_IA32_SPEC_CTRL, .always = false }, - { .index = MSR_IA32_PRED_CMD, .always = false }, - { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, - { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, - { .index = MSR_IA32_LASTINTFROMIP, .always = false }, - { .index = MSR_IA32_LASTINTTOIP, .always = false }, - { .index = MSR_EFER, .always = false }, - { .index = MSR_IA32_CR_PAT, .always = false }, - { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, - { .index = MSR_TSC_AUX, .always = false }, - { .index = X2APIC_MSR(APIC_ID), .always = false }, - { .index = X2APIC_MSR(APIC_LVR), .always = false }, - { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, - { .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, - { .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, - { .index = X2APIC_MSR(APIC_EOI), .always = false }, - { .index = X2APIC_MSR(APIC_RRR), .always = false }, - { .index = X2APIC_MSR(APIC_LDR), .always = false }, - { .index = X2APIC_MSR(APIC_DFR), .always = false }, - { .index = X2APIC_MSR(APIC_SPIV), .always = false }, - { .index = X2APIC_MSR(APIC_ISR), .always = false }, - { .index = X2APIC_MSR(APIC_TMR), .always = false }, - { .index = X2APIC_MSR(APIC_IRR), .always = false }, - { .index = X2APIC_MSR(APIC_ESR), .always = false }, - { .index = X2APIC_MSR(APIC_ICR), .always = false }, - { .index = X2APIC_MSR(APIC_ICR2), .always = false }, - - /* - * Note: - * AMD does not virtualize APIC TSC-deadline timer mode, but it is - * emulated by KVM. When setting APIC LVTT (0x832) register bit 18, - * the AVIC hardware would generate GP fault. Therefore, always - * intercept the MSR 0x832, and do not setup direct_access_msr. - */ - { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, - { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, - { .index = X2APIC_MSR(APIC_LVT0), .always = false }, - { .index = X2APIC_MSR(APIC_LVT1), .always = false }, - { .index = X2APIC_MSR(APIC_LVTERR), .always = false }, - { .index = X2APIC_MSR(APIC_TMICT), .always = false }, - { .index = X2APIC_MSR(APIC_TMCCT), .always = false }, - { .index = X2APIC_MSR(APIC_TDCR), .always = false }, - { .index = MSR_INVALID, .always = false }, -}; - /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * pause_filter_count: On processors that support Pause filtering(indicated @@ -194,10 +137,10 @@ module_param_named(npt, npt_enabled, bool, 0444); /* allow nested virtualization in KVM/SVM */ static int nested = true; -module_param(nested, int, S_IRUGO); +module_param(nested, int, 0444); /* enable/disable Next RIP Save */ -static int nrips = true; +int nrips = true; module_param(nrips, int, 0444); /* enable/disable Virtual VMLOAD VMSAVE */ @@ -209,18 +152,13 @@ int vgif = true; module_param(vgif, int, 0444); /* enable/disable LBR virtualization */ -static int lbrv = true; +int lbrv = true; module_param(lbrv, int, 0444); static int tsc_scaling = true; module_param(tsc_scaling, int, 0444); -/* - * enable / disable AVIC. Because the defaults differ for APICv - * support between VMX and SVM we cannot use module_param_named. - */ -static bool avic; -module_param(avic, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -229,6 +167,8 @@ module_param(dump_invalid_vmcb, bool, 0644); bool intercept_smi = true; module_param(intercept_smi, bool, 0444); +bool vnmi = true; +module_param(vnmi, bool, 0444); static bool svm_gp_erratum_intercept = true; @@ -236,16 +176,9 @@ static u8 rsm_ins_bytes[] = "\x0f\xaa"; static unsigned long iopm_base; -struct kvm_ldttss_desc { - u16 limit0; - u16 base0; - unsigned base1:8, type:5, dpl:2, p:1; - unsigned limit1:4, zero0:3, g:1, base2:8; - u32 base3; - u32 zero1; -} __attribute__((packed)); +DEFINE_PER_CPU(struct svm_cpu_data, svm_data); -DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +static DEFINE_MUTEX(vmcb_dump_mutex); /* * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via @@ -254,36 +187,7 @@ DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to * defer the restoration of TSC_AUX until the CPU returns to userspace. */ -static int tsc_aux_uret_slot __read_mostly = -1; - -static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; - -#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) -#define MSRS_RANGE_SIZE 2048 -#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) - -u32 svm_msrpm_offset(u32 msr) -{ - u32 offset; - int i; - - for (i = 0; i < NUM_MSR_MAPS; i++) { - if (msr < msrpm_ranges[i] || - msr >= msrpm_ranges[i] + MSRS_IN_RANGE) - continue; - - offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ - offset += (i * MSRS_RANGE_SIZE); /* add range offset */ - - /* Now we have the u8 offset - but need the u32 offset */ - return offset / 4; - } - - /* MSR not in any range */ - return MSR_INVALID; -} - -static void svm_flush_tlb_current(struct kvm_vcpu *vcpu); +int tsc_aux_uret_slot __ro_after_init = -1; static int get_npt_level(void) { @@ -346,12 +250,6 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) return 0; } -static int is_external_interrupt(u32 info) -{ - info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; - return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); -} - static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -374,6 +272,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, + int emul_type, bool commit_side_effects) { struct vcpu_svm *svm = to_svm(vcpu); @@ -395,7 +294,7 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, if (unlikely(!commit_side_effects)) old_rflags = svm->vmcb->save.rflags; - if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) + if (!kvm_emulate_instruction(vcpu, emul_type)) return 0; if (unlikely(!commit_side_effects)) @@ -413,11 +312,13 @@ done: static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - return __svm_skip_emulated_instruction(vcpu, true); + return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true); } -static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) +static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector) { + const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT | + EMULTYPE_SET_SOFT_INT_VECTOR(vector); unsigned long rip, old_rip = kvm_rip_read(vcpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -433,7 +334,7 @@ static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) * in use, the skip must not commit any side effects such as clearing * the interrupt shadow or RFLAGS.RF. */ - if (!__svm_skip_emulated_instruction(vcpu, !nrips)) + if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips)) return -EIO; rip = kvm_rip_read(vcpu); @@ -469,7 +370,7 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu) kvm_deliver_exception_payload(vcpu, ex); if (kvm_exception_is_soft(ex->vector) && - svm_update_soft_interrupt_rip(vcpu)) + svm_update_soft_interrupt_rip(vcpu, ex->vector)) return; svm->vmcb->control.event_inj = ex->vector @@ -481,24 +382,18 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu) static void svm_init_erratum_383(void) { - u32 low, high; - int err; u64 val; if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) return; /* Use _safe variants to not break nested virtualization */ - val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); - if (err) + if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val)) return; val |= (1ULL << 47); - low = lower_32_bits(val); - high = upper_32_bits(val); - - native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + native_write_msr_safe(MSR_AMD64_DC_CFG, val); erratum_383_found = true; } @@ -524,80 +419,117 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu) vcpu->arch.osvw.status |= 1; } -static int has_svm(void) +static bool __kvm_is_svm_supported(void) { - const char *msg; + int cpu = smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu); - if (!cpu_has_svm(&msg)) { - printk(KERN_INFO "has_svm: %s\n", msg); - return 0; + if (c->x86_vendor != X86_VENDOR_AMD && + c->x86_vendor != X86_VENDOR_HYGON) { + pr_err("CPU %d isn't AMD or Hygon\n", cpu); + return false; + } + + if (!cpu_has(c, X86_FEATURE_SVM)) { + pr_err("SVM not supported by CPU %d\n", cpu); + return false; } if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { pr_info("KVM is unsupported when running as an SEV guest\n"); - return 0; + return false; } - return 1; + return true; } -void __svm_write_tsc_multiplier(u64 multiplier) +static bool kvm_is_svm_supported(void) { - preempt_disable(); + bool supported; + + migrate_disable(); + supported = __kvm_is_svm_supported(); + migrate_enable(); + + return supported; +} + +static int svm_check_processor_compat(void) +{ + if (!__kvm_is_svm_supported()) + return -EIO; + + return 0; +} +static void __svm_write_tsc_multiplier(u64 multiplier) +{ if (multiplier == __this_cpu_read(current_tsc_ratio)) - goto out; + return; - wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); + wrmsrq(MSR_AMD64_TSC_RATIO, multiplier); __this_cpu_write(current_tsc_ratio, multiplier); -out: - preempt_enable(); } -static void svm_hardware_disable(void) +static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) +{ + return &sd->save_area->host_sev_es_save; +} + +static inline void kvm_cpu_svm_disable(void) +{ + uint64_t efer; + + wrmsrq(MSR_VM_HSAVE_PA, 0); + rdmsrq(MSR_EFER, efer); + if (efer & EFER_SVME) { + /* + * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and + * NMI aren't blocked. + */ + stgi(); + wrmsrq(MSR_EFER, efer & ~EFER_SVME); + } +} + +static void svm_emergency_disable_virtualization_cpu(void) +{ + kvm_rebooting = true; + + kvm_cpu_svm_disable(); +} + +static void svm_disable_virtualization_cpu(void) { /* Make sure we clean up behind us */ if (tsc_scaling) __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); - cpu_svm_disable(); + kvm_cpu_svm_disable(); amd_pmu_disable_virt(); } -static int svm_hardware_enable(void) +static int svm_enable_virtualization_cpu(void) { struct svm_cpu_data *sd; uint64_t efer; - struct desc_struct *gdt; int me = raw_smp_processor_id(); - rdmsrl(MSR_EFER, efer); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) return -EBUSY; - if (!has_svm()) { - pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); - return -EINVAL; - } - sd = per_cpu(svm_data, me); - if (!sd) { - pr_err("%s: svm_data is NULL on %d\n", __func__, me); - return -EINVAL; - } - + sd = per_cpu_ptr(&svm_data, me); sd->asid_generation = 1; sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; sd->min_asid = max_sev_asid + 1; - gdt = get_current_gdt_rw(); - sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); - - wrmsrl(MSR_EFER, efer | EFER_SVME); + wrmsrq(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area)); + wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { /* @@ -618,13 +550,12 @@ static int svm_hardware_enable(void) * erratum is present everywhere). */ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { - uint64_t len, status = 0; + u64 len, status = 0; int err; - len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len); if (!err) - status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, - &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status); if (err) osvw_status = osvw_len = 0; @@ -646,261 +577,231 @@ static int svm_hardware_enable(void) static void svm_cpu_uninit(int cpu) { - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - if (!sd) + if (!sd->save_area) return; - per_cpu(svm_data, cpu) = NULL; kfree(sd->sev_vmcbs); - __free_page(sd->save_area); - kfree(sd); + __free_page(__sme_pa_to_page(sd->save_area_pa)); + sd->save_area_pa = 0; + sd->save_area = NULL; } static int svm_cpu_init(int cpu) { - struct svm_cpu_data *sd; + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); + struct page *save_area_page; int ret = -ENOMEM; - sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); - if (!sd) + memset(sd, 0, sizeof(struct svm_cpu_data)); + save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL); + if (!save_area_page) return ret; - sd->cpu = cpu; - sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!sd->save_area) - goto free_cpu_data; ret = sev_cpu_init(sd); if (ret) goto free_save_area; - per_cpu(svm_data, cpu) = sd; - + sd->save_area = page_address(save_area_page); + sd->save_area_pa = __sme_page_pa(save_area_page); return 0; free_save_area: - __free_page(sd->save_area); -free_cpu_data: - kfree(sd); + __free_page(save_area_page); return ret; } -static int direct_access_msr_slot(u32 msr) +static void set_dr_intercepts(struct vcpu_svm *svm) { - u32 i; + struct vmcb *vmcb = svm->vmcb01.ptr; - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) - if (direct_access_msrs[i].index == msr) - return i; + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); - return -ENOENT; + recalc_intercepts(svm); } -static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, - int write) +static void clr_dr_intercepts(struct vcpu_svm *svm) { - struct vcpu_svm *svm = to_svm(vcpu); - int slot = direct_access_msr_slot(msr); - - if (slot == -ENOENT) - return; - - /* Set the shadow bitmaps to the desired intercept states */ - if (read) - set_bit(slot, svm->shadow_msr_intercept.read); - else - clear_bit(slot, svm->shadow_msr_intercept.read); + struct vmcb *vmcb = svm->vmcb01.ptr; - if (write) - set_bit(slot, svm->shadow_msr_intercept.write); - else - clear_bit(slot, svm->shadow_msr_intercept.write); -} + vmcb->control.intercepts[INTERCEPT_DR] = 0; -static bool valid_msr_intercept(u32 index) -{ - return direct_access_msr_slot(index) != -ENOENT; + recalc_intercepts(svm); } static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) { - u8 bit_write; - unsigned long tmp; - u32 offset; - u32 *msrpm; - - msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: - to_svm(vcpu)->msrpm; - - offset = svm_msrpm_offset(msr); - bit_write = 2 * (msr & 0x0f) + 1; - tmp = msrpm[offset]; - - BUG_ON(offset == MSR_INVALID); + /* + * For non-nested case: + * If the L01 MSR bitmap does not intercept the MSR, then we need to + * save it. + * + * For nested case: + * If the L02 MSR bitmap does not intercept the MSR, then we need to + * save it. + */ + void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm : + to_svm(vcpu)->msrpm; - return !!test_bit(bit_write, &tmp); + return svm_test_msr_bitmap_write(msrpm, msr); } -static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, - u32 msr, int read, int write) +void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) { struct vcpu_svm *svm = to_svm(vcpu); - u8 bit_read, bit_write; - unsigned long tmp; - u32 offset; - - /* - * If this warning triggers extend the direct_access_msrs list at the - * beginning of the file - */ - WARN_ON(!valid_msr_intercept(msr)); - - /* Enforce non allowed MSRs to trap */ - if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) - read = 0; - - if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) - write = 0; + void *msrpm = svm->msrpm; - offset = svm_msrpm_offset(msr); - bit_read = 2 * (msr & 0x0f); - bit_write = 2 * (msr & 0x0f) + 1; - tmp = msrpm[offset]; - - BUG_ON(offset == MSR_INVALID); - - read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); - write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); + /* Don't disable interception for MSRs userspace wants to handle. */ + if (type & MSR_TYPE_R) { + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) + svm_clear_msr_bitmap_read(msrpm, msr); + else + svm_set_msr_bitmap_read(msrpm, msr); + } - msrpm[offset] = tmp; + if (type & MSR_TYPE_W) { + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) + svm_clear_msr_bitmap_write(msrpm, msr); + else + svm_set_msr_bitmap_write(msrpm, msr); + } svm_hv_vmcb_dirty_nested_enlightenments(vcpu); svm->nested.force_msr_bitmap_recalc = true; } -void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, - int read, int write) +void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask) { - set_shadow_msr_intercept(vcpu, msr, read, write); - set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); -} - -u32 *svm_vcpu_alloc_msrpm(void) -{ - unsigned int order = get_order(MSRPM_SIZE); - struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order); - u32 *msrpm; + unsigned int order = get_order(size); + struct page *pages = alloc_pages(gfp_mask, order); + void *pm; if (!pages) return NULL; - msrpm = page_address(pages); - memset(msrpm, 0xff, PAGE_SIZE * (1 << order)); - - return msrpm; -} - -void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) -{ - int i; + /* + * Set all bits in the permissions map so that all MSR and I/O accesses + * are intercepted by default. + */ + pm = page_address(pages); + memset(pm, 0xff, PAGE_SIZE * (1 << order)); - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - if (!direct_access_msrs[i].always) - continue; - set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); - } + return pm; } -void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) +static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) { - int i; - - if (intercept == svm->x2avic_msrs_intercepted) - return; + struct vcpu_svm *svm = to_svm(vcpu); + bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); - if (avic_mode != AVIC_MODE_X2 || - !apic_x2apic_mode(svm->vcpu.arch.apic)) + if (intercept == svm->lbr_msrs_intercepted) return; - for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { - int index = direct_access_msrs[i].index; + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept); - if ((index < APIC_BASE_MSR) || - (index > APIC_BASE_MSR + 0xff)) - continue; - set_msr_interception(&svm->vcpu, svm->msrpm, index, - !intercept, !intercept); - } + if (sev_es_guest(vcpu->kvm)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); - svm->x2avic_msrs_intercepted = intercept; + svm->lbr_msrs_intercepted = intercept; } -void svm_vcpu_free_msrpm(u32 *msrpm) +void svm_vcpu_free_msrpm(void *msrpm) { __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); } -static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) +static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - u32 i; - - /* - * Set intercept permissions for all direct access MSRs again. They - * will automatically get filtered through the MSR filter, so we are - * back in sync after this. - */ - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - u32 msr = direct_access_msrs[i].index; - u32 read = test_bit(i, svm->shadow_msr_intercept.read); - u32 write = test_bit(i, svm->shadow_msr_intercept.write); - - set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); - } -} - -static void add_msr_offset(u32 offset) -{ - int i; - for (i = 0; i < MSRPM_OFFSETS; ++i) { + svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); - /* Offset already in list? */ - if (msrpm_offsets[i] == offset) - return; +#ifdef CONFIG_X86_64 + svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW); +#endif - /* Slot used by another offset? */ - if (msrpm_offsets[i] != MSR_INVALID) - continue; + if (lbrv) + svm_recalc_lbr_msr_intercepts(vcpu); - /* Add offset to list */ - msrpm_offsets[i] = offset; + if (cpu_feature_enabled(X86_FEATURE_IBPB)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, + !guest_has_pred_cmd_msr(vcpu)); - return; - } + if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); /* - * If this BUG triggers the msrpm_offsets table has an overflow. Just - * increase MSRPM_OFFSETS in this case. + * Disable interception of SPEC_CTRL if KVM doesn't need to manually + * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if + * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively + * using SPEC_CTRL. */ - BUG(); -} + if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, + !guest_has_spec_ctrl_msr(vcpu)); + else + svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, + !svm->spec_ctrl); -static void init_msrpm_offsets(void) -{ - int i; + /* + * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU, + * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits. + */ + svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW, + guest_cpuid_is_intel_compatible(vcpu)); + svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW, + guest_cpuid_is_intel_compatible(vcpu)); + + if (kvm_aperfmperf_in_guest(vcpu->kvm)) { + svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); + } - memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + bool shstk_enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - u32 offset; + svm_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, !shstk_enabled); + } - offset = svm_msrpm_offset(direct_access_msrs[i].index); - BUG_ON(offset == MSR_INVALID); + if (sev_es_guest(vcpu->kvm)) + sev_es_recalc_msr_intercepts(vcpu); - add_msr_offset(offset); - } + /* + * x2APIC intercepts are modified on-demand and cannot be filtered by + * userspace. + */ } void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) @@ -914,91 +815,43 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) vmcb_mark_dirty(to_vmcb, VMCB_LBR); } -static void svm_enable_lbrv(struct kvm_vcpu *vcpu) +static void __svm_enable_lbrv(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); - - /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ - if (is_guest_mode(vcpu)) - svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); + to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; } -static void svm_disable_lbrv(struct kvm_vcpu *vcpu) +void svm_enable_lbrv(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); - - /* - * Move the LBR msrs back to the vmcb01 to avoid copying them - * on nested guest entries. - */ - if (is_guest_mode(vcpu)) - svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); + __svm_enable_lbrv(vcpu); + svm_recalc_lbr_msr_intercepts(vcpu); } -static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index) +static void __svm_disable_lbrv(struct kvm_vcpu *vcpu) { - /* - * If the LBR virtualization is disabled, the LBR msrs are always - * kept in the vmcb01 to avoid copying them on nested guest entries. - * - * If nested, and the LBR virtualization is enabled/disabled, the msrs - * are moved between the vmcb01 and vmcb02 as needed. - */ - struct vmcb *vmcb = - (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ? - svm->vmcb : svm->vmcb01.ptr; - - switch (index) { - case MSR_IA32_DEBUGCTLMSR: - return vmcb->save.dbgctl; - case MSR_IA32_LASTBRANCHFROMIP: - return vmcb->save.br_from; - case MSR_IA32_LASTBRANCHTOIP: - return vmcb->save.br_to; - case MSR_IA32_LASTINTFROMIP: - return vmcb->save.last_excp_from; - case MSR_IA32_LASTINTTOIP: - return vmcb->save.last_excp_to; - default: - KVM_BUG(false, svm->vcpu.kvm, - "%s: Unknown MSR 0x%x", __func__, index); - return 0; - } + KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); + to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; } void svm_update_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; + bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) || + (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); - bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) & - DEBUGCTLMSR_LBR; + if (enable_lbrv && !current_enable_lbrv) + __svm_enable_lbrv(vcpu); + else if (!enable_lbrv && current_enable_lbrv) + __svm_disable_lbrv(vcpu); - bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext & - LBR_CTL_ENABLE_MASK); - - if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled)) - if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)) - enable_lbrv = true; - - if (enable_lbrv == current_enable_lbrv) - return; - - if (enable_lbrv) - svm_enable_lbrv(vcpu); - else - svm_disable_lbrv(vcpu); + /* + * During nested transitions, it is possible that the current VMCB has + * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa). + * In this case, even though LBR_CTL does not need an update, intercepts + * do, so always recalculate the intercepts here. + */ + svm_recalc_lbr_msr_intercepts(vcpu); } void disable_nmi_singlestep(struct vcpu_svm *svm) @@ -1060,13 +913,14 @@ static void svm_hardware_unsetup(void) { int cpu; + avic_hardware_unsetup(); + sev_hardware_unsetup(); for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); - __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), - get_order(IOPM_SIZE)); + __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE)); iopm_base = 0; } @@ -1101,63 +955,51 @@ static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return svm->tsc_ratio_msr; } -static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; - svm->vmcb->control.tsc_offset = offset; + svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu) { - __svm_write_tsc_multiplier(multiplier); + preempt_disable(); + if (to_svm(vcpu)->guest_state_loaded) + __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); + preempt_enable(); } - /* Evaluate instruction intercepts that depend on guest CPUID features. */ -static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, - struct vcpu_svm *svm) +static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + /* * Intercept INVPCID if shadow paging is enabled to sync/free shadow * roots, or if INVPCID is disabled in the guest to inject #UD. */ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { if (!npt_enabled || - !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID)) + !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID)) svm_set_intercept(svm, INTERCEPT_INVPCID); else svm_clr_intercept(svm, INTERCEPT_INVPCID); } if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { - if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP)) svm_clr_intercept(svm, INTERCEPT_RDTSCP); else svm_set_intercept(svm, INTERCEPT_RDTSCP); } -} - -static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - if (guest_cpuid_is_intel(vcpu)) { - /* - * We must intercept SYSENTER_EIP and SYSENTER_ESP - * accesses because the processor only stores 32 bits. - * For the same reason we cannot use virtual VMLOAD/VMSAVE. - */ + if (guest_cpuid_is_intel_compatible(vcpu)) { svm_set_intercept(svm, INTERCEPT_VMLOAD); svm_set_intercept(svm, INTERCEPT_VMSAVE); svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); - - svm->v_vmload_vmsave_enabled = false; } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it @@ -1168,13 +1010,16 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_VMSAVE); svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } - /* No need to intercept these MSRs */ - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); } } -static void init_vmcb(struct kvm_vcpu *vcpu) +static void svm_recalc_intercepts(struct kvm_vcpu *vcpu) +{ + svm_recalc_instruction_intercepts(vcpu); + svm_recalc_msr_intercepts(vcpu); +} + +static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb01.ptr; @@ -1201,10 +1046,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway - * as VMware does. Don't intercept #GP for SEV guests as KVM can't - * decrypt guest memory to decode the faulting instruction. + * as VMware does. */ - if (enable_vmware_backdoor && !sev_guest(vcpu->kvm)) + if (enable_vmware_backdoor) set_exception_intercept(svm, GP_VECTOR); svm_set_intercept(svm, INTERCEPT_INTR); @@ -1240,10 +1084,14 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_MWAIT); } - if (!kvm_hlt_in_guest(vcpu->kvm)) - svm_set_intercept(svm, INTERCEPT_HLT); + if (!kvm_hlt_in_guest(vcpu->kvm)) { + if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT)) + svm_set_intercept(svm, INTERCEPT_IDLE_HLT); + else + svm_set_intercept(svm, INTERCEPT_HLT); + } - control->iopm_base_pa = __sme_set(iopm_base); + control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); control->int_ctl = V_INTR_MASKING_MASK; @@ -1293,29 +1141,27 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_PAUSE); } - svm_recalc_instruction_intercepts(vcpu, svm); - - /* - * If the host supports V_SPEC_CTRL then disable the interception - * of MSR_IA32_SPEC_CTRL. - */ - if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); - if (kvm_vcpu_apicv_active(vcpu)) avic_init_vmcb(svm, vmcb); + if (vnmi) + svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK; + if (vgif) { svm_clr_intercept(svm, INTERCEPT_STGI); svm_clr_intercept(svm, INTERCEPT_CLGI); svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (vcpu->kvm->arch.bus_lock_detection_enabled) + svm_set_intercept(svm, INTERCEPT_BUSLOCK); + if (sev_guest(vcpu->kvm)) - sev_init_vmcb(svm); + sev_init_vmcb(svm, init_event); svm_hv_init_vmcb(vmcb); - init_vmcb_after_set_cpuid(vcpu); + + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); vmcb_mark_all_dirty(vmcb); @@ -1326,14 +1172,14 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - svm_vcpu_init_msrpm(vcpu, svm->msrpm); - svm_init_osvw(vcpu); - vcpu->arch.microcode_version = 0x01000065; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) + vcpu->arch.microcode_version = 0x01000065; svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; - if (sev_es_guest(vcpu->kvm)) - sev_es_vcpu_reset(svm); + svm->nmi_masked = false; + svm->awaiting_iret_completion = false; } static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -1343,7 +1189,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; - init_vmcb(vcpu); + init_vmcb(vcpu, init_event); if (!init_event) __svm_vcpu_reset(vcpu); @@ -1355,102 +1201,134 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) svm->vmcb = target_vmcb->ptr; } +static int svm_vcpu_precreate(struct kvm *kvm) +{ + return avic_alloc_physical_id_table(kvm); +} + static int svm_vcpu_create(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *vmcb01_page; - struct page *vmsa_page = NULL; int err; BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); svm = to_svm(vcpu); err = -ENOMEM; - vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + vmcb01_page = snp_safe_alloc_page(); if (!vmcb01_page) goto out; - if (sev_es_guest(vcpu->kvm)) { - /* - * SEV-ES guests require a separate VMSA page used to contain - * the encrypted register state of the guest. - */ - vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!vmsa_page) - goto error_free_vmcb_page; - - /* - * SEV-ES guests maintain an encrypted version of their FPU - * state which is restored and saved on VMRUN and VMEXIT. - * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't - * do xsave/xrstor on it. - */ - fpstate_set_confidential(&vcpu->arch.guest_fpu); - } + err = sev_vcpu_create(vcpu); + if (err) + goto error_free_vmcb_page; err = avic_init_vcpu(svm); if (err) - goto error_free_vmsa_page; + goto error_free_sev; svm->msrpm = svm_vcpu_alloc_msrpm(); if (!svm->msrpm) { err = -ENOMEM; - goto error_free_vmsa_page; + goto error_free_sev; } svm->x2avic_msrs_intercepted = true; + svm->lbr_msrs_intercepted = true; svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); svm_switch_vmcb(svm, &svm->vmcb01); - if (vmsa_page) - svm->sev_es.vmsa = page_address(vmsa_page); - svm->guest_state_loaded = false; return 0; -error_free_vmsa_page: - if (vmsa_page) - __free_page(vmsa_page); +error_free_sev: + sev_free_vcpu(vcpu); error_free_vmcb_page: __free_page(vmcb01_page); out: return err; } -static void svm_clear_current_vmcb(struct vmcb *vmcb) +static void svm_vcpu_free(struct kvm_vcpu *vcpu) { - int i; + struct vcpu_svm *svm = to_svm(vcpu); + + WARN_ON_ONCE(!list_empty(&svm->ir_list)); + + svm_leave_nested(vcpu); + svm_free_nested(svm); - for_each_online_cpu(i) - cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); + sev_free_vcpu(vcpu); + + __free_page(__sme_pa_to_page(svm->vmcb01.pa)); + svm_vcpu_free_msrpm(svm->msrpm); } -static void svm_vcpu_free(struct kvm_vcpu *vcpu) +#ifdef CONFIG_CPU_MITIGATIONS +static DEFINE_SPINLOCK(srso_lock); +static atomic_t srso_nr_vms; + +static void svm_srso_clear_bp_spec_reduce(void *ign) { - struct vcpu_svm *svm = to_svm(vcpu); + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); + + if (!sd->bp_spec_reduce_set) + return; + + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + sd->bp_spec_reduce_set = false; +} + +static void svm_srso_vm_destroy(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + if (atomic_dec_return(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); /* - * The vmcb page can be recycled, causing a false negative in - * svm_vcpu_load(). So, ensure that no logical CPU has this - * vmcb page recorded as its current vmcb. + * Verify a new VM didn't come along, acquire the lock, and increment + * the count before this task acquired the lock. */ - svm_clear_current_vmcb(svm->vmcb); + if (atomic_read(&srso_nr_vms)) + return; - svm_free_nested(svm); + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); +} - sev_free_vcpu(vcpu); +static void svm_srso_vm_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + /* + * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 + * transition, i.e. destroying the last VM, is fully complete, e.g. so + * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. + */ + if (atomic_inc_not_zero(&srso_nr_vms)) + return; - __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT)); - __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); + guard(spinlock)(&srso_lock); + + atomic_inc(&srso_nr_vms); } +#else +static void svm_srso_vm_init(void) { } +static void svm_srso_vm_destroy(void) { } +#endif static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); if (sev_es_guest(vcpu->kvm)) sev_es_unmap_ghcb(svm); @@ -1462,20 +1340,28 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * Save additional host state that will be restored on VMEXIT (sev-es) * or subsequent vmload of host save area. */ - vmsave(__sme_page_pa(sd->save_area)); - if (sev_es_guest(vcpu->kvm)) { - struct sev_es_save_area *hostsa; - hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); - - sev_es_prepare_switch_to_guest(hostsa); - } + vmsave(sd->save_area_pa); + if (sev_es_guest(vcpu->kvm)) + sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd)); if (tsc_scaling) __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); - if (likely(tsc_aux_uret_slot >= 0)) + /* + * TSC_AUX is always virtualized (context switched by hardware) for + * SEV-ES guests when the feature is available. For non-SEV-ES guests, + * context switch TSC_AUX via the user_return MSR infrastructure (not + * all CPUs support TSC_AUX virtualization). + */ + if (likely(tsc_aux_uret_slot >= 0) && + (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && + !sd->bp_spec_reduce_set) { + sd->bp_spec_reduce_set = true; + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + } svm->guest_state_loaded = true; } @@ -1486,13 +1372,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) + shrink_ple_window(vcpu); - if (sd->current_vmcb != svm->vmcb) { - sd->current_vmcb = svm->vmcb; - indirect_branch_prediction_barrier(); - } if (kvm_vcpu_apicv_active(vcpu)) avic_vcpu_load(vcpu, cpu); } @@ -1574,6 +1456,16 @@ static void svm_set_vintr(struct vcpu_svm *svm) svm_set_intercept(svm, INTERCEPT_VINTR); /* + * Recalculating intercepts may have cleared the VINTR intercept. If + * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF + * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN. + * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as + * interrupts will never be unblocked while L2 is running. + */ + if (!svm_is_intercept(svm, INTERCEPT_VINTR)) + return; + + /* * This is just a dummy VINTR to actually cause a vmexit to happen. * Actual injection of virtual interrupts happens through EVENTINJ. */ @@ -1769,6 +1661,11 @@ static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } } +static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + return true; +} + void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1776,15 +1673,17 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) bool old_paging = is_paging(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { vcpu->arch.efer |= EFER_LMA; - svm->vmcb->save.efer |= EFER_LMA | EFER_LME; + if (!vcpu->arch.guest_state_protected) + svm->vmcb->save.efer |= EFER_LMA | EFER_LME; } if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { vcpu->arch.efer &= ~EFER_LMA; - svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); + if (!vcpu->arch.guest_state_protected) + svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); } } #endif @@ -1834,9 +1733,6 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; unsigned long old_cr4 = vcpu->arch.cr4; - if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb_current(vcpu); - vcpu->arch.cr4 = cr4; if (!npt_enabled) { cr4 |= X86_CR4_PAE; @@ -1849,7 +1745,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1908,11 +1804,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid = sd->next_asid++; } -static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = to_svm(vcpu)->vmcb; - if (svm->vcpu.arch.guest_state_protected) + if (vcpu->arch.guest_state_protected) return; if (unlikely(value != vmcb->save.dr6)) { @@ -1925,7 +1821,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (vcpu->arch.guest_state_protected) + if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm))) return; get_debugreg(vcpu->arch.db[0], 0); @@ -1969,15 +1865,33 @@ static int pf_interception(struct kvm_vcpu *vcpu) static int npf_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + int rc; u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; + /* + * WARN if hardware generates a fault with an error code that collides + * with KVM-defined sythentic flags. Clear the flags and continue on, + * i.e. don't terminate the VM, as KVM can't possibly be relying on a + * flag that KVM doesn't know about. + */ + if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK)) + error_code &= ~PFERR_SYNTHETIC_MASK; + + if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) + error_code |= PFERR_PRIVATE_ACCESS; + trace_kvm_page_fault(vcpu, fault_address, error_code); - return kvm_mmu_page_fault(vcpu, fault_address, error_code, - static_cpu_has(X86_FEATURE_DECODEASSISTS) ? - svm->vmcb->control.insn_bytes : NULL, - svm->vmcb->control.insn_len); + rc = kvm_mmu_page_fault(vcpu, fault_address, error_code, + static_cpu_has(X86_FEATURE_DECODEASSISTS) ? + svm->vmcb->control.insn_bytes : NULL, + svm->vmcb->control.insn_len); + + if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK) + sev_handle_rmp_fault(vcpu, fault_address, error_code); + + return rc; } static int db_interception(struct kvm_vcpu *vcpu) @@ -2037,14 +1951,13 @@ static int ac_interception(struct kvm_vcpu *vcpu) static bool is_erratum_383(void) { - int err, i; + int i; u64 value; if (!erratum_383_found) return false; - value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); - if (err) + if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value)) return false; /* Bit 62 may or may not be set for this mce */ @@ -2055,17 +1968,11 @@ static bool is_erratum_383(void) /* Clear MCi_STATUS registers */ for (i = 0; i < 6; ++i) - native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); - - value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); - if (!err) { - u32 low, high; + native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0); + if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) { value &= ~(1ULL << 2); - low = lower_32_bits(value); - high = upper_32_bits(value); - - native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + native_write_msr_safe(MSR_IA32_MCG_STATUS, value); } /* Flush tlb to evict multi-match entries */ @@ -2081,7 +1988,7 @@ static void svm_handle_mce(struct kvm_vcpu *vcpu) * Erratum 383 triggered. Guest state is corrupt so kill the * guest. */ - pr_err("KVM: Guest triggered AMD Erratum 383\n"); + pr_err("Guest triggered AMD Erratum 383\n"); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); @@ -2105,12 +2012,6 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; struct vcpu_svm *svm = to_svm(vcpu); - /* - * The VM save area has already been encrypted so it - * cannot be reinitialized - just terminate. - */ - if (sev_es_guest(vcpu->kvm)) - return -EINVAL; /* * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put @@ -2119,9 +2020,18 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) * userspace. At a platform view, INIT is acceptable behavior as * there exist bare metal platforms that automatically INIT the CPU * in response to shutdown. + * + * The VM save area for SEV-ES guests has already been encrypted so it + * cannot be reinitialized, i.e. synthesizing INIT is futile. */ - clear_page(svm->vmcb); - kvm_vcpu_reset(vcpu, true); + if (!sev_es_guest(vcpu->kvm)) { + clear_page(svm->vmcb); +#ifdef CONFIG_KVM_SMM + if (is_smm(vcpu)) + kvm_smm_changed(vcpu, false); +#endif + kvm_vcpu_reset(vcpu, true); + } kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; @@ -2197,7 +2107,7 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) svm_copy_vmloadsave_state(vmcb12, svm->vmcb); } - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -2470,16 +2380,30 @@ static int task_switch_interception(struct kvm_vcpu *vcpu) has_error_code, error_code); } +static void svm_clr_iret_intercept(struct vcpu_svm *svm) +{ + if (!sev_es_guest(svm->vcpu.kvm)) + svm_clr_intercept(svm, INTERCEPT_IRET); +} + +static void svm_set_iret_intercept(struct vcpu_svm *svm) +{ + if (!sev_es_guest(svm->vcpu.kvm)) + svm_set_intercept(svm, INTERCEPT_IRET); +} + static int iret_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON_ONCE(sev_es_guest(vcpu->kvm)); + ++vcpu->stat.nmi_window_exits; - vcpu->arch.hflags |= HF_IRET_MASK; - if (!sev_es_guest(vcpu->kvm)) { - svm_clr_intercept(svm, INTERCEPT_IRET); - svm->nmi_iret_rip = kvm_rip_read(vcpu); - } + svm->awaiting_iret_completion = true; + + svm_clr_iret_intercept(svm); + svm->nmi_iret_rip = kvm_rip_read(vcpu); + kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } @@ -2640,9 +2564,15 @@ static int dr_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); int reg, dr; - unsigned long val; int err = 0; + /* + * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT + * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early. + */ + if (sev_es_guest(vcpu->kvm)) + return 1; + if (vcpu->guest_debug == 0) { /* * No more DR vmexits; force a reload of the debug registers @@ -2661,11 +2591,9 @@ static int dr_interception(struct kvm_vcpu *vcpu) dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; if (dr >= 16) { /* mov to DRn */ dr -= 16; - val = kvm_register_read(vcpu, reg); - err = kvm_set_dr(vcpu, dr, val); + err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); } else { - kvm_get_dr(vcpu, dr, &val); - kvm_register_write(vcpu, reg, val); + kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); } return kvm_complete_insn_gp(vcpu, err); @@ -2705,31 +2633,43 @@ static int efer_trap(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, ret); } -static int svm_get_msr_feature(struct kvm_msr_entry *msr) +static int svm_get_feature_msr(u32 msr, u64 *data) { - msr->data = 0; + *data = 0; - switch (msr->index) { - case MSR_F10H_DECFG: - if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) - msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; + switch (msr) { + case MSR_AMD64_DE_CFG: + if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) + *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; break; - case MSR_IA32_PERF_CAPABILITIES: - return 0; default: - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } return 0; } +static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, + struct msr_data *msr_info) +{ + return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected && + msr_info->index != MSR_IA32_XSS && + !msr_write_intercepted(vcpu, msr_info->index); +} + static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); + if (sev_es_prevent_msr_access(vcpu, msr_info)) { + msr_info->data = 0; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; + } + switch (msr_info->index) { case MSR_AMD64_TSC_RATIO: - if (!msr_info->host_initiated && !svm->tsc_scaling_enabled) + if (!msr_info->host_initiated && + !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) return 1; msr_info->data = svm->tsc_ratio_msr; break; @@ -2743,6 +2683,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CSTAR: msr_info->data = svm->vmcb01.ptr->save.cstar; break; + case MSR_GS_BASE: + msr_info->data = svm->vmcb01.ptr->save.gs.base; + break; + case MSR_FS_BASE: + msr_info->data = svm->vmcb01.ptr->save.fs.base; + break; case MSR_KERNEL_GS_BASE: msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; break; @@ -2755,23 +2701,40 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SYSENTER_EIP: msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip; - if (guest_cpuid_is_intel(vcpu)) + if (guest_cpuid_is_intel_compatible(vcpu)) msr_info->data |= (u64)svm->sysenter_eip_hi << 32; break; case MSR_IA32_SYSENTER_ESP: msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; - if (guest_cpuid_is_intel(vcpu)) + if (guest_cpuid_is_intel_compatible(vcpu)) msr_info->data |= (u64)svm->sysenter_esp_hi << 32; break; + case MSR_IA32_S_CET: + msr_info->data = svm->vmcb->save.s_cet; + break; + case MSR_IA32_INT_SSP_TAB: + msr_info->data = svm->vmcb->save.isst_addr; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + msr_info->data = svm->vmcb->save.ssp; + break; case MSR_TSC_AUX: msr_info->data = svm->tsc_aux; break; case MSR_IA32_DEBUGCTLMSR: + msr_info->data = svm->vmcb->save.dbgctl; + break; case MSR_IA32_LASTBRANCHFROMIP: + msr_info->data = svm->vmcb->save.br_from; + break; case MSR_IA32_LASTBRANCHTOIP: + msr_info->data = svm->vmcb->save.br_to; + break; case MSR_IA32_LASTINTFROMIP: + msr_info->data = svm->vmcb->save.last_excp_from; + break; case MSR_IA32_LASTINTTOIP: - msr_info->data = svm_get_lbr_msr(svm, msr_info->index); + msr_info->data = svm->vmcb->save.last_excp_to; break; case MSR_VM_HSAVE_PA: msr_info->data = svm->nested.hsave_msr; @@ -2791,7 +2754,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) return 1; msr_info->data = svm->virt_spec_ctrl; @@ -2813,7 +2776,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x1E; } break; - case MSR_F10H_DECFG: + case MSR_AMD64_DE_CFG: msr_info->data = svm->msr_decfg; break; default: @@ -2828,11 +2791,7 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) return kvm_complete_insn_gp(vcpu, err); - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, - X86_TRAP_GP | - SVM_EVTINJ_TYPE_EXEPT | - SVM_EVTINJ_VALID); + svm_vmgexit_inject_exception(svm, X86_TRAP_GP); return 1; } @@ -2864,14 +2823,18 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct vcpu_svm *svm = to_svm(vcpu); - int r; + int ret = 0; u32 ecx = msr->index; u64 data = msr->data; + + if (sev_es_prevent_msr_access(vcpu, msr)) + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; + switch (ecx) { case MSR_AMD64_TSC_RATIO: - if (!svm->tsc_scaling_enabled) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) { if (!msr->host_initiated) return 1; @@ -2893,14 +2856,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->tsc_ratio_msr = data; - if (svm->tsc_scaling_enabled && is_guest_mode(vcpu)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && + is_guest_mode(vcpu)) nested_svm_update_tsc_ratio_msr(vcpu); break; case MSR_IA32_CR_PAT: - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) - return 1; - vcpu->arch.pat = data; + ret = kvm_set_msr_common(vcpu, msr); + if (ret) + break; + svm->vmcb01.ptr->save.g_pat = data; if (is_guest_mode(vcpu)) nested_vmcb02_compute_g_pat(svm); @@ -2928,30 +2893,15 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * * For nested: * The handling of the MSR bitmap for L2 guests is done in - * nested_svm_vmrun_msrpm. + * nested_svm_merge_msrpm(). * We update the L1 MSR bit as well since it will end up * touching the MSR anyway now. */ - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); - break; - case MSR_IA32_PRED_CMD: - if (!msr->host_initiated && - !guest_has_pred_cmd_msr(vcpu)) - return 1; - - if (data & ~PRED_CMD_IBPB) - return 1; - if (!boot_cpu_has(X86_FEATURE_IBPB)) - return 1; - if (!data) - break; - - wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) return 1; if (data & ~SPEC_CTRL_SSBD) @@ -2969,6 +2919,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_CSTAR: svm->vmcb01.ptr->save.cstar = data; break; + case MSR_GS_BASE: + svm->vmcb01.ptr->save.gs.base = data; + break; + case MSR_FS_BASE: + svm->vmcb01.ptr->save.fs.base = data; + break; case MSR_KERNEL_GS_BASE: svm->vmcb01.ptr->save.kernel_gs_base = data; break; @@ -2988,42 +2944,70 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * 32 bit part of these msrs to support Intel's * implementation of SYSENTER/SYSEXIT. */ - svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; + svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; break; case MSR_IA32_SYSENTER_ESP: svm->vmcb01.ptr->save.sysenter_esp = (u32)data; - svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; + svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; + break; + case MSR_IA32_S_CET: + svm->vmcb->save.s_cet = data; + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); + break; + case MSR_IA32_INT_SSP_TAB: + svm->vmcb->save.isst_addr = data; + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + svm->vmcb->save.ssp = data; + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); break; case MSR_TSC_AUX: /* + * TSC_AUX is always virtualized for SEV-ES guests when the + * feature is available. The user return MSR support is not + * required in this case because TSC_AUX is restored on #VMEXIT + * from the host save area. + */ + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) + break; + + /* * TSC_AUX is usually changed only during boot and never read - * directly. Intercept TSC_AUX instead of exposing it to the - * guest via direct_access_msrs, and switch it via user return. + * directly. Intercept TSC_AUX and switch it via user return. */ preempt_disable(); - r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); + ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); preempt_enable(); - if (r) - return 1; + if (ret) + break; svm->tsc_aux = data; break; case MSR_IA32_DEBUGCTLMSR: if (!lbrv) { - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", - __func__, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } + + /* + * Suppress BTF as KVM doesn't virtualize BTF, but there's no + * way to communicate lack of support to the guest. + */ + if (data & DEBUGCTLMSR_BTF) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + data &= ~DEBUGCTLMSR_BTF; + } + if (data & DEBUGCTL_RESERVED_BITS) return 1; - if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) - svm->vmcb->save.dbgctl = data; - else - svm->vmcb01.ptr->save.dbgctl = data; + if (svm->vmcb->save.dbgctl == data) + break; + svm->vmcb->save.dbgctl = data; + vmcb_mark_dirty(svm->vmcb, VMCB_LBR); svm_update_lbrv(vcpu); - break; case MSR_VM_HSAVE_PA: /* @@ -3040,21 +3024,15 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: - vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; - case MSR_F10H_DECFG: { - struct kvm_msr_entry msr_entry; - - msr_entry.index = msr->index; - if (svm_get_msr_feature(&msr_entry)) - return 1; + case MSR_AMD64_DE_CFG: { + u64 supported_de_cfg; - /* Check the supported bits */ - if (data & ~msr_entry.data) + if (svm_get_feature_msr(ecx, &supported_de_cfg)) return 1; - /* Don't allow the guest to change a bit, #GP */ - if (!msr->host_initiated && (data ^ msr_entry.data)) + if (data & ~supported_de_cfg) return 1; svm->msr_decfg = data; @@ -3063,7 +3041,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) default: return kvm_set_msr_common(vcpu, msr); } - return 0; + return ret; } static int msr_interception(struct kvm_vcpu *vcpu) @@ -3119,7 +3097,7 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) unsigned long type; gva_t gva; - if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -3132,9 +3110,51 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) type = svm->vmcb->control.exit_info_2; gva = svm->vmcb->control.exit_info_1; + /* + * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the + * stack segment is used. The intercept takes priority over all + * #GP checks except CPL>0, but somehow still generates a linear + * address? The APM is sorely lacking. + */ + if (is_noncanonical_address(gva, vcpu, 0)) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + return kvm_handle_invpcid(vcpu, type, gva); } +static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If userspace has NOT changed RIP, then KVM's ABI is to let the guest + * execute the bus-locking instruction. Set the bus lock counter to '1' + * to effectively step past the bus lock. + */ + if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)) + svm->vmcb->control.bus_lock_counter = 1; + + return 1; +} + +static int bus_lock_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.complete_userspace_io = complete_userspace_buslock; + + if (is_guest_mode(vcpu)) + svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; + + return 0; +} + static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3202,11 +3222,15 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, [SVM_EXIT_INVPCID] = invpcid_interception, + [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, [SVM_EXIT_NPF] = npf_interception, + [SVM_EXIT_BUS_LOCK] = bus_lock_exit, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, +#ifdef CONFIG_KVM_AMD_SEV [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, +#endif }; static void dump_vmcb(struct kvm_vcpu *vcpu) @@ -3215,14 +3239,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; + char *vm_type; if (!dump_invalid_vmcb) { pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); return; } - pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", - svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); + guard(mutex)(&vmcb_dump_mutex); + + vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : + sev_es_guest(vcpu->kvm) ? "SEV-ES" : + sev_guest(vcpu->kvm) ? "SEV" : "SVM"; + + pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", + vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); @@ -3260,6 +3291,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); + pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); + pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); + + if (sev_es_guest(vcpu->kvm)) { + save = sev_decrypt_vmsa(vcpu); + if (!save) + goto no_vmsa; + + save01 = save; + } + pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3314,6 +3356,10 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "rsp:", save->rsp, "rax:", save->rax); pr_err("%-15s %016llx %-13s %016llx\n", + "s_cet:", save->s_cet, "ssp:", save->ssp); + pr_err("%-15s %016llx\n", + "isst_addr:", save->isst_addr); + pr_err("%-15s %016llx %-13s %016llx\n", "star:", save01->star, "lstar:", save01->lstar); pr_err("%-15s %016llx %-13s %016llx\n", "cstar:", save01->cstar, "sfmask:", save01->sfmask); @@ -3330,6 +3376,70 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "excp_from:", save->last_excp_from, "excp_to:", save->last_excp_to); + + if (sev_es_guest(vcpu->kvm)) { + struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; + + pr_err("%-15s %016llx\n", + "sev_features", vmsa->sev_features); + + pr_err("%-15s %016llx %-13s %016llx\n", + "pl0_ssp:", vmsa->pl0_ssp, "pl1_ssp:", vmsa->pl1_ssp); + pr_err("%-15s %016llx %-13s %016llx\n", + "pl2_ssp:", vmsa->pl2_ssp, "pl3_ssp:", vmsa->pl3_ssp); + pr_err("%-15s %016llx\n", + "u_cet:", vmsa->u_cet); + + pr_err("%-15s %016llx %-13s %016llx\n", + "rax:", vmsa->rax, "rbx:", vmsa->rbx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); + pr_err("%-15s %016llx %-13s %016llx\n", + "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); + pr_err("%-15s %016llx %-13s %016llx\n", + "r8:", vmsa->r8, "r9:", vmsa->r9); + pr_err("%-15s %016llx %-13s %016llx\n", + "r10:", vmsa->r10, "r11:", vmsa->r11); + pr_err("%-15s %016llx %-13s %016llx\n", + "r12:", vmsa->r12, "r13:", vmsa->r13); + pr_err("%-15s %016llx %-13s %016llx\n", + "r14:", vmsa->r14, "r15:", vmsa->r15); + pr_err("%-15s %016llx %-13s %016llx\n", + "xcr0:", vmsa->xcr0, "xss:", vmsa->xss); + } else { + pr_err("%-15s %016llx %-13s %016lx\n", + "rax:", save->rax, "rbx:", + vcpu->arch.regs[VCPU_REGS_RBX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rcx:", vcpu->arch.regs[VCPU_REGS_RCX], + "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rsi:", vcpu->arch.regs[VCPU_REGS_RSI], + "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]); + pr_err("%-15s %016lx %-13s %016llx\n", + "rbp:", vcpu->arch.regs[VCPU_REGS_RBP], + "rsp:", save->rsp); +#ifdef CONFIG_X86_64 + pr_err("%-15s %016lx %-13s %016lx\n", + "r8:", vcpu->arch.regs[VCPU_REGS_R8], + "r9:", vcpu->arch.regs[VCPU_REGS_R9]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r10:", vcpu->arch.regs[VCPU_REGS_R10], + "r11:", vcpu->arch.regs[VCPU_REGS_R11]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r12:", vcpu->arch.regs[VCPU_REGS_R12], + "r13:", vcpu->arch.regs[VCPU_REGS_R13]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r14:", vcpu->arch.regs[VCPU_REGS_R14], + "r15:", vcpu->arch.regs[VCPU_REGS_R15]); +#endif + } + +no_vmsa: + if (sev_es_guest(vcpu->kvm)) + sev_free_decrypted_vmsa(vcpu, save); } static bool svm_check_exit_valid(u64 exit_code) @@ -3340,13 +3450,8 @@ static bool svm_check_exit_valid(u64 exit_code) static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) { - vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); dump_vmcb(vcpu); - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = exit_code; - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + kvm_prepare_unexpected_reason_exit(vcpu, exit_code); return 0; } @@ -3355,17 +3460,21 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) if (!svm_check_exit_valid(exit_code)) return svm_handle_invalid_exit(vcpu, exit_code); -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE if (exit_code == SVM_EXIT_MSR) return msr_interception(vcpu); else if (exit_code == SVM_EXIT_VINTR) return interrupt_window_interception(vcpu); else if (exit_code == SVM_EXIT_INTR) return intr_interception(vcpu); - else if (exit_code == SVM_EXIT_HLT) + else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT) return kvm_emulate_halt(vcpu); else if (exit_code == SVM_EXIT_NPF) return npf_interception(vcpu); +#ifdef CONFIG_KVM_AMD_SEV + else if (exit_code == SVM_EXIT_VMGEXIT) + return sev_handle_vmgexit(vcpu); +#endif #endif return svm_exit_handlers[exit_code](vcpu); } @@ -3387,14 +3496,27 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, *error_code = 0; } +static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, + u32 *error_code) +{ + struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + + *intr_info = control->event_inj; + + if ((*intr_info & SVM_EXITINTINFO_VALID) && + (*intr_info & SVM_EXITINTINFO_VALID_ERR)) + *error_code = control->event_inj_err; + else + *error_code = 0; + +} + static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(vcpu, KVM_ISA_SVM); - /* SEV-ES guests must use the CR write traps to track CR registers. */ if (!sev_es_guest(vcpu->kvm)) { if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) @@ -3426,32 +3548,15 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return 0; } - if (is_external_interrupt(svm->vmcb->control.exit_int_info) && - exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && - exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && - exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) - printk(KERN_ERR "%s: unexpected exit_int_info 0x%x " - "exit_code 0x%x\n", - __func__, svm->vmcb->control.exit_int_info, - exit_code); - if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; return svm_invoke_exit_handler(vcpu, exit_code); } -static void reload_tss(struct kvm_vcpu *vcpu) +static int pre_svm_run(struct kvm_vcpu *vcpu) { - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); - - sd->tss_desc->type = 9; /* available 32/64-bit TSS */ - load_TR_desc(); -} - -static void pre_svm_run(struct kvm_vcpu *vcpu) -{ - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); /* @@ -3471,6 +3576,8 @@ static void pre_svm_run(struct kvm_vcpu *vcpu) /* FIXME: handle wraparound of asid_generation */ if (svm->current_vmcb->asid_generation != sd->asid_generation) new_asid(svm, sd); + + return 0; } static void svm_inject_nmi(struct kvm_vcpu *vcpu) @@ -3482,19 +3589,59 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) if (svm->nmi_l1_to_l2) return; - vcpu->arch.hflags |= HF_NMI_MASK; - if (!sev_es_guest(vcpu->kvm)) - svm_set_intercept(svm, INTERCEPT_IRET); + /* + * No need to manually track NMI masking when vNMI is enabled, hardware + * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the + * case where software directly injects an NMI. + */ + if (!is_vnmi_enabled(svm)) { + svm->nmi_masked = true; + svm_set_iret_intercept(svm); + } ++vcpu->stat.nmi_injections; } +static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (!is_vnmi_enabled(svm)) + return false; + + return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK); +} + +static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (!is_vnmi_enabled(svm)) + return false; + + if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK) + return false; + + svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK; + vmcb_mark_dirty(svm->vmcb, VMCB_INTR); + + /* + * Because the pending NMI is serviced by hardware, KVM can't know when + * the NMI is "injected", but for all intents and purposes, passing the + * NMI off to hardware counts as injection. + */ + ++vcpu->stat.nmi_injections; + + return true; +} + static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { + struct kvm_queued_interrupt *intr = &vcpu->arch.interrupt; struct vcpu_svm *svm = to_svm(vcpu); u32 type; - if (vcpu->arch.interrupt.soft) { - if (svm_update_soft_interrupt_rip(vcpu)) + if (intr->soft) { + if (svm_update_soft_interrupt_rip(vcpu, intr->nr)) return; type = SVM_EVTINJ_TYPE_SOFT; @@ -3502,12 +3649,10 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) type = SVM_EVTINJ_TYPE_INTR; } - trace_kvm_inj_virq(vcpu->arch.interrupt.nr, - vcpu->arch.interrupt.soft, reinjected); + trace_kvm_inj_virq(intr->nr, intr->soft, reinjected); ++vcpu->stat.irq_injections; - svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | - SVM_EVTINJ_VALID | type; + svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type; } void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, @@ -3583,11 +3728,39 @@ static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) svm_set_intercept(svm, INTERCEPT_CR8_WRITE); } +static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (is_vnmi_enabled(svm)) + return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK; + else + return svm->nmi_masked; +} + +static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (is_vnmi_enabled(svm)) { + if (masked) + svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK; + else + svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK; + + } else { + svm->nmi_masked = masked; + if (masked) + svm_set_iret_intercept(svm); + else + svm_clr_iret_intercept(svm); + } +} + bool svm_nmi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - bool ret; if (!gif_set(svm)) return true; @@ -3595,10 +3768,10 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) return false; - ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || - (vcpu->arch.hflags & HF_NMI_MASK); + if (svm_get_nmi_mask(vcpu)) + return true; - return ret; + return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK; } static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) @@ -3616,26 +3789,6 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return 1; } -static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) -{ - return !!(vcpu->arch.hflags & HF_NMI_MASK); -} - -static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (masked) { - vcpu->arch.hflags |= HF_NMI_MASK; - if (!sev_es_guest(vcpu->kvm)) - svm_set_intercept(svm, INTERCEPT_IRET); - } else { - vcpu->arch.hflags &= ~HF_NMI_MASK; - if (!sev_es_guest(vcpu->kvm)) - svm_clr_intercept(svm, INTERCEPT_IRET); - } -} - bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3716,8 +3869,41 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) - return; /* IRET will cause a vm exit */ + /* + * If NMIs are outright masked, i.e. the vCPU is already handling an + * NMI, and KVM has not yet intercepted an IRET, then there is nothing + * more to do at this time as KVM has already enabled IRET intercepts. + * If KVM has already intercepted IRET, then single-step over the IRET, + * as NMIs aren't architecturally unmasked until the IRET completes. + * + * If vNMI is enabled, KVM should never request an NMI window if NMIs + * are masked, as KVM allows at most one to-be-injected NMI and one + * pending NMI. If two NMIs arrive simultaneously, KVM will inject one + * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are + * unmasked. KVM _will_ request an NMI window in some situations, e.g. + * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately + * inject the NMI. In those situations, KVM needs to single-step over + * the STI shadow or intercept STGI. + */ + if (svm_get_nmi_mask(vcpu)) { + WARN_ON_ONCE(is_vnmi_enabled(svm)); + + if (!svm->awaiting_iret_completion) + return; /* IRET will cause a vm exit */ + } + + /* + * SEV-ES guests are responsible for signaling when a vCPU is ready to + * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e. + * KVM can't intercept and single-step IRET to detect when NMIs are + * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE. + * + * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware + * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not + * supported NAEs in the GHCB protocol. + */ + if (sev_es_guest(vcpu->kvm)) + return; if (!gif_set(svm)) { if (vgif) @@ -3734,11 +3920,18 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } -static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) +static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); /* + * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries. + * A TLB flush for the current ASID flushes both "host" and "guest" TLB + * entries, and thus is a superset of Hyper-V's fine grained flushing. + */ + kvm_hv_vcpu_purge_flush_tlb(vcpu); + + /* * Flush only the current ASID even if the TLB flush was invoked via * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and @@ -3751,6 +3944,37 @@ static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) svm->current_vmcb->asid_generation--; } +static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + hpa_t root_tdp = vcpu->arch.mmu->root.hpa; + + /* + * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly + * flush the NPT mappings via hypercall as flushing the ASID only + * affects virtual to physical mappings, it does not invalidate guest + * physical to host physical mappings. + */ + if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp)) + hyperv_flush_guest_mapping(root_tdp); + + svm_flush_tlb_asid(vcpu); +} + +static void svm_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + /* + * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB + * flushes should be routed to hv_flush_remote_tlbs() without requesting + * a "regular" remote flush. Reaching this point means either there's + * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of + * which might be fatal to the guest. Yell, but try to recover. + */ + if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu))) + hv_flush_remote_tlbs(vcpu->kvm); + + svm_flush_tlb_asid(vcpu); +} + static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3776,8 +4000,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (nested_svm_virtualize_tpr(vcpu) || - kvm_vcpu_apicv_active(vcpu)) + if (nested_svm_virtualize_tpr(vcpu)) return; cr8 = kvm_get_cr8(vcpu); @@ -3830,13 +4053,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) svm->soft_int_injected = false; /* - * If we've made progress since setting HF_IRET_MASK, we've + * If we've made progress since setting awaiting_iret_completion, we've * executed an IRET and can allow NMI injection. */ - if ((vcpu->arch.hflags & HF_IRET_MASK) && - (sev_es_guest(vcpu->kvm) || - kvm_rip_read(vcpu) != svm->nmi_iret_rip)) { - vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); + if (svm->awaiting_iret_completion && + kvm_rip_read(vcpu) != svm->nmi_iret_rip) { + svm->awaiting_iret_completion = false; + svm->nmi_masked = false; kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -3860,20 +4083,23 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.nmi_injected = true; svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; - case SVM_EXITINTINFO_TYPE_EXEPT: + case SVM_EXITINTINFO_TYPE_EXEPT: { + u32 error_code = 0; + /* * Never re-inject a #VC exception. */ if (vector == X86_TRAP_VC) break; - if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { - u32 err = svm->vmcb->control.exit_int_info_err; - kvm_requeue_exception_e(vcpu, vector, err); + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) + error_code = svm->vmcb->control.exit_int_info_err; - } else - kvm_requeue_exception(vcpu, vector); + kvm_requeue_exception(vcpu, vector, + exitintinfo & SVM_EXITINTINFO_VALID_ERR, + error_code); break; + } case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; @@ -3899,51 +4125,82 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) { + if (to_kvm_sev_info(vcpu->kvm)->need_init) + return -EINVAL; + return 1; } static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) - return handle_fastpath_set_msr_irqoff(vcpu); + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + /* + * Next RIP must be provided as IRQs are disabled, and accessing guest + * memory to decode the instruction might fault, i.e. might sleep. + */ + if (!nrips || !control->next_rip) + return EXIT_FASTPATH_NONE; + + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + + switch (control->exit_code) { + case SVM_EXIT_MSR: + if (!control->exit_info_1) + break; + return handle_fastpath_wrmsr(vcpu); + case SVM_EXIT_HLT: + return handle_fastpath_hlt(vcpu); + case SVM_EXIT_INVD: + return handle_fastpath_invd(vcpu); + default: + break; + } return EXIT_FASTPATH_NONE; } -static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) +static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) { + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); - unsigned long vmcb_pa = svm->current_vmcb->pa; guest_state_enter_irqoff(); - if (sev_es_guest(vcpu->kvm)) { - __svm_sev_es_vcpu_run(vmcb_pa); - } else { - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + /* + * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of + * VMRUN controls whether or not physical IRQs are masked (KVM always + * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the + * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow + * into guest state if delivery of an event during VMRUN triggers a + * #VMEXIT, and the guest_state transitions already tell lockdep that + * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of + * this path, so IRQs aren't actually unmasked while running host code. + */ + raw_local_irq_enable(); - /* - * Use a single vmcb (vmcb01 because it's always valid) for - * context switching guest state via VMLOAD/VMSAVE, that way - * the state doesn't need to be copied between vmcb01 and - * vmcb02 when switching vmcbs for nested virtualization. - */ - vmload(svm->vmcb01.pa); - __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs); - vmsave(svm->vmcb01.pa); + amd_clear_divider(); - vmload(__sme_page_pa(sd->save_area)); - } + if (sev_es_guest(vcpu->kvm)) + __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted, + sev_es_host_save_area(sd)); + else + __svm_vcpu_run(svm, spec_ctrl_intercepted); + + raw_local_irq_disable(); guest_state_exit_irqoff(); } -static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) +static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) { + bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; struct vcpu_svm *svm = to_svm(vcpu); + bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); - trace_kvm_entry(vcpu); + trace_kvm_entry(vcpu, force_immediate_exit); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; @@ -3962,10 +4219,18 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * is enough to force an immediate vmexit. */ disable_nmi_singlestep(svm); - smp_send_reschedule(vcpu->cpu); + force_immediate_exit = true; } - pre_svm_run(vcpu); + if (force_immediate_exit) + smp_send_reschedule(vcpu->cpu); + + if (pre_svm_run(vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR; + vcpu->run->fail_entry.cpu = vcpu->cpu; + return EXIT_FASTPATH_EXIT_USERSPACE; + } sync_lapic_to_cr8(vcpu); @@ -3978,16 +4243,26 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_hv_update_vp_id(svm->vmcb, vcpu); /* - * Run with all-zero DR6 unless needed, so that we can get the exact cause - * of a #DB. + * Run with all-zero DR6 unless the guest can write DR6 freely, so that + * KVM can get the exact cause of a #DB. Note, loading guest DR6 from + * KVM's snapshot is only necessary when DR accesses won't exit. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - svm_set_dr6(svm, vcpu->arch.dr6); - else - svm_set_dr6(svm, DR6_ACTIVE_LOW); + if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6)) + svm_set_dr6(vcpu, vcpu->arch.dr6); + else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); - kvm_load_guest_xsave_state(vcpu); + + /* + * Hardware only context switches DEBUGCTL if LBR virtualization is + * enabled. Manually load DEBUGCTL if necessary (and restore it after + * VM-Exit), as running with the host's DEBUGCTL can negatively affect + * guest state and can even be fatal, e.g. due to Bus Lock Detect. + */ + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(svm->vmcb->save.dbgctl); kvm_wait_lapic_expire(vcpu); @@ -3998,34 +4273,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * being speculatively taken. */ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) - x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); - - svm_vcpu_enter_exit(vcpu); - - /* - * We do not use IBRS in the kernel. If this vCPU has used the - * SPEC_CTRL MSR it may have left it on; save the value and - * turn it off. This is much more efficient than blindly adding - * it to the atomic save/restore list. Especially as the former - * (Saving guest MSRs on vmexit) doesn't even exist in KVM. - * - * For non-nested case: - * If the L01 MSR bitmap does not intercept the MSR, then we need to - * save it. - * - * For nested case: - * If the L02 MSR bitmap does not intercept the MSR, then we need to - * save it. - */ - if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) && - unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) - svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + x86_spec_ctrl_set_guest(svm->virt_spec_ctrl); - if (!sev_es_guest(vcpu->kvm)) - reload_tss(vcpu); + svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted); if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) - x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); + x86_spec_ctrl_restore_host(svm->virt_spec_ctrl); if (!sev_es_guest(vcpu->kvm)) { vcpu->arch.cr2 = svm->vmcb->save.cr2; @@ -4038,7 +4291,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - kvm_load_host_xsave_state(vcpu); + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(vcpu->arch.host_debugctl); + stgi(); /* Any pending NMI will happen here */ @@ -4070,19 +4326,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; - /* - * We need to handle MC intercepts here before the vcpu has a chance to - * change the physical cpu - */ - if (unlikely(svm->vmcb->control.exit_code == - SVM_EXIT_EXCP_BASE + MC_VECTOR)) - svm_handle_mce(vcpu); + trace_kvm_exit(vcpu, KVM_ISA_SVM); svm_complete_interrupts(vcpu); - if (is_guest_mode(vcpu)) - return EXIT_FASTPATH_NONE; - return svm_exit_handlers_fastpath(vcpu); } @@ -4111,17 +4358,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcb_mark_dirty(svm->vmcb, VMCB_CR); } -static int is_disabled(void) -{ - u64 vm_cr; - - rdmsrl(MSR_VM_CR, vm_cr); - if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) - return 1; - - return 0; -} - static void svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) { @@ -4133,11 +4369,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xd9; } -static int __init svm_check_processor_compat(void) -{ - return 0; -} - /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. @@ -4146,9 +4377,11 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_MCG_EXT_CTL: - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: return false; case MSR_IA32_SMBASE: + if (!IS_ENABLED(CONFIG_KVM_SMM)) + return false; /* SEV-ES guests do not support SMM, so report false */ if (kvm && sev_es_guest(kvm)) return false; @@ -4163,39 +4396,31 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_cpuid_entry2 *best; - - vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - boot_cpu_has(X86_FEATURE_XSAVE) && - boot_cpu_has(X86_FEATURE_XSAVES); - - /* Update nrips enabled cache */ - svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && - guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); - - svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); - svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV); - - svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - - svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) && - guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER); - - svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) && - guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD); - svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF); - - svm_recalc_instruction_intercepts(vcpu, svm); + /* + * SVM doesn't provide a way to disable just XSAVES in the guest, KVM + * can only disable all variants of by disallowing CR4.OSXSAVE from + * being set. As a result, if the host has XSAVE and XSAVES, and the + * guest has XSAVE enabled, the guest can execute XSAVES without + * faulting. Treat XSAVES as enabled in this case regardless of + * whether it's advertised to the guest so that KVM context switches + * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give + * the guest read/write access to the host's XSS. + */ + guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES, + boot_cpu_has(X86_FEATURE_XSAVES) && + guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)); - /* For sev guests, the memory encryption bit is not reserved in CR3. */ - if (sev_guest(vcpu->kvm)) { - best = kvm_find_cpuid_entry(vcpu, 0x8000001F); - if (best) - vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); - } + /* + * Intercept VMLOAD if the vCPU model is Intel in order to emulate that + * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing + * SVM on Intel is bonkers and extremely unlikely to work). + */ + if (guest_cpuid_is_intel_compatible(vcpu)) + guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - init_vmcb_after_set_cpuid(vcpu); + if (sev_guest(vcpu->kvm)) + sev_vcpu_after_set_cpuid(svm); } static bool svm_has_wbinvd_exit(void) @@ -4293,31 +4518,45 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, case SVM_EXIT_WRITE_CR0: { unsigned long cr0, val; - if (info->intercept == x86_intercept_cr_write) + /* + * Adjust the exit code accordingly if a CR other than CR0 is + * being written, and skip straight to the common handling as + * only CR0 has an additional selective intercept. + */ + if (info->intercept == x86_intercept_cr_write && info->modrm_reg) { icpt_info.exit_code += info->modrm_reg; - - if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || - info->intercept == x86_intercept_clts) break; + } - if (!(vmcb12_is_intercept(&svm->nested.ctl, - INTERCEPT_SELECTIVE_CR0))) + /* + * Convert the exit_code to SVM_EXIT_CR0_SEL_WRITE if a + * selective CR0 intercept is triggered (the common logic will + * treat the selective intercept as being enabled). Note, the + * unconditional intercept has higher priority, i.e. this is + * only relevant if *only* the selective intercept is enabled. + */ + if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_CR0_WRITE) || + !(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))) break; - cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; - val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; + /* CLTS never triggers INTERCEPT_SELECTIVE_CR0 */ + if (info->intercept == x86_intercept_clts) + break; + /* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */ if (info->intercept == x86_intercept_lmsw) { - cr0 &= 0xfUL; - val &= 0xfUL; - /* lmsw can't clear PE - catch this here */ - if (cr0 & X86_CR0_PE) - val |= X86_CR0_PE; + icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; + break; } + /* + * MOV-to-CR0 only triggers INTERCEPT_SELECTIVE_CR0 if any bit + * other than SVM_CR0_SELECTIVE_MASK is changed. + */ + cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; + val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; if (cr0 ^ val) icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; - break; } case SVM_EXIT_READ_DR0: @@ -4389,14 +4628,16 @@ out: static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR) + switch (to_svm(vcpu)->vmcb->control.exit_code) { + case SVM_EXIT_EXCP_BASE + MC_VECTOR: + svm_handle_mce(vcpu); + break; + case SVM_EXIT_INTR: vcpu->arch.at_instruction_boundary = true; -} - -static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - if (!kvm_pause_in_guest(vcpu->kvm)) - shrink_ple_window(vcpu); + break; + default: + break; + } } static void svm_setup_mce(struct kvm_vcpu *vcpu) @@ -4405,6 +4646,7 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) vcpu->arch.mcg_cap &= 0x1ff; } +#ifdef CONFIG_KVM_SMM bool svm_smi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4432,7 +4674,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return 1; } -static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map_save; @@ -4441,10 +4683,16 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) if (!is_guest_mode(vcpu)) return 0; - /* FED8h - SVM Guest */ - put_smstate(u64, smstate, 0x7ed8, 1); - /* FEE0h - SVM Guest VMCB Physical Address */ - put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); + /* + * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is + * responsible for ensuring nested SVM and SMIs are mutually exclusive. + */ + + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) + return 1; + + smram->smram64.svm_guest_flag = 1; + smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa; svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; @@ -4466,8 +4714,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) * that, see svm_prepare_switch_to_guest()) which must be * preserved. */ - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), - &map_save) == -EINVAL) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) return 1; BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); @@ -4475,38 +4722,37 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) svm_copy_vmrun_state(map_save.hva + 0x400, &svm->vmcb01.ptr->save); - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); return 0; } -static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map, map_save; - u64 saved_efer, vmcb12_gpa; struct vmcb *vmcb12; int ret; - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + const struct kvm_smram_state_64 *smram64 = &smram->smram64; + + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return 0; /* Non-zero if SMI arrived while vCPU was in guest mode. */ - if (!GET_SMSTATE(u64, smstate, 0x7ed8)) + if (!smram64->svm_guest_flag) return 0; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) return 1; - saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); - if (!(saved_efer & EFER_SVME)) + if (!(smram64->efer & EFER_SVME)) return 1; - vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map)) return 1; ret = 1; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) goto unmap_map; if (svm_allocate_nested(svm)) @@ -4528,7 +4774,7 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) vmcb12 = map.hva; nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); - ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); + ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false); if (ret) goto unmap_save; @@ -4536,9 +4782,9 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) svm->nested.nested_run_pending = 1; unmap_save: - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); unmap_map: - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -4554,17 +4800,23 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) /* We must be in SMM; RSM will cause a vmexit anyway. */ } } +#endif -static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { + struct vcpu_svm *svm = to_svm(vcpu); bool smep, smap, is_user; - unsigned long cr4; u64 error_code; + /* Check that emulation is possible during event vectoring */ + if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) && + !kvm_can_emulate_event_vectoring(emul_type)) + return X86EMUL_UNHANDLEABLE_VECTORING; + /* Emulation is always possible when KVM has access to all guest state. */ if (!sev_guest(vcpu->kvm)) - return true; + return X86EMUL_CONTINUE; /* #UD and #GP should never be intercepted for SEV guests. */ WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | @@ -4576,33 +4828,44 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * to guest register state. */ if (sev_es_guest(vcpu->kvm)) - return false; + return X86EMUL_RETRY_INSTR; /* * Emulation is possible if the instruction is already decoded, e.g. * when completing I/O after returning from userspace. */ if (emul_type & EMULTYPE_NO_DECODE) - return true; + return X86EMUL_CONTINUE; /* * Emulation is possible for SEV guests if and only if a prefilled * buffer containing the bytes of the intercepted instruction is * available. SEV guest memory is encrypted with a guest specific key - * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and + * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and * decode garbage. * - * Inject #UD if KVM reached this point without an instruction buffer. - * In practice, this path should never be hit by a well-behaved guest, - * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path - * is still theoretically reachable, e.g. via unaccelerated fault-like - * AVIC access, and needs to be handled by KVM to avoid putting the - * guest into an infinite loop. Injecting #UD is somewhat arbitrary, - * but its the least awful option given lack of insight into the guest. + * If KVM is NOT trying to simply skip an instruction, inject #UD if + * KVM reached this point without an instruction buffer. In practice, + * this path should never be hit by a well-behaved guest, e.g. KVM + * doesn't intercept #UD or #GP for SEV guests, but this path is still + * theoretically reachable, e.g. via unaccelerated fault-like AVIC + * access, and needs to be handled by KVM to avoid putting the guest + * into an infinite loop. Injecting #UD is somewhat arbitrary, but + * its the least awful option given lack of insight into the guest. + * + * If KVM is trying to skip an instruction, simply resume the guest. + * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM + * will attempt to re-inject the INT3/INTO and skip the instruction. + * In that scenario, retrying the INT3/INTO and hoping the guest will + * make forward progress is the only option that has a chance of + * success (and in practice it will work the vast majority of the time). */ if (unlikely(!insn)) { + if (emul_type & EMULTYPE_SKIP) + return X86EMUL_UNHANDLEABLE; + kvm_queue_exception(vcpu, UD_VECTOR); - return false; + return X86EMUL_PROPAGATE_FAULT; } /* @@ -4613,7 +4876,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * table used to translate CS:RIP resides in emulated MMIO. */ if (likely(insn_len)) - return true; + return X86EMUL_CONTINUE; /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. @@ -4647,16 +4910,15 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * In addition, don't apply the erratum workaround if the #NPF occurred * while translating guest page tables (see below). */ - error_code = to_svm(vcpu)->vmcb->control.exit_info_1; + error_code = svm->vmcb->control.exit_info_1; if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) goto resume_guest; - cr4 = kvm_read_cr4(vcpu); - smep = cr4 & X86_CR4_SMEP; - smap = cr4 & X86_CR4_SMAP; + smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP); + smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP); is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { - pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); + pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n"); /* * If the fault occurred in userspace, arbitrarily inject #GP @@ -4672,6 +4934,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, kvm_inject_gp(vcpu, 0); else kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + return X86EMUL_PROPAGATE_FAULT; } resume_guest: @@ -4689,7 +4952,7 @@ resume_guest: * doesn't explicitly define "ignored", i.e. doing nothing and letting * the guest spin is technically "ignoring" the access. */ - return false; + return X86EMUL_RETRY_INSTR; } static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) @@ -4711,12 +4974,26 @@ static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); sev_vm_destroy(kvm); + + svm_srso_vm_destroy(); } static int svm_vm_init(struct kvm *kvm) { + int type = kvm->arch.vm_type; + + if (type != KVM_X86_DEFAULT_VM && + type != KVM_X86_SW_PROTECTED_VM) { + kvm->arch.has_protected_state = + (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM); + to_kvm_sev_info(kvm)->need_init = true; + + kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM); + kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; + } + if (!pause_filter_count || !pause_filter_thresh) - kvm->arch.pause_in_guest = true; + kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); if (enable_apicv) { int ret = avic_vm_init(kvm); @@ -4724,17 +5001,32 @@ static int svm_vm_init(struct kvm *kvm) return ret; } + svm_srso_vm_init(); return 0; } -static struct kvm_x86_ops svm_x86_ops __initdata = { - .name = "kvm_amd", +static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) +{ + struct page *page = snp_safe_alloc_page(); + + if (!page) + return NULL; + + return page_address(page); +} + +struct kvm_x86_ops svm_x86_ops __initdata = { + .name = KBUILD_MODNAME, + + .check_processor_compatibility = svm_check_processor_compat, .hardware_unsetup = svm_hardware_unsetup, - .hardware_enable = svm_hardware_enable, - .hardware_disable = svm_hardware_disable, + .enable_virtualization_cpu = svm_enable_virtualization_cpu, + .disable_virtualization_cpu = svm_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, .has_emulated_msr = svm_has_emulated_msr, + .vcpu_precreate = svm_vcpu_precreate, .vcpu_create = svm_vcpu_create, .vcpu_free = svm_vcpu_free, .vcpu_reset = svm_vcpu_reset, @@ -4750,14 +5042,16 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_unblocking = avic_vcpu_unblocking, .update_exception_bitmap = svm_update_exception_bitmap, - .get_msr_feature = svm_get_msr_feature, + .get_feature_msr = svm_get_feature_msr, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, .get_segment = svm_get_segment, .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, + .get_cpl_no_cache = svm_get_cpl, .get_cs_db_l_bits = svm_get_cs_db_l_bits, + .is_valid_cr0 = svm_is_valid_cr0, .set_cr0 = svm_set_cr0, .post_set_cr3 = sev_post_set_cr3, .is_valid_cr4 = svm_is_valid_cr4, @@ -4774,10 +5068,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_rflags = svm_set_rflags, .get_if_flag = svm_get_if_flag, - .flush_tlb_all = svm_flush_tlb_current, + .flush_tlb_all = svm_flush_tlb_all, .flush_tlb_current = svm_flush_tlb_current, .flush_tlb_gva = svm_flush_tlb_gva, - .flush_tlb_guest = svm_flush_tlb_current, + .flush_tlb_guest = svm_flush_tlb_asid, .vcpu_pre_run = svm_vcpu_pre_run, .vcpu_run = svm_vcpu_run, @@ -4789,6 +5083,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .patch_hypercall = svm_patch_hypercall, .inject_irq = svm_inject_irq, .inject_nmi = svm_inject_nmi, + .is_vnmi_pending = svm_is_vnmi_pending, + .set_vnmi_pending = svm_set_vnmi_pending, .inject_exception = svm_inject_exception, .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, @@ -4798,12 +5094,15 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, - .set_virtual_apic_mode = avic_set_virtual_apic_mode, + + .x2apic_icr_is_split = true, + .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, - .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, .apicv_post_state_restore = avic_apicv_post_state_restore, + .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS, .get_exit_info = svm_get_exit_info, + .get_entry_info = svm_get_entry_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -4819,21 +5118,21 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_intercept = svm_check_intercept, .handle_exit_irqoff = svm_handle_exit_irqoff, - .request_immediate_exit = __kvm_request_immediate_exit, - - .sched_in = svm_sched_in, - .nested_ops = &svm_nested_ops, .deliver_interrupt = svm_deliver_interrupt, .pi_update_irte = avic_pi_update_irte, .setup_mce = svm_setup_mce, +#ifdef CONFIG_KVM_SMM .smi_allowed = svm_smi_allowed, .enter_smm = svm_enter_smm, .leave_smm = svm_leave_smm, .enable_smi_window = svm_enable_smi_window, +#endif +#ifdef CONFIG_KVM_AMD_SEV + .dev_get_attr = sev_dev_get_attr, .mem_enc_ioctl = sev_mem_enc_ioctl, .mem_enc_register_region = sev_mem_enc_register_region, .mem_enc_unregister_region = sev_mem_enc_unregister_region, @@ -4841,16 +5140,21 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, .vm_move_enc_context_from = sev_vm_move_enc_context_from, - - .can_emulate_instruction = svm_can_emulate_instruction, +#endif + .check_emulate_instruction = svm_check_emulate_instruction, .apic_init_signal_blocked = svm_apic_init_signal_blocked, - .msr_filter_changed = svm_msr_filter_changed, + .recalc_intercepts = svm_recalc_intercepts, .complete_emulated_msr = svm_complete_emulated_msr, .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, + .alloc_apic_backing_page = svm_alloc_apic_backing_page, + + .gmem_prepare = sev_gmem_prepare, + .gmem_invalidate = sev_gmem_invalidate, + .gmem_max_mapping_level = sev_gmem_max_mapping_level, }; /* @@ -4869,7 +5173,7 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; @@ -4898,13 +5202,22 @@ static __init void svm_set_cpu_caps(void) { kvm_set_cpu_caps(); - kvm_caps.supported_xss = 0; + kvm_caps.supported_perf_cap = 0; + + kvm_cpu_cap_clear(X86_FEATURE_IBT); /* CPUID 0x80000001 and 0x8000000A (SVM features) */ if (nested) { kvm_cpu_cap_set(X86_FEATURE_SVM); kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN); + /* + * KVM currently flushes TLBs on *every* nested SVM transition, + * and so for all intents and purposes KVM supports flushing by + * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush. + */ + kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID); + if (nrips) kvm_cpu_cap_set(X86_FEATURE_NRIPS); @@ -4928,30 +5241,53 @@ static __init void svm_set_cpu_caps(void) if (vgif) kvm_cpu_cap_set(X86_FEATURE_VGIF); + if (vnmi) + kvm_cpu_cap_set(X86_FEATURE_VNMI); + /* Nested VM can receive #VMEXIT instead of triggering #GP */ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } + if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD)) + kvm_caps.has_bus_lock_exit = true; + /* CPUID 0x80000008 */ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - /* AMD PMU PERFCTR_CORE CPUID */ - if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) - kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); + if (enable_pmu) { + /* + * Enumerate support for PERFCTR_CORE if and only if KVM has + * access to enough counters to virtualize "core" support, + * otherwise limit vPMU support to the legacy number of counters. + */ + if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE) + kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS, + kvm_pmu_cap.num_counters_gp); + else + kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE); + + if (kvm_pmu_cap.version != 2 || + !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2); + } /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); + + /* + * Clear capabilities that are automatically configured by common code, + * but that require explicit SVM support (that isn't yet implemented). + */ + kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); + kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); } static __init int svm_hardware_setup(void) { - int cpu; - struct page *iopm_pages; void *iopm_va; - int r; - unsigned int order = get_order(IOPM_SIZE); + int cpu, r; /* * NX is required for shadow paging and for NPT if the NX huge pages @@ -4963,17 +5299,6 @@ static __init int svm_hardware_setup(void) } kvm_enable_efer_bits(EFER_NX); - iopm_pages = alloc_pages(GFP_KERNEL, order); - - if (!iopm_pages) - return -ENOMEM; - - iopm_va = page_address(iopm_pages); - memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; - - init_msrpm_offsets(); - kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); @@ -4993,6 +5318,9 @@ static __init int svm_hardware_setup(void) tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); + if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) + kvm_enable_efer_bits(EFER_AUTOIBRS); + /* Check for pause filtering support */ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { pause_filter_count = 0; @@ -5002,8 +5330,14 @@ static __init int svm_hardware_setup(void) } if (nested) { - printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + pr_info("Nested Virtualization enabled\n"); + kvm_enable_efer_bits(EFER_SVME); + if (!boot_cpu_has(X86_FEATURE_EFER_LMSLE_MBZ)) + kvm_enable_efer_bits(EFER_LMSLE); + + r = nested_svm_init_msrpm_merge_offsets(); + if (r) + return r; } /* @@ -5020,35 +5354,54 @@ static __init int svm_hardware_setup(void) /* Force VM NPT level equal to the host's paging level */ kvm_configure_mmu(npt_enabled, get_npt_level(), get_npt_level(), PG_LEVEL_1G); - pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled)); + + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); svm_adjust_mmio_mask(); + nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS); + + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } + + iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL); + if (!iopm_va) + return -ENOMEM; + + iopm_base = __sme_set(__pa(iopm_va)); + /* * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which - * may be modified by svm_adjust_mmio_mask()). + * may be modified by svm_adjust_mmio_mask()), as well as nrips. */ sev_hardware_setup(); svm_hv_hardware_setup(); - for_each_possible_cpu(cpu) { - r = svm_cpu_init(cpu); - if (r) - goto err; - } - - if (nrips) { - if (!boot_cpu_has(X86_FEATURE_NRIPS)) - nrips = false; - } - - enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops); - + enable_apicv = avic_hardware_setup(); if (!enable_apicv) { + enable_ipiv = false; svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; @@ -5074,11 +5427,13 @@ static __init int svm_hardware_setup(void) pr_info("Virtual GIF supported\n"); } - if (lbrv) { - if (!boot_cpu_has(X86_FEATURE_LBRV)) - lbrv = false; - else - pr_info("LBR virtualization supported\n"); + vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI); + if (vnmi) + pr_info("Virtual NMI enabled\n"); + + if (!vnmi) { + svm_x86_ops.is_vnmi_pending = NULL; + svm_x86_ops.set_vnmi_pending = NULL; } if (!enable_pmu) @@ -5086,20 +5441,13 @@ static __init int svm_hardware_setup(void) svm_set_cpu_caps(); - /* - * It seems that on AMD processors PTE's accessed bit is - * being set by the CPU hardware before the NPF vmexit. - * This is not expected behaviour and our tests fail because - * of it. - * A workaround here is to disable support for - * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. - * In this case userspace can know if there is support using - * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle - * it - * If future AMD CPU models change the behaviour described above, - * this variable can be changed accordingly - */ - allow_smaller_maxphyaddr = !npt_enabled; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; + + for_each_possible_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err; + } return 0; @@ -5110,26 +5458,52 @@ err: static struct kvm_x86_init_ops svm_init_ops __initdata = { - .cpu_has_kvm_support = has_svm, - .disabled_by_bios = is_disabled, .hardware_setup = svm_hardware_setup, - .check_processor_compatibility = svm_check_processor_compat, .runtime_ops = &svm_x86_ops, .pmu_ops = &amd_pmu_ops, }; +static void __svm_exit(void) +{ + kvm_x86_vendor_exit(); +} + static int __init svm_init(void) { + int r; + + KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm); + __unused_size_checks(); - return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + if (!kvm_is_svm_supported()) + return -EOPNOTSUPP; + + r = kvm_x86_vendor_init(&svm_init_ops); + if (r) + return r; + + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), + THIS_MODULE); + if (r) + goto err_kvm_init; + + return 0; + +err_kvm_init: + __svm_exit(); + return r; } static void __exit svm_exit(void) { kvm_exit(); + __svm_exit(); } module_init(svm_init) |
