diff options
Diffstat (limited to 'arch/x86/kvm/svm/svm.c')
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 506 |
1 files changed, 170 insertions, 336 deletions
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ab9b947dbf4f..d9931c6c4bc6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -72,8 +72,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); static bool erratum_383_found __read_mostly; -u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; - /* * Set osvw_len to higher value when updated Revision Guides * are published and we know what the new status bits are @@ -82,72 +80,6 @@ static uint64_t osvw_len = 4, osvw_status; static DEFINE_PER_CPU(u64, current_tsc_ratio); -#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4)) - -static const struct svm_direct_access_msrs { - u32 index; /* Index of the MSR */ - bool always; /* True if intercept is initially cleared */ -} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { - { .index = MSR_STAR, .always = true }, - { .index = MSR_IA32_SYSENTER_CS, .always = true }, - { .index = MSR_IA32_SYSENTER_EIP, .always = false }, - { .index = MSR_IA32_SYSENTER_ESP, .always = false }, -#ifdef CONFIG_X86_64 - { .index = MSR_GS_BASE, .always = true }, - { .index = MSR_FS_BASE, .always = true }, - { .index = MSR_KERNEL_GS_BASE, .always = true }, - { .index = MSR_LSTAR, .always = true }, - { .index = MSR_CSTAR, .always = true }, - { .index = MSR_SYSCALL_MASK, .always = true }, -#endif - { .index = MSR_IA32_SPEC_CTRL, .always = false }, - { .index = MSR_IA32_PRED_CMD, .always = false }, - { .index = MSR_IA32_FLUSH_CMD, .always = false }, - { .index = MSR_IA32_DEBUGCTLMSR, .always = false }, - { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, - { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, - { .index = MSR_IA32_LASTINTFROMIP, .always = false }, - { .index = MSR_IA32_LASTINTTOIP, .always = false }, - { .index = MSR_IA32_XSS, .always = false }, - { .index = MSR_EFER, .always = false }, - { .index = MSR_IA32_CR_PAT, .always = false }, - { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, - { .index = MSR_TSC_AUX, .always = false }, - { .index = X2APIC_MSR(APIC_ID), .always = false }, - { .index = X2APIC_MSR(APIC_LVR), .always = false }, - { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, - { .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, - { .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, - { .index = X2APIC_MSR(APIC_EOI), .always = false }, - { .index = X2APIC_MSR(APIC_RRR), .always = false }, - { .index = X2APIC_MSR(APIC_LDR), .always = false }, - { .index = X2APIC_MSR(APIC_DFR), .always = false }, - { .index = X2APIC_MSR(APIC_SPIV), .always = false }, - { .index = X2APIC_MSR(APIC_ISR), .always = false }, - { .index = X2APIC_MSR(APIC_TMR), .always = false }, - { .index = X2APIC_MSR(APIC_IRR), .always = false }, - { .index = X2APIC_MSR(APIC_ESR), .always = false }, - { .index = X2APIC_MSR(APIC_ICR), .always = false }, - { .index = X2APIC_MSR(APIC_ICR2), .always = false }, - - /* - * Note: - * AMD does not virtualize APIC TSC-deadline timer mode, but it is - * emulated by KVM. When setting APIC LVTT (0x832) register bit 18, - * the AVIC hardware would generate GP fault. Therefore, always - * intercept the MSR 0x832, and do not setup direct_access_msr. - */ - { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, - { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, - { .index = X2APIC_MSR(APIC_LVT0), .always = false }, - { .index = X2APIC_MSR(APIC_LVT1), .always = false }, - { .index = X2APIC_MSR(APIC_LVTERR), .always = false }, - { .index = X2APIC_MSR(APIC_TMICT), .always = false }, - { .index = X2APIC_MSR(APIC_TMCCT), .always = false }, - { .index = X2APIC_MSR(APIC_TDCR), .always = false }, - { .index = MSR_INVALID, .always = false }, -}; - /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * pause_filter_count: On processors that support Pause filtering(indicated @@ -232,6 +164,7 @@ module_param(tsc_scaling, int, 0444); */ static bool avic; module_param(avic, bool, 0444); +module_param(enable_ipiv, bool, 0444); module_param(enable_device_posted_irqs, bool, 0444); @@ -264,33 +197,6 @@ static DEFINE_MUTEX(vmcb_dump_mutex); */ static int tsc_aux_uret_slot __read_mostly = -1; -static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; - -#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) -#define MSRS_RANGE_SIZE 2048 -#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) - -u32 svm_msrpm_offset(u32 msr) -{ - u32 offset; - int i; - - for (i = 0; i < NUM_MSR_MAPS; i++) { - if (msr < msrpm_ranges[i] || - msr >= msrpm_ranges[i] + MSRS_IN_RANGE) - continue; - - offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ - offset += (i * MSRS_RANGE_SIZE); /* add range offset */ - - /* Now we have the u8 offset - but need the u32 offset */ - return offset / 4; - } - - /* MSR not in any range */ - return MSR_INVALID; -} - static int get_npt_level(void) { #ifdef CONFIG_X86_64 @@ -757,50 +663,8 @@ static void clr_dr_intercepts(struct vcpu_svm *svm) recalc_intercepts(svm); } -static int direct_access_msr_slot(u32 msr) -{ - u32 i; - - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) - if (direct_access_msrs[i].index == msr) - return i; - - return -ENOENT; -} - -static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, - int write) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int slot = direct_access_msr_slot(msr); - - if (slot == -ENOENT) - return; - - /* Set the shadow bitmaps to the desired intercept states */ - if (read) - set_bit(slot, svm->shadow_msr_intercept.read); - else - clear_bit(slot, svm->shadow_msr_intercept.read); - - if (write) - set_bit(slot, svm->shadow_msr_intercept.write); - else - clear_bit(slot, svm->shadow_msr_intercept.write); -} - -static bool valid_msr_intercept(u32 index) -{ - return direct_access_msr_slot(index) != -ENOENT; -} - static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) { - u8 bit_write; - unsigned long tmp; - u32 offset; - u32 *msrpm; - /* * For non-nested case: * If the L01 MSR bitmap does not intercept the MSR, then we need to @@ -810,90 +674,102 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: - to_svm(vcpu)->msrpm; + void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm : + to_svm(vcpu)->msrpm; - offset = svm_msrpm_offset(msr); - bit_write = 2 * (msr & 0x0f) + 1; - tmp = msrpm[offset]; - - BUG_ON(offset == MSR_INVALID); - - return test_bit(bit_write, &tmp); + return svm_test_msr_bitmap_write(msrpm, msr); } -static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, - u32 msr, int read, int write) +void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) { struct vcpu_svm *svm = to_svm(vcpu); - u8 bit_read, bit_write; - unsigned long tmp; - u32 offset; + void *msrpm = svm->msrpm; - /* - * If this warning triggers extend the direct_access_msrs list at the - * beginning of the file - */ - WARN_ON(!valid_msr_intercept(msr)); - - /* Enforce non allowed MSRs to trap */ - if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) - read = 0; - - if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) - write = 0; - - offset = svm_msrpm_offset(msr); - bit_read = 2 * (msr & 0x0f); - bit_write = 2 * (msr & 0x0f) + 1; - tmp = msrpm[offset]; - - BUG_ON(offset == MSR_INVALID); - - read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); - write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); + /* Don't disable interception for MSRs userspace wants to handle. */ + if (type & MSR_TYPE_R) { + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) + svm_clear_msr_bitmap_read(msrpm, msr); + else + svm_set_msr_bitmap_read(msrpm, msr); + } - msrpm[offset] = tmp; + if (type & MSR_TYPE_W) { + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) + svm_clear_msr_bitmap_write(msrpm, msr); + else + svm_set_msr_bitmap_write(msrpm, msr); + } svm_hv_vmcb_dirty_nested_enlightenments(vcpu); svm->nested.force_msr_bitmap_recalc = true; } -void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, - int read, int write) -{ - set_shadow_msr_intercept(vcpu, msr, read, write); - set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); -} - -u32 *svm_vcpu_alloc_msrpm(void) +void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask) { - unsigned int order = get_order(MSRPM_SIZE); - struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order); - u32 *msrpm; + unsigned int order = get_order(size); + struct page *pages = alloc_pages(gfp_mask, order); + void *pm; if (!pages) return NULL; - msrpm = page_address(pages); - memset(msrpm, 0xff, PAGE_SIZE * (1 << order)); + /* + * Set all bits in the permissions map so that all MSR and I/O accesses + * are intercepted by default. + */ + pm = page_address(pages); + memset(pm, 0xff, PAGE_SIZE * (1 << order)); - return msrpm; + return pm; } -void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) +static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) { - int i; + bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - if (!direct_access_msrs[i].always) - continue; - set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); - } + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept); + + if (sev_es_guest(vcpu->kvm)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); } void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) { + static const u32 x2avic_passthrough_msrs[] = { + X2APIC_MSR(APIC_ID), + X2APIC_MSR(APIC_LVR), + X2APIC_MSR(APIC_TASKPRI), + X2APIC_MSR(APIC_ARBPRI), + X2APIC_MSR(APIC_PROCPRI), + X2APIC_MSR(APIC_EOI), + X2APIC_MSR(APIC_RRR), + X2APIC_MSR(APIC_LDR), + X2APIC_MSR(APIC_DFR), + X2APIC_MSR(APIC_SPIV), + X2APIC_MSR(APIC_ISR), + X2APIC_MSR(APIC_TMR), + X2APIC_MSR(APIC_IRR), + X2APIC_MSR(APIC_ESR), + X2APIC_MSR(APIC_ICR), + X2APIC_MSR(APIC_ICR2), + + /* + * Note! Always intercept LVTT, as TSC-deadline timer mode + * isn't virtualized by hardware, and the CPU will generate a + * #GP instead of a #VMEXIT. + */ + X2APIC_MSR(APIC_LVTTHMR), + X2APIC_MSR(APIC_LVTPC), + X2APIC_MSR(APIC_LVT0), + X2APIC_MSR(APIC_LVT1), + X2APIC_MSR(APIC_LVTERR), + X2APIC_MSR(APIC_TMICT), + X2APIC_MSR(APIC_TMCCT), + X2APIC_MSR(APIC_TDCR), + }; int i; if (intercept == svm->x2avic_msrs_intercepted) @@ -902,84 +778,79 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) if (!x2avic_enabled) return; - for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { - int index = direct_access_msrs[i].index; - - if ((index < APIC_BASE_MSR) || - (index > APIC_BASE_MSR + 0xff)) - continue; - set_msr_interception(&svm->vcpu, svm->msrpm, index, - !intercept, !intercept); - } + for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) + svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], + MSR_TYPE_RW, intercept); svm->x2avic_msrs_intercepted = intercept; } -void svm_vcpu_free_msrpm(u32 *msrpm) +void svm_vcpu_free_msrpm(void *msrpm) { __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); } -static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) +static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - u32 i; - /* - * Set intercept permissions for all direct access MSRs again. They - * will automatically get filtered through the MSR filter, so we are - * back in sync after this. - */ - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - u32 msr = direct_access_msrs[i].index; - u32 read = test_bit(i, svm->shadow_msr_intercept.read); - u32 write = test_bit(i, svm->shadow_msr_intercept.write); - - set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); - } -} - -static void add_msr_offset(u32 offset) -{ - int i; - - for (i = 0; i < MSRPM_OFFSETS; ++i) { + svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); - /* Offset already in list? */ - if (msrpm_offsets[i] == offset) - return; +#ifdef CONFIG_X86_64 + svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW); +#endif - /* Slot used by another offset? */ - if (msrpm_offsets[i] != MSR_INVALID) - continue; + if (lbrv) + svm_recalc_lbr_msr_intercepts(vcpu); - /* Add offset to list */ - msrpm_offsets[i] = offset; + if (cpu_feature_enabled(X86_FEATURE_IBPB)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, + !guest_has_pred_cmd_msr(vcpu)); - return; - } + if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); /* - * If this BUG triggers the msrpm_offsets table has an overflow. Just - * increase MSRPM_OFFSETS in this case. + * Disable interception of SPEC_CTRL if KVM doesn't need to manually + * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if + * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively + * using SPEC_CTRL. */ - BUG(); -} - -static void init_msrpm_offsets(void) -{ - int i; - - memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); + if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, + !guest_has_spec_ctrl_msr(vcpu)); + else + svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, + !svm->spec_ctrl); - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - u32 offset; + /* + * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU, + * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits. + */ + svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW, + guest_cpuid_is_intel_compatible(vcpu)); + svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW, + guest_cpuid_is_intel_compatible(vcpu)); + + if (kvm_aperfmperf_in_guest(vcpu->kvm)) { + svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); + } - offset = svm_msrpm_offset(direct_access_msrs[i].index); - BUG_ON(offset == MSR_INVALID); + if (sev_es_guest(vcpu->kvm)) + sev_es_recalc_msr_intercepts(vcpu); - add_msr_offset(offset); - } + /* + * x2APIC intercepts are modified on-demand and cannot be filtered by + * userspace. + */ } void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) @@ -998,13 +869,7 @@ void svm_enable_lbrv(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); - - if (sev_es_guest(vcpu->kvm)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1); + svm_recalc_lbr_msr_intercepts(vcpu); /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ if (is_guest_mode(vcpu)) @@ -1016,12 +881,8 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); - svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); + svm_recalc_lbr_msr_intercepts(vcpu); /* * Move the LBR msrs back to the vmcb01 to avoid copying them @@ -1176,9 +1037,10 @@ void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu) } /* Evaluate instruction intercepts that depend on guest CPUID features. */ -static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, - struct vcpu_svm *svm) +static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + /* * Intercept INVPCID if shadow paging is enabled to sync/free shadow * roots, or if INVPCID is disabled in the guest to inject #UD. @@ -1197,24 +1059,11 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, else svm_set_intercept(svm, INTERCEPT_RDTSCP); } -} - -static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); if (guest_cpuid_is_intel_compatible(vcpu)) { - /* - * We must intercept SYSENTER_EIP and SYSENTER_ESP - * accesses because the processor only stores 32 bits. - * For the same reason we cannot use virtual VMLOAD/VMSAVE. - */ svm_set_intercept(svm, INTERCEPT_VMLOAD); svm_set_intercept(svm, INTERCEPT_VMSAVE); svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it @@ -1225,12 +1074,15 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_VMSAVE); svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } - /* No need to intercept these MSRs */ - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); } } +static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + svm_recalc_instruction_intercepts(vcpu); + svm_recalc_msr_intercepts(vcpu); +} + static void init_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1353,15 +1205,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_PAUSE); } - svm_recalc_instruction_intercepts(vcpu, svm); - - /* - * If the host supports V_SPEC_CTRL then disable the interception - * of MSR_IA32_SPEC_CTRL. - */ - if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); - if (kvm_vcpu_apicv_active(vcpu)) avic_init_vmcb(svm, vmcb); @@ -1381,7 +1224,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu) sev_init_vmcb(svm); svm_hv_init_vmcb(vmcb); - init_vmcb_after_set_cpuid(vcpu); + + svm_recalc_intercepts_after_set_cpuid(vcpu); vmcb_mark_all_dirty(vmcb); @@ -1392,8 +1236,6 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - svm_vcpu_init_msrpm(vcpu, svm->msrpm); - svm_init_osvw(vcpu); if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) @@ -1490,13 +1332,15 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON_ONCE(!list_empty(&svm->ir_list)); + svm_leave_nested(vcpu); svm_free_nested(svm); sev_free_vcpu(vcpu); __free_page(__sme_pa_to_page(svm->vmcb01.pa)); - __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); + svm_vcpu_free_msrpm(svm->msrpm); } #ifdef CONFIG_CPU_MITIGATIONS @@ -2880,12 +2724,11 @@ static int svm_get_feature_msr(u32 msr, u64 *data) return 0; } -static bool -sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, + struct msr_data *msr_info) { return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected && - svm_msrpm_offset(msr_info->index) != MSR_INVALID && !msr_write_intercepted(vcpu, msr_info->index); } @@ -3116,11 +2959,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * * For nested: * The handling of the MSR bitmap for L2 guests is done in - * nested_svm_vmrun_msrpm. + * nested_svm_merge_msrpm(). * We update the L1 MSR bit as well since it will end up * touching the MSR anyway now. */ - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && @@ -3186,8 +3029,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) /* * TSC_AUX is usually changed only during boot and never read - * directly. Intercept TSC_AUX instead of exposing it to the - * guest via direct_access_msrs, and switch it via user return. + * directly. Intercept TSC_AUX and switch it via user return. */ preempt_disable(); ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); @@ -4389,9 +4231,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_exit_irqoff(); } -static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, - bool force_immediate_exit) +static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) { + bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; struct vcpu_svm *svm = to_svm(vcpu); bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); @@ -4438,10 +4280,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, svm_hv_update_vp_id(svm->vmcb, vcpu); /* - * Run with all-zero DR6 unless needed, so that we can get the exact cause - * of a #DB. + * Run with all-zero DR6 unless the guest can write DR6 freely, so that + * KVM can get the exact cause of a #DB. Note, loading guest DR6 from + * KVM's snapshot is only necessary when DR accesses won't exit. */ - if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6)) + svm_set_dr6(vcpu, vcpu->arch.dr6); + else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); @@ -4621,20 +4466,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (guest_cpuid_is_intel_compatible(vcpu)) guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - svm_recalc_instruction_intercepts(vcpu, svm); - - if (boot_cpu_has(X86_FEATURE_IBPB)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, - !!guest_has_pred_cmd_msr(vcpu)); - - if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, - !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); - if (sev_guest(vcpu->kvm)) sev_vcpu_after_set_cpuid(svm); - init_vmcb_after_set_cpuid(vcpu); + svm_recalc_intercepts_after_set_cpuid(vcpu); } static bool svm_has_wbinvd_exit(void) @@ -5185,7 +5020,7 @@ static int svm_vm_init(struct kvm *kvm) } if (!pause_filter_count || !pause_filter_thresh) - kvm->arch.pause_in_guest = true; + kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); if (enable_apicv) { int ret = avic_vm_init(kvm); @@ -5252,7 +5087,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, - .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, @@ -5337,7 +5171,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .apic_init_signal_blocked = svm_apic_init_signal_blocked, - .msr_filter_changed = svm_msr_filter_changed, + .recalc_msr_intercepts = svm_recalc_msr_intercepts, .complete_emulated_msr = svm_complete_emulated_msr, .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, @@ -5473,11 +5307,8 @@ static __init void svm_set_cpu_caps(void) static __init int svm_hardware_setup(void) { - int cpu; - struct page *iopm_pages; void *iopm_va; - int r; - unsigned int order = get_order(IOPM_SIZE); + int cpu, r; /* * NX is required for shadow paging and for NPT if the NX huge pages @@ -5489,17 +5320,6 @@ static __init int svm_hardware_setup(void) } kvm_enable_efer_bits(EFER_NX); - iopm_pages = alloc_pages(GFP_KERNEL, order); - - if (!iopm_pages) - return -ENOMEM; - - iopm_va = page_address(iopm_pages); - memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); - iopm_base = __sme_page_pa(iopm_pages); - - init_msrpm_offsets(); - kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); @@ -5533,6 +5353,10 @@ static __init int svm_hardware_setup(void) if (nested) { pr_info("Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + + r = nested_svm_init_msrpm_merge_offsets(); + if (r) + return r; } /* @@ -5564,6 +5388,13 @@ static __init int svm_hardware_setup(void) else pr_info("LBR virtualization supported\n"); } + + iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL); + if (!iopm_va) + return -ENOMEM; + + iopm_base = __sme_set(__pa(iopm_va)); + /* * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which * may be modified by svm_adjust_mmio_mask()), as well as nrips. @@ -5581,6 +5412,7 @@ static __init int svm_hardware_setup(void) enable_apicv = avic = avic && avic_hardware_setup(); if (!enable_apicv) { + enable_ipiv = false; svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; @@ -5662,6 +5494,8 @@ static int __init svm_init(void) { int r; + KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm); + __unused_size_checks(); if (!kvm_is_svm_supported()) |