diff options
Diffstat (limited to 'arch')
47 files changed, 1451 insertions, 1097 deletions
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 94d33e296e10..7b107fa540fa 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2130,6 +2130,11 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) return NULL; } +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + bool kvm_arch_has_irq_bypass(void) { return true; diff --git a/arch/arm64/kvm/irq.h b/arch/arm64/kvm/irq.h deleted file mode 100644 index 0d257de42c10..000000000000 --- a/arch/arm64/kvm/irq.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * irq.h: in kernel interrupt controller related definitions - * Copyright (c) 2016 Red Hat, Inc. - * - * This header is included by irqchip.c. However, on ARM, interrupt - * controller declarations are located in include/kvm/arm_vgic.h since - * they are mostly shared between arm and arm64. - */ - -#ifndef __IRQ_H -#define __IRQ_H - -#include <kvm/arm_vgic.h> - -#endif diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 60ee3d9f01f8..f154d4a7fae0 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1239,7 +1239,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, */ smp_rmb(); - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, write_fault, &writable, NULL); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index e9744b41a226..4939f57b6f6a 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -598,7 +598,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, write_ok = true; } else { /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, writing, &write_ok, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 5d5e12f3bf86..9d3743ca16d5 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -846,7 +846,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, unsigned long pfn; /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, writing, upgrade_p, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h deleted file mode 100644 index e6463f866abc..000000000000 --- a/arch/powerpc/kvm/irq.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __IRQ_H -#define __IRQ_H - -#include <linux/kvm_host.h> - -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - int ret = 0; - -#ifdef CONFIG_KVM_MPIC - ret = ret || (kvm->arch.mpic != NULL); -#endif -#ifdef CONFIG_KVM_XICS - ret = ret || (kvm->arch.xics != NULL); - ret = ret || (kvm->arch.xive != NULL); -#endif - smp_rmb(); - return ret; -} - -#endif diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index b850b0efa201..04494a4fb37a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -36,7 +36,6 @@ #include <asm/setup.h> #include "timing.h" -#include "irq.h" #include "../mm/mmu_decl.h" #define CREATE_TRACE_POINTS @@ -2165,10 +2164,25 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) return 0; } +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + int ret = 0; + +#ifdef CONFIG_KVM_MPIC + ret = ret || (kvm->arch.mpic != NULL); +#endif +#ifdef CONFIG_KVM_XICS + ret = ret || (kvm->arch.xics != NULL); + ret = ret || (kvm->arch.xive != NULL); +#endif + smp_rmb(); + return ret; +} + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, bool line_status) { - if (!irqchip_in_kernel(kvm)) + if (!kvm_arch_irqchip_in_kernel(kvm)) return -ENXIO; irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h deleted file mode 100644 index 484608c71dd0..000000000000 --- a/arch/s390/kvm/irq.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * s390 irqchip routines - * - * Copyright IBM Corp. 2014 - * - * Author(s): Cornelia Huck <cornelia.huck@de.ibm.com> - */ -#ifndef __KVM_IRQ_H -#define __KVM_IRQ_H - -#include <linux/kvm_host.h> - -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - return 1; -} - -#endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index bc491a73815c..5c7532dbc96b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5567,6 +5567,11 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return true; +} + /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 8259d725054d..4dbde69c423b 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1603,10 +1603,8 @@ clear_arch_lbr: * x86_perf_get_lbr - get the LBR records information * * @lbr: the caller's memory to store the LBR records information - * - * Returns: 0 indicates the LBR info has been successfully obtained */ -int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) +void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { int lbr_fmt = x86_pmu.intel_cap.lbr_format; @@ -1614,8 +1612,6 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) lbr->from = x86_pmu.lbr_from; lbr->to = x86_pmu.lbr_to; lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0; - - return 0; } EXPORT_SYMBOL_GPL(x86_perf_get_lbr); diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 82ba4a564e58..ea58e67e9a67 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -110,10 +110,12 @@ KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) KVM_X86_OP_OPTIONAL(cancel_hv_timer) KVM_X86_OP(setup_mce) +#ifdef CONFIG_KVM_SMM KVM_X86_OP(smi_allowed) KVM_X86_OP(enter_smm) KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) +#endif KVM_X86_OP_OPTIONAL(mem_enc_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f05ebaa26f0f..598eb3b9ae44 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -81,7 +81,9 @@ #define KVM_REQ_NMI KVM_ARCH_REQ(9) #define KVM_REQ_PMU KVM_ARCH_REQ(10) #define KVM_REQ_PMI KVM_ARCH_REQ(11) +#ifdef CONFIG_KVM_SMM #define KVM_REQ_SMI KVM_ARCH_REQ(12) +#endif #define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13) #define KVM_REQ_MCLOCK_INPROGRESS \ KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) @@ -204,6 +206,7 @@ typedef enum exit_fastpath_completion fastpath_t; struct x86_emulate_ctxt; struct x86_exception; +union kvm_smram; enum x86_intercept; enum x86_intercept_stage; @@ -253,16 +256,16 @@ enum x86_intercept_stage; #define PFERR_GUEST_PAGE_BIT 33 #define PFERR_IMPLICIT_ACCESS_BIT 48 -#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) -#define PFERR_USER_MASK (1U << PFERR_USER_BIT) -#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) -#define PFERR_PK_MASK (1U << PFERR_PK_BIT) -#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT) -#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) -#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) -#define PFERR_IMPLICIT_ACCESS (1ULL << PFERR_IMPLICIT_ACCESS_BIT) +#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) +#define PFERR_USER_MASK BIT(PFERR_USER_BIT) +#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) +#define PFERR_PK_MASK BIT(PFERR_PK_BIT) +#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) +#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) +#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) +#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ @@ -488,17 +491,19 @@ enum pmc_type { struct kvm_pmc { enum pmc_type type; u8 idx; + bool is_paused; + bool intr; u64 counter; + u64 prev_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* + * only for creating or reusing perf_event, * eventsel value for general purpose counters, * ctrl value for fixed counters. */ u64 current_config; - bool is_paused; - bool intr; }; /* More counters may conflict with other existing Architectural MSRs */ @@ -524,7 +529,16 @@ struct kvm_pmu { struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; struct irq_work irq_work; - DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); + + /* + * Overlay the bitmap with a 64-bit atomic so that all bits can be + * set in a single access, e.g. to reprogram all counters when the PMU + * filter changes. + */ + union { + DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); + atomic64_t __reprogram_pmi; + }; DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); @@ -1156,7 +1170,18 @@ struct kvm_arch { struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; - struct list_head lpage_disallowed_mmu_pages; + /* + * A list of kvm_mmu_page structs that, if zapped, could possibly be + * replaced by an NX huge page. A shadow page is on this list if its + * existence disallows an NX huge page (nx_huge_page_disallowed is set) + * and there are no other conditions that prevent a huge page, e.g. + * the backing host page is huge, dirtly logging is not enabled for its + * memslot, etc... Note, zapping shadow pages on this list doesn't + * guarantee an NX huge page will be created in its stead, e.g. if the + * guest attempts to execute from the region then KVM obviously can't + * create an NX huge page (without hanging the guest). + */ + struct list_head possible_nx_huge_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; /* @@ -1272,7 +1297,7 @@ struct kvm_arch { bool sgx_provisioning_allowed; struct kvm_pmu_event_filter __rcu *pmu_event_filter; - struct task_struct *nx_lpage_recovery_thread; + struct task_struct *nx_huge_page_recovery_thread; #ifdef CONFIG_X86_64 /* @@ -1284,6 +1309,9 @@ struct kvm_arch { */ bool tdp_mmu_enabled; + /* The number of TDP MMU pages across all roots. */ + atomic64_t tdp_mmu_pages; + /* * List of kvm_mmu_page structs being used as roots. * All kvm_mmu_page structs in the list should have @@ -1305,20 +1333,12 @@ struct kvm_arch { struct list_head tdp_mmu_roots; /* - * List of kvm_mmu_page structs not being used as roots. - * All kvm_mmu_page structs in the list should have - * tdp_mmu_page set and a tdp_mmu_root_count of 0. - */ - struct list_head tdp_mmu_pages; - - /* * Protects accesses to the following fields when the MMU lock * is held in read mode: * - tdp_mmu_roots (above) - * - tdp_mmu_pages (above) * - the link field of kvm_mmu_page structs used by the TDP MMU - * - lpage_disallowed_mmu_pages - * - the lpage_disallowed_link field of kvm_mmu_page structs used + * - possible_nx_huge_pages; + * - the possible_nx_huge_page_link field of kvm_mmu_page structs used * by the TDP MMU * It is acceptable, but not necessary, to acquire this lock when * the thread holds the MMU lock in write mode. @@ -1612,10 +1632,12 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); +#ifdef CONFIG_KVM_SMM int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); - int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate); - int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); + int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram); + int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram); void (*enable_smi_window)(struct kvm_vcpu *vcpu); +#endif int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); @@ -1844,6 +1866,7 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); @@ -1909,8 +1932,6 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); -gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, @@ -1994,14 +2015,18 @@ enum { #define HF_NMI_MASK (1 << 3) #define HF_IRET_MASK (1 << 4) #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ + +#ifdef CONFIG_KVM_SMM #define HF_SMM_MASK (1 << 6) #define HF_SMM_INSIDE_NMI_MASK (1 << 7) -#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE -#define KVM_ADDRESS_SPACE_NUM 2 - -#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) -#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) +# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE +# define KVM_ADDRESS_SPACE_NUM 2 +# define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) +# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) +#else +# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) +#endif #define KVM_ARCH_WANT_MMU_NOTIFIER @@ -2089,12 +2114,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #endif } -#define put_smstate(type, buf, offset, val) \ - *(type *)((buf) + (offset) - 0x7e00) = val - -#define GET_SMSTATE(type, buf, offset) \ - (*(type *)((buf) + (offset) - 0x7e00)) - int kvm_cpu_dirty_log_size(void); int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 9ac46dbe57d4..5d0f6891ae61 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -543,12 +543,12 @@ static inline void perf_check_microcode(void) { } #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); -extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); +extern void x86_perf_get_lbr(struct x86_pmu_lbr *lbr); #else struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); -static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) +static inline void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { - return -1; + memset(lbr, 0, sizeof(*lbr)); } #endif diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 0361626841bc..4352b46dd20c 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -293,12 +293,13 @@ struct vmcb_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u8 reserved_1[42]; + /* Reserved fields are named following their struct offset */ + u8 reserved_0xa0[42]; u8 vmpl; u8 cpl; - u8 reserved_2[4]; + u8 reserved_0xcc[4]; u64 efer; - u8 reserved_3[112]; + u8 reserved_0xd8[112]; u64 cr4; u64 cr3; u64 cr0; @@ -306,7 +307,7 @@ struct vmcb_save_area { u64 dr6; u64 rflags; u64 rip; - u8 reserved_4[88]; + u8 reserved_0x180[88]; u64 rsp; u64 s_cet; u64 ssp; @@ -321,14 +322,14 @@ struct vmcb_save_area { u64 sysenter_esp; u64 sysenter_eip; u64 cr2; - u8 reserved_5[32]; + u8 reserved_0x248[32]; u64 g_pat; u64 dbgctl; u64 br_from; u64 br_to; u64 last_excp_from; u64 last_excp_to; - u8 reserved_6[72]; + u8 reserved_0x298[72]; u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ } __packed; @@ -349,12 +350,12 @@ struct sev_es_save_area { u64 vmpl2_ssp; u64 vmpl3_ssp; u64 u_cet; - u8 reserved_1[2]; + u8 reserved_0xc8[2]; u8 vmpl; u8 cpl; - u8 reserved_2[4]; + u8 reserved_0xcc[4]; u64 efer; - u8 reserved_3[104]; + u8 reserved_0xd8[104]; u64 xss; u64 cr4; u64 cr3; @@ -371,7 +372,7 @@ struct sev_es_save_area { u64 dr1_addr_mask; u64 dr2_addr_mask; u64 dr3_addr_mask; - u8 reserved_4[24]; + u8 reserved_0x1c0[24]; u64 rsp; u64 s_cet; u64 ssp; @@ -386,21 +387,21 @@ struct sev_es_save_area { u64 sysenter_esp; u64 sysenter_eip; u64 cr2; - u8 reserved_5[32]; + u8 reserved_0x248[32]; u64 g_pat; u64 dbgctl; u64 br_from; u64 br_to; u64 last_excp_from; u64 last_excp_to; - u8 reserved_7[80]; + u8 reserved_0x298[80]; u32 pkru; - u8 reserved_8[20]; - u64 reserved_9; /* rax already available at 0x01f8 */ + u32 tsc_aux; + u8 reserved_0x2f0[24]; u64 rcx; u64 rdx; u64 rbx; - u64 reserved_10; /* rsp already available at 0x01d8 */ + u64 reserved_0x320; /* rsp already available at 0x01d8 */ u64 rbp; u64 rsi; u64 rdi; @@ -412,7 +413,7 @@ struct sev_es_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_11[16]; + u8 reserved_0x380[16]; u64 guest_exit_info_1; u64 guest_exit_info_2; u64 guest_exit_int_info; @@ -425,7 +426,7 @@ struct sev_es_save_area { u64 pcpu_id; u64 event_inj; u64 xcr0; - u8 reserved_12[16]; + u8 reserved_0x3f0[16]; /* Floating point area */ u64 x87_dp; @@ -443,23 +444,23 @@ struct sev_es_save_area { } __packed; struct ghcb_save_area { - u8 reserved_1[203]; + u8 reserved_0x0[203]; u8 cpl; - u8 reserved_2[116]; + u8 reserved_0xcc[116]; u64 xss; - u8 reserved_3[24]; + u8 reserved_0x148[24]; u64 dr7; - u8 reserved_4[16]; + u8 reserved_0x168[16]; u64 rip; - u8 reserved_5[88]; + u8 reserved_0x180[88]; u64 rsp; - u8 reserved_6[24]; + u8 reserved_0x1e0[24]; u64 rax; - u8 reserved_7[264]; + u8 reserved_0x200[264]; u64 rcx; u64 rdx; u64 rbx; - u8 reserved_8[8]; + u8 reserved_0x320[8]; u64 rbp; u64 rsi; u64 rdi; @@ -471,12 +472,12 @@ struct ghcb_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_9[16]; + u8 reserved_0x380[16]; u64 sw_exit_code; u64 sw_exit_info_1; u64 sw_exit_info_2; u64 sw_scratch; - u8 reserved_10[56]; + u8 reserved_0x3b0[56]; u64 xcr0; u8 valid_bitmap[16]; u64 x87_state_gpa; @@ -490,7 +491,7 @@ struct ghcb { u8 shared_buffer[GHCB_SHARED_BUF_SIZE]; - u8 reserved_1[10]; + u8 reserved_0xff0[10]; u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */ u32 ghcb_usage; } __packed; @@ -502,6 +503,9 @@ struct ghcb { #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE +#define BUILD_BUG_RESERVED_OFFSET(x, y) \ + ASSERT_STRUCT_OFFSET(struct x, reserved ## _ ## y, y) + static inline void __unused_size_checks(void) { BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE); @@ -509,6 +513,39 @@ static inline void __unused_size_checks(void) BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); + + /* Check offsets of reserved fields */ + + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xa0); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xd8); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x180); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x248); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x298); + + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xc8); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xd8); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x1c0); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x248); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x298); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x2f0); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x320); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x380); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x3f0); + + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x0); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x148); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x168); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x180); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x1e0); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x200); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x320); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x380); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x3b0); + + BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0); } struct vmcb { diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 46de10a809ec..c6df6b16a088 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -214,6 +214,8 @@ struct kvm_msr_list { struct kvm_msr_filter_range { #define KVM_MSR_FILTER_READ (1 << 0) #define KVM_MSR_FILTER_WRITE (1 << 1) +#define KVM_MSR_FILTER_RANGE_VALID_MASK (KVM_MSR_FILTER_READ | \ + KVM_MSR_FILTER_WRITE) __u32 flags; __u32 nmsrs; /* number of msrs in bitmap */ __u32 base; /* MSR index the bitmap starts at */ @@ -222,8 +224,11 @@ struct kvm_msr_filter_range { #define KVM_MSR_FILTER_MAX_RANGES 16 struct kvm_msr_filter { +#ifndef __KERNEL__ #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) +#endif #define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) +#define KVM_MSR_FILTER_VALID_MASK (KVM_MSR_FILTER_DEFAULT_DENY) __u32 flags; struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; }; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d4e48b4a438b..cf886f86038a 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -349,7 +349,7 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { - u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); + u64 pa; WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 67be7f217e37..fbeaa9ddef59 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -118,6 +118,17 @@ config KVM_AMD_SEV Provides support for launching Encrypted VMs (SEV) and Encrypted VMs with Encrypted State (SEV-ES) on AMD processors. +config KVM_SMM + bool "System Management Mode emulation" + default y + depends on KVM + help + Provides support for KVM to emulate System Management Mode (SMM) + in virtual machines. This can be used by the virtual machine + firmware to implement UEFI secure boot. + + If unsure, say Y. + config KVM_XEN bool "Support for Xen hypercall interface" depends on KVM diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f453a0f96e24..b8a494b6a5ec 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -20,6 +20,7 @@ endif kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o kvm-$(CONFIG_KVM_XEN) += xen.o +kvm-$(CONFIG_KVM_SMM) += smm.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 62bc7a01cecc..6b5912578edd 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -62,7 +62,7 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) * This one is tied to SSB in the user API, and not * visible in /proc/cpuinfo. */ -#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ +#define KVM_X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) @@ -694,7 +694,7 @@ void kvm_set_cpu_caps(void) F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | - __feature_bit(KVM_X86_FEATURE_PSFD) + __feature_bit(KVM_X86_FEATURE_AMD_PSFD) ); /* diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4a43261d25a2..5cc3efa0e21c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -242,37 +242,6 @@ enum x86_transfer_type { X86_TRANSFER_TASK_SWITCH, }; -static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) -{ - if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) - nr &= NR_EMULATOR_GPRS - 1; - - if (!(ctxt->regs_valid & (1 << nr))) { - ctxt->regs_valid |= 1 << nr; - ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); - } - return ctxt->_regs[nr]; -} - -static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) -{ - if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) - nr &= NR_EMULATOR_GPRS - 1; - - BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); - BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS); - - ctxt->regs_valid |= 1 << nr; - ctxt->regs_dirty |= 1 << nr; - return &ctxt->_regs[nr]; -} - -static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) -{ - reg_read(ctxt, nr); - return reg_write(ctxt, nr); -} - static void writeback_registers(struct x86_emulate_ctxt *ctxt) { unsigned long dirty = ctxt->regs_dirty; @@ -2338,335 +2307,15 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) return rc; } -static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) -{ -#ifdef CONFIG_X86_64 - return ctxt->ops->guest_has_long_mode(ctxt); -#else - return false; -#endif -} - -static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) -{ - desc->g = (flags >> 23) & 1; - desc->d = (flags >> 22) & 1; - desc->l = (flags >> 21) & 1; - desc->avl = (flags >> 20) & 1; - desc->p = (flags >> 15) & 1; - desc->dpl = (flags >> 13) & 3; - desc->s = (flags >> 12) & 1; - desc->type = (flags >> 8) & 15; -} - -static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate, - int n) -{ - struct desc_struct desc; - int offset; - u16 selector; - - selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4); - - if (n < 3) - offset = 0x7f84 + n * 12; - else - offset = 0x7f2c + (n - 3) * 12; - - set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset)); - ctxt->ops->set_segment(ctxt, selector, &desc, 0, n); - return X86EMUL_CONTINUE; -} - -#ifdef CONFIG_X86_64 -static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate, - int n) -{ - struct desc_struct desc; - int offset; - u16 selector; - u32 base3; - - offset = 0x7e00 + n * 16; - - selector = GET_SMSTATE(u16, smstate, offset); - rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); - base3 = GET_SMSTATE(u32, smstate, offset + 12); - - ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); - return X86EMUL_CONTINUE; -} -#endif - -static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, - u64 cr0, u64 cr3, u64 cr4) -{ - int bad; - u64 pcid; - - /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */ - pcid = 0; - if (cr4 & X86_CR4_PCIDE) { - pcid = cr3 & 0xfff; - cr3 &= ~0xfff; - } - - bad = ctxt->ops->set_cr(ctxt, 3, cr3); - if (bad) - return X86EMUL_UNHANDLEABLE; - - /* - * First enable PAE, long mode needs it before CR0.PG = 1 is set. - * Then enable protected mode. However, PCID cannot be enabled - * if EFER.LMA=0, so set it separately. - */ - bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); - if (bad) - return X86EMUL_UNHANDLEABLE; - - bad = ctxt->ops->set_cr(ctxt, 0, cr0); - if (bad) - return X86EMUL_UNHANDLEABLE; - - if (cr4 & X86_CR4_PCIDE) { - bad = ctxt->ops->set_cr(ctxt, 4, cr4); - if (bad) - return X86EMUL_UNHANDLEABLE; - if (pcid) { - bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); - if (bad) - return X86EMUL_UNHANDLEABLE; - } - - } - - return X86EMUL_CONTINUE; -} - -static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, - const char *smstate) -{ - struct desc_struct desc; - struct desc_ptr dt; - u16 selector; - u32 val, cr0, cr3, cr4; - int i; - - cr0 = GET_SMSTATE(u32, smstate, 0x7ffc); - cr3 = GET_SMSTATE(u32, smstate, 0x7ff8); - ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; - ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); - - for (i = 0; i < 8; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); - - val = GET_SMSTATE(u32, smstate, 0x7fcc); - - if (ctxt->ops->set_dr(ctxt, 6, val)) - return X86EMUL_UNHANDLEABLE; - - val = GET_SMSTATE(u32, smstate, 0x7fc8); - - if (ctxt->ops->set_dr(ctxt, 7, val)) - return X86EMUL_UNHANDLEABLE; - - selector = GET_SMSTATE(u32, smstate, 0x7fc4); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64)); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f60)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c)); - ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR); - - selector = GET_SMSTATE(u32, smstate, 0x7fc0); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f80)); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f7c)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78)); - ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR); - - dt.address = GET_SMSTATE(u32, smstate, 0x7f74); - dt.size = GET_SMSTATE(u32, smstate, 0x7f70); - ctxt->ops->set_gdt(ctxt, &dt); - - dt.address = GET_SMSTATE(u32, smstate, 0x7f58); - dt.size = GET_SMSTATE(u32, smstate, 0x7f54); - ctxt->ops->set_idt(ctxt, &dt); - - for (i = 0; i < 6; i++) { - int r = rsm_load_seg_32(ctxt, smstate, i); - if (r != X86EMUL_CONTINUE) - return r; - } - - cr4 = GET_SMSTATE(u32, smstate, 0x7f14); - - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7ef8)); - - return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); -} - -#ifdef CONFIG_X86_64 -static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, - const char *smstate) -{ - struct desc_struct desc; - struct desc_ptr dt; - u64 val, cr0, cr3, cr4; - u32 base3; - u16 selector; - int i, r; - - for (i = 0; i < 16; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); - - ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); - ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED; - - val = GET_SMSTATE(u64, smstate, 0x7f68); - - if (ctxt->ops->set_dr(ctxt, 6, val)) - return X86EMUL_UNHANDLEABLE; - - val = GET_SMSTATE(u64, smstate, 0x7f60); - - if (ctxt->ops->set_dr(ctxt, 7, val)) - return X86EMUL_UNHANDLEABLE; - - cr0 = GET_SMSTATE(u64, smstate, 0x7f58); - cr3 = GET_SMSTATE(u64, smstate, 0x7f50); - cr4 = GET_SMSTATE(u64, smstate, 0x7f48); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00)); - val = GET_SMSTATE(u64, smstate, 0x7ed0); - - if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA)) - return X86EMUL_UNHANDLEABLE; - - selector = GET_SMSTATE(u32, smstate, 0x7e90); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e94)); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e98)); - base3 = GET_SMSTATE(u32, smstate, 0x7e9c); - ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR); - - dt.size = GET_SMSTATE(u32, smstate, 0x7e84); - dt.address = GET_SMSTATE(u64, smstate, 0x7e88); - ctxt->ops->set_idt(ctxt, &dt); - - selector = GET_SMSTATE(u32, smstate, 0x7e70); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e74)); - set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e78)); - base3 = GET_SMSTATE(u32, smstate, 0x7e7c); - ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR); - - dt.size = GET_SMSTATE(u32, smstate, 0x7e64); - dt.address = GET_SMSTATE(u64, smstate, 0x7e68); - ctxt->ops->set_gdt(ctxt, &dt); - - r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); - if (r != X86EMUL_CONTINUE) - return r; - - for (i = 0; i < 6; i++) { - r = rsm_load_seg_64(ctxt, smstate, i); - if (r != X86EMUL_CONTINUE) - return r; - } - - return X86EMUL_CONTINUE; -} -#endif - static int em_rsm(struct x86_emulate_ctxt *ctxt) { - unsigned long cr0, cr4, efer; - char buf[512]; - u64 smbase; - int ret; - if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) return emulate_ud(ctxt); - smbase = ctxt->ops->get_smbase(ctxt); - - ret = ctxt->ops->read_phys(ctxt, smbase + 0xfe00, buf, sizeof(buf)); - if (ret != X86EMUL_CONTINUE) - return X86EMUL_UNHANDLEABLE; - - if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) - ctxt->ops->set_nmi_mask(ctxt, false); - - ctxt->ops->exiting_smm(ctxt); - - /* - * Get back to real mode, to prepare a safe state in which to load - * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU - * supports long mode. - */ - if (emulator_has_longmode(ctxt)) { - struct desc_struct cs_desc; - - /* Zero CR4.PCIDE before CR0.PG. */ - cr4 = ctxt->ops->get_cr(ctxt, 4); - if (cr4 & X86_CR4_PCIDE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); - - /* A 32-bit code segment is required to clear EFER.LMA. */ - memset(&cs_desc, 0, sizeof(cs_desc)); - cs_desc.type = 0xb; - cs_desc.s = cs_desc.g = cs_desc.p = 1; - ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS); - } - - /* For the 64-bit case, this will clear EFER.LMA. */ - cr0 = ctxt->ops->get_cr(ctxt, 0); - if (cr0 & X86_CR0_PE) - ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); - - if (emulator_has_longmode(ctxt)) { - /* Clear CR4.PAE before clearing EFER.LME. */ - cr4 = ctxt->ops->get_cr(ctxt, 4); - if (cr4 & X86_CR4_PAE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); - - /* And finally go back to 32-bit mode. */ - efer = 0; - ctxt->ops->set_msr(ctxt, MSR_EFER, efer); - } - - /* - * Give leave_smm() a chance to make ISA-specific changes to the vCPU - * state (e.g. enter guest mode) before loading state from the SMM - * state-save area. - */ - if (ctxt->ops->leave_smm(ctxt, buf)) - goto emulate_shutdown; - -#ifdef CONFIG_X86_64 - if (emulator_has_longmode(ctxt)) - ret = rsm_load_state_64(ctxt, buf); - else -#endif - ret = rsm_load_state_32(ctxt, buf); - - if (ret != X86EMUL_CONTINUE) - goto emulate_shutdown; + if (ctxt->ops->leave_smm(ctxt)) + ctxt->ops->triple_fault(ctxt); - /* - * Note, the ctxt->ops callbacks are responsible for handling side - * effects when writing MSRs and CRs, e.g. MMU context resets, CPUID - * runtime updates, etc... If that changes, e.g. this flow is moved - * out of the emulator to make it look more like enter_smm(), then - * those side effects need to be explicitly handled for both success - * and shutdown. - */ return emulator_recalc_and_set_mode(ctxt); - -emulate_shutdown: - ctxt->ops->triple_fault(ctxt); - return X86EMUL_CONTINUE; } static void diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index f371f1292ca3..d8d50558f165 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -165,3 +165,8 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm); } + +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 3febc342360c..c09174f73a34 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -200,9 +200,4 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu) return vcpu->arch.hflags & HF_GUEST_MASK; } -static inline bool is_smm(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.hflags & HF_SMM_MASK; -} - #endif diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 89246446d6aa..2d9662be8333 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -117,16 +117,6 @@ struct x86_emulate_ops { struct x86_exception *fault, bool system); /* - * read_phys: Read bytes of standard (non-emulated/special) memory. - * Used for descriptor reading. - * @addr: [IN ] Physical address from which to read. - * @val: [OUT] Value read from memory. - * @bytes: [IN ] Number of bytes to read from memory. - */ - int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr, - void *val, unsigned int bytes); - - /* * write_std: Write bytes of standard (non-emulated/special) memory. * Used for descriptor writing. * @addr: [IN ] Linear address to which to write. @@ -209,11 +199,8 @@ struct x86_emulate_ops { int (*cpl)(struct x86_emulate_ctxt *ctxt); void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); - u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt); - void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase); int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); - int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata); @@ -234,8 +221,7 @@ struct x86_emulate_ops { void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); - void (*exiting_smm)(struct x86_emulate_ctxt *ctxt); - int (*leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate); + int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); }; @@ -292,7 +278,6 @@ enum x86emul_mode { /* These match some of the HF_* flags defined in kvm_host.h */ #define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ #define X86EMUL_SMM_MASK (1 << 6) -#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7) /* * fastop functions are declared as taking a never-defined fastop parameter, @@ -526,4 +511,35 @@ void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt); void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt); bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt); +static inline ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + + if (!(ctxt->regs_valid & (1 << nr))) { + ctxt->regs_valid |= 1 << nr; + ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); + } + return ctxt->_regs[nr]; +} + +static inline ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + + BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + + ctxt->regs_valid |= 1 << nr; + ctxt->regs_dirty |= 1 << nr; + return &ctxt->_regs[nr]; +} + +static inline ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + reg_read(ctxt, nr); + return reg_write(ctxt, nr); +} + #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d7639d126e6c..1bb63746e991 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -42,6 +42,7 @@ #include "x86.h" #include "cpuid.h" #include "hyperv.h" +#include "smm.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -1170,9 +1171,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_SMI: - result = 1; - kvm_make_request(KVM_REQ_SMI, vcpu); - kvm_vcpu_kick(vcpu); + if (!kvm_inject_smi(vcpu)) { + kvm_vcpu_kick(vcpu); + result = 1; + } break; case APIC_DM_NMI: diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index a5ac4a5a5179..28e3769066e2 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -7,7 +7,7 @@ #include <linux/kvm_host.h> #include "hyperv.h" -#include "kvm_cache_regs.h" +#include "smm.h" #define KVM_APIC_INIT 0 #define KVM_APIC_SIPI 1 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6f81539061d6..cfff74685a25 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -22,6 +22,7 @@ #include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" +#include "smm.h" #include "kvm_emulate.h" #include "cpuid.h" #include "spte.h" @@ -802,15 +803,31 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); } -void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - if (sp->lpage_disallowed) + /* + * If it's possible to replace the shadow page with an NX huge page, + * i.e. if the shadow page is the only thing currently preventing KVM + * from using a huge page, add the shadow page to the list of "to be + * zapped for NX recovery" pages. Note, the shadow page can already be + * on the list if KVM is reusing an existing shadow page, i.e. if KVM + * links a shadow page at multiple points. + */ + if (!list_empty(&sp->possible_nx_huge_page_link)) return; ++kvm->stat.nx_lpage_splits; - list_add_tail(&sp->lpage_disallowed_link, - &kvm->arch.lpage_disallowed_mmu_pages); - sp->lpage_disallowed = true; + list_add_tail(&sp->possible_nx_huge_page_link, + &kvm->arch.possible_nx_huge_pages); +} + +static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool nx_huge_page_possible) +{ + sp->nx_huge_page_disallowed = true; + + if (nx_huge_page_possible) + track_possible_nx_huge_page(kvm, sp); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -830,11 +847,20 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } -void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { + if (list_empty(&sp->possible_nx_huge_page_link)) + return; + --kvm->stat.nx_lpage_splits; - sp->lpage_disallowed = false; - list_del(&sp->lpage_disallowed_link); + list_del_init(&sp->possible_nx_huge_page_link); +} + +static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + sp->nx_huge_page_disallowed = false; + + untrack_possible_nx_huge_page(kvm, sp); } static struct kvm_memory_slot * @@ -1645,7 +1671,7 @@ static int is_empty_shadow_page(u64 *spt) u64 *pos; u64 *end; - for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) + for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++) if (is_shadow_present_pte(*pos)) { printk(KERN_ERR "%s: %p %llx\n", __func__, pos, *pos); @@ -1793,7 +1819,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(ent); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -1894,7 +1920,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) if (sp->role.invalid) return true; - /* TDP MMU pages due not use the MMU generation. */ + /* TDP MMU pages do not use the MMU generation. */ return !sp->tdp_mmu_page && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } @@ -2129,6 +2155,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); + /* * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() * depends on valid pages being added to the head of the list. See @@ -2350,7 +2378,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(*sptep); if (child->role.access == direct_access) return; @@ -2371,7 +2399,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); } else { - child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(pte); drop_parent_pte(child, spte); /* @@ -2486,8 +2514,8 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, zapped_root = !is_obsolete_sp(kvm, sp); } - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); + if (sp->nx_huge_page_disallowed) + unaccount_nx_huge_page(kvm, sp); sp->role.invalid = 1; @@ -2810,7 +2838,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, struct kvm_mmu_page *child; u64 pte = *sptep; - child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); + child = spte_to_child_sp(pte); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -3084,7 +3112,8 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ if (cur_level > PG_LEVEL_4K && cur_level == fault->goal_level && is_shadow_present_pte(spte) && - !is_large_pte(spte)) { + !is_large_pte(spte) && + spte_to_child_sp(spte)->nx_huge_page_disallowed) { /* * A small SPTE exists for this pfn, but FNAME(fetch) * and __direct_map would like to create a large PTE @@ -3126,9 +3155,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) continue; link_shadow_page(vcpu, it.sptep, sp); - if (fault->is_tdp && fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); + if (fault->huge_page_disallowed) + account_nx_huge_page(vcpu->kvm, sp, + fault->req_level >= it.level); } if (WARN_ON_ONCE(it.level != fault->goal_level)) @@ -3148,8 +3177,13 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk); } -static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) +static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) { + if (is_sigpending_pfn(pfn)) { + kvm_handle_signal_exit(vcpu); + return -EINTR; + } + /* * Do not cache the mmio info caused by writing the readonly gfn * into the spte otherwise read access on readonly gfn also can @@ -3171,7 +3205,7 @@ static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fau { /* The pfn is invalid, report the error! */ if (unlikely(is_error_pfn(fault->pfn))) - return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn); + return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn); if (unlikely(!fault->slot)) { gva_t gva = fault->is_tdp ? 0 : fault->addr; @@ -3422,7 +3456,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK); + /* + * The "root" may be a special root, e.g. a PAE entry, treat it as a + * SPTE to ensure any non-PA bits are dropped. + */ + sp = spte_to_child_sp(*root_hpa); if (WARN_ON(!sp)) return; @@ -3907,8 +3945,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu->pae_root[i]; if (IS_VALID_PAE_ROOT(root)) { - root &= SPTE_BASE_ADDR_MASK; - sp = to_shadow_page(root); + sp = spte_to_child_sp(root); mmu_sync_children(vcpu, sp, true); } } @@ -4169,7 +4206,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } async = false; - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, fault->write, &fault->map_writable, &fault->hva); if (!async) @@ -4186,7 +4223,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } } - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL, + /* + * Allow gup to bail on pending non-fatal signals when it's also allowed + * to wait for IO. Note, gup always bails if it is unable to quickly + * get a page and a fatal signal, i.e. SIGKILL, is pending. + */ + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL, fault->write, &fault->map_writable, &fault->hva); return RET_PF_CONTINUE; @@ -5971,7 +6013,7 @@ int kvm_mmu_init_vm(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); - INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); r = kvm_mmu_init_tdp_mmu(kvm); @@ -6656,7 +6698,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); - wake_up_process(kvm->arch.nx_lpage_recovery_thread); + wake_up_process(kvm->arch.nx_huge_page_recovery_thread); } mutex_unlock(&kvm_lock); } @@ -6788,7 +6830,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - wake_up_process(kvm->arch.nx_lpage_recovery_thread); + wake_up_process(kvm->arch.nx_huge_page_recovery_thread); mutex_unlock(&kvm_lock); } @@ -6796,9 +6838,10 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel return err; } -static void kvm_recover_nx_lpages(struct kvm *kvm) +static void kvm_recover_nx_huge_pages(struct kvm *kvm) { unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; + struct kvm_memory_slot *slot; int rcu_idx; struct kvm_mmu_page *sp; unsigned int ratio; @@ -6819,24 +6862,39 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { - if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) + if (list_empty(&kvm->arch.possible_nx_huge_pages)) break; /* * We use a separate list instead of just using active_mmu_pages - * because the number of lpage_disallowed pages is expected to - * be relatively small compared to the total. + * because the number of shadow pages that be replaced with an + * NX huge page is expected to be relatively small compared to + * the total number of shadow pages. And because the TDP MMU + * doesn't use active_mmu_pages. */ - sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, + sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, struct kvm_mmu_page, - lpage_disallowed_link); - WARN_ON_ONCE(!sp->lpage_disallowed); - if (is_tdp_mmu_page(sp)) { + possible_nx_huge_page_link); + WARN_ON_ONCE(!sp->nx_huge_page_disallowed); + WARN_ON_ONCE(!sp->role.direct); + + slot = gfn_to_memslot(kvm, sp->gfn); + WARN_ON_ONCE(!slot); + + /* + * Unaccount and do not attempt to recover any NX Huge Pages + * that are being dirty tracked, as they would just be faulted + * back in as 4KiB pages. The NX Huge Pages in this slot will be + * recovered, along with all the other huge pages in the slot, + * when dirty logging is disabled. + */ + if (slot && kvm_slot_dirty_track_enabled(slot)) + unaccount_nx_huge_page(kvm, sp); + else if (is_tdp_mmu_page(sp)) flush |= kvm_tdp_mmu_zap_sp(kvm, sp); - } else { + else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - WARN_ON_ONCE(sp->lpage_disallowed); - } + WARN_ON_ONCE(sp->nx_huge_page_disallowed); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); @@ -6856,7 +6914,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, rcu_idx); } -static long get_nx_lpage_recovery_timeout(u64 start_time) +static long get_nx_huge_page_recovery_timeout(u64 start_time) { bool enabled; uint period; @@ -6867,19 +6925,19 @@ static long get_nx_lpage_recovery_timeout(u64 start_time) : MAX_SCHEDULE_TIMEOUT; } -static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) +static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data) { u64 start_time; long remaining_time; while (true) { start_time = get_jiffies_64(); - remaining_time = get_nx_lpage_recovery_timeout(start_time); + remaining_time = get_nx_huge_page_recovery_timeout(start_time); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop() && remaining_time > 0) { schedule_timeout(remaining_time); - remaining_time = get_nx_lpage_recovery_timeout(start_time); + remaining_time = get_nx_huge_page_recovery_timeout(start_time); set_current_state(TASK_INTERRUPTIBLE); } @@ -6888,7 +6946,7 @@ static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) if (kthread_should_stop()) return 0; - kvm_recover_nx_lpages(kvm); + kvm_recover_nx_huge_pages(kvm); } } @@ -6896,17 +6954,17 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) { int err; - err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, + err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, "kvm-nx-lpage-recovery", - &kvm->arch.nx_lpage_recovery_thread); + &kvm->arch.nx_huge_page_recovery_thread); if (!err) - kthread_unpark(kvm->arch.nx_lpage_recovery_thread); + kthread_unpark(kvm->arch.nx_huge_page_recovery_thread); return err; } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) { - if (kvm->arch.nx_lpage_recovery_thread) - kthread_stop(kvm->arch.nx_lpage_recovery_thread); + if (kvm->arch.nx_huge_page_recovery_thread) + kthread_stop(kvm->arch.nx_huge_page_recovery_thread); } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 582def531d4d..dbaf6755c5a7 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -57,7 +57,13 @@ struct kvm_mmu_page { bool tdp_mmu_page; bool unsync; u8 mmu_valid_gen; - bool lpage_disallowed; /* Can't be replaced by an equiv large page */ + + /* + * The shadow page can't be replaced by an equivalent huge page + * because it is being used to map an executable page in the guest + * and the NX huge page mitigation is enabled. + */ + bool nx_huge_page_disallowed; /* * The following two entries are used to key the shadow page in the @@ -100,7 +106,14 @@ struct kvm_mmu_page { }; }; - struct list_head lpage_disallowed_link; + /* + * Tracks shadow pages that, if zapped, would allow KVM to create an NX + * huge page. A shadow page will have nx_huge_page_disallowed set but + * not be on the list if a huge page is disallowed for other reasons, + * e.g. because KVM is shadowing a PTE at the same gfn, the memslot + * isn't properly aligned, etc... + */ + struct list_head possible_nx_huge_page_link; #ifdef CONFIG_X86_32 /* * Used out of the mmu-lock to avoid reading spte values while an @@ -120,18 +133,6 @@ struct kvm_mmu_page { extern struct kmem_cache *mmu_page_header_cache; -static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) -{ - struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); - - return (struct kvm_mmu_page *)page_private(page); -} - -static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) -{ - return to_shadow_page(__pa(sptep)); -} - static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role) { return role.smm ? 1 : 0; @@ -315,7 +316,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); -void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); -void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 5ab5f94dcb6f..0f6455072055 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -713,9 +713,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, continue; link_shadow_page(vcpu, it.sptep, sp); - if (fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); + if (fault->huge_page_disallowed) + account_nx_huge_page(vcpu->kvm, sp, + fault->req_level >= it.level); } if (WARN_ON_ONCE(it.level != fault->goal_level)) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 2e08b2a45361..c0fd7e049b4e 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -161,6 +161,18 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (!prefetch) spte |= spte_shadow_accessed_mask(spte); + /* + * For simplicity, enforce the NX huge page mitigation even if not + * strictly necessary. KVM could ignore the mitigation if paging is + * disabled in the guest, as the guest doesn't have an page tables to + * abuse. But to safely ignore the mitigation, KVM would have to + * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG + * is toggled on, and that's a net negative for performance when TDP is + * enabled. When TDP is disabled, KVM will always switch to a new MMU + * when CR0.PG is toggled, but leveraging that to ignore the mitigation + * would tie make_spte() further to vCPU/MMU state, and add complexity + * just to optimize a mode that is anything but performance critical. + */ if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 7670c13ce251..1f03701b943a 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -188,7 +188,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; * should not modify the SPTE. * * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on - * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF + * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF * vulnerability. Use only low bits to avoid 64-bit immediates. * * Only used by the TDP MMU. @@ -219,6 +219,23 @@ static inline int spte_index(u64 *sptep) */ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; +static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) +{ + struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT); + + return (struct kvm_mmu_page *)page_private(page); +} + +static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte) +{ + return to_shadow_page(spte & SPTE_BASE_ADDR_MASK); +} + +static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) +{ + return to_shadow_page(__pa(sptep)); +} + static inline bool is_mmio_spte(u64 spte) { return (spte & shadow_mmio_mask) == shadow_mmio_value && diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 672f0432d777..771210ce5181 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -29,7 +29,6 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm) kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); - INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); kvm->arch.tdp_mmu_zap_wq = wq; return 1; } @@ -54,7 +53,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) /* Also waits for any queued work items. */ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); - WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); + WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -284,6 +283,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, gfn_t gfn, union kvm_mmu_page_role role) { + INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); sp->role = role; @@ -375,11 +376,13 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); + atomic64_inc(&kvm->arch.tdp_mmu_pages); } static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, -1); + atomic64_dec(&kvm->arch.tdp_mmu_pages); } /** @@ -395,14 +398,17 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, bool shared) { tdp_unaccount_mmu_page(kvm, sp); + + if (!sp->nx_huge_page_disallowed) + return; + if (shared) spin_lock(&kvm->arch.tdp_mmu_pages_lock); else lockdep_assert_held_write(&kvm->mmu_lock); - list_del(&sp->link); - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); + sp->nx_huge_page_disallowed = false; + untrack_possible_nx_huge_page(kvm, sp); if (shared) spin_unlock(&kvm->arch.tdp_mmu_pages_lock); @@ -1116,16 +1122,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set * @sp: The new TDP page table to install. - * @account_nx: True if this page table is being installed to split a - * non-executable huge page. * @shared: This operation is running under the MMU lock in read mode. * * Returns: 0 if the new page table was installed. Non-0 if the page table * could not be installed (e.g. the atomic compare-exchange failed). */ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_mmu_page *sp, bool account_nx, - bool shared) + struct kvm_mmu_page *sp, bool shared) { u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); int ret = 0; @@ -1138,16 +1141,14 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, tdp_mmu_set_spte(kvm, iter, spte); } - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - list_add(&sp->link, &kvm->arch.tdp_mmu_pages); - if (account_nx) - account_huge_nx_page(kvm, sp); - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); tdp_account_mmu_page(kvm, sp); return 0; } +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_mmu_page *sp, bool shared); + /* * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. @@ -1155,9 +1156,10 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu *mmu = vcpu->arch.mmu; + struct kvm *kvm = vcpu->kvm; struct tdp_iter iter; struct kvm_mmu_page *sp; - int ret; + int ret = RET_PF_RETRY; kvm_mmu_hugepage_adjust(vcpu, fault); @@ -1166,6 +1168,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) rcu_read_lock(); tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { + int r; + if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); @@ -1173,57 +1177,52 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) break; /* - * If there is an SPTE mapping a large page at a higher level - * than the target, that SPTE must be cleared and replaced - * with a non-leaf SPTE. + * If SPTE has been frozen by another thread, just give up and + * retry, avoiding unnecessary page table allocation and free. */ - if (is_shadow_present_pte(iter.old_spte) && - is_large_pte(iter.old_spte)) { - if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) - break; + if (is_removed_spte(iter.old_spte)) + goto retry; - /* - * The iter must explicitly re-read the spte here - * because the new value informs the !present - * path below. - */ - iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep); - } + /* Step down into the lower level page table if it exists. */ + if (is_shadow_present_pte(iter.old_spte) && + !is_large_pte(iter.old_spte)) + continue; - if (!is_shadow_present_pte(iter.old_spte)) { - bool account_nx = fault->huge_page_disallowed && - fault->req_level >= iter.level; + /* + * The SPTE is either non-present or points to a huge page that + * needs to be split. + */ + sp = tdp_mmu_alloc_sp(vcpu); + tdp_mmu_init_child_sp(sp, &iter); - /* - * If SPTE has been frozen by another thread, just - * give up and retry, avoiding unnecessary page table - * allocation and free. - */ - if (is_removed_spte(iter.old_spte)) - break; + sp->nx_huge_page_disallowed = fault->huge_page_disallowed; - sp = tdp_mmu_alloc_sp(vcpu); - tdp_mmu_init_child_sp(sp, &iter); + if (is_shadow_present_pte(iter.old_spte)) + r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); + else + r = tdp_mmu_link_sp(kvm, &iter, sp, true); - if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) { - tdp_mmu_free_sp(sp); - break; - } + /* + * Also force the guest to retry the access if the upper level SPTEs + * aren't in place. + */ + if (r) { + tdp_mmu_free_sp(sp); + goto retry; } - } - /* - * Force the guest to retry the access if the upper level SPTEs aren't - * in place, or if the target leaf SPTE is frozen by another CPU. - */ - if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) { - rcu_read_unlock(); - return RET_PF_RETRY; + if (fault->huge_page_disallowed && + fault->req_level >= iter.level) { + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + track_possible_nx_huge_page(kvm, sp); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + } } ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); - rcu_read_unlock(); +retry: + rcu_read_unlock(); return ret; } @@ -1472,6 +1471,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, return sp; } +/* Note, the caller is responsible for initializing @sp. */ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) { @@ -1479,8 +1479,6 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, const int level = iter->level; int ret, i; - tdp_mmu_init_child_sp(sp, iter); - /* * No need for atomics when writing to sp->spt since the page table has * not been linked in yet and thus is not reachable from any other CPU. @@ -1496,7 +1494,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * correctness standpoint since the translation will be the same either * way. */ - ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared); + ret = tdp_mmu_link_sp(kvm, iter, sp, shared); if (ret) goto out; @@ -1556,6 +1554,8 @@ retry: continue; } + tdp_mmu_init_child_sp(sp, &iter); + if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) goto retry; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index c163f7cc23ca..d3714200b932 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -5,6 +5,8 @@ #include <linux/kvm_host.h> +#include "spte.h" + hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index de1fd7369736..684393c22105 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -101,10 +101,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) struct kvm_pmu *pmu = pmc_to_pmu(pmc); bool skip_pmi = false; - /* Ignore counters that have been reprogrammed already. */ - if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) - return; - if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { if (!in_pmi) { /* @@ -122,7 +118,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) } else { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); } - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); if (!pmc->intr || skip_pmi) return; @@ -147,12 +142,22 @@ static void kvm_perf_overflow(struct perf_event *perf_event, { struct kvm_pmc *pmc = perf_event->overflow_handler_context; + /* + * Ignore overflow events for counters that are scheduled to be + * reprogrammed, e.g. if a PMI for the previous event races with KVM's + * handling of a related guest WRMSR. + */ + if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi)) + return; + __kvm_perf_overflow(pmc, true); + + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } -static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, - u64 config, bool exclude_user, - bool exclude_kernel, bool intr) +static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, + bool exclude_user, bool exclude_kernel, + bool intr) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); struct perf_event *event; @@ -204,14 +209,14 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, if (IS_ERR(event)) { pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", PTR_ERR(event), pmc->idx); - return; + return PTR_ERR(event); } pmc->perf_event = event; pmc_to_pmu(pmc)->event_count++; - clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; pmc->intr = intr || pebs; + return 0; } static void pmc_pause_counter(struct kvm_pmc *pmc) @@ -245,7 +250,6 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) perf_event_enable(pmc->perf_event); pmc->is_paused = false; - clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); return true; } @@ -293,7 +297,7 @@ out: return allow_event; } -void reprogram_counter(struct kvm_pmc *pmc) +static void reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 eventsel = pmc->eventsel; @@ -303,10 +307,13 @@ void reprogram_counter(struct kvm_pmc *pmc) pmc_pause_counter(pmc); if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) - return; + goto reprogram_complete; if (!check_pmu_event_filter(pmc)) - return; + goto reprogram_complete; + + if (pmc->counter < pmc->prev_counter) + __kvm_perf_overflow(pmc, false); if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) printk_once("kvm pmu: pin control bit is ignored\n"); @@ -324,18 +331,29 @@ void reprogram_counter(struct kvm_pmc *pmc) } if (pmc->current_config == new_config && pmc_resume_counter(pmc)) - return; + goto reprogram_complete; pmc_release_perf_event(pmc); pmc->current_config = new_config; - pmc_reprogram_counter(pmc, PERF_TYPE_RAW, - (eventsel & pmu->raw_event_mask), - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); + + /* + * If reprogramming fails, e.g. due to contention, leave the counter's + * regprogram bit set, i.e. opportunistically try again on the next PMU + * refresh. Don't make a new request as doing so can stall the guest + * if reprogramming repeatedly fails. + */ + if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW, + (eventsel & pmu->raw_event_mask), + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT)) + return; + +reprogram_complete: + clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); + pmc->prev_counter = 0; } -EXPORT_SYMBOL_GPL(reprogram_counter); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { @@ -345,10 +363,11 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit); - if (unlikely(!pmc || !pmc->perf_event)) { + if (unlikely(!pmc)) { clear_bit(bit, pmu->reprogram_pmi); continue; } + reprogram_counter(pmc); } @@ -522,14 +541,9 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - u64 prev_count; - - prev_count = pmc->counter; + pmc->prev_counter = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - - reprogram_counter(pmc); - if (pmc->counter < prev_count) - __kvm_perf_overflow(pmc, false); + kvm_pmu_request_counter_reprogam(pmc); } static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, @@ -542,12 +556,15 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, static inline bool cpl_is_matched(struct kvm_pmc *pmc) { bool select_os, select_user; - u64 config = pmc->current_config; + u64 config; if (pmc_is_gp(pmc)) { + config = pmc->eventsel; select_os = config & ARCH_PERFMON_EVENTSEL_OS; select_user = config & ARCH_PERFMON_EVENTSEL_USR; } else { + config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED); select_os = config & 0x1; select_user = config & 0x2; } @@ -577,6 +594,8 @@ EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) { struct kvm_pmu_event_filter tmp, *filter; + struct kvm_vcpu *vcpu; + unsigned long i; size_t size; int r; @@ -613,9 +632,18 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) mutex_lock(&kvm->lock); filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, mutex_is_locked(&kvm->lock)); + synchronize_srcu_expedited(&kvm->srcu); + + BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) > + sizeof(((struct kvm_pmu *)0)->__reprogram_pmi)); + + kvm_for_each_vcpu(i, vcpu, kvm) + atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull); + + kvm_make_all_cpus_request(kvm, KVM_REQ_PMU); + mutex_unlock(&kvm->lock); - synchronize_srcu_expedited(&kvm->srcu); r = 0; cleanup: kfree(filter); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 5cc5721f260b..85ff3c0588ba 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -183,7 +183,11 @@ static inline void kvm_init_pmu_capability(void) KVM_PMC_MAX_FIXED); } -void reprogram_counter(struct kvm_pmc *pmc); +static inline void kvm_pmu_request_counter_reprogam(struct kvm_pmc *pmc) +{ + set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); +} void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c new file mode 100644 index 000000000000..a9c1c2af8d94 --- /dev/null +++ b/arch/x86/kvm/smm.c @@ -0,0 +1,649 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/kvm_host.h> +#include "x86.h" +#include "kvm_cache_regs.h" +#include "kvm_emulate.h" +#include "smm.h" +#include "cpuid.h" +#include "trace.h" + +#define CHECK_SMRAM32_OFFSET(field, offset) \ + ASSERT_STRUCT_OFFSET(struct kvm_smram_state_32, field, offset - 0xFE00) + +#define CHECK_SMRAM64_OFFSET(field, offset) \ + ASSERT_STRUCT_OFFSET(struct kvm_smram_state_64, field, offset - 0xFE00) + +static void check_smram_offsets(void) +{ + /* 32 bit SMRAM image */ + CHECK_SMRAM32_OFFSET(reserved1, 0xFE00); + CHECK_SMRAM32_OFFSET(smbase, 0xFEF8); + CHECK_SMRAM32_OFFSET(smm_revision, 0xFEFC); + CHECK_SMRAM32_OFFSET(io_inst_restart, 0xFF00); + CHECK_SMRAM32_OFFSET(auto_hlt_restart, 0xFF02); + CHECK_SMRAM32_OFFSET(io_restart_rdi, 0xFF04); + CHECK_SMRAM32_OFFSET(io_restart_rcx, 0xFF08); + CHECK_SMRAM32_OFFSET(io_restart_rsi, 0xFF0C); + CHECK_SMRAM32_OFFSET(io_restart_rip, 0xFF10); + CHECK_SMRAM32_OFFSET(cr4, 0xFF14); + CHECK_SMRAM32_OFFSET(reserved2, 0xFF18); + CHECK_SMRAM32_OFFSET(int_shadow, 0xFF1A); + CHECK_SMRAM32_OFFSET(reserved3, 0xFF1B); + CHECK_SMRAM32_OFFSET(ds, 0xFF2C); + CHECK_SMRAM32_OFFSET(fs, 0xFF38); + CHECK_SMRAM32_OFFSET(gs, 0xFF44); + CHECK_SMRAM32_OFFSET(idtr, 0xFF50); + CHECK_SMRAM32_OFFSET(tr, 0xFF5C); + CHECK_SMRAM32_OFFSET(gdtr, 0xFF6C); + CHECK_SMRAM32_OFFSET(ldtr, 0xFF78); + CHECK_SMRAM32_OFFSET(es, 0xFF84); + CHECK_SMRAM32_OFFSET(cs, 0xFF90); + CHECK_SMRAM32_OFFSET(ss, 0xFF9C); + CHECK_SMRAM32_OFFSET(es_sel, 0xFFA8); + CHECK_SMRAM32_OFFSET(cs_sel, 0xFFAC); + CHECK_SMRAM32_OFFSET(ss_sel, 0xFFB0); + CHECK_SMRAM32_OFFSET(ds_sel, 0xFFB4); + CHECK_SMRAM32_OFFSET(fs_sel, 0xFFB8); + CHECK_SMRAM32_OFFSET(gs_sel, 0xFFBC); + CHECK_SMRAM32_OFFSET(ldtr_sel, 0xFFC0); + CHECK_SMRAM32_OFFSET(tr_sel, 0xFFC4); + CHECK_SMRAM32_OFFSET(dr7, 0xFFC8); + CHECK_SMRAM32_OFFSET(dr6, 0xFFCC); + CHECK_SMRAM32_OFFSET(gprs, 0xFFD0); + CHECK_SMRAM32_OFFSET(eip, 0xFFF0); + CHECK_SMRAM32_OFFSET(eflags, 0xFFF4); + CHECK_SMRAM32_OFFSET(cr3, 0xFFF8); + CHECK_SMRAM32_OFFSET(cr0, 0xFFFC); + + /* 64 bit SMRAM image */ + CHECK_SMRAM64_OFFSET(es, 0xFE00); + CHECK_SMRAM64_OFFSET(cs, 0xFE10); + CHECK_SMRAM64_OFFSET(ss, 0xFE20); + CHECK_SMRAM64_OFFSET(ds, 0xFE30); + CHECK_SMRAM64_OFFSET(fs, 0xFE40); + CHECK_SMRAM64_OFFSET(gs, 0xFE50); + CHECK_SMRAM64_OFFSET(gdtr, 0xFE60); + CHECK_SMRAM64_OFFSET(ldtr, 0xFE70); + CHECK_SMRAM64_OFFSET(idtr, 0xFE80); + CHECK_SMRAM64_OFFSET(tr, 0xFE90); + CHECK_SMRAM64_OFFSET(io_restart_rip, 0xFEA0); + CHECK_SMRAM64_OFFSET(io_restart_rcx, 0xFEA8); + CHECK_SMRAM64_OFFSET(io_restart_rsi, 0xFEB0); + CHECK_SMRAM64_OFFSET(io_restart_rdi, 0xFEB8); + CHECK_SMRAM64_OFFSET(io_restart_dword, 0xFEC0); + CHECK_SMRAM64_OFFSET(reserved1, 0xFEC4); + CHECK_SMRAM64_OFFSET(io_inst_restart, 0xFEC8); + CHECK_SMRAM64_OFFSET(auto_hlt_restart, 0xFEC9); + CHECK_SMRAM64_OFFSET(amd_nmi_mask, 0xFECA); + CHECK_SMRAM64_OFFSET(int_shadow, 0xFECB); + CHECK_SMRAM64_OFFSET(reserved2, 0xFECC); + CHECK_SMRAM64_OFFSET(efer, 0xFED0); + CHECK_SMRAM64_OFFSET(svm_guest_flag, 0xFED8); + CHECK_SMRAM64_OFFSET(svm_guest_vmcb_gpa, 0xFEE0); + CHECK_SMRAM64_OFFSET(svm_guest_virtual_int, 0xFEE8); + CHECK_SMRAM64_OFFSET(reserved3, 0xFEF0); + CHECK_SMRAM64_OFFSET(smm_revison, 0xFEFC); + CHECK_SMRAM64_OFFSET(smbase, 0xFF00); + CHECK_SMRAM64_OFFSET(reserved4, 0xFF04); + CHECK_SMRAM64_OFFSET(ssp, 0xFF18); + CHECK_SMRAM64_OFFSET(svm_guest_pat, 0xFF20); + CHECK_SMRAM64_OFFSET(svm_host_efer, 0xFF28); + CHECK_SMRAM64_OFFSET(svm_host_cr4, 0xFF30); + CHECK_SMRAM64_OFFSET(svm_host_cr3, 0xFF38); + CHECK_SMRAM64_OFFSET(svm_host_cr0, 0xFF40); + CHECK_SMRAM64_OFFSET(cr4, 0xFF48); + CHECK_SMRAM64_OFFSET(cr3, 0xFF50); + CHECK_SMRAM64_OFFSET(cr0, 0xFF58); + CHECK_SMRAM64_OFFSET(dr7, 0xFF60); + CHECK_SMRAM64_OFFSET(dr6, 0xFF68); + CHECK_SMRAM64_OFFSET(rflags, 0xFF70); + CHECK_SMRAM64_OFFSET(rip, 0xFF78); + CHECK_SMRAM64_OFFSET(gprs, 0xFF80); + + BUILD_BUG_ON(sizeof(union kvm_smram) != 512); +} + +#undef CHECK_SMRAM64_OFFSET +#undef CHECK_SMRAM32_OFFSET + + +void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) +{ + BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); + + trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); + + if (entering_smm) { + vcpu->arch.hflags |= HF_SMM_MASK; + } else { + vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); + + /* Process a latched INIT or SMI, if any. */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * Even if KVM_SET_SREGS2 loaded PDPTRs out of band, + * on SMM exit we still need to reload them from + * guest memory + */ + vcpu->arch.pdptrs_from_userspace = false; + } + + kvm_mmu_reset_context(vcpu); +} + +void process_smi(struct kvm_vcpu *vcpu) +{ + vcpu->arch.smi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + +static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) +{ + u32 flags = 0; + flags |= seg->g << 23; + flags |= seg->db << 22; + flags |= seg->l << 21; + flags |= seg->avl << 20; + flags |= seg->present << 15; + flags |= seg->dpl << 13; + flags |= seg->s << 12; + flags |= seg->type << 8; + return flags; +} + +static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, + struct kvm_smm_seg_state_32 *state, + u32 *selector, int n) +{ + struct kvm_segment seg; + + kvm_get_segment(vcpu, &seg, n); + *selector = seg.selector; + state->base = seg.base; + state->limit = seg.limit; + state->flags = enter_smm_get_segment_flags(&seg); +} + +#ifdef CONFIG_X86_64 +static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, + struct kvm_smm_seg_state_64 *state, + int n) +{ + struct kvm_segment seg; + + kvm_get_segment(vcpu, &seg, n); + state->selector = seg.selector; + state->attributes = enter_smm_get_segment_flags(&seg) >> 8; + state->limit = seg.limit; + state->base = seg.base; +} +#endif + +static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, + struct kvm_smram_state_32 *smram) +{ + struct desc_ptr dt; + unsigned long val; + int i; + + smram->cr0 = kvm_read_cr0(vcpu); + smram->cr3 = kvm_read_cr3(vcpu); + smram->eflags = kvm_get_rflags(vcpu); + smram->eip = kvm_rip_read(vcpu); + + for (i = 0; i < 8; i++) + smram->gprs[i] = kvm_register_read_raw(vcpu, i); + + kvm_get_dr(vcpu, 6, &val); + smram->dr6 = (u32)val; + kvm_get_dr(vcpu, 7, &val); + smram->dr7 = (u32)val; + + enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR); + enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR); + + static_call(kvm_x86_get_gdt)(vcpu, &dt); + smram->gdtr.base = dt.address; + smram->gdtr.limit = dt.size; + + static_call(kvm_x86_get_idt)(vcpu, &dt); + smram->idtr.base = dt.address; + smram->idtr.limit = dt.size; + + enter_smm_save_seg_32(vcpu, &smram->es, &smram->es_sel, VCPU_SREG_ES); + enter_smm_save_seg_32(vcpu, &smram->cs, &smram->cs_sel, VCPU_SREG_CS); + enter_smm_save_seg_32(vcpu, &smram->ss, &smram->ss_sel, VCPU_SREG_SS); + + enter_smm_save_seg_32(vcpu, &smram->ds, &smram->ds_sel, VCPU_SREG_DS); + enter_smm_save_seg_32(vcpu, &smram->fs, &smram->fs_sel, VCPU_SREG_FS); + enter_smm_save_seg_32(vcpu, &smram->gs, &smram->gs_sel, VCPU_SREG_GS); + + smram->cr4 = kvm_read_cr4(vcpu); + smram->smm_revision = 0x00020000; + smram->smbase = vcpu->arch.smbase; + + smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); +} + +#ifdef CONFIG_X86_64 +static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, + struct kvm_smram_state_64 *smram) +{ + struct desc_ptr dt; + unsigned long val; + int i; + + for (i = 0; i < 16; i++) + smram->gprs[15 - i] = kvm_register_read_raw(vcpu, i); + + smram->rip = kvm_rip_read(vcpu); + smram->rflags = kvm_get_rflags(vcpu); + + + kvm_get_dr(vcpu, 6, &val); + smram->dr6 = val; + kvm_get_dr(vcpu, 7, &val); + smram->dr7 = val; + + smram->cr0 = kvm_read_cr0(vcpu); + smram->cr3 = kvm_read_cr3(vcpu); + smram->cr4 = kvm_read_cr4(vcpu); + + smram->smbase = vcpu->arch.smbase; + smram->smm_revison = 0x00020064; + + smram->efer = vcpu->arch.efer; + + enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR); + + static_call(kvm_x86_get_idt)(vcpu, &dt); + smram->idtr.limit = dt.size; + smram->idtr.base = dt.address; + + enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR); + + static_call(kvm_x86_get_gdt)(vcpu, &dt); + smram->gdtr.limit = dt.size; + smram->gdtr.base = dt.address; + + enter_smm_save_seg_64(vcpu, &smram->es, VCPU_SREG_ES); + enter_smm_save_seg_64(vcpu, &smram->cs, VCPU_SREG_CS); + enter_smm_save_seg_64(vcpu, &smram->ss, VCPU_SREG_SS); + enter_smm_save_seg_64(vcpu, &smram->ds, VCPU_SREG_DS); + enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS); + enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS); + + smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); +} +#endif + +void enter_smm(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs, ds; + struct desc_ptr dt; + unsigned long cr0; + union kvm_smram smram; + + check_smram_offsets(); + + memset(smram.bytes, 0, sizeof(smram.bytes)); + +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) + enter_smm_save_state_64(vcpu, &smram.smram64); + else +#endif + enter_smm_save_state_32(vcpu, &smram.smram32); + + /* + * Give enter_smm() a chance to make ISA-specific changes to the vCPU + * state (e.g. leave guest mode) after we've saved the state into the + * SMM state-save area. + * + * Kill the VM in the unlikely case of failure, because the VM + * can be in undefined state in this case. + */ + if (static_call(kvm_x86_enter_smm)(vcpu, &smram)) + goto error; + + kvm_smm_changed(vcpu, true); + + if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram))) + goto error; + + if (static_call(kvm_x86_get_nmi_mask)(vcpu)) + vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; + else + static_call(kvm_x86_set_nmi_mask)(vcpu, true); + + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0x8000); + + static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + + cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); + static_call(kvm_x86_set_cr0)(vcpu, cr0); + vcpu->arch.cr0 = cr0; + + static_call(kvm_x86_set_cr4)(vcpu, 0); + + /* Undocumented: IDT limit is set to zero on entry to SMM. */ + dt.address = dt.size = 0; + static_call(kvm_x86_set_idt)(vcpu, &dt); + + if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1))) + goto error; + + cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; + cs.base = vcpu->arch.smbase; + + ds.selector = 0; + ds.base = 0; + + cs.limit = ds.limit = 0xffffffff; + cs.type = ds.type = 0x3; + cs.dpl = ds.dpl = 0; + cs.db = ds.db = 0; + cs.s = ds.s = 1; + cs.l = ds.l = 0; + cs.g = ds.g = 1; + cs.avl = ds.avl = 0; + cs.present = ds.present = 1; + cs.unusable = ds.unusable = 0; + cs.padding = ds.padding = 0; + + kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_DS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_ES); + kvm_set_segment(vcpu, &ds, VCPU_SREG_FS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); + +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (static_call(kvm_x86_set_efer)(vcpu, 0)) + goto error; +#endif + + kvm_update_cpuid_runtime(vcpu); + kvm_mmu_reset_context(vcpu); + return; +error: + kvm_vm_dead(vcpu->kvm); +} + +static void rsm_set_desc_flags(struct kvm_segment *desc, u32 flags) +{ + desc->g = (flags >> 23) & 1; + desc->db = (flags >> 22) & 1; + desc->l = (flags >> 21) & 1; + desc->avl = (flags >> 20) & 1; + desc->present = (flags >> 15) & 1; + desc->dpl = (flags >> 13) & 3; + desc->s = (flags >> 12) & 1; + desc->type = (flags >> 8) & 15; + + desc->unusable = !desc->present; + desc->padding = 0; +} + +static int rsm_load_seg_32(struct kvm_vcpu *vcpu, + const struct kvm_smm_seg_state_32 *state, + u16 selector, int n) +{ + struct kvm_segment desc; + + desc.selector = selector; + desc.base = state->base; + desc.limit = state->limit; + rsm_set_desc_flags(&desc, state->flags); + kvm_set_segment(vcpu, &desc, n); + return X86EMUL_CONTINUE; +} + +#ifdef CONFIG_X86_64 + +static int rsm_load_seg_64(struct kvm_vcpu *vcpu, + const struct kvm_smm_seg_state_64 *state, + int n) +{ + struct kvm_segment desc; + + desc.selector = state->selector; + rsm_set_desc_flags(&desc, state->attributes << 8); + desc.limit = state->limit; + desc.base = state->base; + kvm_set_segment(vcpu, &desc, n); + return X86EMUL_CONTINUE; +} +#endif + +static int rsm_enter_protected_mode(struct kvm_vcpu *vcpu, + u64 cr0, u64 cr3, u64 cr4) +{ + int bad; + u64 pcid; + + /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */ + pcid = 0; + if (cr4 & X86_CR4_PCIDE) { + pcid = cr3 & 0xfff; + cr3 &= ~0xfff; + } + + bad = kvm_set_cr3(vcpu, cr3); + if (bad) + return X86EMUL_UNHANDLEABLE; + + /* + * First enable PAE, long mode needs it before CR0.PG = 1 is set. + * Then enable protected mode. However, PCID cannot be enabled + * if EFER.LMA=0, so set it separately. + */ + bad = kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE); + if (bad) + return X86EMUL_UNHANDLEABLE; + + bad = kvm_set_cr0(vcpu, cr0); + if (bad) + return X86EMUL_UNHANDLEABLE; + + if (cr4 & X86_CR4_PCIDE) { + bad = kvm_set_cr4(vcpu, cr4); + if (bad) + return X86EMUL_UNHANDLEABLE; + if (pcid) { + bad = kvm_set_cr3(vcpu, cr3 | pcid); + if (bad) + return X86EMUL_UNHANDLEABLE; + } + + } + + return X86EMUL_CONTINUE; +} + +static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, + const struct kvm_smram_state_32 *smstate) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + struct desc_ptr dt; + int i, r; + + ctxt->eflags = smstate->eflags | X86_EFLAGS_FIXED; + ctxt->_eip = smstate->eip; + + for (i = 0; i < 8; i++) + *reg_write(ctxt, i) = smstate->gprs[i]; + + if (kvm_set_dr(vcpu, 6, smstate->dr6)) + return X86EMUL_UNHANDLEABLE; + if (kvm_set_dr(vcpu, 7, smstate->dr7)) + return X86EMUL_UNHANDLEABLE; + + rsm_load_seg_32(vcpu, &smstate->tr, smstate->tr_sel, VCPU_SREG_TR); + rsm_load_seg_32(vcpu, &smstate->ldtr, smstate->ldtr_sel, VCPU_SREG_LDTR); + + dt.address = smstate->gdtr.base; + dt.size = smstate->gdtr.limit; + static_call(kvm_x86_set_gdt)(vcpu, &dt); + + dt.address = smstate->idtr.base; + dt.size = smstate->idtr.limit; + static_call(kvm_x86_set_idt)(vcpu, &dt); + + rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES); + rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS); + rsm_load_seg_32(vcpu, &smstate->ss, smstate->ss_sel, VCPU_SREG_SS); + + rsm_load_seg_32(vcpu, &smstate->ds, smstate->ds_sel, VCPU_SREG_DS); + rsm_load_seg_32(vcpu, &smstate->fs, smstate->fs_sel, VCPU_SREG_FS); + rsm_load_seg_32(vcpu, &smstate->gs, smstate->gs_sel, VCPU_SREG_GS); + + vcpu->arch.smbase = smstate->smbase; + + r = rsm_enter_protected_mode(vcpu, smstate->cr0, + smstate->cr3, smstate->cr4); + + if (r != X86EMUL_CONTINUE) + return r; + + static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + ctxt->interruptibility = (u8)smstate->int_shadow; + + return r; +} + +#ifdef CONFIG_X86_64 +static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, + const struct kvm_smram_state_64 *smstate) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + struct desc_ptr dt; + int i, r; + + for (i = 0; i < 16; i++) + *reg_write(ctxt, i) = smstate->gprs[15 - i]; + + ctxt->_eip = smstate->rip; + ctxt->eflags = smstate->rflags | X86_EFLAGS_FIXED; + + if (kvm_set_dr(vcpu, 6, smstate->dr6)) + return X86EMUL_UNHANDLEABLE; + if (kvm_set_dr(vcpu, 7, smstate->dr7)) + return X86EMUL_UNHANDLEABLE; + + vcpu->arch.smbase = smstate->smbase; + + if (kvm_set_msr(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA)) + return X86EMUL_UNHANDLEABLE; + + rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR); + + dt.size = smstate->idtr.limit; + dt.address = smstate->idtr.base; + static_call(kvm_x86_set_idt)(vcpu, &dt); + + rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR); + + dt.size = smstate->gdtr.limit; + dt.address = smstate->gdtr.base; + static_call(kvm_x86_set_gdt)(vcpu, &dt); + + r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4); + if (r != X86EMUL_CONTINUE) + return r; + + rsm_load_seg_64(vcpu, &smstate->es, VCPU_SREG_ES); + rsm_load_seg_64(vcpu, &smstate->cs, VCPU_SREG_CS); + rsm_load_seg_64(vcpu, &smstate->ss, VCPU_SREG_SS); + rsm_load_seg_64(vcpu, &smstate->ds, VCPU_SREG_DS); + rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS); + rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS); + + static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + ctxt->interruptibility = (u8)smstate->int_shadow; + + return X86EMUL_CONTINUE; +} +#endif + +int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + unsigned long cr0; + union kvm_smram smram; + u64 smbase; + int ret; + + smbase = vcpu->arch.smbase; + + ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfe00, smram.bytes, sizeof(smram)); + if (ret < 0) + return X86EMUL_UNHANDLEABLE; + + if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0) + static_call(kvm_x86_set_nmi_mask)(vcpu, false); + + kvm_smm_changed(vcpu, false); + + /* + * Get back to real mode, to prepare a safe state in which to load + * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU + * supports long mode. + */ +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { + struct kvm_segment cs_desc; + unsigned long cr4; + + /* Zero CR4.PCIDE before CR0.PG. */ + cr4 = kvm_read_cr4(vcpu); + if (cr4 & X86_CR4_PCIDE) + kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE); + + /* A 32-bit code segment is required to clear EFER.LMA. */ + memset(&cs_desc, 0, sizeof(cs_desc)); + cs_desc.type = 0xb; + cs_desc.s = cs_desc.g = cs_desc.present = 1; + kvm_set_segment(vcpu, &cs_desc, VCPU_SREG_CS); + } +#endif + + /* For the 64-bit case, this will clear EFER.LMA. */ + cr0 = kvm_read_cr0(vcpu); + if (cr0 & X86_CR0_PE) + kvm_set_cr0(vcpu, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); + +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { + unsigned long cr4, efer; + + /* Clear CR4.PAE before clearing EFER.LME. */ + cr4 = kvm_read_cr4(vcpu); + if (cr4 & X86_CR4_PAE) + kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PAE); + + /* And finally go back to 32-bit mode. */ + efer = 0; + kvm_set_msr(vcpu, MSR_EFER, efer); + } +#endif + + /* + * Give leave_smm() a chance to make ISA-specific changes to the vCPU + * state (e.g. enter guest mode) before loading state from the SMM + * state-save area. + */ + if (static_call(kvm_x86_leave_smm)(vcpu, &smram)) + return X86EMUL_UNHANDLEABLE; + +#ifdef CONFIG_X86_64 + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return rsm_load_state_64(ctxt, &smram.smram64); + else +#endif + return rsm_load_state_32(ctxt, &smram.smram32); +} diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h new file mode 100644 index 000000000000..a1cf2ac5bd78 --- /dev/null +++ b/arch/x86/kvm/smm.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_KVM_SMM_H +#define ASM_KVM_SMM_H + +#include <linux/build_bug.h> + +#ifdef CONFIG_KVM_SMM + + +/* + * 32 bit KVM's emulated SMM layout. Based on Intel P6 layout + * (https://www.sandpile.org/x86/smm.htm). + */ + +struct kvm_smm_seg_state_32 { + u32 flags; + u32 limit; + u32 base; +} __packed; + +struct kvm_smram_state_32 { + u32 reserved1[62]; + u32 smbase; + u32 smm_revision; + u16 io_inst_restart; + u16 auto_hlt_restart; + u32 io_restart_rdi; + u32 io_restart_rcx; + u32 io_restart_rsi; + u32 io_restart_rip; + u32 cr4; + + /* A20M#, CPL, shutdown and other reserved/undocumented fields */ + u16 reserved2; + u8 int_shadow; /* KVM extension */ + u8 reserved3[17]; + + struct kvm_smm_seg_state_32 ds; + struct kvm_smm_seg_state_32 fs; + struct kvm_smm_seg_state_32 gs; + struct kvm_smm_seg_state_32 idtr; /* IDTR has only base and limit */ + struct kvm_smm_seg_state_32 tr; + u32 reserved; + struct kvm_smm_seg_state_32 gdtr; /* GDTR has only base and limit */ + struct kvm_smm_seg_state_32 ldtr; + struct kvm_smm_seg_state_32 es; + struct kvm_smm_seg_state_32 cs; + struct kvm_smm_seg_state_32 ss; + + u32 es_sel; + u32 cs_sel; + u32 ss_sel; + u32 ds_sel; + u32 fs_sel; + u32 gs_sel; + u32 ldtr_sel; + u32 tr_sel; + + u32 dr7; + u32 dr6; + u32 gprs[8]; /* GPRS in the "natural" X86 order (EAX/ECX/EDX.../EDI) */ + u32 eip; + u32 eflags; + u32 cr3; + u32 cr0; +} __packed; + + +/* 64 bit KVM's emulated SMM layout. Based on AMD64 layout */ + +struct kvm_smm_seg_state_64 { + u16 selector; + u16 attributes; + u32 limit; + u64 base; +}; + +struct kvm_smram_state_64 { + + struct kvm_smm_seg_state_64 es; + struct kvm_smm_seg_state_64 cs; + struct kvm_smm_seg_state_64 ss; + struct kvm_smm_seg_state_64 ds; + struct kvm_smm_seg_state_64 fs; + struct kvm_smm_seg_state_64 gs; + struct kvm_smm_seg_state_64 gdtr; /* GDTR has only base and limit*/ + struct kvm_smm_seg_state_64 ldtr; + struct kvm_smm_seg_state_64 idtr; /* IDTR has only base and limit*/ + struct kvm_smm_seg_state_64 tr; + + /* I/O restart and auto halt restart are not implemented by KVM */ + u64 io_restart_rip; + u64 io_restart_rcx; + u64 io_restart_rsi; + u64 io_restart_rdi; + u32 io_restart_dword; + u32 reserved1; + u8 io_inst_restart; + u8 auto_hlt_restart; + u8 amd_nmi_mask; /* Documented in AMD BKDG as NMI mask, not used by KVM */ + u8 int_shadow; + u32 reserved2; + + u64 efer; + + /* + * Two fields below are implemented on AMD only, to store + * SVM guest vmcb address if the #SMI was received while in the guest mode. + */ + u64 svm_guest_flag; + u64 svm_guest_vmcb_gpa; + u64 svm_guest_virtual_int; /* unknown purpose, not implemented */ + + u32 reserved3[3]; + u32 smm_revison; + u32 smbase; + u32 reserved4[5]; + + /* ssp and svm_* fields below are not implemented by KVM */ + u64 ssp; + u64 svm_guest_pat; + u64 svm_host_efer; + u64 svm_host_cr4; + u64 svm_host_cr3; + u64 svm_host_cr0; + + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; + u64 rflags; + u64 rip; + u64 gprs[16]; /* GPRS in a reversed "natural" X86 order (R15/R14/../RCX/RAX.) */ +}; + +union kvm_smram { + struct kvm_smram_state_64 smram64; + struct kvm_smram_state_32 smram32; + u8 bytes[512]; +}; + +static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) +{ + kvm_make_request(KVM_REQ_SMI, vcpu); + return 0; +} + +static inline bool is_smm(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hflags & HF_SMM_MASK; +} + +void kvm_smm_changed(struct kvm_vcpu *vcpu, bool in_smm); +void enter_smm(struct kvm_vcpu *vcpu); +int emulator_leave_smm(struct x86_emulate_ctxt *ctxt); +void process_smi(struct kvm_vcpu *vcpu); +#else +static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { return -ENOTTY; } +static inline bool is_smm(struct kvm_vcpu *vcpu) { return false; } + +/* + * emulator_leave_smm is used as a function pointer, so the + * stub is defined in x86.c. + */ +#endif + +#endif diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 995bc0f90759..3aa9184d1e4e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -25,6 +25,7 @@ #include "trace.h" #include "mmu.h" #include "x86.h" +#include "smm.h" #include "cpuid.h" #include "lapic.h" #include "svm.h" @@ -1383,6 +1384,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return 0; } +#ifdef CONFIG_KVM_SMM if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) { if (block_nested_events) return -EBUSY; @@ -1391,6 +1393,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) nested_svm_simple_vmexit(svm, SVM_EXIT_SMI); return 0; } +#endif if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) { if (block_nested_events) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 9d65cd095691..0e313fbae055 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -159,7 +159,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } return 0; } @@ -212,7 +212,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmc *pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; + pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index efaaef2b7ae1..69dbf17f0d6a 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2648,7 +2648,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) ghcb_scratch_beg = control->ghcb_gpa + offsetof(struct ghcb, shared_buffer); ghcb_scratch_end = control->ghcb_gpa + - offsetof(struct ghcb, reserved_1); + offsetof(struct ghcb, reserved_0xff0); /* * If the scratch area begins within the GHCB, it must be diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c0950ae86b2b..7efc4fdaa446 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -6,6 +6,7 @@ #include "mmu.h" #include "kvm_cache_regs.h" #include "x86.h" +#include "smm.h" #include "cpuid.h" #include "pmu.h" @@ -2708,8 +2709,6 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; break; - case MSR_IA32_PERF_CAPABILITIES: - return 0; default: return KVM_MSR_RET_INVALID; } @@ -4102,6 +4101,8 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return false; case MSR_IA32_SMBASE: + if (!IS_ENABLED(CONFIG_KVM_SMM)) + return false; /* SEV-ES guests do not support SMM, so report false */ if (kvm && sev_es_guest(kvm)) return false; @@ -4358,6 +4359,7 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) vcpu->arch.mcg_cap &= 0x1ff; } +#ifdef CONFIG_KVM_SMM bool svm_smi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4385,7 +4387,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return 1; } -static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map_save; @@ -4394,10 +4396,16 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) if (!is_guest_mode(vcpu)) return 0; - /* FED8h - SVM Guest */ - put_smstate(u64, smstate, 0x7ed8, 1); - /* FEE0h - SVM Guest VMCB Physical Address */ - put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); + /* + * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is + * responsible for ensuring nested SVM and SMIs are mutually exclusive. + */ + + if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return 1; + + smram->smram64.svm_guest_flag = 1; + smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa; svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; @@ -4419,8 +4427,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) * that, see svm_prepare_switch_to_guest()) which must be * preserved. */ - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), - &map_save) == -EINVAL) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) return 1; BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); @@ -4432,34 +4439,33 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map, map_save; - u64 saved_efer, vmcb12_gpa; struct vmcb *vmcb12; int ret; + const struct kvm_smram_state_64 *smram64 = &smram->smram64; + if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) return 0; /* Non-zero if SMI arrived while vCPU was in guest mode. */ - if (!GET_SMSTATE(u64, smstate, 0x7ed8)) + if (!smram64->svm_guest_flag) return 0; if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) return 1; - saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); - if (!(saved_efer & EFER_SVME)) + if (!(smram64->efer & EFER_SVME)) return 1; - vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map)) return 1; ret = 1; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) goto unmap_map; if (svm_allocate_nested(svm)) @@ -4481,7 +4487,7 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) vmcb12 = map.hva; nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); - ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); + ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false); if (ret) goto unmap_save; @@ -4507,6 +4513,7 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) /* We must be in SMM; RSM will cause a vmexit anyway. */ } } +#endif static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) @@ -4782,10 +4789,12 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .pi_update_irte = avic_pi_update_irte, .setup_mce = svm_setup_mce, +#ifdef CONFIG_KVM_SMM .smi_allowed = svm_smi_allowed, .enter_smm = svm_enter_smm, .leave_smm = svm_leave_smm, .enable_smi_window = svm_enable_smi_window, +#endif .mem_enc_ioctl = sev_mem_enc_ioctl, .mem_enc_register_region = sev_mem_enc_register_region, @@ -4851,6 +4860,7 @@ static __init void svm_set_cpu_caps(void) { kvm_set_cpu_caps(); + kvm_caps.supported_perf_cap = 0; kvm_caps.supported_xss = 0; /* CPUID 0x80000001 and 0x8000000A (SVM features) */ diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 07254314f3dd..cd2ac9536c99 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -395,30 +395,6 @@ static inline bool vmx_pebs_supported(void) return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; } -static inline u64 vmx_get_perf_capabilities(void) -{ - u64 perf_cap = PMU_CAP_FW_WRITES; - struct x86_pmu_lbr lbr; - u64 host_perf_cap = 0; - - if (!enable_pmu) - return 0; - - if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - - if (x86_perf_get_lbr(&lbr) >= 0 && lbr.nr) - perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; - - if (vmx_pebs_supported()) { - perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; - if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) - perf_cap &= ~PERF_CAP_PEBS_BASELINE; - } - - return perf_cap; -} - static inline bool cpu_has_notify_vmexit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 5b0d4859e4b7..7924dea93678 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -16,6 +16,7 @@ #include "trace.h" #include "vmx.h" #include "x86.h" +#include "smm.h" static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 10b33da9bd05..e5cec07ca8d9 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -52,7 +52,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } } @@ -76,7 +76,7 @@ static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { pmc = intel_pmc_idx_to_pmc(pmu, bit); if (pmc) - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } } @@ -477,7 +477,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; if (!(data & reserved_bits)) { pmc->eventsel = data; - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); return 0; } } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) @@ -631,7 +631,6 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->fixed_counters[i].current_config = 0; } - vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); lbr_desc->records.nr = 0; lbr_desc->event = NULL; lbr_desc->msr_passthrough = false; @@ -647,14 +646,14 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; + pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { pmc = &pmu->fixed_counters[i]; pmc_stop_counter(pmc); - pmc->counter = 0; + pmc->counter = pmc->prev_counter = 0; } pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 746129ddd5ae..01936013428b 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -208,9 +208,8 @@ struct __packed vmcs12 { /* * For save/restore compatibility, the vmcs12 field offsets must not change. */ -#define CHECK_OFFSET(field, loc) \ - BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ - "Offset of " #field " in struct vmcs12 has changed.") +#define CHECK_OFFSET(field, loc) \ + ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc) static inline void vmx_check_vmcs12_offsets(void) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 63247c57c72c..aca88524fd1e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -66,6 +66,7 @@ #include "vmcs12.h" #include "vmx.h" #include "x86.h" +#include "smm.h" MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -1849,9 +1850,6 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) if (!nested) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); - case MSR_IA32_PERF_CAPABILITIES: - msr->data = vmx_get_perf_capabilities(); - return 0; default: return KVM_MSR_RET_INVALID; } @@ -2029,7 +2027,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; - if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) && + if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; @@ -2342,14 +2340,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; if (data & PMU_CAP_LBR_FMT) { if ((data & PMU_CAP_LBR_FMT) != - (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) + (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1; } if (data & PERF_CAP_PEBS_FORMAT) { if ((data & PERF_CAP_PEBS_MASK) != - (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) + (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) return 1; if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) return 1; @@ -6844,6 +6842,8 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_SMBASE: + if (!IS_ENABLED(CONFIG_KVM_SMM)) + return false; /* * We cannot do SMM unless we can run the guest in big * real mode. @@ -7669,6 +7669,31 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_update_exception_bitmap(vcpu); } +static u64 vmx_get_perf_capabilities(void) +{ + u64 perf_cap = PMU_CAP_FW_WRITES; + struct x86_pmu_lbr lbr; + u64 host_perf_cap = 0; + + if (!enable_pmu) + return 0; + + if (boot_cpu_has(X86_FEATURE_PDCM)) + rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + + x86_perf_get_lbr(&lbr); + if (lbr.nr) + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + + if (vmx_pebs_supported()) { + perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; + if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) + perf_cap &= ~PERF_CAP_PEBS_BASELINE; + } + + return perf_cap; +} + static __init void vmx_set_cpu_caps(void) { kvm_set_cpu_caps(); @@ -7691,6 +7716,7 @@ static __init void vmx_set_cpu_caps(void) if (!enable_pmu) kvm_cpu_cap_clear(X86_FEATURE_PDCM); + kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); @@ -7906,6 +7932,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEAT_CTL_LMCE_ENABLED; } +#ifdef CONFIG_KVM_SMM static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ @@ -7914,7 +7941,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !is_smm(vcpu); } -static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7935,7 +7962,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; @@ -7960,6 +7987,7 @@ static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) { /* RSM will cause a vmexit anyway. */ } +#endif static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { @@ -8127,10 +8155,12 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .setup_mce = vmx_setup_mce, +#ifdef CONFIG_KVM_SMM .smi_allowed = vmx_smi_allowed, .enter_smm = vmx_enter_smm, .leave_smm = vmx_leave_smm, .enable_smi_window = vmx_enable_smi_window, +#endif .can_emulate_instruction = vmx_can_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5384c567f68f..404325a13dc2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -30,6 +30,7 @@ #include "hyperv.h" #include "lapic.h" #include "xen.h" +#include "smm.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -119,8 +120,6 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); -static void process_smi(struct kvm_vcpu *vcpu); -static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); @@ -1654,6 +1653,9 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) case MSR_IA32_ARCH_CAPABILITIES: msr->data = kvm_get_arch_capabilities(); break; + case MSR_IA32_PERF_CAPABILITIES: + msr->data = kvm_caps.supported_perf_cap; + break; case MSR_IA32_UCODE_REV: rdmsrl_safe(msr->index, &msr->data); break; @@ -3566,20 +3568,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.arch_capabilities = data; break; - case MSR_IA32_PERF_CAPABILITIES: { - struct kvm_msr_entry msr_ent = {.index = msr, .data = 0}; - + case MSR_IA32_PERF_CAPABILITIES: if (!msr_info->host_initiated) return 1; - if (kvm_get_msr_feature(&msr_ent)) - return 1; - if (data & ~msr_ent.data) + if (data & ~kvm_caps.supported_perf_cap) return 1; vcpu->arch.perf_capabilities = data; kvm_pmu_refresh(vcpu); return 0; - } case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: @@ -3651,7 +3648,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; } case MSR_IA32_SMBASE: - if (!msr_info->host_initiated) + if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) return 1; vcpu->arch.smbase = data; break; @@ -4067,7 +4064,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.ia32_misc_enable_msr; break; case MSR_IA32_SMBASE: - if (!msr_info->host_initiated) + if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) return 1; msr_info->data = vcpu->arch.smbase; break; @@ -4441,6 +4438,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r |= KVM_X86_DISABLE_EXITS_MWAIT; break; case KVM_CAP_X86_SMM: + if (!IS_ENABLED(CONFIG_KVM_SMM)) + break; + /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, * so do not report SMM to be available if real mode is @@ -4897,13 +4897,6 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) return 0; } -static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) -{ - kvm_make_request(KVM_REQ_SMI, vcpu); - - return 0; -} - static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, struct kvm_tpr_access_ctl *tac) { @@ -5039,8 +5032,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, process_nmi(vcpu); +#ifdef CONFIG_KVM_SMM if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); +#endif /* * KVM's ABI only allows for one exception to be migrated. Luckily, @@ -5068,16 +5063,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, ex->pending && ex->has_payload) kvm_deliver_exception_payload(vcpu, ex); + memset(events, 0, sizeof(*events)); + /* * The API doesn't provide the instruction length for software * exceptions, so don't report them. As long as the guest RIP * isn't advanced, we should expect to encounter the exception * again. */ - if (kvm_exception_is_soft(ex->vector)) { - events->exception.injected = 0; - events->exception.pending = 0; - } else { + if (!kvm_exception_is_soft(ex->vector)) { events->exception.injected = ex->injected; events->exception.pending = ex->pending; /* @@ -5097,20 +5091,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->interrupt.injected = vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; - events->interrupt.soft = 0; events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); - events->nmi.pad = 0; - events->sipi_vector = 0; /* never valid when reporting to user space */ + /* events->sipi_vector is never valid when reporting to user space */ +#ifdef CONFIG_KVM_SMM events->smi.smm = is_smm(vcpu); events->smi.pending = vcpu->arch.smi_pending; events->smi.smm_inside_nmi = !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK); +#endif events->smi.latched_init = kvm_lapic_latched_init(vcpu); events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING @@ -5122,12 +5116,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu); events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; } - - memset(&events->reserved, 0, sizeof(events->reserved)); } -static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm); - static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { @@ -5200,6 +5190,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { +#ifdef CONFIG_KVM_SMM if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { kvm_leave_nested(vcpu); kvm_smm_changed(vcpu, events->smi.smm); @@ -5214,6 +5205,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; } +#else + if (events->smi.smm || events->smi.pending || + events->smi.smm_inside_nmi) + return -EINVAL; +#endif + if (lapic_in_kernel(vcpu)) { if (events->smi.latched_init) set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); @@ -5580,7 +5577,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SMI: { - r = kvm_vcpu_ioctl_smi(vcpu); + r = kvm_inject_smi(vcpu); break; } case KVM_SET_CPUID: { @@ -6239,9 +6236,7 @@ split_irqchip_unlock: break; case KVM_CAP_X86_USER_SPACE_MSR: r = -EINVAL; - if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL | - KVM_MSR_EXIT_REASON_UNKNOWN | - KVM_MSR_EXIT_REASON_FILTER)) + if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK) break; kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; @@ -6418,7 +6413,7 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, if (!user_range->nmsrs) return 0; - if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) + if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK) return -EINVAL; if (!user_range->flags) @@ -6452,7 +6447,7 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, int r = 0; u32 i; - if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY) + if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK) return -EINVAL; for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) @@ -7125,8 +7120,8 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) return handled; } -static void kvm_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) +void kvm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) { static_call(kvm_x86_set_segment)(vcpu, var, seg); } @@ -7162,16 +7157,6 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, } EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); - gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) -{ - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - - u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; - access |= PFERR_FETCH_MASK; - return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); -} - gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { @@ -7284,15 +7269,6 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, - unsigned long addr, void *val, unsigned int bytes) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes); - - return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE; -} - static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception) @@ -8084,26 +8060,6 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); } -static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, - u32 msr_index, u64 data) -{ - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); -} - -static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - return vcpu->arch.smbase; -} - -static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - vcpu->arch.smbase = smbase; -} - static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { @@ -8178,18 +8134,13 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) return emul_to_vcpu(ctxt)->arch.hflags; } -static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt) +#ifndef CONFIG_KVM_SMM +static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - kvm_smm_changed(vcpu, false); -} - -static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt, - const char *smstate) -{ - return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate); + WARN_ON_ONCE(1); + return X86EMUL_UNHANDLEABLE; } +#endif static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) { @@ -8215,7 +8166,6 @@ static const struct x86_emulate_ops emulate_ops = { .write_gpr = emulator_write_gpr, .read_std = emulator_read_std, .write_std = emulator_write_std, - .read_phys = kvm_read_guest_phys_system, .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, @@ -8235,11 +8185,8 @@ static const struct x86_emulate_ops emulate_ops = { .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, - .get_smbase = emulator_get_smbase, - .set_smbase = emulator_set_smbase, .set_msr_with_filter = emulator_set_msr_with_filter, .get_msr_with_filter = emulator_get_msr_with_filter, - .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, .check_pmc = emulator_check_pmc, .read_pmc = emulator_read_pmc, @@ -8254,7 +8201,6 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_rdpid = emulator_guest_has_rdpid, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, - .exiting_smm = emulator_exiting_smm, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, @@ -8327,8 +8273,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); - BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); - BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); ctxt->interruptibility = 0; ctxt->have_exception = false; @@ -8587,29 +8531,6 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); -static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) -{ - trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); - - if (entering_smm) { - vcpu->arch.hflags |= HF_SMM_MASK; - } else { - vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); - - /* Process a latched INIT or SMI, if any. */ - kvm_make_request(KVM_REQ_EVENT, vcpu); - - /* - * Even if KVM_SET_SREGS2 loaded PDPTRs out of band, - * on SMM exit we still need to reload them from - * guest memory - */ - vcpu->arch.pdptrs_from_userspace = false; - } - - kvm_mmu_reset_context(vcpu); -} - static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, unsigned long *db) { @@ -9999,6 +9920,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, * in order to make progress and get back here for another iteration. * The kvm_x86_ops hooks communicate this by returning -EBUSY. */ +#ifdef CONFIG_KVM_SMM if (vcpu->arch.smi_pending) { r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY; if (r < 0) @@ -10011,6 +9933,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, } else static_call(kvm_x86_enable_smi_window)(vcpu); } +#endif if (vcpu->arch.nmi_pending) { r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY; @@ -10086,246 +10009,6 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } -static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) -{ - u32 flags = 0; - flags |= seg->g << 23; - flags |= seg->db << 22; - flags |= seg->l << 21; - flags |= seg->avl << 20; - flags |= seg->present << 15; - flags |= seg->dpl << 13; - flags |= seg->s << 12; - flags |= seg->type << 8; - return flags; -} - -static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) -{ - struct kvm_segment seg; - int offset; - - kvm_get_segment(vcpu, &seg, n); - put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector); - - if (n < 3) - offset = 0x7f84 + n * 12; - else - offset = 0x7f2c + (n - 3) * 12; - - put_smstate(u32, buf, offset + 8, seg.base); - put_smstate(u32, buf, offset + 4, seg.limit); - put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg)); -} - -#ifdef CONFIG_X86_64 -static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) -{ - struct kvm_segment seg; - int offset; - u16 flags; - - kvm_get_segment(vcpu, &seg, n); - offset = 0x7e00 + n * 16; - - flags = enter_smm_get_segment_flags(&seg) >> 8; - put_smstate(u16, buf, offset, seg.selector); - put_smstate(u16, buf, offset + 2, flags); - put_smstate(u32, buf, offset + 4, seg.limit); - put_smstate(u64, buf, offset + 8, seg.base); -} -#endif - -static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) -{ - struct desc_ptr dt; - struct kvm_segment seg; - unsigned long val; - int i; - - put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); - put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu)); - put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu)); - put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); - - for (i = 0; i < 8; i++) - put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i)); - - kvm_get_dr(vcpu, 6, &val); - put_smstate(u32, buf, 0x7fcc, (u32)val); - kvm_get_dr(vcpu, 7, &val); - put_smstate(u32, buf, 0x7fc8, (u32)val); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); - put_smstate(u32, buf, 0x7fc4, seg.selector); - put_smstate(u32, buf, 0x7f64, seg.base); - put_smstate(u32, buf, 0x7f60, seg.limit); - put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg)); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); - put_smstate(u32, buf, 0x7fc0, seg.selector); - put_smstate(u32, buf, 0x7f80, seg.base); - put_smstate(u32, buf, 0x7f7c, seg.limit); - put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); - - static_call(kvm_x86_get_gdt)(vcpu, &dt); - put_smstate(u32, buf, 0x7f74, dt.address); - put_smstate(u32, buf, 0x7f70, dt.size); - - static_call(kvm_x86_get_idt)(vcpu, &dt); - put_smstate(u32, buf, 0x7f58, dt.address); - put_smstate(u32, buf, 0x7f54, dt.size); - - for (i = 0; i < 6; i++) - enter_smm_save_seg_32(vcpu, buf, i); - - put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); - - /* revision id */ - put_smstate(u32, buf, 0x7efc, 0x00020000); - put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); -} - -#ifdef CONFIG_X86_64 -static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) -{ - struct desc_ptr dt; - struct kvm_segment seg; - unsigned long val; - int i; - - for (i = 0; i < 16; i++) - put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i)); - - put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu)); - put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); - - kvm_get_dr(vcpu, 6, &val); - put_smstate(u64, buf, 0x7f68, val); - kvm_get_dr(vcpu, 7, &val); - put_smstate(u64, buf, 0x7f60, val); - - put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu)); - put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu)); - put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu)); - - put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase); - - /* revision id */ - put_smstate(u32, buf, 0x7efc, 0x00020064); - - put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); - put_smstate(u16, buf, 0x7e90, seg.selector); - put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8); - put_smstate(u32, buf, 0x7e94, seg.limit); - put_smstate(u64, buf, 0x7e98, seg.base); - - static_call(kvm_x86_get_idt)(vcpu, &dt); - put_smstate(u32, buf, 0x7e84, dt.size); - put_smstate(u64, buf, 0x7e88, dt.address); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); - put_smstate(u16, buf, 0x7e70, seg.selector); - put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8); - put_smstate(u32, buf, 0x7e74, seg.limit); - put_smstate(u64, buf, 0x7e78, seg.base); - - static_call(kvm_x86_get_gdt)(vcpu, &dt); - put_smstate(u32, buf, 0x7e64, dt.size); - put_smstate(u64, buf, 0x7e68, dt.address); - - for (i = 0; i < 6; i++) - enter_smm_save_seg_64(vcpu, buf, i); -} -#endif - -static void enter_smm(struct kvm_vcpu *vcpu) -{ - struct kvm_segment cs, ds; - struct desc_ptr dt; - unsigned long cr0; - char buf[512]; - - memset(buf, 0, 512); -#ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - enter_smm_save_state_64(vcpu, buf); - else -#endif - enter_smm_save_state_32(vcpu, buf); - - /* - * Give enter_smm() a chance to make ISA-specific changes to the vCPU - * state (e.g. leave guest mode) after we've saved the state into the - * SMM state-save area. - */ - static_call(kvm_x86_enter_smm)(vcpu, buf); - - kvm_smm_changed(vcpu, true); - kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); - - if (static_call(kvm_x86_get_nmi_mask)(vcpu)) - vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; - else - static_call(kvm_x86_set_nmi_mask)(vcpu, true); - - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - kvm_rip_write(vcpu, 0x8000); - - cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); - static_call(kvm_x86_set_cr0)(vcpu, cr0); - vcpu->arch.cr0 = cr0; - - static_call(kvm_x86_set_cr4)(vcpu, 0); - - /* Undocumented: IDT limit is set to zero on entry to SMM. */ - dt.address = dt.size = 0; - static_call(kvm_x86_set_idt)(vcpu, &dt); - - kvm_set_dr(vcpu, 7, DR7_FIXED_1); - - cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; - cs.base = vcpu->arch.smbase; - - ds.selector = 0; - ds.base = 0; - - cs.limit = ds.limit = 0xffffffff; - cs.type = ds.type = 0x3; - cs.dpl = ds.dpl = 0; - cs.db = ds.db = 0; - cs.s = ds.s = 1; - cs.l = ds.l = 0; - cs.g = ds.g = 1; - cs.avl = ds.avl = 0; - cs.present = ds.present = 1; - cs.unusable = ds.unusable = 0; - cs.padding = ds.padding = 0; - - kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_DS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_ES); - kvm_set_segment(vcpu, &ds, VCPU_SREG_FS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); - -#ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - static_call(kvm_x86_set_efer)(vcpu, 0); -#endif - - kvm_update_cpuid_runtime(vcpu); - kvm_mmu_reset_context(vcpu); -} - -static void process_smi(struct kvm_vcpu *vcpu) -{ - vcpu->arch.smi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); -} - void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap) { @@ -10585,8 +10268,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); +#ifdef CONFIG_KVM_SMM if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); +#endif if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); if (kvm_check_request(KVM_REQ_PMU, vcpu)) @@ -11900,6 +11585,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; kvm_async_pf_hash_reset(vcpu); + + vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; kvm_pmu_init(vcpu); vcpu->arch.pending_external_vector = -1; @@ -12909,10 +12596,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) static_call(kvm_x86_nmi_allowed)(vcpu, false))) return true; +#ifdef CONFIG_KVM_SMM if (kvm_test_request(KVM_REQ_SMI, vcpu) || (vcpu->arch.smi_pending && static_call(kvm_x86_smi_allowed)(vcpu, false))) return true; +#endif if (kvm_arch_interrupt_allowed(vcpu) && (kvm_cpu_has_interrupt(vcpu) || @@ -12953,7 +12642,9 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) return true; if (kvm_test_request(KVM_REQ_NMI, vcpu) || +#ifdef CONFIG_KVM_SMM kvm_test_request(KVM_REQ_SMI, vcpu) || +#endif kvm_test_request(KVM_REQ_EVENT, vcpu)) return true; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 829d3134c1eb..9de72586f406 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -27,6 +27,7 @@ struct kvm_caps { u64 supported_mce_cap; u64 supported_xcr0; u64 supported_xss; + u64 supported_perf_cap; }; void kvm_spurious_fault(void); |