diff options
Diffstat (limited to 'arch')
37 files changed, 787 insertions, 389 deletions
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index dfd2c124cdf8..aa53d042b943 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6348,6 +6348,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints; x86_pmu.extra_regs = intel_spr_extra_regs; x86_pmu.limit_period = spr_limit_period; + x86_pmu.pebs_ept = 1; x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 88e58b6ee73c..d8a404b91b7e 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2303,8 +2303,10 @@ void __init intel_ds_init(void) x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; - case 4: case 5: + x86_pmu.pebs_ept = 1; + fallthrough; + case 4: x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl; x86_pmu.pebs_record_size = sizeof(struct pebs_basic); if (x86_pmu.intel_cap.pebs_baseline) { diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 61012476d66e..cdb7e1492311 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -312,6 +312,9 @@ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ #define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ +#define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ +#define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ +#define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4d2bc08794e4..792a6037047a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -514,6 +514,7 @@ struct kvm_pmc { #define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define KVM_PMC_MAX_FIXED 3 +#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1) #define KVM_AMD_PMC_MAX_GENERIC 6 struct kvm_pmu { unsigned nr_arch_gp_counters; @@ -678,6 +679,11 @@ struct kvm_vcpu_hv { } nested; }; +struct kvm_hypervisor_cpuid { + u32 base; + u32 limit; +}; + /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; @@ -698,6 +704,7 @@ struct kvm_vcpu_xen { struct hrtimer timer; int poll_evtchn; struct timer_list poll_timer; + struct kvm_hypervisor_cpuid cpuid; }; struct kvm_queued_exception { @@ -826,7 +833,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; - u32 kvm_cpuid_base; + struct kvm_hypervisor_cpuid kvm_cpuid; u64 reserved_gpa_bits; int maxphyaddr; @@ -1145,6 +1152,18 @@ struct kvm_x86_msr_filter { struct msr_bitmap_range ranges[16]; }; +struct kvm_x86_pmu_event_filter { + __u32 action; + __u32 nevents; + __u32 fixed_counter_bitmap; + __u32 flags; + __u32 nr_includes; + __u32 nr_excludes; + __u64 *includes; + __u64 *excludes; + __u64 events[]; +}; + enum kvm_apicv_inhibit { /********************************************************************/ @@ -1327,7 +1346,6 @@ struct kvm_arch { u32 bsp_vcpu_id; u64 disabled_quirks; - int cpu_dirty_logging_count; enum kvm_irqchip_mode irqchip_mode; u8 nr_reserved_ioapic_pins; @@ -1363,7 +1381,7 @@ struct kvm_arch { /* Guest can access the SGX PROVISIONKEY. */ bool sgx_provisioning_allowed; - struct kvm_pmu_event_filter __rcu *pmu_event_filter; + struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter; struct task_struct *nx_huge_page_recovery_thread; #ifdef CONFIG_X86_64 @@ -2074,14 +2092,11 @@ enum { TASK_SWITCH_GATE = 3, }; -#define HF_GIF_MASK (1 << 0) -#define HF_NMI_MASK (1 << 3) -#define HF_IRET_MASK (1 << 4) -#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ +#define HF_GUEST_MASK (1 << 0) /* VCPU is in guest-mode */ #ifdef CONFIG_KVM_SMM -#define HF_SMM_MASK (1 << 6) -#define HF_SMM_INSIDE_NMI_MASK (1 << 7) +#define HF_SMM_MASK (1 << 1) +#define HF_SMM_INSIDE_NMI_MASK (1 << 2) # define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE # define KVM_ADDRESS_SPACE_NUM 2 diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 04c17be9b5fd..bc5b4d788c08 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,6 +25,8 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 +void cpu_emergency_disable_virtualization(void); + typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_panic_self_stop(struct pt_regs *regs); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 8757078d4442..3b12e6b99412 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -126,7 +126,21 @@ static inline void cpu_svm_disable(void) wrmsrl(MSR_VM_HSAVE_PA, 0); rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer & ~EFER_SVME); + if (efer & EFER_SVME) { + /* + * Force GIF=1 prior to disabling SVM to ensure INIT and NMI + * aren't blocked, e.g. if a fatal error occurred between CLGI + * and STGI. Note, STGI may #UD if SVM is disabled from NMI + * context between reading EFER and executing STGI. In that + * case, GIF must already be set, otherwise the NMI would have + * been blocked, so just eat the fault. + */ + asm_volatile_goto("1: stgi\n\t" + _ASM_EXTABLE(1b, %l[fault]) + ::: "memory" : fault); +fault: + wrmsrl(MSR_EFER, efer & ~EFER_SVME); + } } /** Makes sure SVM is disabled, if it is supported on the CPU diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 16f548a661cf..5fc35f889cd1 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -38,9 +38,11 @@ extern struct start_info *xen_start_info; #include <asm/processor.h> +#define XEN_SIGNATURE "XenVMMXenVMM" + static inline uint32_t xen_cpuid_base(void) { - return hypervisor_cpuid_base("XenVMMXenVMM", 2); + return hypervisor_cpuid_base(XEN_SIGNATURE, 2); } struct pci_dev; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index e48deab8901d..7f467fe05d42 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/ioctl.h> +#include <linux/stddef.h> #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 @@ -507,8 +508,8 @@ struct kvm_nested_state { * KVM_{GET,PUT}_NESTED_STATE ioctl values. */ union { - struct kvm_vmx_nested_state_data vmx[0]; - struct kvm_svm_nested_state_data svm[0]; + __DECLARE_FLEX_ARRAY(struct kvm_vmx_nested_state_data, vmx); + __DECLARE_FLEX_ARRAY(struct kvm_svm_nested_state_data, svm); } data; }; @@ -525,6 +526,35 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0) +#define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS) + +/* + * Masked event layout. + * Bits Description + * ---- ----------- + * 7:0 event select (low bits) + * 15:8 umask match + * 31:16 unused + * 35:32 event select (high bits) + * 36:54 unused + * 55 exclude bit + * 63:56 umask mask + */ + +#define KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, exclude) \ + (((event_select) & 0xFFULL) | (((event_select) & 0XF00ULL) << 24) | \ + (((mask) & 0xFFULL) << 56) | \ + (((match) & 0xFFULL) << 8) | \ + ((__u64)(!!(exclude)) << 55)) + +#define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \ + (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8)) +#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56) + /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 305514431f26..cdd92ab43cda 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -37,7 +37,6 @@ #include <linux/kdebug.h> #include <asm/cpu.h> #include <asm/reboot.h> -#include <asm/virtext.h> #include <asm/intel_pt.h> #include <asm/crash.h> #include <asm/cmdline.h> @@ -81,15 +80,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) */ cpu_crash_vmclear_loaded_vmcss(); - /* Disable VMX or SVM if needed. - * - * We need to disable virtualization on all CPUs. - * Having VMX or SVM enabled on any CPU may break rebooting - * after the kdump kernel has finished its task. - */ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); - /* * Disable Intel PT to stop its logging */ @@ -148,12 +138,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) */ cpu_crash_vmclear_loaded_vmcss(); - /* Booting kdump kernel with VMX or SVM enabled won't work, - * because (among other limitations) we can't disable paging - * with the virt flags. - */ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); + cpu_emergency_disable_virtualization(); /* * Disable Intel PT to stop its logging diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index c3636ea4aa71..d03c551defcc 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -528,33 +528,29 @@ static inline void kb_wait(void) } } -static void vmxoff_nmi(int cpu, struct pt_regs *regs) -{ - cpu_emergency_vmxoff(); -} +static inline void nmi_shootdown_cpus_on_restart(void); -/* Use NMIs as IPIs to tell all CPUs to disable virtualization */ -static void emergency_vmx_disable_all(void) +static void emergency_reboot_disable_virtualization(void) { /* Just make sure we won't change CPUs while doing this */ local_irq_disable(); /* - * Disable VMX on all CPUs before rebooting, otherwise we risk hanging - * the machine, because the CPU blocks INIT when it's in VMX root. + * Disable virtualization on all CPUs before rebooting to avoid hanging + * the system, as VMX and SVM block INIT when running in the host. * * We can't take any locks and we may be on an inconsistent state, so - * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt. + * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt. * - * Do the NMI shootdown even if VMX if off on _this_ CPU, as that - * doesn't prevent a different CPU from being in VMX root operation. + * Do the NMI shootdown even if virtualization is off on _this_ CPU, as + * other CPUs may have virtualization enabled. */ - if (cpu_has_vmx()) { - /* Safely force _this_ CPU out of VMX root operation. */ - __cpu_emergency_vmxoff(); + if (cpu_has_vmx() || cpu_has_svm(NULL)) { + /* Safely force _this_ CPU out of VMX/SVM operation. */ + cpu_emergency_disable_virtualization(); - /* Halt and exit VMX root operation on the other CPUs. */ - nmi_shootdown_cpus(vmxoff_nmi); + /* Disable VMX/SVM and halt on other CPUs. */ + nmi_shootdown_cpus_on_restart(); } } @@ -590,7 +586,7 @@ static void native_machine_emergency_restart(void) unsigned short mode; if (reboot_emergency) - emergency_vmx_disable_all(); + emergency_reboot_disable_virtualization(); tboot_shutdown(TB_SHUTDOWN_REBOOT); @@ -795,6 +791,17 @@ void machine_crash_shutdown(struct pt_regs *regs) /* This is the CPU performing the emergency shutdown work. */ int crashing_cpu = -1; +/* + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if + * GIF=0, i.e. if the crash occurred between CLGI and STGI. + */ +void cpu_emergency_disable_virtualization(void) +{ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); +} + #if defined(CONFIG_SMP) static nmi_shootdown_cb shootdown_callback; @@ -817,7 +824,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) return NMI_HANDLED; local_irq_disable(); - shootdown_callback(cpu, regs); + if (shootdown_callback) + shootdown_callback(cpu, regs); + + /* + * Prepare the CPU for reboot _after_ invoking the callback so that the + * callback can safely use virtualization instructions, e.g. VMCLEAR. + */ + cpu_emergency_disable_virtualization(); atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ @@ -828,18 +842,32 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) return NMI_HANDLED; } -/* - * Halt all other CPUs, calling the specified function on each of them +/** + * nmi_shootdown_cpus - Stop other CPUs via NMI + * @callback: Optional callback to be invoked from the NMI handler + * + * The NMI handler on the remote CPUs invokes @callback, if not + * NULL, first and then disables virtualization to ensure that + * INIT is recognized during reboot. * - * This function can be used to halt all other CPUs on crash - * or emergency reboot time. The function passed as parameter - * will be called inside a NMI handler on all CPUs. + * nmi_shootdown_cpus() can only be invoked once. After the first + * invocation all other CPUs are stuck in crash_nmi_callback() and + * cannot respond to a second NMI. */ void nmi_shootdown_cpus(nmi_shootdown_cb callback) { unsigned long msecs; + local_irq_disable(); + /* + * Avoid certain doom if a shootdown already occurred; re-registering + * the NMI handler will cause list corruption, modifying the callback + * will do who knows what, etc... + */ + if (WARN_ON_ONCE(crash_ipi_issued)) + return; + /* Make a note of crashing cpu. Will be used in NMI callback. */ crashing_cpu = safe_smp_processor_id(); @@ -867,7 +895,17 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) msecs--; } - /* Leave the nmi callback set */ + /* + * Leave the nmi callback set, shootdown is a one-time thing. Clearing + * the callback could result in a NULL pointer dereference if a CPU + * (finally) responds after the timeout expires. + */ +} + +static inline void nmi_shootdown_cpus_on_restart(void) +{ + if (!crash_ipi_issued) + nmi_shootdown_cpus(NULL); } /* @@ -897,6 +935,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* No other CPUs to shoot down */ } +static inline void nmi_shootdown_cpus_on_restart(void) { } + void run_crash_ipi_callback(struct pt_regs *regs) { } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 06db901fabe8..375b33ecafa2 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -32,7 +32,7 @@ #include <asm/mce.h> #include <asm/trace/irq_vectors.h> #include <asm/kexec.h> -#include <asm/virtext.h> +#include <asm/reboot.h> /* * Some notes on x86 processor bugs affecting SMP operation: @@ -122,7 +122,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) return NMI_HANDLED; - cpu_emergency_vmxoff(); + cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); return NMI_HANDLED; @@ -134,7 +134,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { ack_APIC_irq(); - cpu_emergency_vmxoff(); + cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2a9f1e200dbc..8f8edeaf8177 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -26,6 +26,7 @@ #include "mmu.h" #include "trace.h" #include "pmu.h" +#include "xen.h" /* * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be @@ -181,15 +182,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 return 0; } -static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) +static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, + const char *sig) { - u32 function; + struct kvm_hypervisor_cpuid cpuid = {}; struct kvm_cpuid_entry2 *entry; + u32 base; - vcpu->arch.kvm_cpuid_base = 0; - - for_each_possible_hypervisor_cpuid_base(function) { - entry = kvm_find_cpuid_entry(vcpu, function); + for_each_possible_hypervisor_cpuid_base(base) { + entry = kvm_find_cpuid_entry(vcpu, base); if (entry) { u32 signature[3]; @@ -198,19 +199,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) signature[1] = entry->ecx; signature[2] = entry->edx; - BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE)); - if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) { - vcpu->arch.kvm_cpuid_base = function; + if (!memcmp(signature, sig, sizeof(signature))) { + cpuid.base = base; + cpuid.limit = entry->eax; break; } } } + + return cpuid; } static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, int nent) { - u32 base = vcpu->arch.kvm_cpuid_base; + u32 base = vcpu->arch.kvm_cpuid.base; if (!base) return NULL; @@ -440,7 +443,8 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_entries = e2; vcpu->arch.cpuid_nent = nent; - kvm_update_kvm_cpuid_base(vcpu); + vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); + vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE); kvm_vcpu_after_set_cpuid(vcpu); return 0; @@ -664,8 +668,9 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); kvm_cpu_cap_mask(CPUID_7_1_EAX, - F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(AMX_FP16) | - F(AVX_IFMA) + F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | + F(FZRM) | F(FSRS) | F(FSRC) | + F(AMX_FP16) | F(AVX_IFMA) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c3443045cd93..baf97c56aefa 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1634,7 +1634,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_SS: /* * segment is not a writable data segment or segment - * selector's RPL != CPL or segment selector's RPL != CPL + * selector's RPL != CPL or DPL != CPL */ if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl) goto exception; @@ -1696,11 +1696,11 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, /* * segment is not a data or readable code segment or * ((segment is a data or nonconforming code segment) - * and (both RPL and CPL > DPL)) + * and ((RPL > DPL) or (CPL > DPL))) */ if ((seg_desc.type & 0xa) == 0x8 || (((seg_desc.type & 0xc) != 0xc) && - (rpl > dpl && cpl > dpl))) + (rpl > dpl || cpl > dpl))) goto exception; break; } @@ -2310,7 +2310,7 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) static int em_rsm(struct x86_emulate_ctxt *ctxt) { - if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) + if (!ctxt->ops->is_smm(ctxt)) return emulate_ud(ctxt); if (ctxt->ops->leave_smm(ctxt)) @@ -5133,7 +5133,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; - unsigned emul_flags; + bool is_guest_mode = ctxt->ops->is_guest_mode(ctxt); ctxt->mem_read.pos = 0; @@ -5148,7 +5148,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - emul_flags = ctxt->ops->get_hflags(ctxt); if (unlikely(ctxt->d & (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || @@ -5182,7 +5181,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(&ctxt->dst); } - if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { + if (unlikely(is_guest_mode) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5211,7 +5210,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5265,7 +5264,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 71aff0edc0ed..b28fd020066f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -44,6 +44,24 @@ #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK) +/* + * As per Hyper-V TLFS, extended hypercalls start from 0x8001 + * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value + * where each bit tells which extended hypercall is available besides + * HvExtCallQueryCapabilities. + * + * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit + * assigned. + * + * 0x8002 - Bit 0 + * 0x8003 - Bit 1 + * .. + * 0x8041 - Bit 63 + * + * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64 + */ +#define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64) + static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick); @@ -1430,8 +1448,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; @@ -1552,8 +1569,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } @@ -1608,7 +1624,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } @@ -1673,7 +1689,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, data = APIC_BUS_FREQUENCY; break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; @@ -2439,6 +2455,9 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) case HVCALL_SEND_IPI: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_CLUSTER_IPI_RECOMMENDED; + case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: + return hv_vcpu->cpuid_cache.features_ebx & + HV_ENABLE_EXTENDED_HYPERCALLS; default: break; } @@ -2531,14 +2550,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = hc.param; - vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; - vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; - vcpu->arch.complete_userspace_io = - kvm_hv_hypercall_complete_userspace; - return 0; + goto hypercall_userspace_exit; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; @@ -2597,15 +2609,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) ret = HV_STATUS_OPERATION_DENIED; break; } - vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = hc.param; - vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; - vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; - vcpu->arch.complete_userspace_io = - kvm_hv_hypercall_complete_userspace; - return 0; + goto hypercall_userspace_exit; } + case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: + if (unlikely(hc.fast)) { + ret = HV_STATUS_INVALID_PARAMETER; + break; + } + goto hypercall_userspace_exit; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; @@ -2613,6 +2624,15 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) hypercall_complete: return kvm_hv_hypercall_complete(vcpu, ret); + +hypercall_userspace_exit: + vcpu->run->exit_reason = KVM_EXIT_HYPERV; + vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; + vcpu->run->hyperv.u.hcall.input = hc.param; + vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; + vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; + vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; + return 0; } void kvm_hv_init_vm(struct kvm *kvm) @@ -2756,6 +2776,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; + ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS; ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 2d9662be8333..ab65f3a47dfd 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -220,7 +220,8 @@ struct x86_emulate_ops { void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); - unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); + bool (*is_smm)(struct x86_emulate_ctxt *ctxt); + bool (*is_guest_mode)(struct x86_emulate_ctxt *ctxt); int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); @@ -275,10 +276,6 @@ enum x86emul_mode { X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */ }; -/* These match some of the HF_* flags defined in kvm_host.h */ -#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ -#define X86EMUL_SMM_MASK (1 << 6) - /* * fastop functions are declared as taking a never-defined fastop parameter, * so they can't be called from C directly. diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index aeb240b339f5..c91ee2927dd7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -44,6 +44,7 @@ #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/kern_levels.h> +#include <linux/kstrtox.h> #include <linux/kthread.h> #include <asm/page.h> @@ -269,6 +270,17 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } +static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); + +/* Flush the range of guest memory mapped by the given SPTE. */ +static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep) +{ + struct kvm_mmu_page *sp = sptep_to_sp(sptep); + gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep)); + + kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); +} + static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { @@ -813,7 +825,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) - kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K); } void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -1187,8 +1199,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) drop_spte(kvm, sptep); if (flush) - kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm_flush_remote_tlbs_sptep(kvm, sptep); } /* @@ -1469,7 +1480,7 @@ restart: } if (need_flush && kvm_available_flush_tlb_with_range()) { - kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + kvm_flush_remote_tlbs_gfn(kvm, gfn, level); return false; } @@ -1639,8 +1650,7 @@ static void __rmap_add(struct kvm *kvm, kvm->stat.max_mmu_rmap_size = rmap_count; if (rmap_count > RMAP_RECYCLE_THRESHOLD) { kvm_zap_all_rmap_sptes(kvm, rmap_head); - kvm_flush_remote_tlbs_with_address( - kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); } } @@ -2405,7 +2415,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, return; drop_parent_pte(child, sptep); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1); + kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); } } @@ -2889,8 +2899,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } if (flush) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, - KVM_PAGES_PER_HPAGE(level)); + kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); pgprintk("%s: setting spte %llx\n", __func__, *sptep); @@ -3169,7 +3178,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break; @@ -4440,7 +4449,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = fault->gfn & ~(page_num - 1); + gfn_t base = gfn_round_for_level(fault->gfn, + fault->max_level); if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) break; @@ -4556,10 +4566,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) struct kvm_mmu *mmu = vcpu->arch.mmu; union kvm_mmu_page_role new_role = mmu->root_role; - if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) { - /* kvm_mmu_ensure_valid_pgd will set up a new root. */ + /* + * Return immediately if no usable root was found, kvm_mmu_reload() + * will establish a valid root prior to the next VM-Enter. + */ + if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) return; - } /* * It's possible that the cached previous root page is obsolete because @@ -6518,8 +6530,7 @@ restart: kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) - kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm_flush_remote_tlbs_sptep(kvm, sptep); else need_tlb_flush = 1; @@ -6752,7 +6763,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) new_val = 1; else if (sysfs_streq(val, "auto")) new_val = get_nx_auto_mode(); - else if (strtobool(val, &new_val) < 0) + else if (kstrtobool(val, &new_val) < 0) return -EINVAL; __set_nx_huge_pages(new_val); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ac00bfbf32f6..cc58631e2336 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -156,6 +156,11 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; } +static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) +{ + return gfn & -KVM_PAGES_PER_HPAGE(level); +} + int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, bool can_unsync, bool prefetch); @@ -164,8 +169,17 @@ void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); + void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); + +/* Flush the given page (huge or not) of guest memory. */ +static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level) +{ + kvm_flush_remote_tlbs_with_address(kvm, gfn_round_for_level(gfn, level), + KVM_PAGES_PER_HPAGE(level)); +} + unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); extern int nx_huge_pages; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index e5662dbd519c..57f0b75c80f9 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -642,12 +642,12 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) goto out_gpte_changed; - for (shadow_walk_init(&it, vcpu, fault->addr); - shadow_walk_okay(&it) && it.level > gw->level; - shadow_walk_next(&it)) { + for_each_shadow_entry(vcpu, fault->addr, it) { gfn_t table_gfn; clear_sp_write_flooding_count(it.sptep); + if (it.level == gw->level) + break; table_gfn = gw->table_gfn[it.level - 2]; access = gw->pt_access[it.level - 2]; @@ -692,8 +692,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, trace_kvm_mmu_spte_requested(fault); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { - clear_sp_write_flooding_count(it.sptep); - /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. @@ -701,7 +699,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break; @@ -929,8 +927,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); if (is_shadow_present_pte(old_spte)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, - sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); if (!rmap_can_add(vcpu)) break; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index fce6f047399f..c15bfca3ed15 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -147,9 +147,9 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, WARN_ON_ONCE(!pte_access && !shadow_present_mask); if (sp->role.ad_disabled) - spte |= SPTE_TDP_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED; else if (kvm_mmu_page_ad_need_write_protect(sp)) - spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; + spte |= SPTE_TDP_AD_WRPROT_ONLY; /* * For the EPT case, shadow_present_mask is 0 if hardware @@ -317,7 +317,7 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) shadow_user_mask | shadow_x_mask | shadow_me_value; if (ad_disabled) - spte |= SPTE_TDP_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED; else spte |= shadow_accessed_mask; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 0d8deefee66c..1279db2eab44 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -28,10 +28,10 @@ */ #define SPTE_TDP_AD_SHIFT 52 #define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) -#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT) -#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT) -#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT) -static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); +#define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT) +static_assert(SPTE_TDP_AD_ENABLED == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK #define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) @@ -164,7 +164,7 @@ extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; /* - * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK; + * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED; * shadow_acc_track_mask is the set of bits to be cleared in non-accessed * pages. */ @@ -266,18 +266,18 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { MMU_WARN_ON(!is_shadow_present_pte(spte)); - return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK; + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED; } static inline bool spte_ad_need_write_protect(u64 spte) { MMU_WARN_ON(!is_shadow_present_pte(spte)); /* - * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0', + * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0', * and non-TDP SPTEs will never set these bits. Optimize for 64-bit * TDP and do the A/D type check unconditionally. */ - return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK; + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED; } static inline u64 spte_shadow_accessed_mask(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index e26e744df1d1..d2eb0d4f8710 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -16,11 +16,6 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); } -static gfn_t round_gfn_for_level(gfn_t gfn, int level) -{ - return gfn & -KVM_PAGES_PER_HPAGE(level); -} - /* * Return the TDP iterator to the root PT and allow it to continue its * traversal over the paging structure from there. @@ -31,7 +26,7 @@ void tdp_iter_restart(struct tdp_iter *iter) iter->yielded_gfn = iter->next_last_level_gfn; iter->level = iter->root_level; - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); iter->valid = true; @@ -98,7 +93,7 @@ static bool try_step_down(struct tdp_iter *iter) iter->level--; iter->pt_path[iter->level - 1] = child_pt; - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); return true; @@ -140,7 +135,7 @@ static bool try_step_up(struct tdp_iter *iter) return false; iter->level++; - iter->gfn = round_gfn_for_level(iter->gfn, iter->level); + iter->gfn = gfn_round_for_level(iter->gfn, iter->level); tdp_iter_refresh_sptep(iter); return true; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index bba33aea0fb0..7c25dbf32ecc 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -680,8 +680,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, if (ret) return ret; - kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, - KVM_PAGES_PER_HPAGE(iter->level)); + kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); /* * No other thread can overwrite the removed SPTE as they must either @@ -1080,8 +1079,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, return RET_PF_RETRY; else if (is_shadow_present_pte(iter->old_spte) && !is_last_spte(iter->old_spte, iter->level)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(iter->level + 1)); + kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); /* * If the page fault was caused by a write but the page is write diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index d939d3b84e6f..612e6c70ce2e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -29,9 +29,18 @@ struct x86_pmu_capability __read_mostly kvm_pmu_cap; EXPORT_SYMBOL_GPL(kvm_pmu_cap); -static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { +/* Precise Distribution of Instructions Retired (PDIR) */ +static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), + /* Instruction-Accurate PDIR (PDIR++) */ + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), + {} +}; + +/* Precise Distribution (PDist) */ +static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), {} }; @@ -156,6 +165,28 @@ static void kvm_perf_overflow(struct perf_event *perf_event, kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } +static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc) +{ + /* + * For some model specific pebs counters with special capabilities + * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise + * level to the maximum value (currently 3, backwards compatible) + * so that the perf subsystem would assign specific hardware counter + * with that capability for vPMC. + */ + if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) || + (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu))) + return 3; + + /* + * The non-zero precision level of guest event makes the ordinary + * guest event becomes a guest PEBS event and triggers the host + * PEBS PMI handler to determine whether the PEBS overflow PMI + * comes from the host counters or the guest. + */ + return 1; +} + static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, bool exclude_kernel, bool intr) @@ -187,22 +218,12 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, } if (pebs) { /* - * The non-zero precision level of guest event makes the ordinary - * guest event becomes a guest PEBS event and triggers the host - * PEBS PMI handler to determine whether the PEBS overflow PMI - * comes from the host counters or the guest. - * * For most PEBS hardware events, the difference in the software * precision levels of guest and host PEBS events will not affect * the accuracy of the PEBS profiling result, because the "event IP" * in the PEBS record is calibrated on the guest side. - * - * On Icelake everything is fine. Other hardware (GLC+, TNT+) that - * could possibly care here is unsupported and needs changes. */ - attr.precise_ip = 1; - if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32) - attr.precise_ip = 3; + attr.precise_ip = pmc_get_pebs_precise_level(pmc); } event = perf_event_create_kernel_counter(&attr, -1, current, @@ -255,48 +276,128 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } -static int cmp_u64(const void *pa, const void *pb) +static int filter_cmp(const void *pa, const void *pb, u64 mask) { - u64 a = *(u64 *)pa; - u64 b = *(u64 *)pb; + u64 a = *(u64 *)pa & mask; + u64 b = *(u64 *)pb & mask; return (a > b) - (a < b); } + +static int filter_sort_cmp(const void *pa, const void *pb) +{ + return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT | + KVM_PMU_MASKED_ENTRY_EXCLUDE)); +} + +/* + * For the event filter, searching is done on the 'includes' list and + * 'excludes' list separately rather than on the 'events' list (which + * has both). As a result the exclude bit can be ignored. + */ +static int filter_event_cmp(const void *pa, const void *pb) +{ + return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT)); +} + +static int find_filter_index(u64 *events, u64 nevents, u64 key) +{ + u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]), + filter_event_cmp); + + if (!fe) + return -1; + + return fe - events; +} + +static bool is_filter_entry_match(u64 filter_event, u64 umask) +{ + u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8); + u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH; + + BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >> + (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) != + ARCH_PERFMON_EVENTSEL_UMASK); + + return (umask & mask) == match; +} + +static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel) +{ + u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT; + u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK; + int i, index; + + index = find_filter_index(events, nevents, event_select); + if (index < 0) + return false; + + /* + * Entries are sorted by the event select. Walk the list in both + * directions to process all entries with the targeted event select. + */ + for (i = index; i < nevents; i++) { + if (filter_event_cmp(&events[i], &event_select)) + break; + + if (is_filter_entry_match(events[i], umask)) + return true; + } + + for (i = index - 1; i >= 0; i--) { + if (filter_event_cmp(&events[i], &event_select)) + break; + + if (is_filter_entry_match(events[i], umask)) + return true; + } + + return false; +} + +static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f, + u64 eventsel) +{ + if (filter_contains_match(f->includes, f->nr_includes, eventsel) && + !filter_contains_match(f->excludes, f->nr_excludes, eventsel)) + return f->action == KVM_PMU_EVENT_ALLOW; + + return f->action == KVM_PMU_EVENT_DENY; +} + +static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, + int idx) +{ + int fixed_idx = idx - INTEL_PMC_IDX_FIXED; + + if (filter->action == KVM_PMU_EVENT_DENY && + test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) + return false; + if (filter->action == KVM_PMU_EVENT_ALLOW && + !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) + return false; + + return true; +} + static bool check_pmu_event_filter(struct kvm_pmc *pmc) { - struct kvm_pmu_event_filter *filter; + struct kvm_x86_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; - bool allow_event = true; - __u64 key; - int idx; if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) return false; filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); if (!filter) - goto out; + return true; - if (pmc_is_gp(pmc)) { - key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB; - if (bsearch(&key, filter->events, filter->nevents, - sizeof(__u64), cmp_u64)) - allow_event = filter->action == KVM_PMU_EVENT_ALLOW; - else - allow_event = filter->action == KVM_PMU_EVENT_DENY; - } else { - idx = pmc->idx - INTEL_PMC_IDX_FIXED; - if (filter->action == KVM_PMU_EVENT_DENY && - test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - allow_event = false; - if (filter->action == KVM_PMU_EVENT_ALLOW && - !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - allow_event = false; - } + if (pmc_is_gp(pmc)) + return is_gp_event_allowed(filter, pmc->eventsel); -out: - return allow_event; + return is_fixed_event_allowed(filter, pmc->idx); } static void reprogram_counter(struct kvm_pmc *pmc) @@ -593,47 +694,133 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) } EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); +static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) +{ + u64 mask = kvm_pmu_ops.EVENTSEL_EVENT | + KVM_PMU_MASKED_ENTRY_UMASK_MASK | + KVM_PMU_MASKED_ENTRY_UMASK_MATCH | + KVM_PMU_MASKED_ENTRY_EXCLUDE; + int i; + + for (i = 0; i < filter->nevents; i++) { + if (filter->events[i] & ~mask) + return false; + } + + return true; +} + +static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter) +{ + int i, j; + + for (i = 0, j = 0; i < filter->nevents; i++) { + /* + * Skip events that are impossible to match against a guest + * event. When filtering, only the event select + unit mask + * of the guest event is used. To maintain backwards + * compatibility, impossible filters can't be rejected :-( + */ + if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT | + ARCH_PERFMON_EVENTSEL_UMASK)) + continue; + /* + * Convert userspace events to a common in-kernel event so + * only one code path is needed to support both events. For + * the in-kernel events use masked events because they are + * flexible enough to handle both cases. To convert to masked + * events all that's needed is to add an "all ones" umask_mask, + * (unmasked filter events don't support EXCLUDE). + */ + filter->events[j++] = filter->events[i] | + (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT); + } + + filter->nevents = j; +} + +static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter) +{ + int i; + + if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS)) + convert_to_masked_filter(filter); + else if (!is_masked_filter_valid(filter)) + return -EINVAL; + + /* + * Sort entries by event select and includes vs. excludes so that all + * entries for a given event select can be processed efficiently during + * filtering. The EXCLUDE flag uses a more significant bit than the + * event select, and so the sorted list is also effectively split into + * includes and excludes sub-lists. + */ + sort(&filter->events, filter->nevents, sizeof(filter->events[0]), + filter_sort_cmp, NULL); + + i = filter->nevents; + /* Find the first EXCLUDE event (only supported for masked events). */ + if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) { + for (i = 0; i < filter->nevents; i++) { + if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE) + break; + } + } + + filter->nr_includes = i; + filter->nr_excludes = filter->nevents - filter->nr_includes; + filter->includes = filter->events; + filter->excludes = filter->events + filter->nr_includes; + + return 0; +} + int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) { - struct kvm_pmu_event_filter tmp, *filter; + struct kvm_pmu_event_filter __user *user_filter = argp; + struct kvm_x86_pmu_event_filter *filter; + struct kvm_pmu_event_filter tmp; struct kvm_vcpu *vcpu; unsigned long i; size_t size; int r; - if (copy_from_user(&tmp, argp, sizeof(tmp))) + if (copy_from_user(&tmp, user_filter, sizeof(tmp))) return -EFAULT; if (tmp.action != KVM_PMU_EVENT_ALLOW && tmp.action != KVM_PMU_EVENT_DENY) return -EINVAL; - if (tmp.flags != 0) + if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK) return -EINVAL; if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) return -E2BIG; size = struct_size(filter, events, tmp.nevents); - filter = kmalloc(size, GFP_KERNEL_ACCOUNT); + filter = kzalloc(size, GFP_KERNEL_ACCOUNT); if (!filter) return -ENOMEM; + filter->action = tmp.action; + filter->nevents = tmp.nevents; + filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap; + filter->flags = tmp.flags; + r = -EFAULT; - if (copy_from_user(filter, argp, size)) + if (copy_from_user(filter->events, user_filter->events, + sizeof(filter->events[0]) * filter->nevents)) goto cleanup; - /* Ensure nevents can't be changed between the user copies. */ - *filter = tmp; - - /* - * Sort the in-kernel list so that we can search it with bsearch. - */ - sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL); + r = prepare_filter_lists(filter); + if (r) + goto cleanup; mutex_lock(&kvm->lock); filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, mutex_is_locked(&kvm->lock)); + mutex_unlock(&kvm->lock); synchronize_srcu_expedited(&kvm->srcu); BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) > @@ -644,8 +831,6 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) kvm_make_all_cpus_request(kvm, KVM_REQ_PMU); - mutex_unlock(&kvm->lock); - r = 0; cleanup: kfree(filter); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index cdb91009701d..79988dafb15b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -18,12 +18,6 @@ #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 -struct kvm_event_hw_type_mapping { - u8 eventsel; - u8 unit_mask; - unsigned event_type; -}; - struct kvm_pmu_ops { bool (*hw_event_available)(struct kvm_pmc *pmc); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); @@ -40,6 +34,9 @@ struct kvm_pmu_ops { void (*reset)(struct kvm_vcpu *vcpu); void (*deliver_pmi)(struct kvm_vcpu *vcpu); void (*cleanup)(struct kvm_vcpu *vcpu); + + const u64 EVENTSEL_EVENT; + const int MAX_NR_GP_COUNTERS; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); @@ -161,7 +158,7 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) extern struct x86_pmu_capability kvm_pmu_cap; -static inline void kvm_init_pmu_capability(void) +static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; @@ -180,6 +177,8 @@ static inline void kvm_init_pmu_capability(void) } kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, + pmu_ops->MAX_NR_GP_COUNTERS); kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, KVM_PMC_MAX_FIXED); } diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index cc43638d48a3..b42111a24cc2 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -111,8 +111,6 @@ static void check_smram_offsets(void) void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) { - BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); - trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); if (entering_smm) { diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b3928150a37c..ca684979e90d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1120,7 +1120,7 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) * - Hypervisor can support both xAVIC and x2AVIC in the same guest. * - The mode can be switched at run-time. */ -bool avic_hardware_setup(struct kvm_x86_ops *x86_ops) +bool avic_hardware_setup(void) { if (!npt_enabled) return false; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 700df66d23c7..05d38944a6c0 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1008,7 +1008,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; - vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl; vmcb12->control.event_inj = svm->nested.ctl.event_inj; vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 1ff068f23841..cc77a0681800 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -231,4 +231,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .refresh = amd_pmu_refresh, .init = amd_pmu_init, .reset = amd_pmu_reset, + .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, + .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 273cba809328..c25aeb550cd9 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -813,7 +813,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, if (!IS_ALIGNED(dst_paddr, 16) || !IS_ALIGNED(paddr, 16) || !IS_ALIGNED(size, 16)) { - tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO); + tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!tpage) return -ENOMEM; @@ -1294,7 +1294,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Check if we are crossing the page boundary */ offset = params.guest_uaddr & (PAGE_SIZE - 1); - if ((params.guest_len + offset > PAGE_SIZE)) + if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) return -EINVAL; /* Pin guest memory */ @@ -1474,7 +1474,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Check if we are crossing the page boundary */ offset = params.guest_uaddr & (PAGE_SIZE - 1); - if ((params.guest_len + offset > PAGE_SIZE)) + if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) return -EINVAL; hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d13cf53e7390..b43775490074 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1338,6 +1338,9 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.microcode_version = 0x01000065; svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; + svm->nmi_masked = false; + svm->awaiting_iret_completion = false; + if (sev_es_guest(vcpu->kvm)) sev_es_vcpu_reset(svm); } @@ -2482,7 +2485,7 @@ static int iret_interception(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); ++vcpu->stat.nmi_window_exits; - vcpu->arch.hflags |= HF_IRET_MASK; + svm->awaiting_iret_completion = true; if (!sev_es_guest(vcpu->kvm)) { svm_clr_intercept(svm, INTERCEPT_IRET); svm->nmi_iret_rip = kvm_rip_read(vcpu); @@ -3015,8 +3018,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_DEBUGCTLMSR: if (!lbrv) { - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", - __func__, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } if (data & DEBUGCTL_RESERVED_BITS) @@ -3045,7 +3047,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: - vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; case MSR_AMD64_DE_CFG: { struct kvm_msr_entry msr_entry; @@ -3478,7 +3480,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) if (svm->nmi_l1_to_l2) return; - vcpu->arch.hflags |= HF_NMI_MASK; + svm->nmi_masked = true; if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; @@ -3583,7 +3585,6 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - bool ret; if (!gif_set(svm)) return true; @@ -3591,10 +3592,8 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) return false; - ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || - (vcpu->arch.hflags & HF_NMI_MASK); - - return ret; + return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || + svm->nmi_masked; } static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) @@ -3614,7 +3613,7 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) { - return !!(vcpu->arch.hflags & HF_NMI_MASK); + return to_svm(vcpu)->nmi_masked; } static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) @@ -3622,11 +3621,11 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) struct vcpu_svm *svm = to_svm(vcpu); if (masked) { - vcpu->arch.hflags |= HF_NMI_MASK; + svm->nmi_masked = true; if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); } else { - vcpu->arch.hflags &= ~HF_NMI_MASK; + svm->nmi_masked = false; if (!sev_es_guest(vcpu->kvm)) svm_clr_intercept(svm, INTERCEPT_IRET); } @@ -3712,7 +3711,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) + if (svm->nmi_masked && !svm->awaiting_iret_completion) return; /* IRET will cause a vm exit */ if (!gif_set(svm)) { @@ -3836,10 +3835,11 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) * If we've made progress since setting HF_IRET_MASK, we've * executed an IRET and can allow NMI injection. */ - if ((vcpu->arch.hflags & HF_IRET_MASK) && + if (svm->awaiting_iret_completion && (sev_es_guest(vcpu->kvm) || kvm_rip_read(vcpu) != svm->nmi_iret_rip)) { - vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); + svm->awaiting_iret_completion = false; + svm->nmi_masked = false; kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -5020,7 +5020,7 @@ static __init int svm_hardware_setup(void) nrips = false; } - enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops); + enable_apicv = avic = avic && avic_hardware_setup(); if (!enable_apicv) { svm_x86_ops.vcpu_blocking = NULL; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 41eabb098b13..839809972da1 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -230,8 +230,26 @@ struct vcpu_svm { struct svm_nested_state nested; + /* NMI mask value, used when vNMI is not enabled */ + bool nmi_masked; + + /* + * True when NMIs are still masked but guest IRET was just intercepted + * and KVM is waiting for RIP to change, which will signal that the + * intercepted IRET was retired and thus NMI can be unmasked. + */ + bool awaiting_iret_completion; + + /* + * Set when KVM is awaiting IRET completion and needs to inject NMIs as + * soon as the IRET completes (e.g. NMI is pending injection). KVM + * temporarily steals RFLAGS.TF to single-step the guest in this case + * in order to regain control as soon as the NMI-blocking condition + * goes away. + */ bool nmi_singlestep; u64 nmi_singlestep_guest_rflags; + bool nmi_l1_to_l2; unsigned long soft_int_csbase; @@ -273,6 +291,9 @@ struct vcpu_svm { bool guest_state_loaded; bool x2avic_msrs_intercepted; + + /* Guest GIF value, used when vGIF is not enabled */ + bool guest_gif; }; struct svm_cpu_data { @@ -490,7 +511,7 @@ static inline void enable_gif(struct vcpu_svm *svm) if (vmcb) vmcb->control.int_ctl |= V_GIF_MASK; else - svm->vcpu.arch.hflags |= HF_GIF_MASK; + svm->guest_gif = true; } static inline void disable_gif(struct vcpu_svm *svm) @@ -500,7 +521,7 @@ static inline void disable_gif(struct vcpu_svm *svm) if (vmcb) vmcb->control.int_ctl &= ~V_GIF_MASK; else - svm->vcpu.arch.hflags &= ~HF_GIF_MASK; + svm->guest_gif = false; } static inline bool gif_set(struct vcpu_svm *svm) @@ -510,7 +531,7 @@ static inline bool gif_set(struct vcpu_svm *svm) if (vmcb) return !!(vmcb->control.int_ctl & V_GIF_MASK); else - return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); + return svm->guest_gif; } static inline bool nested_npt_enabled(struct vcpu_svm *svm) @@ -637,7 +658,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \ ) -bool avic_hardware_setup(struct kvm_x86_ops *ops); +bool avic_hardware_setup(void); int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index efce9ad70e4e..e8a3be0b9df9 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -22,16 +22,19 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) -static struct kvm_event_hw_type_mapping intel_arch_events[] = { - [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, - [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, - [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, - [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, +static struct { + u8 eventsel; + u8 unit_mask; +} const intel_arch_events[] = { + [0] = { 0x3c, 0x00 }, + [1] = { 0xc0, 0x00 }, + [2] = { 0x3c, 0x01 }, + [3] = { 0x2e, 0x4f }, + [4] = { 0x2e, 0x41 }, + [5] = { 0xc4, 0x00 }, + [6] = { 0xc5, 0x00 }, /* The above index must match CPUID 0x0A.EBX bit vector */ - [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, + [7] = { 0x00, 0x03 }, }; /* mapping between fixed pmc index and intel_arch_events array */ @@ -811,4 +814,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .reset = intel_pmu_reset, .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, + .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, + .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, }; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 33614ee2cd67..0f82c247e1fd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2206,9 +2206,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); + kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); } @@ -4617,7 +4615,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) * it needs to be set here when dirty logging is already active, e.g. * if this vCPU was created after dirty logging was enabled. */ - if (!vcpu->kvm->arch.cpu_dirty_logging_count) + if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; if (cpu_has_vmx_xsaves()) { @@ -8004,17 +8002,20 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (WARN_ON_ONCE(!enable_pml)) + return; + if (is_guest_mode(vcpu)) { vmx->nested.update_vmcs01_cpu_dirty_logging = true; return; } /* - * Note, cpu_dirty_logging_count can be changed concurrent with this + * Note, nr_memslots_dirty_logging can be changed concurrent with this * code, but in that case another update request will be made and so * the guest will never run with a stale PML value. */ - if (vcpu->kvm->arch.cpu_dirty_logging_count) + if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); else secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 508074e47bc0..f706621c35b8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1419,7 +1419,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); * may depend on host virtualization features rather than host cpu features. */ -static const u32 msrs_to_save_all[] = { +static const u32 msrs_to_save_base[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1436,6 +1436,10 @@ static const u32 msrs_to_save_all[] = { MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, MSR_IA32_UMWAIT_CONTROL, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, +}; + +static const u32 msrs_to_save_pmu[] = { MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, @@ -1460,11 +1464,10 @@ static const u32 msrs_to_save_all[] = { MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, - - MSR_IA32_XFD, MSR_IA32_XFD_ERR, }; -static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + + ARRAY_SIZE(msrs_to_save_pmu)]; static unsigned num_msrs_to_save; static const u32 emulated_msrs_all[] = { @@ -3161,6 +3164,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) &vcpu->hv_clock.tsc_shift, &vcpu->hv_clock.tsc_to_system_mul); vcpu->hw_tsc_khz = tgt_tsc_khz; + kvm_xen_update_tsc_info(v); } vcpu->hv_clock.tsc_timestamp = tsc_timestamp; @@ -3558,9 +3562,20 @@ static void record_steal_time(struct kvm_vcpu *vcpu) mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } +static bool kvm_is_msr_to_save(u32 msr_index) +{ + unsigned int i; + + for (i = 0; i < num_msrs_to_save; i++) { + if (msrs_to_save[i] == msr_index) + return true; + } + + return false; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - bool pr = false; u32 msr = msr_info->index; u64 data = msr_info->data; @@ -3606,15 +3621,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data == BIT_ULL(18)) { vcpu->arch.msr_hwcr = data; } else if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", - data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%llx\n", data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; @@ -3794,16 +3807,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: - pr = true; - fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - if (pr || data != 0) - vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + if (data) + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_K7_CLK_CTL: /* @@ -3831,9 +3841,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) @@ -3881,20 +3889,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif - case MSR_IA32_PEBS_ENABLE: - case MSR_IA32_DS_AREA: - case MSR_PEBS_DATA_CFG: - case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); + /* * Userspace is allowed to write '0' to MSRs that KVM reports * as to-be-saved, even if an MSRs isn't fully supported. */ - return !msr_info->host_initiated || data; - default: - if (kvm_pmu_is_valid_msr(vcpu, msr)) - return kvm_pmu_set_msr(vcpu, msr_info); + if (msr_info->host_initiated && !data && + kvm_is_msr_to_save(msr)) + break; + return KVM_MSR_RET_INVALID; } return 0; @@ -3984,20 +3990,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; - case MSR_IA32_PEBS_ENABLE: - case MSR_IA32_DS_AREA: - case MSR_PEBS_DATA_CFG: - case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info); - /* - * Userspace is allowed to read MSRs that KVM reports as - * to-be-saved, even if an MSR isn't fully supported. - */ - if (!msr_info->host_initiated) - return 1; - msr_info->data = 0; - break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: @@ -4253,6 +4245,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); + + /* + * Userspace is allowed to read MSRs that KVM reports as + * to-be-saved, even if an MSR isn't fully supported. + */ + if (msr_info->host_initiated && + kvm_is_msr_to_save(msr_info->index)) { + msr_info->data = 0; + break; + } + return KVM_MSR_RET_INVALID; } return 0; @@ -4290,8 +4293,8 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, { struct kvm_msrs msrs; struct kvm_msr_entry *entries; - int r, n; unsigned size; + int r; r = -EFAULT; if (copy_from_user(&msrs, user_msrs, sizeof(msrs))) @@ -4308,17 +4311,11 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, goto out; } - r = n = __msr_io(vcpu, &msrs, entries, do_msr); - if (r < 0) - goto out_free; + r = __msr_io(vcpu, &msrs, entries, do_msr); - r = -EFAULT; if (writeback && copy_to_user(user_msrs->entries, entries, size)) - goto out_free; - - r = n; + r = -EFAULT; -out_free: kfree(entries); out: return r; @@ -4406,6 +4403,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_PMU_EVENT_FILTER: + case KVM_CAP_PMU_EVENT_MASKED_EVENTS: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: @@ -6465,7 +6463,7 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, struct kvm_x86_msr_filter *new_filter, *old_filter; bool default_allow; bool empty = true; - int r = 0; + int r; u32 i; if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK) @@ -6491,17 +6489,14 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, } mutex_lock(&kvm->lock); - - /* The per-VM filter is protected by kvm->lock... */ - old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); - - rcu_assign_pointer(kvm->arch.msr_filter, new_filter); + old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter, + mutex_is_locked(&kvm->lock)); + mutex_unlock(&kvm->lock); synchronize_srcu(&kvm->srcu); kvm_free_msr_filter(old_filter); kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); - mutex_unlock(&kvm->lock); return 0; } @@ -7001,83 +6996,98 @@ out: return r; } -static void kvm_init_msr_list(void) +static void kvm_probe_msr_to_save(u32 msr_index) { u32 dummy[2]; + + if (rdmsr_safe(msr_index, &dummy[0], &dummy[1])) + return; + + /* + * Even MSRs that are valid in the host may not be exposed to guests in + * some cases. + */ + switch (msr_index) { + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported()) + return; + break; + case MSR_TSC_AUX: + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && + !kvm_cpu_cap_has(X86_FEATURE_RDPID)) + return; + break; + case MSR_IA32_UMWAIT_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) + return; + break; + case MSR_IA32_RTIT_CTL: + case MSR_IA32_RTIT_STATUS: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) + return; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) + return; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && + !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) + return; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (msr_index - MSR_IA32_RTIT_ADDR0_A >= + intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)) + return; + break; + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: + if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: + if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX: + if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >= + kvm_pmu_cap.num_counters_fixed) + return; + break; + case MSR_IA32_XFD: + case MSR_IA32_XFD_ERR: + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + return; + break; + default: + break; + } + + msrs_to_save[num_msrs_to_save++] = msr_index; +} + +static void kvm_init_msr_list(void) +{ unsigned i; BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, - "Please update the fixed PMCs in msrs_to_saved_all[]"); + "Please update the fixed PMCs in msrs_to_save_pmu[]"); num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; - for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { - if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) - continue; - - /* - * Even MSRs that are valid in the host may not be exposed - * to the guests in some cases. - */ - switch (msrs_to_save_all[i]) { - case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) - continue; - break; - case MSR_TSC_AUX: - if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && - !kvm_cpu_cap_has(X86_FEATURE_RDPID)) - continue; - break; - case MSR_IA32_UMWAIT_CONTROL: - if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) - continue; - break; - case MSR_IA32_RTIT_CTL: - case MSR_IA32_RTIT_STATUS: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) - continue; - break; - case MSR_IA32_RTIT_CR3_MATCH: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) - continue; - break; - case MSR_IA32_RTIT_OUTPUT_BASE: - case MSR_IA32_RTIT_OUTPUT_MASK: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && - !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) - continue; - break; - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= - intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) - continue; - break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) - continue; - break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) - continue; - break; - case MSR_IA32_XFD: - case MSR_IA32_XFD_ERR: - if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) - continue; - break; - default: - break; - } + for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++) + kvm_probe_msr_to_save(msrs_to_save_base[i]); - msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; + if (enable_pmu) { + for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++) + kvm_probe_msr_to_save(msrs_to_save_pmu[i]); } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { @@ -8150,9 +8160,14 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked); } -static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) +static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt) +{ + return is_smm(emul_to_vcpu(ctxt)); +} + +static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt) { - return emul_to_vcpu(ctxt)->arch.hflags; + return is_guest_mode(emul_to_vcpu(ctxt)); } #ifndef CONFIG_KVM_SMM @@ -8221,7 +8236,8 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_fxsr = emulator_guest_has_fxsr, .guest_has_rdpid = emulator_guest_has_rdpid, .set_nmi_mask = emulator_set_nmi_mask, - .get_hflags = emulator_get_hflags, + .is_smm = emulator_is_smm, + .is_guest_mode = emulator_is_guest_mode, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, @@ -8293,8 +8309,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); - ctxt->interruptibility = 0; ctxt->have_exception = false; ctxt->exception.vector = -1; @@ -9385,7 +9399,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); - kvm_init_pmu_capability(); + kvm_init_pmu_capability(ops->pmu_ops); r = ops->hardware_setup(); if (r != 0) @@ -12273,7 +12287,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, */ hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0); - if (IS_ERR((void *)hva)) + if (IS_ERR_VALUE(hva)) return (void __user *)hva; } else { if (!slot || !slot->npages) @@ -12488,16 +12502,14 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) { - struct kvm_arch *ka = &kvm->arch; + int nr_slots; if (!kvm_x86_ops.cpu_dirty_log_size) return; - if ((enable && ++ka->cpu_dirty_logging_count == 1) || - (!enable && --ka->cpu_dirty_logging_count == 0)) + nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging); + if ((enable && nr_slots == 1) || !nr_slots) kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING); - - WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0); } static void kvm_mmu_slot_apply_flags(struct kvm *kvm, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 44d1827f0a30..a8167b47b8c8 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -331,6 +331,18 @@ extern bool report_ignored_msrs; extern bool eager_page_split; +static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data); +} + +static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr); +} + static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 2681e2007e39..40edf4d1974c 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -23,6 +23,9 @@ #include <xen/interface/event_channel.h> #include <xen/interface/sched.h> +#include <asm/xen/cpuid.h> + +#include "cpuid.h" #include "trace.h" static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm); @@ -2077,6 +2080,29 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) del_timer_sync(&vcpu->arch.xen.poll_timer); } +void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *entry; + u32 function; + + if (!vcpu->arch.xen.cpuid.base) + return; + + function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3); + if (function > vcpu->arch.xen.cpuid.limit) + return; + + entry = kvm_find_cpuid_entry_index(vcpu, function, 1); + if (entry) { + entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul; + entry->edx = vcpu->arch.hv_clock.tsc_shift; + } + + entry = kvm_find_cpuid_entry_index(vcpu, function, 2); + if (entry) + entry->eax = vcpu->arch.hw_tsc_khz; +} + void kvm_xen_init_vm(struct kvm *kvm) { mutex_init(&kvm->arch.xen.xen_lock); diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index ea33d80a0c51..f8f1fe22d090 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -9,6 +9,8 @@ #ifndef __ARCH_X86_KVM_XEN_H__ #define __ARCH_X86_KVM_XEN_H__ +#include <asm/xen/hypervisor.h> + #ifdef CONFIG_KVM_XEN #include <linux/jump_label_ratelimit.h> @@ -32,6 +34,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, int kvm_xen_setup_evtchn(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); +void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu); static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { @@ -135,6 +138,10 @@ static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu) { return false; } + +static inline void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) +{ +} #endif int kvm_xen_hypercall(struct kvm_vcpu *vcpu); |