diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 23 | ||||
-rw-r--r-- | arch/x86/kvm/debugfs.c | 18 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 35 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c | 7 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_cache_regs.h | 42 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 111 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 24 | ||||
-rw-r--r-- | arch/x86/kvm/mtrr.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 40 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/pmu_amd.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 145 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/capabilities.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 365 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 34 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmenter.S | 12 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 132 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 12 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 314 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 12 |
24 files changed, 801 insertions, 560 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 72fa955f4a15..fc042419e670 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -27,7 +27,6 @@ config KVM depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS select MMU_NOTIFIER - select ANON_INODES select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select IRQ_BYPASS_MANAGER diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index fd3951638ae4..e18a9f9f65b5 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -410,7 +410,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | - F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP); + F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | + F(MD_CLEAR); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -455,8 +456,9 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } - /* function 4 has additional index. */ - case 4: { + /* functions 4 and 0x8000001d have additional index. */ + case 4: + case 0x8000001d: { int i, cache_type; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; @@ -700,8 +702,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx = entry->edx = 0; break; case 0x8000001a: - break; - case 0x8000001d: + case 0x8000001e: break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: @@ -962,13 +963,13 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0)) return 1; - eax = kvm_register_read(vcpu, VCPU_REGS_RAX); - ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + eax = kvm_rax_read(vcpu); + ecx = kvm_rcx_read(vcpu); kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true); - kvm_register_write(vcpu, VCPU_REGS_RAX, eax); - kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); - kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); - kvm_register_write(vcpu, VCPU_REGS_RDX, edx); + kvm_rax_write(vcpu, eax); + kvm_rbx_write(vcpu, ebx); + kvm_rcx_write(vcpu, ecx); + kvm_rdx_write(vcpu, edx); return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index c19c7ede9bd6..a2f3432ce090 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -9,12 +9,22 @@ */ #include <linux/kvm_host.h> #include <linux/debugfs.h> +#include "lapic.h" bool kvm_arch_has_vcpu_debugfs(void) { return true; } +static int vcpu_get_timer_advance_ns(void *data, u64 *val) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; + *val = vcpu->arch.apic->lapic_timer.timer_advance_ns; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_advance_ns_fops, vcpu_get_timer_advance_ns, NULL, "%llu\n"); + static int vcpu_get_tsc_offset(void *data, u64 *val) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; @@ -51,6 +61,14 @@ int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) if (!ret) return -ENOMEM; + if (lapic_in_kernel(vcpu)) { + ret = debugfs_create_file("lapic_timer_advance_ns", 0444, + vcpu->debugfs_dentry, + vcpu, &vcpu_timer_advance_ns_fops); + if (!ret) + return -ENOMEM; + } + if (kvm_has_tsc_control) { ret = debugfs_create_file("tsc-scaling-ratio", 0444, vcpu->debugfs_dentry, diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 421899f6ad7b..8ca4b39918e0 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1371,7 +1371,16 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; - all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS; + + /* + * Work around possible WS2012 bug: it sends hypercalls + * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear, + * while also expecting us to flush something and crashing if + * we don't. Let's treat processor_mask == 0 same as + * HV_FLUSH_ALL_PROCESSORS. + */ + all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) || + flush.processor_mask == 0; } else { if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex, sizeof(flush_ex)))) @@ -1526,10 +1535,10 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) longmode = is_64_bit_mode(vcpu); if (longmode) - kvm_register_write(vcpu, VCPU_REGS_RAX, result); + kvm_rax_write(vcpu, result); else { - kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32); - kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff); + kvm_rdx_write(vcpu, result >> 32); + kvm_rax_write(vcpu, result & 0xffffffff); } } @@ -1602,18 +1611,18 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) longmode = is_64_bit_mode(vcpu); if (!longmode) { - param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); - ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); - outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); + param = ((u64)kvm_rdx_read(vcpu) << 32) | + (kvm_rax_read(vcpu) & 0xffffffff); + ingpa = ((u64)kvm_rbx_read(vcpu) << 32) | + (kvm_rcx_read(vcpu) & 0xffffffff); + outgpa = ((u64)kvm_rdi_read(vcpu) << 32) | + (kvm_rsi_read(vcpu) & 0xffffffff); } #ifdef CONFIG_X86_64 else { - param = kvm_register_read(vcpu, VCPU_REGS_RCX); - ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); - outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + param = kvm_rcx_read(vcpu); + ingpa = kvm_rdx_read(vcpu); + outgpa = kvm_r8_read(vcpu); } #endif diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index faa264822cee..007bc654f928 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -172,3 +172,10 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu) __kvm_migrate_apic_timer(vcpu); __kvm_migrate_pit_timer(vcpu); } + +bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) +{ + bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE; + + return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm); +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index d5005cc26521..fd210cdd4983 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -114,6 +114,7 @@ static inline int irqchip_in_kernel(struct kvm *kvm) return mode != KVM_IRQCHIP_NONE; } +bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index f8f56a93358b..1cc6c47dc77e 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -9,6 +9,34 @@ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE) +#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ +static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ +{ \ + return vcpu->arch.regs[VCPU_REGS_##uname]; \ +} \ +static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu, \ + unsigned long val) \ +{ \ + vcpu->arch.regs[VCPU_REGS_##uname] = val; \ +} +BUILD_KVM_GPR_ACCESSORS(rax, RAX) +BUILD_KVM_GPR_ACCESSORS(rbx, RBX) +BUILD_KVM_GPR_ACCESSORS(rcx, RCX) +BUILD_KVM_GPR_ACCESSORS(rdx, RDX) +BUILD_KVM_GPR_ACCESSORS(rbp, RBP) +BUILD_KVM_GPR_ACCESSORS(rsi, RSI) +BUILD_KVM_GPR_ACCESSORS(rdi, RDI) +#ifdef CONFIG_X86_64 +BUILD_KVM_GPR_ACCESSORS(r8, R8) +BUILD_KVM_GPR_ACCESSORS(r9, R9) +BUILD_KVM_GPR_ACCESSORS(r10, R10) +BUILD_KVM_GPR_ACCESSORS(r11, R11) +BUILD_KVM_GPR_ACCESSORS(r12, R12) +BUILD_KVM_GPR_ACCESSORS(r13, R13) +BUILD_KVM_GPR_ACCESSORS(r14, R14) +BUILD_KVM_GPR_ACCESSORS(r15, R15) +#endif + static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) { @@ -37,6 +65,16 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) kvm_register_write(vcpu, VCPU_REGS_RIP, val); } +static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu) +{ + return kvm_register_read(vcpu, VCPU_REGS_RSP); +} + +static inline void kvm_rsp_write(struct kvm_vcpu *vcpu, unsigned long val) +{ + kvm_register_write(vcpu, VCPU_REGS_RSP, val); +} + static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) { might_sleep(); /* on svm */ @@ -83,8 +121,8 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) { - return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) - | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); + return (kvm_rax_read(vcpu) & -1u) + | ((u64)(kvm_rdx_read(vcpu) & -1u) << 32); } static inline void enter_guest_mode(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9bf70cf84564..4924f83ed4f3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -70,7 +70,6 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul -static bool lapic_timer_advance_adjust_done = false; #define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -1455,7 +1454,7 @@ static void apic_timer_expired(struct kvm_lapic *apic) if (swait_active(q)) swake_up_one(q); - if (apic_lvtt_tscdeadline(apic)) + if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) ktimer->expired_tscdeadline = ktimer->tscdeadline; } @@ -1482,14 +1481,32 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) return false; } +static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) +{ + u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns; + + /* + * If the guest TSC is running at a different ratio than the host, then + * convert the delay to nanoseconds to achieve an accurate delay. Note + * that __delay() uses delay_tsc whenever the hardware has TSC, thus + * always for VMX enabled hardware. + */ + if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) { + __delay(min(guest_cycles, + nsec_to_cycles(vcpu, timer_advance_ns))); + } else { + u64 delay_ns = guest_cycles * 1000000ULL; + do_div(delay_ns, vcpu->arch.virtual_tsc_khz); + ndelay(min_t(u32, delay_ns, timer_advance_ns)); + } +} + void wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; u64 guest_tsc, tsc_deadline, ns; - if (!lapic_in_kernel(vcpu)) - return; - if (apic->lapic_timer.expired_tscdeadline == 0) return; @@ -1501,33 +1518,37 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); - /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ if (guest_tsc < tsc_deadline) - __delay(min(tsc_deadline - guest_tsc, - nsec_to_cycles(vcpu, lapic_timer_advance_ns))); + __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - if (!lapic_timer_advance_adjust_done) { + if (!apic->lapic_timer.timer_advance_adjust_done) { /* too early */ if (guest_tsc < tsc_deadline) { ns = (tsc_deadline - guest_tsc) * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - lapic_timer_advance_ns -= min((unsigned int)ns, - lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns -= min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); } else { /* too late */ ns = (guest_tsc - tsc_deadline) * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - lapic_timer_advance_ns += min((unsigned int)ns, - lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns += min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); } if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) - lapic_timer_advance_adjust_done = true; + apic->lapic_timer.timer_advance_adjust_done = true; + if (unlikely(timer_advance_ns > 5000)) { + timer_advance_ns = 0; + apic->lapic_timer.timer_advance_adjust_done = true; + } + apic->lapic_timer.timer_advance_ns = timer_advance_ns; } } static void start_sw_tscdeadline(struct kvm_lapic *apic) { - u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; + struct kvm_timer *ktimer = &apic->lapic_timer; + u64 guest_tsc, tscdeadline = ktimer->tscdeadline; u64 ns = 0; ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; @@ -1542,13 +1563,15 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) now = ktime_get(); guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - if (likely(tscdeadline > guest_tsc)) { - ns = (tscdeadline - guest_tsc) * 1000000ULL; - do_div(ns, this_tsc_khz); + + ns = (tscdeadline - guest_tsc) * 1000000ULL; + do_div(ns, this_tsc_khz); + + if (likely(tscdeadline > guest_tsc) && + likely(ns > apic->lapic_timer.timer_advance_ns)) { expire = ktime_add_ns(now, ns); - expire = ktime_sub_ns(expire, lapic_timer_advance_ns); - hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS_PINNED); + expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); + hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED); } else apic_timer_expired(apic); @@ -1673,37 +1696,42 @@ static void cancel_hv_timer(struct kvm_lapic *apic) static bool start_hv_timer(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; - int r; + struct kvm_vcpu *vcpu = apic->vcpu; + bool expired; WARN_ON(preemptible()); if (!kvm_x86_ops->set_hv_timer) return false; - if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) - return false; - if (!ktimer->tscdeadline) return false; - r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); - if (r < 0) + if (kvm_x86_ops->set_hv_timer(vcpu, ktimer->tscdeadline, &expired)) return false; ktimer->hv_timer_in_use = true; hrtimer_cancel(&ktimer->timer); /* - * Also recheck ktimer->pending, in case the sw timer triggered in - * the window. For periodic timer, leave the hv timer running for - * simplicity, and the deadline will be recomputed on the next vmexit. + * To simplify handling the periodic timer, leave the hv timer running + * even if the deadline timer has expired, i.e. rely on the resulting + * VM-Exit to recompute the periodic timer's target expiration. */ - if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) { - if (r) + if (!apic_lvtt_period(apic)) { + /* + * Cancel the hv timer if the sw timer fired while the hv timer + * was being programmed, or if the hv timer itself expired. + */ + if (atomic_read(&ktimer->pending)) { + cancel_hv_timer(apic); + } else if (expired) { apic_timer_expired(apic); - return false; + cancel_hv_timer(apic); + } } - trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true); + trace_kvm_hv_timer_state(vcpu->vcpu_id, ktimer->hv_timer_in_use); + return true; } @@ -1727,8 +1755,13 @@ static void start_sw_timer(struct kvm_lapic *apic) static void restart_apic_timer(struct kvm_lapic *apic) { preempt_disable(); + + if (!apic_lvtt_period(apic) && atomic_read(&apic->lapic_timer.pending)) + goto out; + if (!start_hv_timer(apic)) start_sw_timer(apic); +out: preempt_enable(); } @@ -2255,7 +2288,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -int kvm_create_lapic(struct kvm_vcpu *vcpu) +int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) { struct kvm_lapic *apic; @@ -2279,6 +2312,14 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; + if (timer_advance_ns == -1) { + apic->lapic_timer.timer_advance_ns = 1000; + apic->lapic_timer.timer_advance_adjust_done = false; + } else { + apic->lapic_timer.timer_advance_ns = timer_advance_ns; + apic->lapic_timer.timer_advance_adjust_done = true; + } + /* * APIC is created enabled. This will prevent kvm_lapic_set_base from diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index ff6ef9c3d760..d6d049ba3045 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -31,8 +31,10 @@ struct kvm_timer { u32 timer_mode_mask; u64 tscdeadline; u64 expired_tscdeadline; + u32 timer_advance_ns; atomic_t pending; /* accumulated triggered timers */ bool hv_timer_in_use; + bool timer_advance_adjust_done; }; struct kvm_lapic { @@ -62,7 +64,7 @@ struct kvm_lapic { struct dest_map; -int kvm_create_lapic(struct kvm_vcpu *vcpu); +int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns); void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e10962dfc203..1e9ba81accba 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -44,6 +44,7 @@ #include <asm/page.h> #include <asm/pat.h> #include <asm/cmpxchg.h> +#include <asm/e820/api.h> #include <asm/io.h> #include <asm/vmx.h> #include <asm/kvm_page_track.h> @@ -487,16 +488,24 @@ static void kvm_mmu_reset_all_pte_masks(void) * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is * assumed that the CPU is not vulnerable to L1TF. + * + * Some Intel CPUs address the L1 cache using more PA bits than are + * reported by CPUID. Use the PA width of the L1 cache when possible + * to achieve more effective mitigation, e.g. if system RAM overlaps + * the most significant bits of legal physical address space. */ - low_phys_bits = boot_cpu_data.x86_phys_bits; - if (boot_cpu_data.x86_phys_bits < + shadow_nonpresent_or_rsvd_mask = 0; + low_phys_bits = boot_cpu_data.x86_cache_bits; + if (boot_cpu_data.x86_cache_bits < 52 - shadow_nonpresent_or_rsvd_mask_len) { shadow_nonpresent_or_rsvd_mask = - rsvd_bits(boot_cpu_data.x86_phys_bits - + rsvd_bits(boot_cpu_data.x86_cache_bits - shadow_nonpresent_or_rsvd_mask_len, - boot_cpu_data.x86_phys_bits - 1); + boot_cpu_data.x86_cache_bits - 1); low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len; - } + } else + WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF)); + shadow_nonpresent_or_rsvd_lower_gfn_mask = GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); } @@ -2892,7 +2901,9 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) */ (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); - return true; + return !e820__mapped_raw_any(pfn_to_hpa(pfn), + pfn_to_hpa(pfn + 1) - 1, + E820_TYPE_RAM); } /* Bits which may be returned by set_spte() */ @@ -4781,6 +4792,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) union kvm_mmu_extended_role ext = {0}; ext.cr0_pg = !!is_paging(vcpu); + ext.cr4_pae = !!is_pae(vcpu); ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); ext.cr4_pse = !!is_pse(vcpu); diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index e9ea2d45ae66..9f72cc427158 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -48,11 +48,6 @@ static bool msr_mtrr_valid(unsigned msr) return false; } -static bool valid_pat_type(unsigned t) -{ - return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ -} - static bool valid_mtrr_type(unsigned t) { return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ @@ -67,10 +62,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) return false; if (msr == MSR_IA32_CR_PAT) { - for (i = 0; i < 8; i++) - if (!valid_pat_type((data >> (i * 8)) & 0xff)) - return false; - return true; + return kvm_pat_valid(data); } else if (msr == MSR_MTRRdefType) { if (data & ~0xcff) return false; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6bdca39829bc..367a47df4ba0 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -140,16 +140,36 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t *table; struct page *page; - npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); - /* Check if the user is doing something meaningless. */ - if (unlikely(npages != 1)) - return -EFAULT; - - table = kmap_atomic(page); - ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table); - - kvm_release_page_dirty(page); + npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page); + if (likely(npages == 1)) { + table = kmap_atomic(page); + ret = CMPXCHG(&table[index], orig_pte, new_pte); + kunmap_atomic(table); + + kvm_release_page_dirty(page); + } else { + struct vm_area_struct *vma; + unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK; + unsigned long pfn; + unsigned long paddr; + + down_read(¤t->mm->mmap_sem); + vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE); + if (!vma || !(vma->vm_flags & VM_PFNMAP)) { + up_read(¤t->mm->mmap_sem); + return -EFAULT; + } + pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + paddr = pfn << PAGE_SHIFT; + table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB); + if (!table) { + up_read(¤t->mm->mmap_sem); + return -EFAULT; + } + ret = CMPXCHG(&table[index], orig_pte, new_pte); + memunmap(table); + up_read(¤t->mm->mmap_sem); + } return (ret != orig_pte); } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index e39741997893..dd745b58ffd8 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -283,7 +283,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) bool fast_mode = idx & (1u << 31); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; - u64 ctr_val; + u64 mask = fast_mode ? ~0u : ~0ull; if (!pmu->version) return 1; @@ -291,15 +291,11 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); - pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx); + pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx, &mask); if (!pmc) return 1; - ctr_val = pmc_read_counter(pmc); - if (fast_mode) - ctr_val = (u32)ctr_val; - - *data = ctr_val; + *data = pmc_read_counter(pmc) & mask; return 0; } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ba8898e1a854..22dff661145a 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -25,7 +25,8 @@ struct kvm_pmu_ops { unsigned (*find_fixed_event)(int idx); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); - struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx); + struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx, + u64 *mask); int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx); bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data); diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index 1495a735b38e..d3118088f1cd 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -186,7 +186,7 @@ static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) } /* idx is the ECX register of RDPMC instruction */ -static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx) +static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *mask) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *counters; @@ -269,10 +269,10 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; pmu->reserved_bits = 0xffffffff00200000ull; + pmu->version = 1; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->nr_arch_fixed_counters = 0; - pmu->version = 0; pmu->global_status = 0; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 406b558abfef..735b8c01895e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -379,6 +379,9 @@ module_param(vgif, int, 0444); static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); module_param(sev, int, 0444); +static bool __read_mostly dump_invalid_vmcb = 0; +module_param(dump_invalid_vmcb, bool, 0644); + static u8 rsm_ins_bytes[] = "\x0f\xaa"; static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); @@ -1805,7 +1808,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return NULL; /* Pin the user virtual address. */ - npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); goto err; @@ -2024,7 +2027,11 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (!kvm_vcpu_apicv_active(vcpu)) return; - if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT)) + /* + * Since the host physical APIC id is 8 bits, + * we can support host APIC ID upto 255. + */ + if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); @@ -2091,7 +2098,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) init_vmcb(svm); kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true); - kvm_register_write(vcpu, VCPU_REGS_RDX, eax); + kvm_rdx_write(vcpu, eax); if (kvm_vcpu_apicv_active(vcpu) && !init_event) avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); @@ -3071,32 +3078,6 @@ static inline bool nested_svm_nmi(struct vcpu_svm *svm) return false; } -static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) -{ - struct page *page; - - might_sleep(); - - page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) - goto error; - - *_page = page; - - return kmap(page); - -error: - kvm_inject_gp(&svm->vcpu, 0); - - return NULL; -} - -static void nested_svm_unmap(struct page *page) -{ - kunmap(page); - kvm_release_page_dirty(page); -} - static int nested_svm_intercept_ioio(struct vcpu_svm *svm) { unsigned port, size, iopm_len; @@ -3299,10 +3280,11 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr static int nested_svm_vmexit(struct vcpu_svm *svm) { + int rc; struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; - struct page *page; + struct kvm_host_map map; trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, vmcb->control.exit_info_1, @@ -3311,9 +3293,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) vmcb->control.exit_int_info_err, KVM_ISA_SVM); - nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); - if (!nested_vmcb) + rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(svm->nested.vmcb), &map); + if (rc) { + if (rc == -EINVAL) + kvm_inject_gp(&svm->vcpu, 0); return 1; + } + + nested_vmcb = map.hva; /* Exit Guest-Mode */ leave_guest_mode(&svm->vcpu); @@ -3408,16 +3395,16 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) } else { (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); } - kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); - kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); - kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip); + kvm_rax_write(&svm->vcpu, hsave->save.rax); + kvm_rsp_write(&svm->vcpu, hsave->save.rsp); + kvm_rip_write(&svm->vcpu, hsave->save.rip); svm->vmcb->save.dr7 = 0; svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; mark_all_dirty(svm->vmcb); - nested_svm_unmap(page); + kvm_vcpu_unmap(&svm->vcpu, &map, true); nested_svm_uninit_mmu_context(&svm->vcpu); kvm_mmu_reset_context(&svm->vcpu); @@ -3483,7 +3470,7 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) } static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb, struct page *page) + struct vmcb *nested_vmcb, struct kvm_host_map *map) { if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; @@ -3516,9 +3503,9 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, kvm_mmu_reset_context(&svm->vcpu); svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; - kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); - kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); - kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); + kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); + kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp); + kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip); /* In case we don't even reach vcpu_run, the fields are not updated */ svm->vmcb->save.rax = nested_vmcb->save.rax; @@ -3567,7 +3554,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->vmcb->control.pause_filter_thresh = nested_vmcb->control.pause_filter_thresh; - nested_svm_unmap(page); + kvm_vcpu_unmap(&svm->vcpu, map, true); /* Enter Guest-Mode */ enter_guest_mode(&svm->vcpu); @@ -3587,17 +3574,23 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, static bool nested_svm_vmrun(struct vcpu_svm *svm) { + int rc; struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; - struct page *page; + struct kvm_host_map map; u64 vmcb_gpa; vmcb_gpa = svm->vmcb->save.rax; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); - if (!nested_vmcb) + rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(vmcb_gpa), &map); + if (rc) { + if (rc == -EINVAL) + kvm_inject_gp(&svm->vcpu, 0); return false; + } + + nested_vmcb = map.hva; if (!nested_vmcb_checks(nested_vmcb)) { nested_vmcb->control.exit_code = SVM_EXIT_ERR; @@ -3605,7 +3598,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) nested_vmcb->control.exit_info_1 = 0; nested_vmcb->control.exit_info_2 = 0; - nested_svm_unmap(page); + kvm_vcpu_unmap(&svm->vcpu, &map, true); return false; } @@ -3649,7 +3642,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) copy_vmcb_control_area(hsave, vmcb); - enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page); + enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map); return true; } @@ -3673,21 +3666,26 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; - struct page *page; + struct kvm_host_map map; int ret; if (nested_svm_check_permissions(svm)) return 1; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); - if (!nested_vmcb) + ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); + if (ret) { + if (ret == -EINVAL) + kvm_inject_gp(&svm->vcpu, 0); return 1; + } + + nested_vmcb = map.hva; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(nested_vmcb, svm->vmcb); - nested_svm_unmap(page); + kvm_vcpu_unmap(&svm->vcpu, &map, true); return ret; } @@ -3695,21 +3693,26 @@ static int vmload_interception(struct vcpu_svm *svm) static int vmsave_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; - struct page *page; + struct kvm_host_map map; int ret; if (nested_svm_check_permissions(svm)) return 1; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); - if (!nested_vmcb) + ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); + if (ret) { + if (ret == -EINVAL) + kvm_inject_gp(&svm->vcpu, 0); return 1; + } + + nested_vmcb = map.hva; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(svm->vmcb, nested_vmcb); - nested_svm_unmap(page); + kvm_vcpu_unmap(&svm->vcpu, &map, true); return ret; } @@ -3791,11 +3794,11 @@ static int invlpga_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX), - kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); + trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu), + kvm_rax_read(&svm->vcpu)); /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ - kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); + kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu)); svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; return kvm_skip_emulated_instruction(&svm->vcpu); @@ -3803,7 +3806,7 @@ static int invlpga_interception(struct vcpu_svm *svm) static int skinit_interception(struct vcpu_svm *svm) { - trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); + trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu)); kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; @@ -3817,7 +3820,7 @@ static int wbinvd_interception(struct vcpu_svm *svm) static int xsetbv_interception(struct vcpu_svm *svm) { u64 new_bv = kvm_read_edx_eax(&svm->vcpu); - u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); + u32 index = kvm_rcx_read(&svm->vcpu); if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; @@ -4213,7 +4216,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static int rdmsr_interception(struct vcpu_svm *svm) { - u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); + u32 ecx = kvm_rcx_read(&svm->vcpu); struct msr_data msr_info; msr_info.index = ecx; @@ -4225,10 +4228,8 @@ static int rdmsr_interception(struct vcpu_svm *svm) } else { trace_kvm_msr_read(ecx, msr_info.data); - kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, - msr_info.data & 0xffffffff); - kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, - msr_info.data >> 32); + kvm_rax_write(&svm->vcpu, msr_info.data & 0xffffffff); + kvm_rdx_write(&svm->vcpu, msr_info.data >> 32); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; return kvm_skip_emulated_instruction(&svm->vcpu); } @@ -4422,7 +4423,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) static int wrmsr_interception(struct vcpu_svm *svm) { struct msr_data msr; - u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); + u32 ecx = kvm_rcx_read(&svm->vcpu); u64 data = kvm_read_edx_eax(&svm->vcpu); msr.data = data; @@ -4830,6 +4831,11 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; + if (!dump_invalid_vmcb) { + pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); + return; + } + pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); @@ -4988,7 +4994,6 @@ static int handle_exit(struct kvm_vcpu *vcpu) kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; - pr_err("KVM: FAILED VMRUN WITH VMCB:\n"); dump_vmcb(vcpu); return 0; } @@ -6236,7 +6241,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *nested_vmcb; - struct page *page; + struct kvm_host_map map; u64 guest; u64 vmcb; @@ -6244,10 +6249,10 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); if (guest) { - nested_vmcb = nested_svm_map(svm, vmcb, &page); - if (!nested_vmcb) + if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL) return 1; - enter_svm_guest_mode(svm, vmcb, nested_vmcb, page); + nested_vmcb = map.hva; + enter_svm_guest_mode(svm, vmcb, nested_vmcb, &map); } return 0; } diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 854e144131c6..d6664ee3d127 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -2,6 +2,8 @@ #ifndef __KVM_X86_VMX_CAPS_H #define __KVM_X86_VMX_CAPS_H +#include <asm/vmx.h> + #include "lapic.h" extern bool __read_mostly enable_vpid; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6401eb7ef19c..1032f068f0b9 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -193,10 +193,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) if (!vmx->nested.hv_evmcs) return; - kunmap(vmx->nested.hv_evmcs_page); - kvm_release_page_dirty(vmx->nested.hv_evmcs_page); + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); vmx->nested.hv_evmcs_vmptr = -1ull; - vmx->nested.hv_evmcs_page = NULL; vmx->nested.hv_evmcs = NULL; } @@ -229,16 +227,9 @@ static void free_nested(struct kvm_vcpu *vcpu) kvm_release_page_dirty(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } - if (vmx->nested.virtual_apic_page) { - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; - } + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); + vmx->nested.pi_desc = NULL; kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); @@ -519,39 +510,19 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { int msr; - struct page *page; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; - /* - * pred_cmd & spec_ctrl are trying to verify two things: - * - * 1. L0 gave a permission to L1 to actually passthrough the MSR. This - * ensures that we do not accidentally generate an L02 MSR bitmap - * from the L12 MSR bitmap that is too permissive. - * 2. That L1 or L2s have actually used the MSR. This avoids - * unnecessarily merging of the bitmap if the MSR is unused. This - * works properly because we only update the L01 MSR bitmap lazily. - * So even if L0 should pass L1 these MSRs, the L01 bitmap is only - * updated to reflect this when L1 (or its L2s) actually write to - * the MSR. - */ - bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); - bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); + struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return false; - if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && - !pred_cmd && !spec_ctrl) - return false; - - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); - if (is_error_page(page)) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) return false; - msr_bitmap_l1 = (unsigned long *)kmap(page); + msr_bitmap_l1 = (unsigned long *)map->hva; /* * To keep the control flow simple, pay eight 8-byte writes (sixteen @@ -592,20 +563,42 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, } } - if (spec_ctrl) + /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ + nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, + MSR_FS_BASE, MSR_TYPE_RW); + + nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, + MSR_GS_BASE, MSR_TYPE_RW); + + nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, + MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + + /* + * Checking the L0->L1 bitmap is trying to verify two things: + * + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This + * ensures that we do not accidentally generate an L02 MSR bitmap + * from the L12 MSR bitmap that is too permissive. + * 2. That L1 or L2s have actually used the MSR. This avoids + * unnecessarily merging of the bitmap if the MSR is unused. This + * works properly because we only update the L01 MSR bitmap lazily. + * So even if L0 should pass L1 these MSRs, the L01 bitmap is only + * updated to reflect this when L1 (or its L2s) actually write to + * the MSR. + */ + if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_SPEC_CTRL, MSR_TYPE_R | MSR_TYPE_W); - if (pred_cmd) + if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PRED_CMD, MSR_TYPE_W); - kunmap(page); - kvm_release_page_clean(page); + kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); return true; } @@ -613,20 +606,20 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + struct kvm_host_map map; struct vmcs12 *shadow; - struct page *page; if (!nested_cpu_has_shadow_vmcs(vmcs12) || vmcs12->vmcs_link_pointer == -1ull) return; shadow = get_shadow_vmcs12(vcpu); - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); - memcpy(shadow, kmap(page), VMCS12_SIZE); + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) + return; - kunmap(page); - kvm_release_page_clean(page); + memcpy(shadow, map.hva, VMCS12_SIZE); + kvm_vcpu_unmap(vcpu, &map, false); } static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, @@ -930,7 +923,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { if (!nested_cr3_valid(vcpu, cr3)) { *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; + return -EINVAL; } /* @@ -941,7 +934,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne !nested_ept) { if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { *entry_failure_code = ENTRY_FAIL_PDPTE; - return 1; + return -EINVAL; } } } @@ -1794,13 +1787,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, nested_release_evmcs(vcpu); - vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( - vcpu, assist_page.current_nested_vmcs); - - if (unlikely(is_error_page(vmx->nested.hv_evmcs_page))) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(assist_page.current_nested_vmcs), + &vmx->nested.hv_evmcs_map)) return 0; - vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); + vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; /* * Currently, KVM only supports eVMCS version 1 @@ -2373,19 +2364,19 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, */ if (vmx->emulation_required) { *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; + return -EINVAL; } /* Shadow page tables on either EPT or shadow page tables. */ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), entry_failure_code)) - return 1; + return -EINVAL; if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); - kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); + kvm_rsp_write(vcpu, vmcs12->guest_rsp); + kvm_rip_write(vcpu, vmcs12->guest_rip); return 0; } @@ -2589,11 +2580,19 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, return 0; } -/* - * Checks related to Host Control Registers and MSRs - */ -static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_check_vm_execution_controls(vcpu, vmcs12) || + nested_check_vm_exit_controls(vcpu, vmcs12) || + nested_check_vm_entry_controls(vcpu, vmcs12)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { bool ia32e; @@ -2606,6 +2605,10 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)) return -EINVAL; + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && + !kvm_pat_valid(vmcs12->host_ia32_pat)) + return -EINVAL; + /* * If the load IA32_EFER VM-exit control is 1, bits reserved in the * IA32_EFER MSR must be 0 in the field for that register. In addition, @@ -2624,41 +2627,12 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, return 0; } -/* - * Checks related to Guest Non-register State - */ -static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) -{ - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) - return -EINVAL; - - return 0; -} - -static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (nested_check_vm_execution_controls(vcpu, vmcs12) || - nested_check_vm_exit_controls(vcpu, vmcs12) || - nested_check_vm_entry_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_check_host_control_regs(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; - - if (nested_check_guest_non_reg_state(vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - return 0; -} - static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int r; - struct page *page; + int r = 0; struct vmcs12 *shadow; + struct kvm_host_map map; if (vmcs12->vmcs_link_pointer == -1ull) return 0; @@ -2666,23 +2640,34 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) return -EINVAL; - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); - if (is_error_page(page)) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) return -EINVAL; - r = 0; - shadow = kmap(page); + shadow = map.hva; + if (shadow->hdr.revision_id != VMCS12_REVISION || shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) r = -EINVAL; - kunmap(page); - kvm_release_page_clean(page); + + kvm_vcpu_unmap(vcpu, &map, false); return r; } -static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, - u32 *exit_qual) +/* + * Checks related to Guest Non-register State + */ +static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) +{ + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + u32 *exit_qual) { bool ia32e; @@ -2690,11 +2675,15 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) - return 1; + return -EINVAL; + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && + !kvm_pat_valid(vmcs12->guest_ia32_pat)) + return -EINVAL; if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; - return 1; + return -EINVAL; } /* @@ -2713,13 +2702,16 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || ((vmcs12->guest_cr0 & X86_CR0_PG) && ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) - return 1; + return -EINVAL; } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && - (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || - (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) - return 1; + (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || + (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) + return -EINVAL; + + if (nested_check_guest_non_reg_state(vmcs12)) + return -EINVAL; return 0; } @@ -2792,14 +2784,13 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) : "cc", "memory" ); - preempt_enable(); - if (vmx->msr_autoload.host.nr) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); if (vmx->msr_autoload.guest.nr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); if (vm_fail) { + preempt_enable(); WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; @@ -2811,6 +2802,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) local_irq_enable(); if (hw_breakpoint_active()) set_debugreg(__this_cpu_read(cpu_dr7), 7); + preempt_enable(); /* * A non-failing VMEntry means we somehow entered guest mode with @@ -2832,6 +2824,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_host_map *map; struct page *page; u64 hpa; @@ -2864,20 +2857,14 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) } if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - if (vmx->nested.virtual_apic_page) { /* shouldn't happen */ - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr); + map = &vmx->nested.virtual_apic_map; /* * If translation failed, VM entry will fail because * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. */ - if (!is_error_page(page)) { - vmx->nested.virtual_apic_page = page; - hpa = page_to_phys(vmx->nested.virtual_apic_page); - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { @@ -2898,26 +2885,15 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) } if (nested_cpu_has_posted_intr(vmcs12)) { - if (vmx->nested.pi_desc_page) { /* shouldn't happen */ - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; - vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull); + map = &vmx->nested.pi_desc_map; + + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { + vmx->nested.pi_desc = + (struct pi_desc *)(((void *)map->hva) + + offset_in_page(vmcs12->posted_intr_desc_addr)); + vmcs_write64(POSTED_INTR_DESC_ADDR, + pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr); - if (is_error_page(page)) - return; - vmx->nested.pi_desc_page = page; - vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page); - vmx->nested.pi_desc = - (struct pi_desc *)((void *)vmx->nested.pi_desc + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); - vmcs_write64(POSTED_INTR_DESC_ADDR, - page_to_phys(vmx->nested.pi_desc_page) + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); } if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, @@ -3000,7 +2976,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) return -1; } - if (nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) goto vmentry_fail_vmexit; } @@ -3145,9 +3121,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - ret = nested_vmx_check_vmentry_prereqs(vcpu, vmcs12); - if (ret) - return nested_vmx_failValid(vcpu, ret); + if (nested_vmx_check_controls(vcpu, vmcs12)) + return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + if (nested_vmx_check_host_state(vcpu, vmcs12)) + return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); /* * We're finally done with prerequisite checking, and can start with @@ -3310,11 +3288,12 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr != 256) { - vapic_page = kmap(vmx->nested.virtual_apic_page); + vapic_page = vmx->nested.virtual_apic_map.hva; + if (!vapic_page) + return; + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page, &max_irr); - kunmap(vmx->nested.virtual_apic_page); - status = vmcs_read16(GUEST_INTR_STATUS); if ((u8)max_irr > ((u8)status & 0xff)) { status &= ~0xff; @@ -3425,8 +3404,8 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); - vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); + vmcs12->guest_rsp = kvm_rsp_read(vcpu); + vmcs12->guest_rip = kvm_rip_read(vcpu); vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); @@ -3609,8 +3588,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); vmx_set_efer(vcpu, vcpu->arch.efer); - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); - kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); + kvm_rsp_write(vcpu, vmcs12->host_rsp); + kvm_rip_write(vcpu, vmcs12->host_rip); vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); vmx_set_interrupt_shadow(vcpu, 0); @@ -3955,16 +3934,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, kvm_release_page_dirty(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } - if (vmx->nested.virtual_apic_page) { - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; - } + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); + vmx->nested.pi_desc = NULL; /* * We are now running in L2, mmu_notifier will force to reload the @@ -4260,7 +4232,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) { int ret; gpa_t vmptr; - struct page *page; + uint32_t revision; struct vcpu_vmx *vmx = to_vmx(vcpu); const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; @@ -4306,21 +4278,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu) * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; * which replaces physical address width with 32 */ - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failInvalid(vcpu); - page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) + if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || + revision != VMCS12_REVISION) return nested_vmx_failInvalid(vcpu); - if (*(u32 *)kmap(page) != VMCS12_REVISION) { - kunmap(page); - kvm_release_page_clean(page); - return nested_vmx_failInvalid(vcpu); - } - kunmap(page); - kvm_release_page_clean(page); - vmx->nested.vmxon_ptr = vmptr; ret = enter_vmx_operation(vcpu); if (ret) @@ -4377,7 +4341,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); @@ -4385,7 +4349,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - if (vmx->nested.hv_evmcs_page) { + if (vmx->nested.hv_evmcs_map.hva) { if (vmptr == vmx->nested.hv_evmcs_vmptr) nested_release_evmcs(vcpu); } else { @@ -4584,7 +4548,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); @@ -4597,11 +4561,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return 1; if (vmx->nested.current_vmptr != vmptr) { + struct kvm_host_map map; struct vmcs12 *new_vmcs12; - struct page *page; - page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) { + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { /* * Reads from an unbacked page return all 1s, * which means that the 32 bits located at the @@ -4611,12 +4574,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } - new_vmcs12 = kmap(page); + + new_vmcs12 = map.hva; + if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || (new_vmcs12->hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { - kunmap(page); - kvm_release_page_clean(page); + kvm_vcpu_unmap(vcpu, &map, false); return nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } @@ -4628,8 +4592,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) * cached. */ memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); - kunmap(page); - kvm_release_page_clean(page); + kvm_vcpu_unmap(vcpu, &map, false); set_current_vmptr(vmx, vmptr); } @@ -4804,7 +4767,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; + u32 index = kvm_rcx_read(vcpu); u64 address; bool accessed_dirty; struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -4850,7 +4813,7 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12; - u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; + u32 function = kvm_rax_read(vcpu); /* * VMFUNC is only supported for nested guests, but we always enable the @@ -4936,7 +4899,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 exit_reason) { - u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; + u32 msr_index = kvm_rcx_read(vcpu); gpa_t bitmap; if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) @@ -5373,9 +5336,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->format != 0) return -EINVAL; - if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) - nested_enable_evmcs(vcpu, NULL); - if (!nested_vmx_allowed(vcpu)) return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; @@ -5417,13 +5377,16 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->vmx.vmxon_pa == -1ull) return 0; + if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) + nested_enable_evmcs(vcpu, NULL); + vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; ret = enter_vmx_operation(vcpu); if (ret) return ret; /* Empty 'VMXON' state is permitted */ - if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) + if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) return 0; if (kvm_state->vmx.vmcs_pa != -1ull) { @@ -5463,33 +5426,41 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, vmx->nested.nested_run_pending = !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + ret = -EINVAL; if (nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != -1ull) { struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); - if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) - return -EINVAL; + if (kvm_state->size < sizeof(*kvm_state) + VMCS12_SIZE + sizeof(*vmcs12)) + goto error_guest_mode; if (copy_from_user(shadow_vmcs12, user_kvm_nested_state->data + VMCS12_SIZE, - sizeof(*vmcs12))) - return -EFAULT; + sizeof(*vmcs12))) { + ret = -EFAULT; + goto error_guest_mode; + } if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || !shadow_vmcs12->hdr.shadow_vmcs) - return -EINVAL; + goto error_guest_mode; } - if (nested_vmx_check_vmentry_prereqs(vcpu, vmcs12) || - nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) - return -EINVAL; + if (nested_vmx_check_controls(vcpu, vmcs12) || + nested_vmx_check_host_state(vcpu, vmcs12) || + nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) + goto error_guest_mode; vmx->nested.dirty_vmcs12 = true; ret = nested_vmx_enter_non_root_mode(vcpu, false); if (ret) - return -EINVAL; + goto error_guest_mode; return 0; + +error_guest_mode: + vmx->nested.nested_run_pending = 0; + return ret; } void nested_vmx_vcpu_setup(void) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5ab4a364348e..a99613a060dd 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -126,7 +126,7 @@ static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) } static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, - unsigned idx) + unsigned idx, u64 *mask) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); @@ -138,6 +138,7 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, if (fixed && idx >= pmu->nr_arch_fixed_counters) return NULL; counters = fixed ? pmu->fixed_counters : pmu->gp_counters; + *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP]; return &counters[idx]; } @@ -183,9 +184,13 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) *data = pmu->global_ovf_ctrl; return 0; default: - if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || - (pmc = get_fixed_pmc(pmu, msr))) { - *data = pmc_read_counter(pmc); + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) { + u64 val = pmc_read_counter(pmc); + *data = val & pmu->counter_bitmask[KVM_PMC_GP]; + return 0; + } else if ((pmc = get_fixed_pmc(pmu, msr))) { + u64 val = pmc_read_counter(pmc); + *data = val & pmu->counter_bitmask[KVM_PMC_FIXED]; return 0; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { *data = pmc->eventsel; @@ -227,7 +232,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { + if (!(data & pmu->global_ovf_ctrl_mask)) { if (!msr_info->host_initiated) pmu->global_status &= ~data; pmu->global_ovf_ctrl = data; @@ -235,11 +240,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; default: - if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || - (pmc = get_fixed_pmc(pmu, msr))) { - if (!msr_info->host_initiated) - data = (s64)(s32)data; - pmc->counter += data - pmc_read_counter(pmc); + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) { + if (msr_info->host_initiated) + pmc->counter = data; + else + pmc->counter = (s32)data; + return 0; + } else if ((pmc = get_fixed_pmc(pmu, msr))) { + pmc->counter = data; return 0; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) @@ -297,6 +305,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; + pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask + & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | + MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); + if (kvm_x86_ops->pt_supported()) + pmu->global_ovf_ctrl_mask &= + ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry(vcpu, 7, 0); if (entry && diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 7b272738c576..d4cb1945b2e3 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -3,6 +3,7 @@ #include <asm/asm.h> #include <asm/bitsperlong.h> #include <asm/kvm_vcpu_regs.h> +#include <asm/nospec-branch.h> #define WORD_SIZE (BITS_PER_LONG / 8) @@ -77,6 +78,17 @@ ENDPROC(vmx_vmenter) * referred to by VMCS.HOST_RIP. */ ENTRY(vmx_vmexit) +#ifdef CONFIG_RETPOLINE + ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE + /* Preserve guest's RAX, it's used to stuff the RSB. */ + push %_ASM_AX + + /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE + + pop %_ASM_AX +.Lvmexit_skip_rsb: +#endif ret ENDPROC(vmx_vmexit) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b4e7d645275a..b93e36ddee5e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -114,6 +114,9 @@ static u64 __read_mostly host_xss; bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +static bool __read_mostly dump_invalid_vmcs = 0; +module_param(dump_invalid_vmcs, bool, 0644); + #define MSR_BITMAP_MODE_X2APIC 1 #define MSR_BITMAP_MODE_X2APIC_APICV 2 @@ -1692,6 +1695,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_IA32_POWER_CTL: + msr_info->data = vmx->msr_ia32_power_ctl; + break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -1822,6 +1828,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; + case MSR_IA32_POWER_CTL: + vmx->msr_ia32_power_ctl = data; + break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -1891,7 +1900,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + if (!kvm_pat_valid(data)) return 1; vmcs_write64(GUEST_IA32_PAT, data); vcpu->arch.pat = data; @@ -2288,7 +2297,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | - VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_CLEAR_BNDCFGS | @@ -3619,14 +3627,13 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || !nested_cpu_has_vid(get_vmcs12(vcpu)) || - WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) + WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) return false; rvi = vmx_get_rvi(); - vapic_page = kmap(vmx->nested.virtual_apic_page); + vapic_page = vmx->nested.virtual_apic_map.hva; vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); - kunmap(vmx->nested.virtual_apic_page); return ((rvi & 0xf0) > (vppr & 0xf0)); } @@ -4827,7 +4834,7 @@ static int handle_cpuid(struct kvm_vcpu *vcpu) static int handle_rdmsr(struct kvm_vcpu *vcpu) { - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + u32 ecx = kvm_rcx_read(vcpu); struct msr_data msr_info; msr_info.index = ecx; @@ -4840,18 +4847,16 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) trace_kvm_msr_read(ecx, msr_info.data); - /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; - vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; + kvm_rax_write(vcpu, msr_info.data & -1u); + kvm_rdx_write(vcpu, (msr_info.data >> 32) & -1u); return kvm_skip_emulated_instruction(vcpu); } static int handle_wrmsr(struct kvm_vcpu *vcpu) { struct msr_data msr; - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) - | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); + u32 ecx = kvm_rcx_read(vcpu); + u64 data = kvm_read_edx_eax(vcpu); msr.data = data; msr.index = ecx; @@ -4922,7 +4927,7 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu) static int handle_xsetbv(struct kvm_vcpu *vcpu) { u64 new_bv = kvm_read_edx_eax(vcpu); - u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); + u32 index = kvm_rcx_read(vcpu); if (kvm_set_xcr(vcpu, index, new_bv) == 0) return kvm_skip_emulated_instruction(vcpu); @@ -5605,15 +5610,24 @@ static void vmx_dump_dtsel(char *name, uint32_t limit) void dump_vmcs(void) { - u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); - u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); - u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); - u32 secondary_exec_control = 0; - unsigned long cr4 = vmcs_readl(GUEST_CR4); - u64 efer = vmcs_read64(GUEST_IA32_EFER); + u32 vmentry_ctl, vmexit_ctl; + u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; + unsigned long cr4; + u64 efer; int i, n; + if (!dump_invalid_vmcs) { + pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); + return; + } + + vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); + vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); + cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); + cr4 = vmcs_readl(GUEST_CR4); + efer = vmcs_read64(GUEST_IA32_EFER); + secondary_exec_control = 0; if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); @@ -5723,8 +5737,16 @@ void dump_vmcs(void) if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) pr_err("TSC Multiplier = 0x%016llx\n", vmcs_read64(TSC_MULTIPLIER)); - if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) - pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { + u16 status = vmcs_read16(GUEST_INTR_STATUS); + pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); + } + pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) + pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); + pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); + } if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) @@ -6431,8 +6453,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); + /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); + else if (static_branch_unlikely(&mds_user_clear)) + mds_clear_cpu_buffers(); if (vcpu->arch.cr2 != read_cr2()) write_cr2(vcpu->arch.cr2); @@ -6462,9 +6487,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); - /* Eliminate branch target predictions from guest mode */ - vmexit_fill_RSB(); - /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) current_evmcs->hv_clean_fields |= @@ -6503,7 +6525,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ if (static_cpu_has(X86_FEATURE_PKU) && kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { - vcpu->arch.pkru = __read_pkru(); + vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vmx->host_pkru) __write_pkru(vmx->host_pkru); } @@ -6671,8 +6693,8 @@ free_partial_vcpu: return ERR_PTR(err); } -#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" -#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" static int vmx_vm_init(struct kvm *kvm) { @@ -6856,30 +6878,6 @@ static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) } } -static bool guest_cpuid_has_pmu(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *entry; - union cpuid10_eax eax; - - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry) - return false; - - eax.full = entry->eax; - return (eax.split.version_id > 0); -} - -static void nested_vmx_procbased_ctls_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - bool pmu_enabled = guest_cpuid_has_pmu(vcpu); - - if (pmu_enabled) - vmx->nested.msrs.procbased_ctls_high |= CPU_BASED_RDPMC_EXITING; - else - vmx->nested.msrs.procbased_ctls_high &= ~CPU_BASED_RDPMC_EXITING; -} - static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6968,7 +6966,6 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (nested_vmx_allowed(vcpu)) { nested_vmx_cr_fixed1_bits_update(vcpu); nested_vmx_entry_exit_ctls_update(vcpu); - nested_vmx_procbased_ctls_update(vcpu); } if (boot_cpu_has(X86_FEATURE_INTEL_PT) && @@ -7028,10 +7025,12 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift, return 0; } -static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) +static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired) { struct vcpu_vmx *vmx; u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; + struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; if (kvm_mwait_in_guest(vcpu->kvm)) return -EOPNOTSUPP; @@ -7040,7 +7039,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) tscl = rdtsc(); guest_tscl = kvm_read_l1_tsc(vcpu, tscl); delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; - lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns); + lapic_timer_advance_cycles = nsec_to_cycles(vcpu, + ktimer->timer_advance_ns); if (delta_tsc > lapic_timer_advance_cycles) delta_tsc -= lapic_timer_advance_cycles; @@ -7049,10 +7049,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) /* Convert to host delta tsc if tsc scaling is enabled */ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && - u64_shl_div_u64(delta_tsc, + delta_tsc && u64_shl_div_u64(delta_tsc, kvm_tsc_scaling_ratio_frac_bits, - vcpu->arch.tsc_scaling_ratio, - &delta_tsc)) + vcpu->arch.tsc_scaling_ratio, &delta_tsc)) return -ERANGE; /* @@ -7065,7 +7064,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) return -ERANGE; vmx->hv_deadline_tsc = tscl + delta_tsc; - return delta_tsc == 0; + *expired = !delta_tsc; + return 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) @@ -7102,9 +7102,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); - gpa_t gpa; - struct page *page = NULL; - u64 *pml_address; + gpa_t gpa, dst; if (is_guest_mode(vcpu)) { WARN_ON_ONCE(vmx->nested.pml_full); @@ -7124,15 +7122,13 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) } gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; + dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address); - if (is_error_page(page)) + if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, + offset_in_page(dst), sizeof(gpa))) return 0; - pml_address = kmap(page); - pml_address[vmcs12->guest_pml_index--] = gpa; - kunmap(page); - kvm_release_page_clean(page); + vmcs12->guest_pml_index--; } return 0; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f879529906b4..61128b48c503 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -142,8 +142,11 @@ struct nested_vmx { * pointers, so we must keep them pinned while L2 runs. */ struct page *apic_access_page; - struct page *virtual_apic_page; - struct page *pi_desc_page; + struct kvm_host_map virtual_apic_map; + struct kvm_host_map pi_desc_map; + + struct kvm_host_map msr_bitmap_map; + struct pi_desc *pi_desc; bool pi_pending; u16 posted_intr_nv; @@ -169,7 +172,7 @@ struct nested_vmx { } smm; gpa_t hv_evmcs_vmptr; - struct page *hv_evmcs_page; + struct kvm_host_map hv_evmcs_map; struct hv_enlightened_vmcs *hv_evmcs; }; @@ -257,6 +260,8 @@ struct vcpu_vmx { unsigned long host_debugctlmsr; + u64 msr_ia32_power_ctl; + /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included @@ -314,6 +319,7 @@ void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); +void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a0d1fc80ac5a..acb179f78fdc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -136,10 +136,14 @@ EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); -/* lapic timer advance (tscdeadline mode only) in nanoseconds */ -unsigned int __read_mostly lapic_timer_advance_ns = 1000; -module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); -EXPORT_SYMBOL_GPL(lapic_timer_advance_ns); +/* + * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables + * adaptive tuning starting from default advancment of 1000ns. '0' disables + * advancement entirely. Any other value is used as-is and disables adaptive + * tuning, i.e. allows priveleged userspace to set an exact advancement time. + */ +static int __read_mostly lapic_timer_advance_ns = -1; +module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, S_IRUGO); @@ -1096,15 +1100,15 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); bool kvm_rdpmc(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + u32 ecx = kvm_rcx_read(vcpu); u64 data; int err; err = kvm_pmu_rdpmc(vcpu, ecx, &data); if (err) return err; - kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data); - kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32); + kvm_rax_write(vcpu, (u32)data); + kvm_rdx_write(vcpu, data >> 32); return err; } EXPORT_SYMBOL_GPL(kvm_rdpmc); @@ -1170,6 +1174,9 @@ static u32 emulated_msrs[] = { MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, MSR_AMD64_VIRT_SPEC_CTRL, + MSR_IA32_POWER_CTL, + + MSR_K7_HWCR, }; static unsigned num_emulated_msrs; @@ -1258,31 +1265,49 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) return 0; } -bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) +static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & efer_reserved_bits) - return false; - if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) - return false; + return false; if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) - return false; + return false; + + if (efer & (EFER_LME | EFER_LMA) && + !guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return false; + + if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) + return false; return true; + +} +bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (efer & efer_reserved_bits) + return false; + + return __kvm_valid_efer(vcpu, efer); } EXPORT_SYMBOL_GPL(kvm_valid_efer); -static int set_efer(struct kvm_vcpu *vcpu, u64 efer) +static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 old_efer = vcpu->arch.efer; + u64 efer = msr_info->data; - if (!kvm_valid_efer(vcpu, efer)) + if (efer & efer_reserved_bits) return 1; - if (is_paging(vcpu) - && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) - return 1; + if (!msr_info->host_initiated) { + if (!__kvm_valid_efer(vcpu, efer)) + return 1; + + if (is_paging(vcpu) && + (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) + return 1; + } efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; @@ -2275,6 +2300,18 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } +/* + * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. + */ +static bool can_set_mci_status(struct kvm_vcpu *vcpu) +{ + /* McStatusWrEn enabled? */ + if (guest_cpuid_is_amd(vcpu)) + return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); + + return false; +} + static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 mcg_cap = vcpu->arch.mcg_cap; @@ -2306,9 +2343,14 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((offset & 0x3) == 0 && data != 0 && (data | (1 << 10)) != ~(u64)0) return -1; + + /* MCi_STATUS */ if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) - return -1; + (offset & 0x3) == 1 && data != 0) { + if (!can_set_mci_status(vcpu)) + return -1; + } + vcpu->arch.mce_banks[offset] = data; break; } @@ -2452,13 +2494,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.arch_capabilities = data; break; case MSR_EFER: - return set_efer(vcpu, data); + return set_efer(vcpu, msr_info); case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ - data &= ~(u64)0x40000; /* ignore Mc status write enable */ - if (data != 0) { + + /* Handle McStatusWrEn */ + if (data == BIT_ULL(18)) { + vcpu->arch.msr_hwcr = data; + } else if (data != 0) { vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); return 1; @@ -2732,7 +2777,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K8_SYSCFG: case MSR_K8_TSEG_ADDR: case MSR_K8_TSEG_MASK: - case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: @@ -2896,6 +2940,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_MISC_FEATURES_ENABLES: msr_info->data = vcpu->arch.msr_misc_features_enables; break; + case MSR_K7_HWCR: + msr_info->data = vcpu->arch.msr_hwcr; + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); @@ -3075,9 +3122,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; - case KVM_CAP_NR_MEMSLOTS: - r = KVM_USER_MEM_SLOTS; - break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; @@ -3677,15 +3721,15 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { - u64 feature = valid & -valid; - int index = fls64(feature) - 1; - void *src = get_xsave_addr(xsave, feature); + u64 xfeature_mask = valid & -valid; + int xfeature_nr = fls64(xfeature_mask) - 1; + void *src = get_xsave_addr(xsave, xfeature_nr); if (src) { u32 size, offset, ecx, edx; - cpuid_count(XSTATE_CPUID, index, + cpuid_count(XSTATE_CPUID, xfeature_nr, &size, &offset, &ecx, &edx); - if (feature == XFEATURE_MASK_PKRU) + if (xfeature_nr == XFEATURE_PKRU) memcpy(dest + offset, &vcpu->arch.pkru, sizeof(vcpu->arch.pkru)); else @@ -3693,7 +3737,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) } - valid -= feature; + valid -= xfeature_mask; } } @@ -3720,22 +3764,22 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { - u64 feature = valid & -valid; - int index = fls64(feature) - 1; - void *dest = get_xsave_addr(xsave, feature); + u64 xfeature_mask = valid & -valid; + int xfeature_nr = fls64(xfeature_mask) - 1; + void *dest = get_xsave_addr(xsave, xfeature_nr); if (dest) { u32 size, offset, ecx, edx; - cpuid_count(XSTATE_CPUID, index, + cpuid_count(XSTATE_CPUID, xfeature_nr, &size, &offset, &ecx, &edx); - if (feature == XFEATURE_MASK_PKRU) + if (xfeature_nr == XFEATURE_PKRU) memcpy(&vcpu->arch.pkru, src + offset, sizeof(vcpu->arch.pkru)); else memcpy(dest, src + offset, size); } - valid -= feature; + valid -= xfeature_mask; } } @@ -5517,9 +5561,9 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, unsigned int bytes, struct x86_exception *exception) { + struct kvm_host_map map; struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; - struct page *page; char *kaddr; bool exchanged; @@ -5536,12 +5580,11 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) goto emul_write; - page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map)) goto emul_write; - kaddr = kmap_atomic(page); - kaddr += offset_in_page(gpa); + kaddr = map.hva + offset_in_page(gpa); + switch (bytes) { case 1: exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); @@ -5558,13 +5601,12 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, default: BUG(); } - kunmap_atomic(kaddr); - kvm_release_page_dirty(page); + + kvm_vcpu_unmap(vcpu, &map, true); if (!exchanged) return X86EMUL_CMPXCHG_FAILED; - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); kvm_page_track_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -6535,6 +6577,12 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); +static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pio.count = 0; + return 1; +} + static int complete_fast_pio_out(struct kvm_vcpu *vcpu) { vcpu->arch.pio.count = 0; @@ -6548,15 +6596,26 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu) static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { - unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); + unsigned long val = kvm_rax_read(vcpu); int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, size, port, &val, 1); + if (ret) + return ret; - if (!ret) { + /* + * Workaround userspace that relies on old KVM behavior of %rip being + * incremented prior to exiting to userspace to handle "OUT 0x7e". + */ + if (port == 0x7e && + kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) { + vcpu->arch.complete_userspace_io = + complete_fast_pio_out_port_0x7e; + kvm_skip_emulated_instruction(vcpu); + } else { vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_out; } - return ret; + return 0; } static int complete_fast_pio_in(struct kvm_vcpu *vcpu) @@ -6572,8 +6631,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) } /* For size less than 4 we merge, else we zero extend */ - val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) - : 0; + val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; /* * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform @@ -6581,7 +6639,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) */ emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); - kvm_register_write(vcpu, VCPU_REGS_RAX, val); + kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); } @@ -6593,12 +6651,12 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, int ret; /* For size less than 4 we merge, else we zero extend */ - val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0; + val = (size < 4) ? kvm_rax_read(vcpu) : 0; ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, &val, 1); if (ret) { - kvm_register_write(vcpu, VCPU_REGS_RAX, val); + kvm_rax_write(vcpu, val); return ret; } @@ -6677,10 +6735,8 @@ static void kvm_hyperv_tsc_notifier(void) } #endif -static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) +static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) { - struct cpufreq_freqs *freq = data; struct kvm *kvm; struct kvm_vcpu *vcpu; int i, send_ipi = 0; @@ -6724,17 +6780,12 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va * */ - if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) - return 0; - if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) - return 0; - - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); + smp_call_function_single(cpu, tsc_khz_changed, freq, 1); spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->cpu != freq->cpu) + if (vcpu->cpu != cpu) continue; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != smp_processor_id()) @@ -6756,8 +6807,24 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va * guest context is entered kvmclock will be updated, * so the guest will not see stale values. */ - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); + smp_call_function_single(cpu, tsc_khz_changed, freq, 1); } +} + +static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + int cpu; + + if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) + return 0; + if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) + return 0; + + for_each_cpu(cpu, freq->policy->cpus) + __kvmclock_cpufreq_notifier(freq, cpu); + return 0; } @@ -6824,10 +6891,20 @@ static unsigned long kvm_get_guest_ip(void) return ip; } +static void kvm_handle_intel_pt_intr(void) +{ + struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); + + kvm_make_request(KVM_REQ_PMI, vcpu); + __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, + (unsigned long *)&vcpu->arch.pmu.global_status); +} + static struct perf_guest_info_callbacks kvm_guest_cbs = { .is_in_guest = kvm_is_in_guest, .is_user_mode = kvm_is_user_mode, .get_guest_ip = kvm_get_guest_ip, + .handle_intel_pt_intr = kvm_handle_intel_pt_intr, }; static void kvm_set_mmio_spte_mask(void) @@ -7103,11 +7180,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); - nr = kvm_register_read(vcpu, VCPU_REGS_RAX); - a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); - a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); - a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); - a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); + nr = kvm_rax_read(vcpu); + a0 = kvm_rbx_read(vcpu); + a1 = kvm_rcx_read(vcpu); + a2 = kvm_rdx_read(vcpu); + a3 = kvm_rsi_read(vcpu); trace_kvm_hypercall(nr, a0, a1, a2, a3); @@ -7148,7 +7225,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) out: if (!op_64_bit) ret = (u32)ret; - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + kvm_rax_write(vcpu, ret); ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); @@ -7873,10 +7950,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); - if (lapic_timer_advance_ns) + if (lapic_in_kernel(vcpu) && + vcpu->arch.apic->lapic_timer.timer_advance_ns) wait_lapic_expire(vcpu); guest_enter_irqoff(); + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); + if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); @@ -8135,22 +8217,30 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - preempt_disable(); + fpregs_lock(); + copy_fpregs_to_fpstate(¤t->thread.fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); - preempt_enable(); + + fpregs_mark_activate(); + fpregs_unlock(); + trace_kvm_fpu(1); } /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - preempt_disable(); + fpregs_lock(); + copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); copy_kernel_to_fpregs(¤t->thread.fpu.state); - preempt_enable(); + + fpregs_mark_activate(); + fpregs_unlock(); + ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } @@ -8237,23 +8327,23 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; } - regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); - regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); - regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); - regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); - regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); - regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); - regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); + regs->rax = kvm_rax_read(vcpu); + regs->rbx = kvm_rbx_read(vcpu); + regs->rcx = kvm_rcx_read(vcpu); + regs->rdx = kvm_rdx_read(vcpu); + regs->rsi = kvm_rsi_read(vcpu); + regs->rdi = kvm_rdi_read(vcpu); + regs->rsp = kvm_rsp_read(vcpu); + regs->rbp = kvm_rbp_read(vcpu); #ifdef CONFIG_X86_64 - regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); - regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); - regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); - regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); - regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); - regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); - regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); - regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); + regs->r8 = kvm_r8_read(vcpu); + regs->r9 = kvm_r9_read(vcpu); + regs->r10 = kvm_r10_read(vcpu); + regs->r11 = kvm_r11_read(vcpu); + regs->r12 = kvm_r12_read(vcpu); + regs->r13 = kvm_r13_read(vcpu); + regs->r14 = kvm_r14_read(vcpu); + regs->r15 = kvm_r15_read(vcpu); #endif regs->rip = kvm_rip_read(vcpu); @@ -8273,23 +8363,23 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.emulate_regs_need_sync_from_vcpu = true; vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); - kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); - kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); - kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); - kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); - kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); - kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); - kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); + kvm_rax_write(vcpu, regs->rax); + kvm_rbx_write(vcpu, regs->rbx); + kvm_rcx_write(vcpu, regs->rcx); + kvm_rdx_write(vcpu, regs->rdx); + kvm_rsi_write(vcpu, regs->rsi); + kvm_rdi_write(vcpu, regs->rdi); + kvm_rsp_write(vcpu, regs->rsp); + kvm_rbp_write(vcpu, regs->rbp); #ifdef CONFIG_X86_64 - kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); - kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); - kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); - kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); - kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); - kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); - kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); - kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); + kvm_r8_write(vcpu, regs->r8); + kvm_r9_write(vcpu, regs->r9); + kvm_r10_write(vcpu, regs->r10); + kvm_r11_write(vcpu, regs->r11); + kvm_r12_write(vcpu, regs->r12); + kvm_r13_write(vcpu, regs->r13); + kvm_r14_write(vcpu, regs->r14); + kvm_r15_write(vcpu, regs->r15); #endif kvm_rip_write(vcpu, regs->rip); @@ -8848,11 +8938,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (init_event) kvm_put_guest_fpu(vcpu); mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_MASK_BNDREGS); + XFEATURE_BNDREGS); if (mpx_state_buffer) memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_MASK_BNDCSR); + XFEATURE_BNDCSR); if (mpx_state_buffer) memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); if (init_event) @@ -9061,7 +9151,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); - r = kvm_create_lapic(vcpu); + r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); if (r < 0) goto fail_mmu_destroy; } else diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index aedc5d0d4989..a470ff0868c5 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -294,8 +294,6 @@ extern u64 kvm_supported_xcr0(void); extern unsigned int min_timer_period_us; -extern unsigned int lapic_timer_advance_ns; - extern bool enable_vmware_backdoor; extern struct static_key kvm_no_apic_vcpu; @@ -347,6 +345,16 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) __this_cpu_write(current_vcpu, NULL); } + +static inline bool kvm_pat_valid(u64 data) +{ + if (data & 0xF8F8F8F8F8F8F8F8ull) + return false; + /* 0, 1, 4, 5, 6, 7 are valid values. */ + return (data | ((data & 0x0202020202020202ull) << 1)) == data; +} + void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu); void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu); + #endif |