summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c23
-rw-r--r--arch/x86/kvm/debugfs.c18
-rw-r--r--arch/x86/kvm/hyperv.c35
-rw-r--r--arch/x86/kvm/irq.c7
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h42
-rw-r--r--arch/x86/kvm/lapic.c111
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.c24
-rw-r--r--arch/x86/kvm/mtrr.c10
-rw-r--r--arch/x86/kvm/paging_tmpl.h40
-rw-r--r--arch/x86/kvm/pmu.c10
-rw-r--r--arch/x86/kvm/pmu.h3
-rw-r--r--arch/x86/kvm/pmu_amd.c4
-rw-r--r--arch/x86/kvm/svm.c145
-rw-r--r--arch/x86/kvm/vmx/capabilities.h2
-rw-r--r--arch/x86/kvm/vmx/nested.c365
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c34
-rw-r--r--arch/x86/kvm/vmx/vmenter.S12
-rw-r--r--arch/x86/kvm/vmx/vmx.c132
-rw-r--r--arch/x86/kvm/vmx/vmx.h12
-rw-r--r--arch/x86/kvm/x86.c314
-rw-r--r--arch/x86/kvm/x86.h12
24 files changed, 801 insertions, 560 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 72fa955f4a15..fc042419e670 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -27,7 +27,6 @@ config KVM
depends on X86_LOCAL_APIC
select PREEMPT_NOTIFIERS
select MMU_NOTIFIER
- select ANON_INODES
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQFD
select IRQ_BYPASS_MANAGER
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index fd3951638ae4..e18a9f9f65b5 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -410,7 +410,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
- F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP);
+ F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
+ F(MD_CLEAR);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -455,8 +456,9 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
break;
}
- /* function 4 has additional index. */
- case 4: {
+ /* functions 4 and 0x8000001d have additional index. */
+ case 4:
+ case 0x8000001d: {
int i, cache_type;
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
@@ -700,8 +702,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ecx = entry->edx = 0;
break;
case 0x8000001a:
- break;
- case 0x8000001d:
+ case 0x8000001e:
break;
/*Add support for Centaur's CPUID instruction*/
case 0xC0000000:
@@ -962,13 +963,13 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
return 1;
- eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ eax = kvm_rax_read(vcpu);
+ ecx = kvm_rcx_read(vcpu);
kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true);
- kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
- kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
- kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
+ kvm_rax_write(vcpu, eax);
+ kvm_rbx_write(vcpu, ebx);
+ kvm_rcx_write(vcpu, ecx);
+ kvm_rdx_write(vcpu, edx);
return kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index c19c7ede9bd6..a2f3432ce090 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -9,12 +9,22 @@
*/
#include <linux/kvm_host.h>
#include <linux/debugfs.h>
+#include "lapic.h"
bool kvm_arch_has_vcpu_debugfs(void)
{
return true;
}
+static int vcpu_get_timer_advance_ns(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.apic->lapic_timer.timer_advance_ns;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_advance_ns_fops, vcpu_get_timer_advance_ns, NULL, "%llu\n");
+
static int vcpu_get_tsc_offset(void *data, u64 *val)
{
struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
@@ -51,6 +61,14 @@ int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
if (!ret)
return -ENOMEM;
+ if (lapic_in_kernel(vcpu)) {
+ ret = debugfs_create_file("lapic_timer_advance_ns", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_timer_advance_ns_fops);
+ if (!ret)
+ return -ENOMEM;
+ }
+
if (kvm_has_tsc_control) {
ret = debugfs_create_file("tsc-scaling-ratio", 0444,
vcpu->debugfs_dentry,
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 421899f6ad7b..8ca4b39918e0 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1371,7 +1371,16 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
valid_bank_mask = BIT_ULL(0);
sparse_banks[0] = flush.processor_mask;
- all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS;
+
+ /*
+ * Work around possible WS2012 bug: it sends hypercalls
+ * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear,
+ * while also expecting us to flush something and crashing if
+ * we don't. Let's treat processor_mask == 0 same as
+ * HV_FLUSH_ALL_PROCESSORS.
+ */
+ all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) ||
+ flush.processor_mask == 0;
} else {
if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex,
sizeof(flush_ex))))
@@ -1526,10 +1535,10 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
longmode = is_64_bit_mode(vcpu);
if (longmode)
- kvm_register_write(vcpu, VCPU_REGS_RAX, result);
+ kvm_rax_write(vcpu, result);
else {
- kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32);
- kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff);
+ kvm_rdx_write(vcpu, result >> 32);
+ kvm_rax_write(vcpu, result & 0xffffffff);
}
}
@@ -1602,18 +1611,18 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
longmode = is_64_bit_mode(vcpu);
if (!longmode) {
- param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
- (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
- ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
- (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
- outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
- (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
+ param = ((u64)kvm_rdx_read(vcpu) << 32) |
+ (kvm_rax_read(vcpu) & 0xffffffff);
+ ingpa = ((u64)kvm_rbx_read(vcpu) << 32) |
+ (kvm_rcx_read(vcpu) & 0xffffffff);
+ outgpa = ((u64)kvm_rdi_read(vcpu) << 32) |
+ (kvm_rsi_read(vcpu) & 0xffffffff);
}
#ifdef CONFIG_X86_64
else {
- param = kvm_register_read(vcpu, VCPU_REGS_RCX);
- ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
- outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
+ param = kvm_rcx_read(vcpu);
+ ingpa = kvm_rdx_read(vcpu);
+ outgpa = kvm_r8_read(vcpu);
}
#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index faa264822cee..007bc654f928 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -172,3 +172,10 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
__kvm_migrate_apic_timer(vcpu);
__kvm_migrate_pit_timer(vcpu);
}
+
+bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
+{
+ bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE;
+
+ return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index d5005cc26521..fd210cdd4983 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -114,6 +114,7 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
return mode != KVM_IRQCHIP_NONE;
}
+bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index f8f56a93358b..1cc6c47dc77e 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -9,6 +9,34 @@
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE)
+#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \
+static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
+{ \
+ return vcpu->arch.regs[VCPU_REGS_##uname]; \
+} \
+static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu, \
+ unsigned long val) \
+{ \
+ vcpu->arch.regs[VCPU_REGS_##uname] = val; \
+}
+BUILD_KVM_GPR_ACCESSORS(rax, RAX)
+BUILD_KVM_GPR_ACCESSORS(rbx, RBX)
+BUILD_KVM_GPR_ACCESSORS(rcx, RCX)
+BUILD_KVM_GPR_ACCESSORS(rdx, RDX)
+BUILD_KVM_GPR_ACCESSORS(rbp, RBP)
+BUILD_KVM_GPR_ACCESSORS(rsi, RSI)
+BUILD_KVM_GPR_ACCESSORS(rdi, RDI)
+#ifdef CONFIG_X86_64
+BUILD_KVM_GPR_ACCESSORS(r8, R8)
+BUILD_KVM_GPR_ACCESSORS(r9, R9)
+BUILD_KVM_GPR_ACCESSORS(r10, R10)
+BUILD_KVM_GPR_ACCESSORS(r11, R11)
+BUILD_KVM_GPR_ACCESSORS(r12, R12)
+BUILD_KVM_GPR_ACCESSORS(r13, R13)
+BUILD_KVM_GPR_ACCESSORS(r14, R14)
+BUILD_KVM_GPR_ACCESSORS(r15, R15)
+#endif
+
static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
@@ -37,6 +65,16 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
kvm_register_write(vcpu, VCPU_REGS_RIP, val);
}
+static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu)
+{
+ return kvm_register_read(vcpu, VCPU_REGS_RSP);
+}
+
+static inline void kvm_rsp_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ kvm_register_write(vcpu, VCPU_REGS_RSP, val);
+}
+
static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
{
might_sleep(); /* on svm */
@@ -83,8 +121,8 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
{
- return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
- | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
+ return (kvm_rax_read(vcpu) & -1u)
+ | ((u64)(kvm_rdx_read(vcpu) & -1u) << 32);
}
static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9bf70cf84564..4924f83ed4f3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -70,7 +70,6 @@
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
-static bool lapic_timer_advance_adjust_done = false;
#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
@@ -1455,7 +1454,7 @@ static void apic_timer_expired(struct kvm_lapic *apic)
if (swait_active(q))
swake_up_one(q);
- if (apic_lvtt_tscdeadline(apic))
+ if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
ktimer->expired_tscdeadline = ktimer->tscdeadline;
}
@@ -1482,14 +1481,32 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
return false;
}
+static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
+{
+ u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns;
+
+ /*
+ * If the guest TSC is running at a different ratio than the host, then
+ * convert the delay to nanoseconds to achieve an accurate delay. Note
+ * that __delay() uses delay_tsc whenever the hardware has TSC, thus
+ * always for VMX enabled hardware.
+ */
+ if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) {
+ __delay(min(guest_cycles,
+ nsec_to_cycles(vcpu, timer_advance_ns)));
+ } else {
+ u64 delay_ns = guest_cycles * 1000000ULL;
+ do_div(delay_ns, vcpu->arch.virtual_tsc_khz);
+ ndelay(min_t(u32, delay_ns, timer_advance_ns));
+ }
+}
+
void wait_lapic_expire(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
u64 guest_tsc, tsc_deadline, ns;
- if (!lapic_in_kernel(vcpu))
- return;
-
if (apic->lapic_timer.expired_tscdeadline == 0)
return;
@@ -1501,33 +1518,37 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
- /* __delay is delay_tsc whenever the hardware has TSC, thus always. */
if (guest_tsc < tsc_deadline)
- __delay(min(tsc_deadline - guest_tsc,
- nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
+ __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
- if (!lapic_timer_advance_adjust_done) {
+ if (!apic->lapic_timer.timer_advance_adjust_done) {
/* too early */
if (guest_tsc < tsc_deadline) {
ns = (tsc_deadline - guest_tsc) * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- lapic_timer_advance_ns -= min((unsigned int)ns,
- lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns -= min((u32)ns,
+ timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
} else {
/* too late */
ns = (guest_tsc - tsc_deadline) * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- lapic_timer_advance_ns += min((unsigned int)ns,
- lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns += min((u32)ns,
+ timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
}
if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
- lapic_timer_advance_adjust_done = true;
+ apic->lapic_timer.timer_advance_adjust_done = true;
+ if (unlikely(timer_advance_ns > 5000)) {
+ timer_advance_ns = 0;
+ apic->lapic_timer.timer_advance_adjust_done = true;
+ }
+ apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}
}
static void start_sw_tscdeadline(struct kvm_lapic *apic)
{
- u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ u64 guest_tsc, tscdeadline = ktimer->tscdeadline;
u64 ns = 0;
ktime_t expire;
struct kvm_vcpu *vcpu = apic->vcpu;
@@ -1542,13 +1563,15 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
now = ktime_get();
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
- if (likely(tscdeadline > guest_tsc)) {
- ns = (tscdeadline - guest_tsc) * 1000000ULL;
- do_div(ns, this_tsc_khz);
+
+ ns = (tscdeadline - guest_tsc) * 1000000ULL;
+ do_div(ns, this_tsc_khz);
+
+ if (likely(tscdeadline > guest_tsc) &&
+ likely(ns > apic->lapic_timer.timer_advance_ns)) {
expire = ktime_add_ns(now, ns);
- expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
- hrtimer_start(&apic->lapic_timer.timer,
- expire, HRTIMER_MODE_ABS_PINNED);
+ expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
+ hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED);
} else
apic_timer_expired(apic);
@@ -1673,37 +1696,42 @@ static void cancel_hv_timer(struct kvm_lapic *apic)
static bool start_hv_timer(struct kvm_lapic *apic)
{
struct kvm_timer *ktimer = &apic->lapic_timer;
- int r;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ bool expired;
WARN_ON(preemptible());
if (!kvm_x86_ops->set_hv_timer)
return false;
- if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
- return false;
-
if (!ktimer->tscdeadline)
return false;
- r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
- if (r < 0)
+ if (kvm_x86_ops->set_hv_timer(vcpu, ktimer->tscdeadline, &expired))
return false;
ktimer->hv_timer_in_use = true;
hrtimer_cancel(&ktimer->timer);
/*
- * Also recheck ktimer->pending, in case the sw timer triggered in
- * the window. For periodic timer, leave the hv timer running for
- * simplicity, and the deadline will be recomputed on the next vmexit.
+ * To simplify handling the periodic timer, leave the hv timer running
+ * even if the deadline timer has expired, i.e. rely on the resulting
+ * VM-Exit to recompute the periodic timer's target expiration.
*/
- if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) {
- if (r)
+ if (!apic_lvtt_period(apic)) {
+ /*
+ * Cancel the hv timer if the sw timer fired while the hv timer
+ * was being programmed, or if the hv timer itself expired.
+ */
+ if (atomic_read(&ktimer->pending)) {
+ cancel_hv_timer(apic);
+ } else if (expired) {
apic_timer_expired(apic);
- return false;
+ cancel_hv_timer(apic);
+ }
}
- trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true);
+ trace_kvm_hv_timer_state(vcpu->vcpu_id, ktimer->hv_timer_in_use);
+
return true;
}
@@ -1727,8 +1755,13 @@ static void start_sw_timer(struct kvm_lapic *apic)
static void restart_apic_timer(struct kvm_lapic *apic)
{
preempt_disable();
+
+ if (!apic_lvtt_period(apic) && atomic_read(&apic->lapic_timer.pending))
+ goto out;
+
if (!start_hv_timer(apic))
start_sw_timer(apic);
+out:
preempt_enable();
}
@@ -2255,7 +2288,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
return HRTIMER_NORESTART;
}
-int kvm_create_lapic(struct kvm_vcpu *vcpu)
+int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
{
struct kvm_lapic *apic;
@@ -2279,6 +2312,14 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_PINNED);
apic->lapic_timer.timer.function = apic_timer_fn;
+ if (timer_advance_ns == -1) {
+ apic->lapic_timer.timer_advance_ns = 1000;
+ apic->lapic_timer.timer_advance_adjust_done = false;
+ } else {
+ apic->lapic_timer.timer_advance_ns = timer_advance_ns;
+ apic->lapic_timer.timer_advance_adjust_done = true;
+ }
+
/*
* APIC is created enabled. This will prevent kvm_lapic_set_base from
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ff6ef9c3d760..d6d049ba3045 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,8 +31,10 @@ struct kvm_timer {
u32 timer_mode_mask;
u64 tscdeadline;
u64 expired_tscdeadline;
+ u32 timer_advance_ns;
atomic_t pending; /* accumulated triggered timers */
bool hv_timer_in_use;
+ bool timer_advance_adjust_done;
};
struct kvm_lapic {
@@ -62,7 +64,7 @@ struct kvm_lapic {
struct dest_map;
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
+int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns);
void kvm_free_lapic(struct kvm_vcpu *vcpu);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e10962dfc203..1e9ba81accba 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -44,6 +44,7 @@
#include <asm/page.h>
#include <asm/pat.h>
#include <asm/cmpxchg.h>
+#include <asm/e820/api.h>
#include <asm/io.h>
#include <asm/vmx.h>
#include <asm/kvm_page_track.h>
@@ -487,16 +488,24 @@ static void kvm_mmu_reset_all_pte_masks(void)
* If the CPU has 46 or less physical address bits, then set an
* appropriate mask to guard against L1TF attacks. Otherwise, it is
* assumed that the CPU is not vulnerable to L1TF.
+ *
+ * Some Intel CPUs address the L1 cache using more PA bits than are
+ * reported by CPUID. Use the PA width of the L1 cache when possible
+ * to achieve more effective mitigation, e.g. if system RAM overlaps
+ * the most significant bits of legal physical address space.
*/
- low_phys_bits = boot_cpu_data.x86_phys_bits;
- if (boot_cpu_data.x86_phys_bits <
+ shadow_nonpresent_or_rsvd_mask = 0;
+ low_phys_bits = boot_cpu_data.x86_cache_bits;
+ if (boot_cpu_data.x86_cache_bits <
52 - shadow_nonpresent_or_rsvd_mask_len) {
shadow_nonpresent_or_rsvd_mask =
- rsvd_bits(boot_cpu_data.x86_phys_bits -
+ rsvd_bits(boot_cpu_data.x86_cache_bits -
shadow_nonpresent_or_rsvd_mask_len,
- boot_cpu_data.x86_phys_bits - 1);
+ boot_cpu_data.x86_cache_bits - 1);
low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
- }
+ } else
+ WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF));
+
shadow_nonpresent_or_rsvd_lower_gfn_mask =
GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
}
@@ -2892,7 +2901,9 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
*/
(!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
- return true;
+ return !e820__mapped_raw_any(pfn_to_hpa(pfn),
+ pfn_to_hpa(pfn + 1) - 1,
+ E820_TYPE_RAM);
}
/* Bits which may be returned by set_spte() */
@@ -4781,6 +4792,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
union kvm_mmu_extended_role ext = {0};
ext.cr0_pg = !!is_paging(vcpu);
+ ext.cr4_pae = !!is_pae(vcpu);
ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
ext.cr4_pse = !!is_pse(vcpu);
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index e9ea2d45ae66..9f72cc427158 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -48,11 +48,6 @@ static bool msr_mtrr_valid(unsigned msr)
return false;
}
-static bool valid_pat_type(unsigned t)
-{
- return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
-}
-
static bool valid_mtrr_type(unsigned t)
{
return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
@@ -67,10 +62,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return false;
if (msr == MSR_IA32_CR_PAT) {
- for (i = 0; i < 8; i++)
- if (!valid_pat_type((data >> (i * 8)) & 0xff))
- return false;
- return true;
+ return kvm_pat_valid(data);
} else if (msr == MSR_MTRRdefType) {
if (data & ~0xcff)
return false;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bdca39829bc..367a47df4ba0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -140,16 +140,36 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pt_element_t *table;
struct page *page;
- npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
- /* Check if the user is doing something meaningless. */
- if (unlikely(npages != 1))
- return -EFAULT;
-
- table = kmap_atomic(page);
- ret = CMPXCHG(&table[index], orig_pte, new_pte);
- kunmap_atomic(table);
-
- kvm_release_page_dirty(page);
+ npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page);
+ if (likely(npages == 1)) {
+ table = kmap_atomic(page);
+ ret = CMPXCHG(&table[index], orig_pte, new_pte);
+ kunmap_atomic(table);
+
+ kvm_release_page_dirty(page);
+ } else {
+ struct vm_area_struct *vma;
+ unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK;
+ unsigned long pfn;
+ unsigned long paddr;
+
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE);
+ if (!vma || !(vma->vm_flags & VM_PFNMAP)) {
+ up_read(&current->mm->mmap_sem);
+ return -EFAULT;
+ }
+ pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ paddr = pfn << PAGE_SHIFT;
+ table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB);
+ if (!table) {
+ up_read(&current->mm->mmap_sem);
+ return -EFAULT;
+ }
+ ret = CMPXCHG(&table[index], orig_pte, new_pte);
+ memunmap(table);
+ up_read(&current->mm->mmap_sem);
+ }
return (ret != orig_pte);
}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index e39741997893..dd745b58ffd8 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -283,7 +283,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
bool fast_mode = idx & (1u << 31);
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
- u64 ctr_val;
+ u64 mask = fast_mode ? ~0u : ~0ull;
if (!pmu->version)
return 1;
@@ -291,15 +291,11 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
if (is_vmware_backdoor_pmc(idx))
return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
- pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx);
+ pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx, &mask);
if (!pmc)
return 1;
- ctr_val = pmc_read_counter(pmc);
- if (fast_mode)
- ctr_val = (u32)ctr_val;
-
- *data = ctr_val;
+ *data = pmc_read_counter(pmc) & mask;
return 0;
}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index ba8898e1a854..22dff661145a 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -25,7 +25,8 @@ struct kvm_pmu_ops {
unsigned (*find_fixed_event)(int idx);
bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
- struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx);
+ struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx,
+ u64 *mask);
int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx);
bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index 1495a735b38e..d3118088f1cd 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -186,7 +186,7 @@ static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
}
/* idx is the ECX register of RDPMC instruction */
-static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx)
+static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *mask)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *counters;
@@ -269,10 +269,10 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->version = 1;
/* not applicable to AMD; but clean them to prevent any fall out */
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->nr_arch_fixed_counters = 0;
- pmu->version = 0;
pmu->global_status = 0;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 406b558abfef..735b8c01895e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -379,6 +379,9 @@ module_param(vgif, int, 0444);
static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
module_param(sev, int, 0444);
+static bool __read_mostly dump_invalid_vmcb = 0;
+module_param(dump_invalid_vmcb, bool, 0644);
+
static u8 rsm_ins_bytes[] = "\x0f\xaa";
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
@@ -1805,7 +1808,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
return NULL;
/* Pin the user virtual address. */
- npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+ npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
if (npinned != npages) {
pr_err("SEV: Failure locking %lu pages.\n", npages);
goto err;
@@ -2024,7 +2027,11 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (!kvm_vcpu_apicv_active(vcpu))
return;
- if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
+ /*
+ * Since the host physical APIC id is 8 bits,
+ * we can support host APIC ID upto 255.
+ */
+ if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
@@ -2091,7 +2098,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
init_vmcb(svm);
kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
- kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
+ kvm_rdx_write(vcpu, eax);
if (kvm_vcpu_apicv_active(vcpu) && !init_event)
avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
@@ -3071,32 +3078,6 @@ static inline bool nested_svm_nmi(struct vcpu_svm *svm)
return false;
}
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
-{
- struct page *page;
-
- might_sleep();
-
- page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
- if (is_error_page(page))
- goto error;
-
- *_page = page;
-
- return kmap(page);
-
-error:
- kvm_inject_gp(&svm->vcpu, 0);
-
- return NULL;
-}
-
-static void nested_svm_unmap(struct page *page)
-{
- kunmap(page);
- kvm_release_page_dirty(page);
-}
-
static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
{
unsigned port, size, iopm_len;
@@ -3299,10 +3280,11 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
static int nested_svm_vmexit(struct vcpu_svm *svm)
{
+ int rc;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
- struct page *page;
+ struct kvm_host_map map;
trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
vmcb->control.exit_info_1,
@@ -3311,9 +3293,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb->control.exit_int_info_err,
KVM_ISA_SVM);
- nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
- if (!nested_vmcb)
+ rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(svm->nested.vmcb), &map);
+ if (rc) {
+ if (rc == -EINVAL)
+ kvm_inject_gp(&svm->vcpu, 0);
return 1;
+ }
+
+ nested_vmcb = map.hva;
/* Exit Guest-Mode */
leave_guest_mode(&svm->vcpu);
@@ -3408,16 +3395,16 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
} else {
(void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
}
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
+ kvm_rax_write(&svm->vcpu, hsave->save.rax);
+ kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
+ kvm_rip_write(&svm->vcpu, hsave->save.rip);
svm->vmcb->save.dr7 = 0;
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
mark_all_dirty(svm->vmcb);
- nested_svm_unmap(page);
+ kvm_vcpu_unmap(&svm->vcpu, &map, true);
nested_svm_uninit_mmu_context(&svm->vcpu);
kvm_mmu_reset_context(&svm->vcpu);
@@ -3483,7 +3470,7 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
}
static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
- struct vmcb *nested_vmcb, struct page *page)
+ struct vmcb *nested_vmcb, struct kvm_host_map *map)
{
if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
@@ -3516,9 +3503,9 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
kvm_mmu_reset_context(&svm->vcpu);
svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
+ kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax);
+ kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp);
+ kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip);
/* In case we don't even reach vcpu_run, the fields are not updated */
svm->vmcb->save.rax = nested_vmcb->save.rax;
@@ -3567,7 +3554,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
svm->vmcb->control.pause_filter_thresh =
nested_vmcb->control.pause_filter_thresh;
- nested_svm_unmap(page);
+ kvm_vcpu_unmap(&svm->vcpu, map, true);
/* Enter Guest-Mode */
enter_guest_mode(&svm->vcpu);
@@ -3587,17 +3574,23 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
static bool nested_svm_vmrun(struct vcpu_svm *svm)
{
+ int rc;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
- struct page *page;
+ struct kvm_host_map map;
u64 vmcb_gpa;
vmcb_gpa = svm->vmcb->save.rax;
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
+ rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(vmcb_gpa), &map);
+ if (rc) {
+ if (rc == -EINVAL)
+ kvm_inject_gp(&svm->vcpu, 0);
return false;
+ }
+
+ nested_vmcb = map.hva;
if (!nested_vmcb_checks(nested_vmcb)) {
nested_vmcb->control.exit_code = SVM_EXIT_ERR;
@@ -3605,7 +3598,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
nested_vmcb->control.exit_info_1 = 0;
nested_vmcb->control.exit_info_2 = 0;
- nested_svm_unmap(page);
+ kvm_vcpu_unmap(&svm->vcpu, &map, true);
return false;
}
@@ -3649,7 +3642,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
copy_vmcb_control_area(hsave, vmcb);
- enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
+ enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
return true;
}
@@ -3673,21 +3666,26 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
static int vmload_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
- struct page *page;
+ struct kvm_host_map map;
int ret;
if (nested_svm_check_permissions(svm))
return 1;
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
+ ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+ if (ret) {
+ if (ret == -EINVAL)
+ kvm_inject_gp(&svm->vcpu, 0);
return 1;
+ }
+
+ nested_vmcb = map.hva;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
ret = kvm_skip_emulated_instruction(&svm->vcpu);
nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
- nested_svm_unmap(page);
+ kvm_vcpu_unmap(&svm->vcpu, &map, true);
return ret;
}
@@ -3695,21 +3693,26 @@ static int vmload_interception(struct vcpu_svm *svm)
static int vmsave_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
- struct page *page;
+ struct kvm_host_map map;
int ret;
if (nested_svm_check_permissions(svm))
return 1;
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
+ ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+ if (ret) {
+ if (ret == -EINVAL)
+ kvm_inject_gp(&svm->vcpu, 0);
return 1;
+ }
+
+ nested_vmcb = map.hva;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
ret = kvm_skip_emulated_instruction(&svm->vcpu);
nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
- nested_svm_unmap(page);
+ kvm_vcpu_unmap(&svm->vcpu, &map, true);
return ret;
}
@@ -3791,11 +3794,11 @@ static int invlpga_interception(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
- trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
- kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
+ trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
+ kvm_rax_read(&svm->vcpu));
/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
- kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
+ kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
return kvm_skip_emulated_instruction(&svm->vcpu);
@@ -3803,7 +3806,7 @@ static int invlpga_interception(struct vcpu_svm *svm)
static int skinit_interception(struct vcpu_svm *svm)
{
- trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
+ trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
@@ -3817,7 +3820,7 @@ static int wbinvd_interception(struct vcpu_svm *svm)
static int xsetbv_interception(struct vcpu_svm *svm)
{
u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
- u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+ u32 index = kvm_rcx_read(&svm->vcpu);
if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
@@ -4213,7 +4216,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static int rdmsr_interception(struct vcpu_svm *svm)
{
- u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+ u32 ecx = kvm_rcx_read(&svm->vcpu);
struct msr_data msr_info;
msr_info.index = ecx;
@@ -4225,10 +4228,8 @@ static int rdmsr_interception(struct vcpu_svm *svm)
} else {
trace_kvm_msr_read(ecx, msr_info.data);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
- msr_info.data & 0xffffffff);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
- msr_info.data >> 32);
+ kvm_rax_write(&svm->vcpu, msr_info.data & 0xffffffff);
+ kvm_rdx_write(&svm->vcpu, msr_info.data >> 32);
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
return kvm_skip_emulated_instruction(&svm->vcpu);
}
@@ -4422,7 +4423,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
static int wrmsr_interception(struct vcpu_svm *svm)
{
struct msr_data msr;
- u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+ u32 ecx = kvm_rcx_read(&svm->vcpu);
u64 data = kvm_read_edx_eax(&svm->vcpu);
msr.data = data;
@@ -4830,6 +4831,11 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
+ if (!dump_invalid_vmcb) {
+ pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
+ return;
+ }
+
pr_err("VMCB Control Area:\n");
pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
@@ -4988,7 +4994,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
kvm_run->fail_entry.hardware_entry_failure_reason
= svm->vmcb->control.exit_code;
- pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
dump_vmcb(vcpu);
return 0;
}
@@ -6236,7 +6241,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *nested_vmcb;
- struct page *page;
+ struct kvm_host_map map;
u64 guest;
u64 vmcb;
@@ -6244,10 +6249,10 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
if (guest) {
- nested_vmcb = nested_svm_map(svm, vmcb, &page);
- if (!nested_vmcb)
+ if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
return 1;
- enter_svm_guest_mode(svm, vmcb, nested_vmcb, page);
+ nested_vmcb = map.hva;
+ enter_svm_guest_mode(svm, vmcb, nested_vmcb, &map);
}
return 0;
}
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 854e144131c6..d6664ee3d127 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -2,6 +2,8 @@
#ifndef __KVM_X86_VMX_CAPS_H
#define __KVM_X86_VMX_CAPS_H
+#include <asm/vmx.h>
+
#include "lapic.h"
extern bool __read_mostly enable_vpid;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6401eb7ef19c..1032f068f0b9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -193,10 +193,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
if (!vmx->nested.hv_evmcs)
return;
- kunmap(vmx->nested.hv_evmcs_page);
- kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
vmx->nested.hv_evmcs_vmptr = -1ull;
- vmx->nested.hv_evmcs_page = NULL;
vmx->nested.hv_evmcs = NULL;
}
@@ -229,16 +227,9 @@ static void free_nested(struct kvm_vcpu *vcpu)
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
- if (vmx->nested.virtual_apic_page) {
- kvm_release_page_dirty(vmx->nested.virtual_apic_page);
- vmx->nested.virtual_apic_page = NULL;
- }
- if (vmx->nested.pi_desc_page) {
- kunmap(vmx->nested.pi_desc_page);
- kvm_release_page_dirty(vmx->nested.pi_desc_page);
- vmx->nested.pi_desc_page = NULL;
- vmx->nested.pi_desc = NULL;
- }
+ kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
+ vmx->nested.pi_desc = NULL;
kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
@@ -519,39 +510,19 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
int msr;
- struct page *page;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
- /*
- * pred_cmd & spec_ctrl are trying to verify two things:
- *
- * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
- * ensures that we do not accidentally generate an L02 MSR bitmap
- * from the L12 MSR bitmap that is too permissive.
- * 2. That L1 or L2s have actually used the MSR. This avoids
- * unnecessarily merging of the bitmap if the MSR is unused. This
- * works properly because we only update the L01 MSR bitmap lazily.
- * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
- * updated to reflect this when L1 (or its L2s) actually write to
- * the MSR.
- */
- bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
- bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
+ struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
/* Nothing to do if the MSR bitmap is not in use. */
if (!cpu_has_vmx_msr_bitmap() ||
!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
return false;
- if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
- !pred_cmd && !spec_ctrl)
- return false;
-
- page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
- if (is_error_page(page))
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
return false;
- msr_bitmap_l1 = (unsigned long *)kmap(page);
+ msr_bitmap_l1 = (unsigned long *)map->hva;
/*
* To keep the control flow simple, pay eight 8-byte writes (sixteen
@@ -592,20 +563,42 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
}
}
- if (spec_ctrl)
+ /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
+ nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
+ MSR_FS_BASE, MSR_TYPE_RW);
+
+ nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
+ MSR_GS_BASE, MSR_TYPE_RW);
+
+ nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
+ MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+
+ /*
+ * Checking the L0->L1 bitmap is trying to verify two things:
+ *
+ * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
+ * ensures that we do not accidentally generate an L02 MSR bitmap
+ * from the L12 MSR bitmap that is too permissive.
+ * 2. That L1 or L2s have actually used the MSR. This avoids
+ * unnecessarily merging of the bitmap if the MSR is unused. This
+ * works properly because we only update the L01 MSR bitmap lazily.
+ * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
+ * updated to reflect this when L1 (or its L2s) actually write to
+ * the MSR.
+ */
+ if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
nested_vmx_disable_intercept_for_msr(
msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_SPEC_CTRL,
MSR_TYPE_R | MSR_TYPE_W);
- if (pred_cmd)
+ if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
nested_vmx_disable_intercept_for_msr(
msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_PRED_CMD,
MSR_TYPE_W);
- kunmap(page);
- kvm_release_page_clean(page);
+ kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
return true;
}
@@ -613,20 +606,20 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
+ struct kvm_host_map map;
struct vmcs12 *shadow;
- struct page *page;
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
vmcs12->vmcs_link_pointer == -1ull)
return;
shadow = get_shadow_vmcs12(vcpu);
- page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
- memcpy(shadow, kmap(page), VMCS12_SIZE);
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
+ return;
- kunmap(page);
- kvm_release_page_clean(page);
+ memcpy(shadow, map.hva, VMCS12_SIZE);
+ kvm_vcpu_unmap(vcpu, &map, false);
}
static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
@@ -930,7 +923,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
if (!nested_cr3_valid(vcpu, cr3)) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
- return 1;
+ return -EINVAL;
}
/*
@@ -941,7 +934,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
!nested_ept) {
if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
*entry_failure_code = ENTRY_FAIL_PDPTE;
- return 1;
+ return -EINVAL;
}
}
}
@@ -1794,13 +1787,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
nested_release_evmcs(vcpu);
- vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
- vcpu, assist_page.current_nested_vmcs);
-
- if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(assist_page.current_nested_vmcs),
+ &vmx->nested.hv_evmcs_map))
return 0;
- vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
+ vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
/*
* Currently, KVM only supports eVMCS version 1
@@ -2373,19 +2364,19 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
*/
if (vmx->emulation_required) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
- return 1;
+ return -EINVAL;
}
/* Shadow page tables on either EPT or shadow page tables. */
if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
entry_failure_code))
- return 1;
+ return -EINVAL;
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
- kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
- kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
+ kvm_rsp_write(vcpu, vmcs12->guest_rsp);
+ kvm_rip_write(vcpu, vmcs12->guest_rip);
return 0;
}
@@ -2589,11 +2580,19 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
return 0;
}
-/*
- * Checks related to Host Control Registers and MSRs
- */
-static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
+static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
+ nested_check_vm_exit_controls(vcpu, vmcs12) ||
+ nested_check_vm_entry_controls(vcpu, vmcs12))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
{
bool ia32e;
@@ -2606,6 +2605,10 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))
return -EINVAL;
+ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
+ !kvm_pat_valid(vmcs12->host_ia32_pat))
+ return -EINVAL;
+
/*
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
* IA32_EFER MSR must be 0 in the field for that register. In addition,
@@ -2624,41 +2627,12 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
return 0;
}
-/*
- * Checks related to Guest Non-register State
- */
-static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
-{
- if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
- vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
- return -EINVAL;
-
- return 0;
-}
-
-static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
-{
- if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
- nested_check_vm_exit_controls(vcpu, vmcs12) ||
- nested_check_vm_entry_controls(vcpu, vmcs12))
- return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
-
- if (nested_check_host_control_regs(vcpu, vmcs12))
- return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
-
- if (nested_check_guest_non_reg_state(vmcs12))
- return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
-
- return 0;
-}
-
static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- int r;
- struct page *page;
+ int r = 0;
struct vmcs12 *shadow;
+ struct kvm_host_map map;
if (vmcs12->vmcs_link_pointer == -1ull)
return 0;
@@ -2666,23 +2640,34 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
return -EINVAL;
- page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
- if (is_error_page(page))
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
return -EINVAL;
- r = 0;
- shadow = kmap(page);
+ shadow = map.hva;
+
if (shadow->hdr.revision_id != VMCS12_REVISION ||
shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
r = -EINVAL;
- kunmap(page);
- kvm_release_page_clean(page);
+
+ kvm_vcpu_unmap(vcpu, &map, false);
return r;
}
-static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12,
- u32 *exit_qual)
+/*
+ * Checks related to Guest Non-register State
+ */
+static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
+{
+ if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ u32 *exit_qual)
{
bool ia32e;
@@ -2690,11 +2675,15 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu,
if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
- return 1;
+ return -EINVAL;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
+ !kvm_pat_valid(vmcs12->guest_ia32_pat))
+ return -EINVAL;
if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
- return 1;
+ return -EINVAL;
}
/*
@@ -2713,13 +2702,16 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu,
ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
((vmcs12->guest_cr0 & X86_CR0_PG) &&
ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
- return 1;
+ return -EINVAL;
}
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
- (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
- (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
- return 1;
+ (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
+ (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
+ return -EINVAL;
+
+ if (nested_check_guest_non_reg_state(vmcs12))
+ return -EINVAL;
return 0;
}
@@ -2792,14 +2784,13 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
: "cc", "memory"
);
- preempt_enable();
-
if (vmx->msr_autoload.host.nr)
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
if (vmx->msr_autoload.guest.nr)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
if (vm_fail) {
+ preempt_enable();
WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
@@ -2811,6 +2802,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
local_irq_enable();
if (hw_breakpoint_active())
set_debugreg(__this_cpu_read(cpu_dr7), 7);
+ preempt_enable();
/*
* A non-failing VMEntry means we somehow entered guest mode with
@@ -2832,6 +2824,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_host_map *map;
struct page *page;
u64 hpa;
@@ -2864,20 +2857,14 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
}
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
- if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
- kvm_release_page_dirty(vmx->nested.virtual_apic_page);
- vmx->nested.virtual_apic_page = NULL;
- }
- page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
+ map = &vmx->nested.virtual_apic_map;
/*
* If translation failed, VM entry will fail because
* prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
*/
- if (!is_error_page(page)) {
- vmx->nested.virtual_apic_page = page;
- hpa = page_to_phys(vmx->nested.virtual_apic_page);
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
+ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -2898,26 +2885,15 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
}
if (nested_cpu_has_posted_intr(vmcs12)) {
- if (vmx->nested.pi_desc_page) { /* shouldn't happen */
- kunmap(vmx->nested.pi_desc_page);
- kvm_release_page_dirty(vmx->nested.pi_desc_page);
- vmx->nested.pi_desc_page = NULL;
- vmx->nested.pi_desc = NULL;
- vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull);
+ map = &vmx->nested.pi_desc_map;
+
+ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
+ vmx->nested.pi_desc =
+ (struct pi_desc *)(((void *)map->hva) +
+ offset_in_page(vmcs12->posted_intr_desc_addr));
+ vmcs_write64(POSTED_INTR_DESC_ADDR,
+ pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
}
- page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
- if (is_error_page(page))
- return;
- vmx->nested.pi_desc_page = page;
- vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
- vmx->nested.pi_desc =
- (struct pi_desc *)((void *)vmx->nested.pi_desc +
- (unsigned long)(vmcs12->posted_intr_desc_addr &
- (PAGE_SIZE - 1)));
- vmcs_write64(POSTED_INTR_DESC_ADDR,
- page_to_phys(vmx->nested.pi_desc_page) +
- (unsigned long)(vmcs12->posted_intr_desc_addr &
- (PAGE_SIZE - 1)));
}
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
@@ -3000,7 +2976,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
return -1;
}
- if (nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+ if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
goto vmentry_fail_vmexit;
}
@@ -3145,9 +3121,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
- ret = nested_vmx_check_vmentry_prereqs(vcpu, vmcs12);
- if (ret)
- return nested_vmx_failValid(vcpu, ret);
+ if (nested_vmx_check_controls(vcpu, vmcs12))
+ return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+
+ if (nested_vmx_check_host_state(vcpu, vmcs12))
+ return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
/*
* We're finally done with prerequisite checking, and can start with
@@ -3310,11 +3288,12 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr != 256) {
- vapic_page = kmap(vmx->nested.virtual_apic_page);
+ vapic_page = vmx->nested.virtual_apic_map.hva;
+ if (!vapic_page)
+ return;
+
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
vapic_page, &max_irr);
- kunmap(vmx->nested.virtual_apic_page);
-
status = vmcs_read16(GUEST_INTR_STATUS);
if ((u8)max_irr > ((u8)status & 0xff)) {
status &= ~0xff;
@@ -3425,8 +3404,8 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
- vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
+ vmcs12->guest_rsp = kvm_rsp_read(vcpu);
+ vmcs12->guest_rip = kvm_rip_read(vcpu);
vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
@@ -3609,8 +3588,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
vmx_set_efer(vcpu, vcpu->arch.efer);
- kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
- kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
+ kvm_rsp_write(vcpu, vmcs12->host_rsp);
+ kvm_rip_write(vcpu, vmcs12->host_rip);
vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
vmx_set_interrupt_shadow(vcpu, 0);
@@ -3955,16 +3934,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
- if (vmx->nested.virtual_apic_page) {
- kvm_release_page_dirty(vmx->nested.virtual_apic_page);
- vmx->nested.virtual_apic_page = NULL;
- }
- if (vmx->nested.pi_desc_page) {
- kunmap(vmx->nested.pi_desc_page);
- kvm_release_page_dirty(vmx->nested.pi_desc_page);
- vmx->nested.pi_desc_page = NULL;
- vmx->nested.pi_desc = NULL;
- }
+ kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
+ vmx->nested.pi_desc = NULL;
/*
* We are now running in L2, mmu_notifier will force to reload the
@@ -4260,7 +4232,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
{
int ret;
gpa_t vmptr;
- struct page *page;
+ uint32_t revision;
struct vcpu_vmx *vmx = to_vmx(vcpu);
const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -4306,21 +4278,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
* Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
* which replaces physical address width with 32
*/
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+ if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failInvalid(vcpu);
- page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
- if (is_error_page(page))
+ if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
+ revision != VMCS12_REVISION)
return nested_vmx_failInvalid(vcpu);
- if (*(u32 *)kmap(page) != VMCS12_REVISION) {
- kunmap(page);
- kvm_release_page_clean(page);
- return nested_vmx_failInvalid(vcpu);
- }
- kunmap(page);
- kvm_release_page_clean(page);
-
vmx->nested.vmxon_ptr = vmptr;
ret = enter_vmx_operation(vcpu);
if (ret)
@@ -4377,7 +4341,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+ if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_INVALID_ADDRESS);
@@ -4385,7 +4349,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
return nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_VMXON_POINTER);
- if (vmx->nested.hv_evmcs_page) {
+ if (vmx->nested.hv_evmcs_map.hva) {
if (vmptr == vmx->nested.hv_evmcs_vmptr)
nested_release_evmcs(vcpu);
} else {
@@ -4584,7 +4548,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
- if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+ if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INVALID_ADDRESS);
@@ -4597,11 +4561,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return 1;
if (vmx->nested.current_vmptr != vmptr) {
+ struct kvm_host_map map;
struct vmcs12 *new_vmcs12;
- struct page *page;
- page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
- if (is_error_page(page)) {
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
/*
* Reads from an unbacked page return all 1s,
* which means that the 32 bits located at the
@@ -4611,12 +4574,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
- new_vmcs12 = kmap(page);
+
+ new_vmcs12 = map.hva;
+
if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
(new_vmcs12->hdr.shadow_vmcs &&
!nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
- kunmap(page);
- kvm_release_page_clean(page);
+ kvm_vcpu_unmap(vcpu, &map, false);
return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
@@ -4628,8 +4592,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
* cached.
*/
memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
- kunmap(page);
- kvm_release_page_clean(page);
+ kvm_vcpu_unmap(vcpu, &map, false);
set_current_vmptr(vmx, vmptr);
}
@@ -4804,7 +4767,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
+ u32 index = kvm_rcx_read(vcpu);
u64 address;
bool accessed_dirty;
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
@@ -4850,7 +4813,7 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12;
- u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
+ u32 function = kvm_rax_read(vcpu);
/*
* VMFUNC is only supported for nested guests, but we always enable the
@@ -4936,7 +4899,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12, u32 exit_reason)
{
- u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+ u32 msr_index = kvm_rcx_read(vcpu);
gpa_t bitmap;
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
@@ -5373,9 +5336,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (kvm_state->format != 0)
return -EINVAL;
- if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
- nested_enable_evmcs(vcpu, NULL);
-
if (!nested_vmx_allowed(vcpu))
return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
@@ -5417,13 +5377,16 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (kvm_state->vmx.vmxon_pa == -1ull)
return 0;
+ if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
+ nested_enable_evmcs(vcpu, NULL);
+
vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
ret = enter_vmx_operation(vcpu);
if (ret)
return ret;
/* Empty 'VMXON' state is permitted */
- if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
+ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
return 0;
if (kvm_state->vmx.vmcs_pa != -1ull) {
@@ -5463,33 +5426,41 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
vmx->nested.nested_run_pending =
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+ ret = -EINVAL;
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
vmcs12->vmcs_link_pointer != -1ull) {
struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
- if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12))
- return -EINVAL;
+ if (kvm_state->size < sizeof(*kvm_state) + VMCS12_SIZE + sizeof(*vmcs12))
+ goto error_guest_mode;
if (copy_from_user(shadow_vmcs12,
user_kvm_nested_state->data + VMCS12_SIZE,
- sizeof(*vmcs12)))
- return -EFAULT;
+ sizeof(*vmcs12))) {
+ ret = -EFAULT;
+ goto error_guest_mode;
+ }
if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
!shadow_vmcs12->hdr.shadow_vmcs)
- return -EINVAL;
+ goto error_guest_mode;
}
- if (nested_vmx_check_vmentry_prereqs(vcpu, vmcs12) ||
- nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
- return -EINVAL;
+ if (nested_vmx_check_controls(vcpu, vmcs12) ||
+ nested_vmx_check_host_state(vcpu, vmcs12) ||
+ nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
+ goto error_guest_mode;
vmx->nested.dirty_vmcs12 = true;
ret = nested_vmx_enter_non_root_mode(vcpu, false);
if (ret)
- return -EINVAL;
+ goto error_guest_mode;
return 0;
+
+error_guest_mode:
+ vmx->nested.nested_run_pending = 0;
+ return ret;
}
void nested_vmx_vcpu_setup(void)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 5ab4a364348e..a99613a060dd 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -126,7 +126,7 @@ static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
}
static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
- unsigned idx)
+ unsigned idx, u64 *mask)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
bool fixed = idx & (1u << 30);
@@ -138,6 +138,7 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
if (fixed && idx >= pmu->nr_arch_fixed_counters)
return NULL;
counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
+ *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
return &counters[idx];
}
@@ -183,9 +184,13 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
*data = pmu->global_ovf_ctrl;
return 0;
default:
- if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
- (pmc = get_fixed_pmc(pmu, msr))) {
- *data = pmc_read_counter(pmc);
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
+ u64 val = pmc_read_counter(pmc);
+ *data = val & pmu->counter_bitmask[KVM_PMC_GP];
+ return 0;
+ } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ u64 val = pmc_read_counter(pmc);
+ *data = val & pmu->counter_bitmask[KVM_PMC_FIXED];
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
*data = pmc->eventsel;
@@ -227,7 +232,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
break;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
+ if (!(data & pmu->global_ovf_ctrl_mask)) {
if (!msr_info->host_initiated)
pmu->global_status &= ~data;
pmu->global_ovf_ctrl = data;
@@ -235,11 +240,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
break;
default:
- if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
- (pmc = get_fixed_pmc(pmu, msr))) {
- if (!msr_info->host_initiated)
- data = (s64)(s32)data;
- pmc->counter += data - pmc_read_counter(pmc);
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
+ if (msr_info->host_initiated)
+ pmc->counter = data;
+ else
+ pmc->counter = (s32)data;
+ return 0;
+ } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ pmc->counter = data;
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
if (data == pmc->eventsel)
@@ -297,6 +305,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
pmu->global_ctrl_mask = ~pmu->global_ctrl;
+ pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask
+ & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
+ MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
+ if (kvm_x86_ops->pt_supported())
+ pmu->global_ovf_ctrl_mask &=
+ ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
entry = kvm_find_cpuid_entry(vcpu, 7, 0);
if (entry &&
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 7b272738c576..d4cb1945b2e3 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -3,6 +3,7 @@
#include <asm/asm.h>
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
+#include <asm/nospec-branch.h>
#define WORD_SIZE (BITS_PER_LONG / 8)
@@ -77,6 +78,17 @@ ENDPROC(vmx_vmenter)
* referred to by VMCS.HOST_RIP.
*/
ENTRY(vmx_vmexit)
+#ifdef CONFIG_RETPOLINE
+ ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE
+ /* Preserve guest's RAX, it's used to stuff the RSB. */
+ push %_ASM_AX
+
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+
+ pop %_ASM_AX
+.Lvmexit_skip_rsb:
+#endif
ret
ENDPROC(vmx_vmexit)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b4e7d645275a..b93e36ddee5e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -114,6 +114,9 @@ static u64 __read_mostly host_xss;
bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);
+static bool __read_mostly dump_invalid_vmcs = 0;
+module_param(dump_invalid_vmcs, bool, 0644);
+
#define MSR_BITMAP_MODE_X2APIC 1
#define MSR_BITMAP_MODE_X2APIC_APICV 2
@@ -1692,6 +1695,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_SYSENTER_ESP:
msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
+ case MSR_IA32_POWER_CTL:
+ msr_info->data = vmx->msr_ia32_power_ctl;
+ break;
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
@@ -1822,6 +1828,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_SYSENTER_ESP:
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
+ case MSR_IA32_POWER_CTL:
+ vmx->msr_ia32_power_ctl = data;
+ break;
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
@@ -1891,7 +1900,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
- if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+ if (!kvm_pat_valid(data))
return 1;
vmcs_write64(GUEST_IA32_PAT, data);
vcpu->arch.pat = data;
@@ -2288,7 +2297,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
#endif
opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
- VM_EXIT_SAVE_IA32_PAT |
VM_EXIT_LOAD_IA32_PAT |
VM_EXIT_LOAD_IA32_EFER |
VM_EXIT_CLEAR_BNDCFGS |
@@ -3619,14 +3627,13 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
- WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
+ WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
return false;
rvi = vmx_get_rvi();
- vapic_page = kmap(vmx->nested.virtual_apic_page);
+ vapic_page = vmx->nested.virtual_apic_map.hva;
vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
- kunmap(vmx->nested.virtual_apic_page);
return ((rvi & 0xf0) > (vppr & 0xf0));
}
@@ -4827,7 +4834,7 @@ static int handle_cpuid(struct kvm_vcpu *vcpu)
static int handle_rdmsr(struct kvm_vcpu *vcpu)
{
- u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+ u32 ecx = kvm_rcx_read(vcpu);
struct msr_data msr_info;
msr_info.index = ecx;
@@ -4840,18 +4847,16 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
trace_kvm_msr_read(ecx, msr_info.data);
- /* FIXME: handling of bits 32:63 of rax, rdx */
- vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
- vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
+ kvm_rax_write(vcpu, msr_info.data & -1u);
+ kvm_rdx_write(vcpu, (msr_info.data >> 32) & -1u);
return kvm_skip_emulated_instruction(vcpu);
}
static int handle_wrmsr(struct kvm_vcpu *vcpu)
{
struct msr_data msr;
- u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
- u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
- | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+ u32 ecx = kvm_rcx_read(vcpu);
+ u64 data = kvm_read_edx_eax(vcpu);
msr.data = data;
msr.index = ecx;
@@ -4922,7 +4927,7 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu)
static int handle_xsetbv(struct kvm_vcpu *vcpu)
{
u64 new_bv = kvm_read_edx_eax(vcpu);
- u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ u32 index = kvm_rcx_read(vcpu);
if (kvm_set_xcr(vcpu, index, new_bv) == 0)
return kvm_skip_emulated_instruction(vcpu);
@@ -5605,15 +5610,24 @@ static void vmx_dump_dtsel(char *name, uint32_t limit)
void dump_vmcs(void)
{
- u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
- u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
- u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
- u32 secondary_exec_control = 0;
- unsigned long cr4 = vmcs_readl(GUEST_CR4);
- u64 efer = vmcs_read64(GUEST_IA32_EFER);
+ u32 vmentry_ctl, vmexit_ctl;
+ u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
+ unsigned long cr4;
+ u64 efer;
int i, n;
+ if (!dump_invalid_vmcs) {
+ pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
+ return;
+ }
+
+ vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
+ vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
+ cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+ cr4 = vmcs_readl(GUEST_CR4);
+ efer = vmcs_read64(GUEST_IA32_EFER);
+ secondary_exec_control = 0;
if (cpu_has_secondary_exec_ctrls())
secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -5723,8 +5737,16 @@ void dump_vmcs(void)
if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
pr_err("TSC Multiplier = 0x%016llx\n",
vmcs_read64(TSC_MULTIPLIER));
- if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
- pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+ if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
+ u16 status = vmcs_read16(GUEST_INTR_STATUS);
+ pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
+ }
+ pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+ pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
+ pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
+ }
if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
@@ -6431,8 +6453,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
*/
x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
+ /* L1D Flush includes CPU buffer clear to mitigate MDS */
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
+ else if (static_branch_unlikely(&mds_user_clear))
+ mds_clear_cpu_buffers();
if (vcpu->arch.cr2 != read_cr2())
write_cr2(vcpu->arch.cr2);
@@ -6462,9 +6487,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
- /* Eliminate branch target predictions from guest mode */
- vmexit_fill_RSB();
-
/* All fields are clean at this point */
if (static_branch_unlikely(&enable_evmcs))
current_evmcs->hv_clean_fields |=
@@ -6503,7 +6525,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
*/
if (static_cpu_has(X86_FEATURE_PKU) &&
kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
- vcpu->arch.pkru = __read_pkru();
+ vcpu->arch.pkru = rdpkru();
if (vcpu->arch.pkru != vmx->host_pkru)
__write_pkru(vmx->host_pkru);
}
@@ -6671,8 +6693,8 @@ free_partial_vcpu:
return ERR_PTR(err);
}
-#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
-#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
static int vmx_vm_init(struct kvm *kvm)
{
@@ -6856,30 +6878,6 @@ static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
}
}
-static bool guest_cpuid_has_pmu(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *entry;
- union cpuid10_eax eax;
-
- entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
- if (!entry)
- return false;
-
- eax.full = entry->eax;
- return (eax.split.version_id > 0);
-}
-
-static void nested_vmx_procbased_ctls_update(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- bool pmu_enabled = guest_cpuid_has_pmu(vcpu);
-
- if (pmu_enabled)
- vmx->nested.msrs.procbased_ctls_high |= CPU_BASED_RDPMC_EXITING;
- else
- vmx->nested.msrs.procbased_ctls_high &= ~CPU_BASED_RDPMC_EXITING;
-}
-
static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6968,7 +6966,6 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
if (nested_vmx_allowed(vcpu)) {
nested_vmx_cr_fixed1_bits_update(vcpu);
nested_vmx_entry_exit_ctls_update(vcpu);
- nested_vmx_procbased_ctls_update(vcpu);
}
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
@@ -7028,10 +7025,12 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
return 0;
}
-static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired)
{
struct vcpu_vmx *vmx;
u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
+ struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
if (kvm_mwait_in_guest(vcpu->kvm))
return -EOPNOTSUPP;
@@ -7040,7 +7039,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
tscl = rdtsc();
guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
- lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
+ lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
+ ktimer->timer_advance_ns);
if (delta_tsc > lapic_timer_advance_cycles)
delta_tsc -= lapic_timer_advance_cycles;
@@ -7049,10 +7049,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
/* Convert to host delta tsc if tsc scaling is enabled */
if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
- u64_shl_div_u64(delta_tsc,
+ delta_tsc && u64_shl_div_u64(delta_tsc,
kvm_tsc_scaling_ratio_frac_bits,
- vcpu->arch.tsc_scaling_ratio,
- &delta_tsc))
+ vcpu->arch.tsc_scaling_ratio, &delta_tsc))
return -ERANGE;
/*
@@ -7065,7 +7064,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
return -ERANGE;
vmx->hv_deadline_tsc = tscl + delta_tsc;
- return delta_tsc == 0;
+ *expired = !delta_tsc;
+ return 0;
}
static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
@@ -7102,9 +7102,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- gpa_t gpa;
- struct page *page = NULL;
- u64 *pml_address;
+ gpa_t gpa, dst;
if (is_guest_mode(vcpu)) {
WARN_ON_ONCE(vmx->nested.pml_full);
@@ -7124,15 +7122,13 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
}
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
+ dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
- page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
- if (is_error_page(page))
+ if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
+ offset_in_page(dst), sizeof(gpa)))
return 0;
- pml_address = kmap(page);
- pml_address[vmcs12->guest_pml_index--] = gpa;
- kunmap(page);
- kvm_release_page_clean(page);
+ vmcs12->guest_pml_index--;
}
return 0;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index f879529906b4..61128b48c503 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -142,8 +142,11 @@ struct nested_vmx {
* pointers, so we must keep them pinned while L2 runs.
*/
struct page *apic_access_page;
- struct page *virtual_apic_page;
- struct page *pi_desc_page;
+ struct kvm_host_map virtual_apic_map;
+ struct kvm_host_map pi_desc_map;
+
+ struct kvm_host_map msr_bitmap_map;
+
struct pi_desc *pi_desc;
bool pi_pending;
u16 posted_intr_nv;
@@ -169,7 +172,7 @@ struct nested_vmx {
} smm;
gpa_t hv_evmcs_vmptr;
- struct page *hv_evmcs_page;
+ struct kvm_host_map hv_evmcs_map;
struct hv_enlightened_vmcs *hv_evmcs;
};
@@ -257,6 +260,8 @@ struct vcpu_vmx {
unsigned long host_debugctlmsr;
+ u64 msr_ia32_power_ctl;
+
/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
* msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
@@ -314,6 +319,7 @@ void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr);
void pt_update_intercept_for_msr(struct vcpu_vmx *vmx);
+void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a0d1fc80ac5a..acb179f78fdc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -136,10 +136,14 @@ EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
-/* lapic timer advance (tscdeadline mode only) in nanoseconds */
-unsigned int __read_mostly lapic_timer_advance_ns = 1000;
-module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
-EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
+/*
+ * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
+ * adaptive tuning starting from default advancment of 1000ns. '0' disables
+ * advancement entirely. Any other value is used as-is and disables adaptive
+ * tuning, i.e. allows priveleged userspace to set an exact advancement time.
+ */
+static int __read_mostly lapic_timer_advance_ns = -1;
+module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);
@@ -1096,15 +1100,15 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
bool kvm_rdpmc(struct kvm_vcpu *vcpu)
{
- u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ u32 ecx = kvm_rcx_read(vcpu);
u64 data;
int err;
err = kvm_pmu_rdpmc(vcpu, ecx, &data);
if (err)
return err;
- kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
- kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
+ kvm_rax_write(vcpu, (u32)data);
+ kvm_rdx_write(vcpu, data >> 32);
return err;
}
EXPORT_SYMBOL_GPL(kvm_rdpmc);
@@ -1170,6 +1174,9 @@ static u32 emulated_msrs[] = {
MSR_PLATFORM_INFO,
MSR_MISC_FEATURES_ENABLES,
MSR_AMD64_VIRT_SPEC_CTRL,
+ MSR_IA32_POWER_CTL,
+
+ MSR_K7_HWCR,
};
static unsigned num_emulated_msrs;
@@ -1258,31 +1265,49 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
return 0;
}
-bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- if (efer & efer_reserved_bits)
- return false;
-
if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
- return false;
+ return false;
if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
- return false;
+ return false;
+
+ if (efer & (EFER_LME | EFER_LMA) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return false;
+
+ if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
+ return false;
return true;
+
+}
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & efer_reserved_bits)
+ return false;
+
+ return __kvm_valid_efer(vcpu, efer);
}
EXPORT_SYMBOL_GPL(kvm_valid_efer);
-static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 old_efer = vcpu->arch.efer;
+ u64 efer = msr_info->data;
- if (!kvm_valid_efer(vcpu, efer))
+ if (efer & efer_reserved_bits)
return 1;
- if (is_paging(vcpu)
- && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
- return 1;
+ if (!msr_info->host_initiated) {
+ if (!__kvm_valid_efer(vcpu, efer))
+ return 1;
+
+ if (is_paging(vcpu) &&
+ (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+ return 1;
+ }
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
@@ -2275,6 +2300,18 @@ static void kvmclock_sync_fn(struct work_struct *work)
KVMCLOCK_SYNC_PERIOD);
}
+/*
+ * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
+ */
+static bool can_set_mci_status(struct kvm_vcpu *vcpu)
+{
+ /* McStatusWrEn enabled? */
+ if (guest_cpuid_is_amd(vcpu))
+ return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
+
+ return false;
+}
+
static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 mcg_cap = vcpu->arch.mcg_cap;
@@ -2306,9 +2343,14 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((offset & 0x3) == 0 &&
data != 0 && (data | (1 << 10)) != ~(u64)0)
return -1;
+
+ /* MCi_STATUS */
if (!msr_info->host_initiated &&
- (offset & 0x3) == 1 && data != 0)
- return -1;
+ (offset & 0x3) == 1 && data != 0) {
+ if (!can_set_mci_status(vcpu))
+ return -1;
+ }
+
vcpu->arch.mce_banks[offset] = data;
break;
}
@@ -2452,13 +2494,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.arch_capabilities = data;
break;
case MSR_EFER:
- return set_efer(vcpu, data);
+ return set_efer(vcpu, msr_info);
case MSR_K7_HWCR:
data &= ~(u64)0x40; /* ignore flush filter disable */
data &= ~(u64)0x100; /* ignore ignne emulation enable */
data &= ~(u64)0x8; /* ignore TLB cache disable */
- data &= ~(u64)0x40000; /* ignore Mc status write enable */
- if (data != 0) {
+
+ /* Handle McStatusWrEn */
+ if (data == BIT_ULL(18)) {
+ vcpu->arch.msr_hwcr = data;
+ } else if (data != 0) {
vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
data);
return 1;
@@ -2732,7 +2777,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_K8_SYSCFG:
case MSR_K8_TSEG_ADDR:
case MSR_K8_TSEG_MASK:
- case MSR_K7_HWCR:
case MSR_VM_HSAVE_PA:
case MSR_K8_INT_PENDING_MSG:
case MSR_AMD64_NB_CFG:
@@ -2896,6 +2940,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_MISC_FEATURES_ENABLES:
msr_info->data = vcpu->arch.msr_misc_features_enables;
break;
+ case MSR_K7_HWCR:
+ msr_info->data = vcpu->arch.msr_hwcr;
+ break;
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -3075,9 +3122,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;
- case KVM_CAP_NR_MEMSLOTS:
- r = KVM_USER_MEM_SLOTS;
- break;
case KVM_CAP_PV_MMU: /* obsolete */
r = 0;
break;
@@ -3677,15 +3721,15 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
*/
valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
while (valid) {
- u64 feature = valid & -valid;
- int index = fls64(feature) - 1;
- void *src = get_xsave_addr(xsave, feature);
+ u64 xfeature_mask = valid & -valid;
+ int xfeature_nr = fls64(xfeature_mask) - 1;
+ void *src = get_xsave_addr(xsave, xfeature_nr);
if (src) {
u32 size, offset, ecx, edx;
- cpuid_count(XSTATE_CPUID, index,
+ cpuid_count(XSTATE_CPUID, xfeature_nr,
&size, &offset, &ecx, &edx);
- if (feature == XFEATURE_MASK_PKRU)
+ if (xfeature_nr == XFEATURE_PKRU)
memcpy(dest + offset, &vcpu->arch.pkru,
sizeof(vcpu->arch.pkru));
else
@@ -3693,7 +3737,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
}
- valid -= feature;
+ valid -= xfeature_mask;
}
}
@@ -3720,22 +3764,22 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
*/
valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
while (valid) {
- u64 feature = valid & -valid;
- int index = fls64(feature) - 1;
- void *dest = get_xsave_addr(xsave, feature);
+ u64 xfeature_mask = valid & -valid;
+ int xfeature_nr = fls64(xfeature_mask) - 1;
+ void *dest = get_xsave_addr(xsave, xfeature_nr);
if (dest) {
u32 size, offset, ecx, edx;
- cpuid_count(XSTATE_CPUID, index,
+ cpuid_count(XSTATE_CPUID, xfeature_nr,
&size, &offset, &ecx, &edx);
- if (feature == XFEATURE_MASK_PKRU)
+ if (xfeature_nr == XFEATURE_PKRU)
memcpy(&vcpu->arch.pkru, src + offset,
sizeof(vcpu->arch.pkru));
else
memcpy(dest, src + offset, size);
}
- valid -= feature;
+ valid -= xfeature_mask;
}
}
@@ -5517,9 +5561,9 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
unsigned int bytes,
struct x86_exception *exception)
{
+ struct kvm_host_map map;
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
- struct page *page;
char *kaddr;
bool exchanged;
@@ -5536,12 +5580,11 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
goto emul_write;
- page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
- if (is_error_page(page))
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
goto emul_write;
- kaddr = kmap_atomic(page);
- kaddr += offset_in_page(gpa);
+ kaddr = map.hva + offset_in_page(gpa);
+
switch (bytes) {
case 1:
exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
@@ -5558,13 +5601,12 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
default:
BUG();
}
- kunmap_atomic(kaddr);
- kvm_release_page_dirty(page);
+
+ kvm_vcpu_unmap(vcpu, &map, true);
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
- kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
kvm_page_track_write(vcpu, gpa, new, bytes);
return X86EMUL_CONTINUE;
@@ -6535,6 +6577,12 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
}
EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
+static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+ return 1;
+}
+
static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
{
vcpu->arch.pio.count = 0;
@@ -6548,15 +6596,26 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned short port)
{
- unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ unsigned long val = kvm_rax_read(vcpu);
int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
size, port, &val, 1);
+ if (ret)
+ return ret;
- if (!ret) {
+ /*
+ * Workaround userspace that relies on old KVM behavior of %rip being
+ * incremented prior to exiting to userspace to handle "OUT 0x7e".
+ */
+ if (port == 0x7e &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
+ vcpu->arch.complete_userspace_io =
+ complete_fast_pio_out_port_0x7e;
+ kvm_skip_emulated_instruction(vcpu);
+ } else {
vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_out;
}
- return ret;
+ return 0;
}
static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
@@ -6572,8 +6631,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
}
/* For size less than 4 we merge, else we zero extend */
- val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
- : 0;
+ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
/*
* Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
@@ -6581,7 +6639,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
*/
emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
vcpu->arch.pio.port, &val, 1);
- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+ kvm_rax_write(vcpu, val);
return kvm_skip_emulated_instruction(vcpu);
}
@@ -6593,12 +6651,12 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
int ret;
/* For size less than 4 we merge, else we zero extend */
- val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0;
+ val = (size < 4) ? kvm_rax_read(vcpu) : 0;
ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
&val, 1);
if (ret) {
- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+ kvm_rax_write(vcpu, val);
return ret;
}
@@ -6677,10 +6735,8 @@ static void kvm_hyperv_tsc_notifier(void)
}
#endif
-static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
- void *data)
+static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
{
- struct cpufreq_freqs *freq = data;
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
@@ -6724,17 +6780,12 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
*
*/
- if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
- return 0;
- if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
- return 0;
-
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (vcpu->cpu != freq->cpu)
+ if (vcpu->cpu != cpu)
continue;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (vcpu->cpu != smp_processor_id())
@@ -6756,8 +6807,24 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
* guest context is entered kvmclock will be updated,
* so the guest will not see stale values.
*/
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
}
+}
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ int cpu;
+
+ if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+ return 0;
+ if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+ return 0;
+
+ for_each_cpu(cpu, freq->policy->cpus)
+ __kvmclock_cpufreq_notifier(freq, cpu);
+
return 0;
}
@@ -6824,10 +6891,20 @@ static unsigned long kvm_get_guest_ip(void)
return ip;
}
+static void kvm_handle_intel_pt_intr(void)
+{
+ struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
+
+ kvm_make_request(KVM_REQ_PMI, vcpu);
+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
+ (unsigned long *)&vcpu->arch.pmu.global_status);
+}
+
static struct perf_guest_info_callbacks kvm_guest_cbs = {
.is_in_guest = kvm_is_in_guest,
.is_user_mode = kvm_is_user_mode,
.get_guest_ip = kvm_get_guest_ip,
+ .handle_intel_pt_intr = kvm_handle_intel_pt_intr,
};
static void kvm_set_mmio_spte_mask(void)
@@ -7103,11 +7180,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
- nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
- a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
- a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
- a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
- a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ nr = kvm_rax_read(vcpu);
+ a0 = kvm_rbx_read(vcpu);
+ a1 = kvm_rcx_read(vcpu);
+ a2 = kvm_rdx_read(vcpu);
+ a3 = kvm_rsi_read(vcpu);
trace_kvm_hypercall(nr, a0, a1, a2, a3);
@@ -7148,7 +7225,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
out:
if (!op_64_bit)
ret = (u32)ret;
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+ kvm_rax_write(vcpu, ret);
++vcpu->stat.hypercalls;
return kvm_skip_emulated_instruction(vcpu);
@@ -7873,10 +7950,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
trace_kvm_entry(vcpu->vcpu_id);
- if (lapic_timer_advance_ns)
+ if (lapic_in_kernel(vcpu) &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns)
wait_lapic_expire(vcpu);
guest_enter_irqoff();
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
+
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
@@ -8135,22 +8217,30 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
- preempt_disable();
+ fpregs_lock();
+
copy_fpregs_to_fpstate(&current->thread.fpu);
/* PKRU is separately restored in kvm_x86_ops->run. */
__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
~XFEATURE_MASK_PKRU);
- preempt_enable();
+
+ fpregs_mark_activate();
+ fpregs_unlock();
+
trace_kvm_fpu(1);
}
/* When vcpu_run ends, restore user space FPU context. */
static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
- preempt_disable();
+ fpregs_lock();
+
copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
copy_kernel_to_fpregs(&current->thread.fpu.state);
- preempt_enable();
+
+ fpregs_mark_activate();
+ fpregs_unlock();
+
++vcpu->stat.fpu_reload;
trace_kvm_fpu(0);
}
@@ -8237,23 +8327,23 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
}
- regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
- regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
- regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
- regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
- regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
- regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+ regs->rax = kvm_rax_read(vcpu);
+ regs->rbx = kvm_rbx_read(vcpu);
+ regs->rcx = kvm_rcx_read(vcpu);
+ regs->rdx = kvm_rdx_read(vcpu);
+ regs->rsi = kvm_rsi_read(vcpu);
+ regs->rdi = kvm_rdi_read(vcpu);
+ regs->rsp = kvm_rsp_read(vcpu);
+ regs->rbp = kvm_rbp_read(vcpu);
#ifdef CONFIG_X86_64
- regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
- regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
- regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
- regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
- regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
- regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
- regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
- regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
+ regs->r8 = kvm_r8_read(vcpu);
+ regs->r9 = kvm_r9_read(vcpu);
+ regs->r10 = kvm_r10_read(vcpu);
+ regs->r11 = kvm_r11_read(vcpu);
+ regs->r12 = kvm_r12_read(vcpu);
+ regs->r13 = kvm_r13_read(vcpu);
+ regs->r14 = kvm_r14_read(vcpu);
+ regs->r15 = kvm_r15_read(vcpu);
#endif
regs->rip = kvm_rip_read(vcpu);
@@ -8273,23 +8363,23 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
- kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
- kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
- kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
- kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
- kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
- kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
- kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
+ kvm_rax_write(vcpu, regs->rax);
+ kvm_rbx_write(vcpu, regs->rbx);
+ kvm_rcx_write(vcpu, regs->rcx);
+ kvm_rdx_write(vcpu, regs->rdx);
+ kvm_rsi_write(vcpu, regs->rsi);
+ kvm_rdi_write(vcpu, regs->rdi);
+ kvm_rsp_write(vcpu, regs->rsp);
+ kvm_rbp_write(vcpu, regs->rbp);
#ifdef CONFIG_X86_64
- kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
- kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
- kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
- kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
- kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
- kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
- kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
- kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+ kvm_r8_write(vcpu, regs->r8);
+ kvm_r9_write(vcpu, regs->r9);
+ kvm_r10_write(vcpu, regs->r10);
+ kvm_r11_write(vcpu, regs->r11);
+ kvm_r12_write(vcpu, regs->r12);
+ kvm_r13_write(vcpu, regs->r13);
+ kvm_r14_write(vcpu, regs->r14);
+ kvm_r15_write(vcpu, regs->r15);
#endif
kvm_rip_write(vcpu, regs->rip);
@@ -8848,11 +8938,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (init_event)
kvm_put_guest_fpu(vcpu);
mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
- XFEATURE_MASK_BNDREGS);
+ XFEATURE_BNDREGS);
if (mpx_state_buffer)
memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
- XFEATURE_MASK_BNDCSR);
+ XFEATURE_BNDCSR);
if (mpx_state_buffer)
memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
if (init_event)
@@ -9061,7 +9151,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
- r = kvm_create_lapic(vcpu);
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
if (r < 0)
goto fail_mmu_destroy;
} else
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index aedc5d0d4989..a470ff0868c5 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -294,8 +294,6 @@ extern u64 kvm_supported_xcr0(void);
extern unsigned int min_timer_period_us;
-extern unsigned int lapic_timer_advance_ns;
-
extern bool enable_vmware_backdoor;
extern struct static_key kvm_no_apic_vcpu;
@@ -347,6 +345,16 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
__this_cpu_write(current_vcpu, NULL);
}
+
+static inline bool kvm_pat_valid(u64 data)
+{
+ if (data & 0xF8F8F8F8F8F8F8F8ull)
+ return false;
+ /* 0, 1, 4, 5, 6, 7 are valid values. */
+ return (data | ((data & 0x0202020202020202ull) << 1)) == data;
+}
+
void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu);
void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu);
+
#endif