summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virt/kvm/api.rst5
-rw-r--r--arch/mips/kvm/emulate.c4
-rw-r--r--arch/mips/kvm/mips.c2
-rw-r--r--arch/s390/include/asm/kvm_host.h8
-rw-r--r--arch/x86/include/asm/kvm_host.h4
-rw-r--r--arch/x86/include/asm/mwait.h2
-rw-r--r--arch/x86/include/uapi/asm/kvm.h5
-rw-r--r--arch/x86/kernel/cpu/umwait.c6
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h2
-rw-r--r--arch/x86/kvm/lapic.c50
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmu/mmu.c6
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h16
-rw-r--r--arch/x86/kvm/svm/svm.c2
-rw-r--r--arch/x86/kvm/vmx/nested.c21
-rw-r--r--arch/x86/kvm/vmx/nested.h5
-rw-r--r--arch/x86/kvm/vmx/vmcs.h32
-rw-r--r--arch/x86/kvm/vmx/vmx.c40
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/x86.c12
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c42
-rw-r--r--virt/kvm/kvm_main.c3
22 files changed, 156 insertions, 115 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 426f94582b7a..320788f81a05 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4339,14 +4339,15 @@ Errors:
#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
struct kvm_vmx_nested_state_hdr {
- __u32 flags;
__u64 vmxon_pa;
__u64 vmcs12_pa;
- __u64 preemption_timer_deadline;
struct {
__u16 flags;
} smm;
+
+ __u32 flags;
+ __u64 preemption_timer_deadline;
};
struct kvm_vmx_nested_state_data {
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 5ae82d925197..d242300cacc0 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1722,6 +1722,7 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
vcpu->arch.gprs[rt], *(u32 *)data);
break;
+#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ)
case sdl_op:
run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa(
vcpu->arch.host_cp0_badvaddr) & (~0x7);
@@ -1815,6 +1816,7 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
vcpu->arch.gprs[rt], *(u64 *)data);
break;
+#endif
#ifdef CONFIG_CPU_LOONGSON64
case sdc2_op:
@@ -2002,6 +2004,7 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
}
break;
+#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ)
case ldl_op:
run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa(
vcpu->arch.host_cp0_badvaddr) & (~0x7);
@@ -2073,6 +2076,7 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
break;
}
break;
+#endif
#ifdef CONFIG_CPU_LOONGSON64
case ldc2_op:
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 521bd5891e84..666d3350b4ac 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -67,8 +67,10 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("vz_ghfc", vz_ghfc_exits),
VCPU_STAT("vz_gpa", vz_gpa_exits),
VCPU_STAT("vz_resvd", vz_resvd_exits),
+#ifdef CONFIG_CPU_LOONGSON64
VCPU_STAT("vz_cpucfg", vz_cpucfg_exits),
#endif
+#endif
VCPU_STAT("halt_successful_poll", halt_successful_poll),
VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index cee3cb6455a2..6ea0820e7c7f 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -31,12 +31,12 @@
#define KVM_USER_MEM_SLOTS 32
/*
- * These seem to be used for allocating ->chip in the routing table,
- * which we don't use. 4096 is an out-of-thin-air value. If we need
- * to look at ->chip later on, we'll need to revisit this.
+ * These seem to be used for allocating ->chip in the routing table, which we
+ * don't use. 1 is as small as we can get to reduce the needed memory. If we
+ * need to look at ->chip later on, we'll need to revisit this.
*/
#define KVM_NR_IRQCHIPS 1
-#define KVM_IRQCHIP_NUM_PINS 4096
+#define KVM_IRQCHIP_NUM_PINS 1
#define KVM_HALT_POLL_NS_DEFAULT 50000
/* s390-specific vcpu->requests bit members */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f8998e97457f..be5363b21540 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -943,7 +943,7 @@ struct kvm_arch {
atomic_t vapics_in_nmi_mode;
struct mutex apic_map_lock;
struct kvm_apic_map *apic_map;
- bool apic_map_dirty;
+ atomic_t apic_map_dirty;
bool apic_access_page_done;
unsigned long apicv_inhibit_reasons;
@@ -1220,7 +1220,7 @@ struct kvm_x86_ops {
void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t offset, unsigned long mask);
- int (*write_log_dirty)(struct kvm_vcpu *vcpu);
+ int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 73d997aa2966..e039a933aca3 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -25,8 +25,6 @@
#define TPAUSE_C01_STATE 1
#define TPAUSE_C02_STATE 0
-u32 get_umwait_control_msr(void);
-
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
{
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 17c5a038f42d..0780f97c1850 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -408,14 +408,15 @@ struct kvm_vmx_nested_state_data {
};
struct kvm_vmx_nested_state_hdr {
- __u32 flags;
__u64 vmxon_pa;
__u64 vmcs12_pa;
- __u64 preemption_timer_deadline;
struct {
__u16 flags;
} smm;
+
+ __u32 flags;
+ __u64 preemption_timer_deadline;
};
struct kvm_svm_nested_state_data {
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
index 300e3fd5ade3..ec8064c0ae03 100644
--- a/arch/x86/kernel/cpu/umwait.c
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -18,12 +18,6 @@
*/
static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
-u32 get_umwait_control_msr(void)
-{
- return umwait_control_cached;
-}
-EXPORT_SYMBOL_GPL(get_umwait_control_msr);
-
/*
* Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by
* hardware or BIOS before kernel boot.
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index ff2d0e9ca3bc..cfe83d4ae625 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -7,7 +7,7 @@
#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
#define KVM_POSSIBLE_CR4_GUEST_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE)
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD)
#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \
static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 34a7e0533dad..5bf72fc86a8e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -169,6 +169,18 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
kvfree(map);
}
+/*
+ * CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock.
+ *
+ * DIRTY -> UPDATE_IN_PROGRESS and UPDATE_IN_PROGRESS -> CLEAN happen with
+ * apic_map_lock_held.
+ */
+enum {
+ CLEAN,
+ UPDATE_IN_PROGRESS,
+ DIRTY
+};
+
void kvm_recalculate_apic_map(struct kvm *kvm)
{
struct kvm_apic_map *new, *old = NULL;
@@ -176,17 +188,17 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
int i;
u32 max_id = 255; /* enough space for any xAPIC ID */
- if (!kvm->arch.apic_map_dirty) {
- /*
- * Read kvm->arch.apic_map_dirty before
- * kvm->arch.apic_map
- */
- smp_rmb();
+ /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
+ if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
return;
- }
mutex_lock(&kvm->arch.apic_map_lock);
- if (!kvm->arch.apic_map_dirty) {
+ /*
+ * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map
+ * (if clean) or the APIC registers (if dirty).
+ */
+ if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty,
+ DIRTY, UPDATE_IN_PROGRESS) == CLEAN) {
/* Someone else has updated the map. */
mutex_unlock(&kvm->arch.apic_map_lock);
return;
@@ -256,11 +268,11 @@ out:
lockdep_is_held(&kvm->arch.apic_map_lock));
rcu_assign_pointer(kvm->arch.apic_map, new);
/*
- * Write kvm->arch.apic_map before
- * clearing apic->apic_map_dirty
+ * Write kvm->arch.apic_map before clearing apic->apic_map_dirty.
+ * If another update has come in, leave it DIRTY.
*/
- smp_wmb();
- kvm->arch.apic_map_dirty = false;
+ atomic_cmpxchg_release(&kvm->arch.apic_map_dirty,
+ UPDATE_IN_PROGRESS, CLEAN);
mutex_unlock(&kvm->arch.apic_map_lock);
if (old)
@@ -282,20 +294,20 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
else
static_key_slow_inc(&apic_sw_disabled.key);
- apic->vcpu->kvm->arch.apic_map_dirty = true;
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
}
static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
{
kvm_lapic_set_reg(apic, APIC_ID, id << 24);
- apic->vcpu->kvm->arch.apic_map_dirty = true;
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
{
kvm_lapic_set_reg(apic, APIC_LDR, id);
- apic->vcpu->kvm->arch.apic_map_dirty = true;
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
@@ -311,7 +323,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
kvm_lapic_set_reg(apic, APIC_ID, id);
kvm_lapic_set_reg(apic, APIC_LDR, ldr);
- apic->vcpu->kvm->arch.apic_map_dirty = true;
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
@@ -1976,7 +1988,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_DFR:
if (!apic_x2apic_mode(apic)) {
kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
- apic->vcpu->kvm->arch.apic_map_dirty = true;
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
} else
ret = 1;
break;
@@ -2232,7 +2244,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
static_key_slow_dec_deferred(&apic_hw_disabled);
} else {
static_key_slow_inc(&apic_hw_disabled.key);
- vcpu->kvm->arch.apic_map_dirty = true;
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
}
@@ -2273,7 +2285,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
if (!apic)
return;
- vcpu->kvm->arch.apic_map_dirty = false;
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
@@ -2567,6 +2578,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
}
memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
kvm_recalculate_apic_map(vcpu->kvm);
kvm_apic_set_version(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 0ad06bfe2c2c..444bb9c54548 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -222,7 +222,7 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn);
-int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index fdd05c233308..6d6a0ae7800c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1745,10 +1745,10 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
* Emulate arch specific page modification logging for the
* nested hypervisor
*/
-int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa)
{
if (kvm_x86_ops.write_log_dirty)
- return kvm_x86_ops.write_log_dirty(vcpu);
+ return kvm_x86_ops.write_log_dirty(vcpu, l2_gpa);
return 0;
}
@@ -4449,7 +4449,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
rsvd_bits(maxphyaddr, 51);
rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- nonleaf_bit8_rsvd | gbpages_bit_rsvd |
+ gbpages_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index a6d484ea110b..bd70ece1ef8b 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -235,7 +235,7 @@ static inline unsigned FNAME(gpte_access)(u64 gpte)
static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu,
struct guest_walker *walker,
- int write_fault)
+ gpa_t addr, int write_fault)
{
unsigned level, index;
pt_element_t pte, orig_pte;
@@ -260,7 +260,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
!(pte & PT_GUEST_DIRTY_MASK)) {
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
#if PTTYPE == PTTYPE_EPT
- if (kvm_arch_write_log_dirty(vcpu))
+ if (kvm_arch_write_log_dirty(vcpu, addr))
return -EINVAL;
#endif
pte |= PT_GUEST_DIRTY_MASK;
@@ -360,7 +360,6 @@ retry_walk:
++walker->level;
do {
- gfn_t real_gfn;
unsigned long host_addr;
pt_access = pte_access;
@@ -375,7 +374,7 @@ retry_walk:
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
+ real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
nested_access,
&walker->fault);
@@ -389,12 +388,10 @@ retry_walk:
* information to fix the exit_qualification or exit_info_1
* fields.
*/
- if (unlikely(real_gfn == UNMAPPED_GVA))
+ if (unlikely(real_gpa == UNMAPPED_GVA))
return 0;
- real_gfn = gpa_to_gfn(real_gfn);
-
- host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn,
+ host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa),
&walker->pte_writable[walker->level - 1]);
if (unlikely(kvm_is_error_hva(host_addr)))
goto error;
@@ -457,7 +454,8 @@ retry_walk:
(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
if (unlikely(!accessed_dirty)) {
- ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
+ ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker,
+ addr, write_fault);
if (unlikely(ret < 0))
goto error;
else if (ret)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 8ccfa4197d9c..c0da4dd78ac5 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3344,7 +3344,7 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
-static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
+static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
{
fastpath_t exit_fastpath;
struct vcpu_svm *svm = to_svm(vcpu);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d1af20b050a8..11e4df560018 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4109,7 +4109,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
* CR0_GUEST_HOST_MASK is already set in the original vmcs01
* (KVM doesn't change it);
*/
- vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
vmx_set_cr0(vcpu, vmcs12->host_cr0);
/* Same as above - no reason to call set_cr4_guest_host_mask(). */
@@ -4259,7 +4259,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
*/
vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
- vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
@@ -6079,6 +6079,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
return -EINVAL;
+ if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
+ return -EINVAL;
+
/*
* SMM temporarily disables VMX, so we cannot be in guest mode,
* nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
@@ -6108,9 +6111,16 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (ret)
return ret;
- /* Empty 'VMXON' state is permitted */
- if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
- return 0;
+ /* Empty 'VMXON' state is permitted if no VMCS loaded */
+ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
+ /* See vmx_has_valid_vmcs12. */
+ if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
+ (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
+ (kvm_state->hdr.vmx.vmcs12_pa != -1ull))
+ return -EINVAL;
+ else
+ return 0;
+ }
if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
@@ -6176,6 +6186,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
goto error_guest_mode;
}
+ vmx->nested.has_preemption_timer_deadline = false;
if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
vmx->nested.has_preemption_timer_deadline = true;
vmx->nested.preemption_timer_deadline =
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 758bccc26cf9..197148d76b8f 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -47,6 +47,11 @@ static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
}
+/*
+ * Note: the same condition is checked against the state provided by userspace
+ * in vmx_set_nested_state; if it is satisfied, the nested state must include
+ * the VMCS12.
+ */
static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 5c0ff80b85c0..7a3675fddec2 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -72,11 +72,24 @@ struct loaded_vmcs {
struct vmcs_controls_shadow controls_shadow;
};
+static inline bool is_intr_type(u32 intr_info, u32 type)
+{
+ const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK;
+
+ return (intr_info & mask) == (INTR_INFO_VALID_MASK | type);
+}
+
+static inline bool is_intr_type_n(u32 intr_info, u32 type, u8 vector)
+{
+ const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK |
+ INTR_INFO_VECTOR_MASK;
+
+ return (intr_info & mask) == (INTR_INFO_VALID_MASK | type | vector);
+}
+
static inline bool is_exception_n(u32 intr_info, u8 vector)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+ return is_intr_type_n(intr_info, INTR_TYPE_HARD_EXCEPTION, vector);
}
static inline bool is_debug(u32 intr_info)
@@ -106,28 +119,23 @@ static inline bool is_gp_fault(u32 intr_info)
static inline bool is_machine_check(u32 intr_info)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
+ return is_exception_n(intr_info, MC_VECTOR);
}
/* Undocumented: icebp/int1 */
static inline bool is_icebp(u32 intr_info)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
+ return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION);
}
static inline bool is_nmi(u32 intr_info)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
+ return is_intr_type(intr_info, INTR_TYPE_NMI_INTR);
}
static inline bool is_external_intr(u32 intr_info)
{
- return (intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
- == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR);
+ return is_intr_type(intr_info, INTR_TYPE_EXT_INTR);
}
enum vmcs_field_width {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 36c771728c8c..13745f2a5ecd 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -133,9 +133,6 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#define KVM_VM_CR0_ALWAYS_ON \
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
-#define KVM_CR4_GUEST_OWNED_BITS \
- (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
@@ -4034,9 +4031,9 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
{
- vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
- if (enable_ept)
- vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+ vmx->vcpu.arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS;
+ if (!enable_ept)
+ vmx->vcpu.arch.cr4_guest_owned_bits &= ~X86_CR4_PGE;
if (is_guest_mode(&vmx->vcpu))
vmx->vcpu.arch.cr4_guest_owned_bits &=
~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
@@ -4333,8 +4330,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
/* 22.2.1, 20.8.1 */
vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
- vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
- vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
+ vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
set_cr4_guest_host_mask(vmx);
@@ -6606,23 +6603,6 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host, false);
}
-static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx)
-{
- u32 host_umwait_control;
-
- if (!vmx_has_waitpkg(vmx))
- return;
-
- host_umwait_control = get_umwait_control_msr();
-
- if (vmx->msr_ia32_umwait_control != host_umwait_control)
- add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL,
- vmx->msr_ia32_umwait_control,
- host_umwait_control, false);
- else
- clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL);
-}
-
static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6728,9 +6708,7 @@ reenter_guest:
pt_guest_enter(vmx);
- if (vcpu_to_pmu(vcpu)->version)
- atomic_switch_perf_msrs(vmx);
- atomic_switch_umwait_control_msr(vmx);
+ atomic_switch_perf_msrs(vmx);
if (enable_preemption_timer)
vmx_update_hv_timer(vcpu);
@@ -7501,11 +7479,11 @@ static void vmx_flush_log_dirty(struct kvm *kvm)
kvm_flush_pml_buffers(kvm);
}
-static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
+static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
{
struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- gpa_t gpa, dst;
+ gpa_t dst;
if (is_guest_mode(vcpu)) {
WARN_ON_ONCE(vmx->nested.pml_full);
@@ -7524,7 +7502,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
return 1;
}
- gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
+ gpa &= ~0xFFFull;
dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 8a83b5edc820..639798e4a6ca 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -288,8 +288,6 @@ struct vcpu_vmx {
u64 current_tsc_ratio;
- u32 host_pkru;
-
unsigned long host_debugctlmsr;
/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00c88c2f34e4..88c593f83b28 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -975,6 +975,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE))
return 1;
+ if ((cr4 ^ old_cr4) & X86_CR4_LA57)
+ return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
&& !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
@@ -2693,6 +2695,9 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
if (data & 0x30)
return 1;
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
vcpu->arch.apf.msr_en_val = data;
if (!kvm_pv_async_pf_enabled(vcpu)) {
@@ -2856,7 +2861,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return kvm_mtrr_set_msr(vcpu, msr, data);
case MSR_IA32_APICBASE:
return kvm_set_apic_base(vcpu, msr_info);
- case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_write(vcpu, msr, data);
case MSR_IA32_TSCDEADLINE:
kvm_set_lapic_tscdeadline_msr(vcpu, data);
@@ -3196,7 +3201,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_APICBASE:
msr_info->data = kvm_get_apic_base(vcpu);
break;
- case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
case MSR_IA32_TSCDEADLINE:
msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
@@ -4603,7 +4608,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
user_tsc_khz = (u32)arg;
- if (user_tsc_khz >= kvm_max_guest_tsc_khz)
+ if (kvm_has_tsc_control &&
+ user_tsc_khz >= kvm_max_guest_tsc_khz)
goto out;
if (user_tsc_khz == 0)
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
index 54cdefdfb49d..d59f3eb67c8f 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
@@ -76,10 +76,8 @@ void set_default_state(struct kvm_nested_state *state)
void set_default_vmx_state(struct kvm_nested_state *state, int size)
{
memset(state, 0, size);
- state->flags = KVM_STATE_NESTED_GUEST_MODE |
- KVM_STATE_NESTED_RUN_PENDING;
if (have_evmcs)
- state->flags |= KVM_STATE_NESTED_EVMCS;
+ state->flags = KVM_STATE_NESTED_EVMCS;
state->format = 0;
state->size = size;
state->hdr.vmx.vmxon_pa = 0x1000;
@@ -148,6 +146,11 @@ void test_vmx_nested_state(struct kvm_vm *vm)
state->hdr.vmx.smm.flags = 1;
test_nested_state_expect_einval(vm, state);
+ /* Invalid flags are rejected. */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.flags = ~0;
+ test_nested_state_expect_einval(vm, state);
+
/* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */
set_default_vmx_state(state, state_sz);
state->hdr.vmx.vmxon_pa = -1ull;
@@ -185,20 +188,41 @@ void test_vmx_nested_state(struct kvm_vm *vm)
state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
test_nested_state_expect_einval(vm, state);
- /* Size must be large enough to fit kvm_nested_state and vmcs12. */
+ /*
+ * Size must be large enough to fit kvm_nested_state and vmcs12
+ * if VMCS12 physical address is set
+ */
set_default_vmx_state(state, state_sz);
state->size = sizeof(*state);
+ state->flags = 0;
+ test_nested_state_expect_einval(vm, state);
+
+ set_default_vmx_state(state, state_sz);
+ state->size = sizeof(*state);
+ state->flags = 0;
+ state->hdr.vmx.vmcs12_pa = -1;
test_nested_state(vm, state);
- /* vmxon_pa cannot be the same address as vmcs_pa. */
+ /*
+ * KVM_SET_NESTED_STATE succeeds with invalid VMCS
+ * contents but L2 not running.
+ */
set_default_vmx_state(state, state_sz);
- state->hdr.vmx.vmxon_pa = 0;
- state->hdr.vmx.vmcs12_pa = 0;
+ state->flags = 0;
+ test_nested_state(vm, state);
+
+ /* Invalid flags are rejected, even if no VMCS loaded. */
+ set_default_vmx_state(state, state_sz);
+ state->size = sizeof(*state);
+ state->flags = 0;
+ state->hdr.vmx.vmcs12_pa = -1;
+ state->hdr.vmx.flags = ~0;
test_nested_state_expect_einval(vm, state);
- /* The revision id for vmcs12 must be VMCS12_REVISION. */
+ /* vmxon_pa cannot be the same address as vmcs_pa. */
set_default_vmx_state(state, state_sz);
- set_revision_id_for_vmcs12(state, 0);
+ state->hdr.vmx.vmxon_pa = 0;
+ state->hdr.vmx.vmcs12_pa = 0;
test_nested_state_expect_einval(vm, state);
/*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a852af5c3214..0a68c9d3d3ab 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3350,7 +3350,8 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
if (kvm_sigmask.len != sizeof(compat_sigset_t))
goto out;
r = -EFAULT;
- if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
+ if (get_compat_sigset(&sigset,
+ (compat_sigset_t __user *)sigmask_arg->sigset))
goto out;
r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
} else