summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c6
-rw-r--r--arch/powerpc/kvm/book3s_hv.c4
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/kvm/hyperv.c11
-rw-r--r--arch/x86/kvm/lapic.c73
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.c1
-rw-r--r--arch/x86/kvm/vmx/nested.c4
-rw-r--r--arch/x86/kvm/vmx/vmenter.S12
-rw-r--r--arch/x86/kvm/vmx/vmx.c7
-rw-r--r--arch/x86/kvm/x86.c36
-rw-r--r--arch/x86/kvm/x86.h2
13 files changed, 120 insertions, 42 deletions
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index f02b04973710..f100e331e69b 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -543,14 +543,14 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
if (ret != H_SUCCESS)
return ret;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
ret = kvmppc_tce_validate(stt, tce);
if (ret != H_SUCCESS)
- return ret;
+ goto unlock_exit;
dir = iommu_tce_direction(tce);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
-
if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
ret = H_PARAMETER;
goto unlock_exit;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 06964350b97a..b2b29d4f9842 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3423,7 +3423,9 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
- mtspr(SPRN_PSSCR, host_psscr);
+ /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
+ mtspr(SPRN_PSSCR, host_psscr |
+ (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
mtspr(SPRN_HFSCR, host_hfscr);
mtspr(SPRN_CIABR, host_ciabr);
mtspr(SPRN_DAWR, host_dawr);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a9d03af34030..c79abe7ca093 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -295,6 +295,7 @@ union kvm_mmu_extended_role {
unsigned int valid:1;
unsigned int execonly:1;
unsigned int cr0_pg:1;
+ unsigned int cr4_pae:1;
unsigned int cr4_pse:1;
unsigned int cr4_pke:1;
unsigned int cr4_smap:1;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index dabfcf7c3941..7a0e64ccd6ff 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -381,6 +381,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
+#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
#define KVM_STATE_NESTED_GUEST_MODE 0x00000001
#define KVM_STATE_NESTED_RUN_PENDING 0x00000002
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 421899f6ad7b..cc24b3a32c44 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1371,7 +1371,16 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
valid_bank_mask = BIT_ULL(0);
sparse_banks[0] = flush.processor_mask;
- all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS;
+
+ /*
+ * Work around possible WS2012 bug: it sends hypercalls
+ * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear,
+ * while also expecting us to flush something and crashing if
+ * we don't. Let's treat processor_mask == 0 same as
+ * HV_FLUSH_ALL_PROCESSORS.
+ */
+ all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) ||
+ flush.processor_mask == 0;
} else {
if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex,
sizeof(flush_ex))))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9bf70cf84564..bd13fdddbdc4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -70,7 +70,6 @@
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
-static bool lapic_timer_advance_adjust_done = false;
#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
@@ -1482,14 +1481,32 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
return false;
}
+static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
+{
+ u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns;
+
+ /*
+ * If the guest TSC is running at a different ratio than the host, then
+ * convert the delay to nanoseconds to achieve an accurate delay. Note
+ * that __delay() uses delay_tsc whenever the hardware has TSC, thus
+ * always for VMX enabled hardware.
+ */
+ if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) {
+ __delay(min(guest_cycles,
+ nsec_to_cycles(vcpu, timer_advance_ns)));
+ } else {
+ u64 delay_ns = guest_cycles * 1000000ULL;
+ do_div(delay_ns, vcpu->arch.virtual_tsc_khz);
+ ndelay(min_t(u32, delay_ns, timer_advance_ns));
+ }
+}
+
void wait_lapic_expire(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
u64 guest_tsc, tsc_deadline, ns;
- if (!lapic_in_kernel(vcpu))
- return;
-
if (apic->lapic_timer.expired_tscdeadline == 0)
return;
@@ -1501,33 +1518,37 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
- /* __delay is delay_tsc whenever the hardware has TSC, thus always. */
if (guest_tsc < tsc_deadline)
- __delay(min(tsc_deadline - guest_tsc,
- nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
+ __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
- if (!lapic_timer_advance_adjust_done) {
+ if (!apic->lapic_timer.timer_advance_adjust_done) {
/* too early */
if (guest_tsc < tsc_deadline) {
ns = (tsc_deadline - guest_tsc) * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- lapic_timer_advance_ns -= min((unsigned int)ns,
- lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns -= min((u32)ns,
+ timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
} else {
/* too late */
ns = (guest_tsc - tsc_deadline) * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- lapic_timer_advance_ns += min((unsigned int)ns,
- lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns += min((u32)ns,
+ timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
}
if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
- lapic_timer_advance_adjust_done = true;
+ apic->lapic_timer.timer_advance_adjust_done = true;
+ if (unlikely(timer_advance_ns > 5000)) {
+ timer_advance_ns = 0;
+ apic->lapic_timer.timer_advance_adjust_done = true;
+ }
+ apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}
}
static void start_sw_tscdeadline(struct kvm_lapic *apic)
{
- u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ u64 guest_tsc, tscdeadline = ktimer->tscdeadline;
u64 ns = 0;
ktime_t expire;
struct kvm_vcpu *vcpu = apic->vcpu;
@@ -1542,13 +1563,15 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
now = ktime_get();
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
- if (likely(tscdeadline > guest_tsc)) {
- ns = (tscdeadline - guest_tsc) * 1000000ULL;
- do_div(ns, this_tsc_khz);
+
+ ns = (tscdeadline - guest_tsc) * 1000000ULL;
+ do_div(ns, this_tsc_khz);
+
+ if (likely(tscdeadline > guest_tsc) &&
+ likely(ns > apic->lapic_timer.timer_advance_ns)) {
expire = ktime_add_ns(now, ns);
- expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
- hrtimer_start(&apic->lapic_timer.timer,
- expire, HRTIMER_MODE_ABS_PINNED);
+ expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
+ hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED);
} else
apic_timer_expired(apic);
@@ -2255,7 +2278,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
return HRTIMER_NORESTART;
}
-int kvm_create_lapic(struct kvm_vcpu *vcpu)
+int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
{
struct kvm_lapic *apic;
@@ -2279,6 +2302,14 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_PINNED);
apic->lapic_timer.timer.function = apic_timer_fn;
+ if (timer_advance_ns == -1) {
+ apic->lapic_timer.timer_advance_ns = 1000;
+ apic->lapic_timer.timer_advance_adjust_done = false;
+ } else {
+ apic->lapic_timer.timer_advance_ns = timer_advance_ns;
+ apic->lapic_timer.timer_advance_adjust_done = true;
+ }
+
/*
* APIC is created enabled. This will prevent kvm_lapic_set_base from
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ff6ef9c3d760..d6d049ba3045 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,8 +31,10 @@ struct kvm_timer {
u32 timer_mode_mask;
u64 tscdeadline;
u64 expired_tscdeadline;
+ u32 timer_advance_ns;
atomic_t pending; /* accumulated triggered timers */
bool hv_timer_in_use;
+ bool timer_advance_adjust_done;
};
struct kvm_lapic {
@@ -62,7 +64,7 @@ struct kvm_lapic {
struct dest_map;
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
+int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns);
void kvm_free_lapic(struct kvm_vcpu *vcpu);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e10962dfc203..d9c7b45d231f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4781,6 +4781,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
union kvm_mmu_extended_role ext = {0};
ext.cr0_pg = !!is_paging(vcpu);
+ ext.cr4_pae = !!is_pae(vcpu);
ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
ext.cr4_pse = !!is_pse(vcpu);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6401eb7ef19c..0c601d079cd2 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5423,7 +5423,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
return ret;
/* Empty 'VMXON' state is permitted */
- if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
+ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
return 0;
if (kvm_state->vmx.vmcs_pa != -1ull) {
@@ -5467,7 +5467,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
vmcs12->vmcs_link_pointer != -1ull) {
struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
- if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12))
+ if (kvm_state->size < sizeof(*kvm_state) + 2 * sizeof(*vmcs12))
return -EINVAL;
if (copy_from_user(shadow_vmcs12,
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 7b272738c576..d4cb1945b2e3 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -3,6 +3,7 @@
#include <asm/asm.h>
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
+#include <asm/nospec-branch.h>
#define WORD_SIZE (BITS_PER_LONG / 8)
@@ -77,6 +78,17 @@ ENDPROC(vmx_vmenter)
* referred to by VMCS.HOST_RIP.
*/
ENTRY(vmx_vmexit)
+#ifdef CONFIG_RETPOLINE
+ ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE
+ /* Preserve guest's RAX, it's used to stuff the RSB. */
+ push %_ASM_AX
+
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+
+ pop %_ASM_AX
+.Lvmexit_skip_rsb:
+#endif
ret
ENDPROC(vmx_vmexit)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b4e7d645275a..0c955bb286ff 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6462,9 +6462,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
- /* Eliminate branch target predictions from guest mode */
- vmexit_fill_RSB();
-
/* All fields are clean at this point */
if (static_branch_unlikely(&enable_evmcs))
current_evmcs->hv_clean_fields |=
@@ -7032,6 +7029,7 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
{
struct vcpu_vmx *vmx;
u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
+ struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
if (kvm_mwait_in_guest(vcpu->kvm))
return -EOPNOTSUPP;
@@ -7040,7 +7038,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
tscl = rdtsc();
guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
- lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
+ lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
+ ktimer->timer_advance_ns);
if (delta_tsc > lapic_timer_advance_cycles)
delta_tsc -= lapic_timer_advance_cycles;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a0d1fc80ac5a..b5edc8e3ce1d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -136,10 +136,14 @@ EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
-/* lapic timer advance (tscdeadline mode only) in nanoseconds */
-unsigned int __read_mostly lapic_timer_advance_ns = 1000;
+/*
+ * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
+ * adaptive tuning starting from default advancment of 1000ns. '0' disables
+ * advancement entirely. Any other value is used as-is and disables adaptive
+ * tuning, i.e. allows priveleged userspace to set an exact advancement time.
+ */
+static int __read_mostly lapic_timer_advance_ns = -1;
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
-EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);
@@ -6535,6 +6539,12 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
}
EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
+static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+ return 1;
+}
+
static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
{
vcpu->arch.pio.count = 0;
@@ -6551,12 +6561,23 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
size, port, &val, 1);
+ if (ret)
+ return ret;
- if (!ret) {
+ /*
+ * Workaround userspace that relies on old KVM behavior of %rip being
+ * incremented prior to exiting to userspace to handle "OUT 0x7e".
+ */
+ if (port == 0x7e &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
+ vcpu->arch.complete_userspace_io =
+ complete_fast_pio_out_port_0x7e;
+ kvm_skip_emulated_instruction(vcpu);
+ } else {
vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_out;
}
- return ret;
+ return 0;
}
static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
@@ -7873,7 +7894,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
trace_kvm_entry(vcpu->vcpu_id);
- if (lapic_timer_advance_ns)
+ if (lapic_in_kernel(vcpu) &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns)
wait_lapic_expire(vcpu);
guest_enter_irqoff();
@@ -9061,7 +9083,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
- r = kvm_create_lapic(vcpu);
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
if (r < 0)
goto fail_mmu_destroy;
} else
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index aedc5d0d4989..534d3f28bb01 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -294,8 +294,6 @@ extern u64 kvm_supported_xcr0(void);
extern unsigned int min_timer_period_us;
-extern unsigned int lapic_timer_advance_ns;
-
extern bool enable_vmware_backdoor;
extern struct static_key kvm_no_apic_vcpu;