summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/emulate.c48
-rw-r--r--arch/x86/kvm/i8254.c64
-rw-r--r--arch/x86/kvm/i8254.h6
-rw-r--r--arch/x86/kvm/i8259.c18
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/kvm_timer.h18
-rw-r--r--arch/x86/kvm/lapic.c264
-rw-r--r--arch/x86/kvm/lapic.h58
-rw-r--r--arch/x86/kvm/mmu.c144
-rw-r--r--arch/x86/kvm/mmu_audit.c8
-rw-r--r--arch/x86/kvm/paging_tmpl.h8
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/svm.c5
-rw-r--r--arch/x86/kvm/timer.c47
-rw-r--r--arch/x86/kvm/vmx.c19
-rw-r--r--arch/x86/kvm/x86.c160
-rw-r--r--arch/x86/kvm/x86.h1
20 files changed, 475 insertions, 402 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a28f338843ea..45c044f0fff7 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -37,6 +37,7 @@ config KVM
select TASK_DELAY_ACCT
select PERF_EVENTS
select HAVE_KVM_MSI
+ select HAVE_KVM_CPU_RELAX_INTERCEPT
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4f579e8dcacf..04d30401c5cb 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o timer.o cpuid.o pmu.o
+ i8254.o cpuid.o pmu.o
kvm-intel-y += vmx.o
kvm-amd-y += svm.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0595f1397b7c..b496da684bd6 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -316,7 +316,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
case 7: {
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- /* Mask ebx against host capbability word 9 */
+ /* Mask ebx against host capability word 9 */
if (index == 0) {
entry->ebx &= kvm_supported_word9_x86_features;
cpuid_mask(&entry->ebx, 9);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a3b57a27be88..e8fb6c5c6c0a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -655,7 +655,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
goto bad;
} else {
- /* exapand-down segment */
+ /* expand-down segment */
if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
goto bad;
lim = desc.d ? 0xffffffff : 0xffff;
@@ -1179,24 +1179,21 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
int rc;
struct read_cache *mc = &ctxt->mem_read;
- while (size) {
- int n = min(size, 8u);
- size -= n;
- if (mc->pos < mc->end)
- goto read_cached;
+ if (mc->pos < mc->end)
+ goto read_cached;
- rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
- &ctxt->exception);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- mc->end += n;
+ WARN_ON((mc->end + size) >= sizeof(mc->data));
- read_cached:
- memcpy(dest, mc->data + mc->pos, n);
- mc->pos += n;
- dest += n;
- addr += n;
- }
+ rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
+ &ctxt->exception);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ mc->end += size;
+
+read_cached:
+ memcpy(dest, mc->data + mc->pos, size);
+ mc->pos += size;
return X86EMUL_CONTINUE;
}
@@ -1396,7 +1393,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
err_code = selector & 0xfffc;
err_vec = GP_VECTOR;
- /* can't load system descriptor into segment selecor */
+ /* can't load system descriptor into segment selector */
if (seg <= VCPU_SREG_GS && !seg_desc.s)
goto exception;
@@ -2050,12 +2047,6 @@ static void
setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
struct desc_struct *cs, struct desc_struct *ss)
{
- u16 selector;
-
- memset(cs, 0, sizeof(struct desc_struct));
- ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
- memset(ss, 0, sizeof(struct desc_struct));
-
cs->l = 0; /* will be adjusted later */
set_desc_base(cs, 0); /* flat segment */
cs->g = 1; /* 4kb granularity */
@@ -2065,6 +2056,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
cs->dpl = 0; /* will be adjusted later */
cs->p = 1;
cs->d = 1;
+ cs->avl = 0;
set_desc_base(ss, 0); /* flat segment */
set_desc_limit(ss, 0xfffff); /* 4GB limit */
@@ -2074,6 +2066,8 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
ss->d = 1; /* 32bit stack segment */
ss->dpl = 0;
ss->p = 1;
+ ss->l = 0;
+ ss->avl = 0;
}
static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
@@ -2410,7 +2404,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
/*
- * Now load segment descriptors. If fault happenes at this stage
+ * Now load segment descriptors. If fault happens at this stage
* it is handled in a context of new task
*/
ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
@@ -2652,7 +2646,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
*
* 1. jmp/call/int to task gate: Check against DPL of the task gate
* 2. Exception/IRQ/iret: No check is performed
- * 3. jmp/call to TSS: Check agains DPL of the TSS
+ * 3. jmp/call to TSS: Check against DPL of the TSS
*/
if (reason == TASK_SWITCH_GATE) {
if (idt_index != -1) {
@@ -2693,7 +2687,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
/* set back link to prev task only if NT bit is set in eflags
- note that old_tss_sel is not used afetr this point */
+ note that old_tss_sel is not used after this point */
if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
old_tss_sel = 0xffff;
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index adba28f88d1a..11300d2fa714 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -108,7 +108,7 @@ static s64 __kpit_elapsed(struct kvm *kvm)
ktime_t remaining;
struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
- if (!ps->pit_timer.period)
+ if (!ps->period)
return 0;
/*
@@ -120,9 +120,9 @@ static s64 __kpit_elapsed(struct kvm *kvm)
* itself with the initial count and continues counting
* from there.
*/
- remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
- elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
- elapsed = mod_64(elapsed, ps->pit_timer.period);
+ remaining = hrtimer_get_remaining(&ps->timer);
+ elapsed = ps->period - ktime_to_ns(remaining);
+ elapsed = mod_64(elapsed, ps->period);
return elapsed;
}
@@ -238,12 +238,12 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
int value;
spin_lock(&ps->inject_lock);
- value = atomic_dec_return(&ps->pit_timer.pending);
+ value = atomic_dec_return(&ps->pending);
if (value < 0)
/* spurious acks can be generated if, for example, the
* PIC is being reset. Handle it gracefully here
*/
- atomic_inc(&ps->pit_timer.pending);
+ atomic_inc(&ps->pending);
else if (value > 0)
/* in this case, we had multiple outstanding pit interrupts
* that we needed to inject. Reinject
@@ -261,28 +261,17 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
if (!kvm_vcpu_is_bsp(vcpu) || !pit)
return;
- timer = &pit->pit_state.pit_timer.timer;
+ timer = &pit->pit_state.timer;
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
static void destroy_pit_timer(struct kvm_pit *pit)
{
- hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+ hrtimer_cancel(&pit->pit_state.timer);
flush_kthread_work(&pit->expired);
}
-static bool kpit_is_periodic(struct kvm_timer *ktimer)
-{
- struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
- pit_timer);
- return ps->is_periodic;
-}
-
-static struct kvm_timer_ops kpit_ops = {
- .is_periodic = kpit_is_periodic,
-};
-
static void pit_do_work(struct kthread_work *work)
{
struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
@@ -322,16 +311,16 @@ static void pit_do_work(struct kthread_work *work)
static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{
- struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+ struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
+ struct kvm_pit *pt = ps->kvm->arch.vpit;
- if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
- atomic_inc(&ktimer->pending);
+ if (ps->reinject || !atomic_read(&ps->pending)) {
+ atomic_inc(&ps->pending);
queue_kthread_work(&pt->worker, &pt->expired);
}
- if (ktimer->t_ops->is_periodic(ktimer)) {
- hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ if (ps->is_periodic) {
+ hrtimer_add_expires_ns(&ps->timer, ps->period);
return HRTIMER_RESTART;
} else
return HRTIMER_NORESTART;
@@ -340,7 +329,6 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
{
struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
- struct kvm_timer *pt = &ps->pit_timer;
s64 interval;
if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
@@ -351,19 +339,18 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
pr_debug("create pit timer, interval is %llu nsec\n", interval);
/* TODO The new value only affected after the retriggered */
- hrtimer_cancel(&pt->timer);
+ hrtimer_cancel(&ps->timer);
flush_kthread_work(&ps->pit->expired);
- pt->period = interval;
+ ps->period = interval;
ps->is_periodic = is_period;
- pt->timer.function = pit_timer_fn;
- pt->t_ops = &kpit_ops;
- pt->kvm = ps->pit->kvm;
+ ps->timer.function = pit_timer_fn;
+ ps->kvm = ps->pit->kvm;
- atomic_set(&pt->pending, 0);
+ atomic_set(&ps->pending, 0);
ps->irq_ack = 1;
- hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+ hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);
}
@@ -639,7 +626,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
}
mutex_unlock(&pit->pit_state.lock);
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
+ atomic_set(&pit->pit_state.pending, 0);
pit->pit_state.irq_ack = 1;
}
@@ -648,7 +635,7 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
if (!mask) {
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
+ atomic_set(&pit->pit_state.pending, 0);
pit->pit_state.irq_ack = 1;
}
}
@@ -706,12 +693,11 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
pit_state = &pit->pit_state;
pit_state->pit = pit;
- hrtimer_init(&pit_state->pit_timer.timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- pit_state->pit_timer.reinject = true;
+ pit_state->reinject = true;
mutex_unlock(&pit->pit_state.lock);
kvm_pit_reset(pit);
@@ -761,7 +747,7 @@ void kvm_free_pit(struct kvm *kvm)
kvm_unregister_irq_ack_notifier(kvm,
&kvm->arch.vpit->pit_state.irq_ack_notifier);
mutex_lock(&kvm->arch.vpit->pit_state.lock);
- timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
+ timer = &kvm->arch.vpit->pit_state.timer;
hrtimer_cancel(timer);
flush_kthread_work(&kvm->arch.vpit->expired);
kthread_stop(kvm->arch.vpit->worker_task);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index fdf40425ea1d..dd1b16b611b0 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -24,8 +24,12 @@ struct kvm_kpit_channel_state {
struct kvm_kpit_state {
struct kvm_kpit_channel_state channels[3];
u32 flags;
- struct kvm_timer pit_timer;
bool is_periodic;
+ s64 period; /* unit: ns */
+ struct hrtimer timer;
+ atomic_t pending; /* accumulated triggered timers */
+ bool reinject;
+ struct kvm *kvm;
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index e498b18f010c..90c84f947d45 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -190,17 +190,17 @@ void kvm_pic_update_irq(struct kvm_pic *s)
int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
{
- int ret = -1;
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
pic_lock(s);
- if (irq >= 0 && irq < PIC_NUM_PINS) {
- int irq_level = __kvm_irq_line_state(&s->irq_states[irq],
- irq_source_id, level);
- ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
- pic_update_irq(s);
- trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
- s->pics[irq >> 3].imr, ret == 0);
- }
+ irq_level = __kvm_irq_line_state(&s->irq_states[irq],
+ irq_source_id, level);
+ ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
+ pic_update_irq(s);
+ trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+ s->pics[irq >> 3].imr, ret == 0);
pic_unlock(s);
return ret;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2086f2bfba33..2d03568e9498 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -70,7 +70,7 @@ struct kvm_pic {
struct kvm_io_device dev_slave;
struct kvm_io_device dev_eclr;
void (*ack_notifier)(void *opaque, int irq);
- unsigned long irq_states[16];
+ unsigned long irq_states[PIC_NUM_PINS];
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
deleted file mode 100644
index 497dbaa366d4..000000000000
--- a/arch/x86/kvm/kvm_timer.h
+++ /dev/null
@@ -1,18 +0,0 @@
-
-struct kvm_timer {
- struct hrtimer timer;
- s64 period; /* unit: ns */
- u32 timer_mode_mask;
- u64 tscdeadline;
- atomic_t pending; /* accumulated triggered timers */
- bool reinject;
- struct kvm_timer_ops *t_ops;
- struct kvm *kvm;
- struct kvm_vcpu *vcpu;
-};
-
-struct kvm_timer_ops {
- bool (*is_periodic)(struct kvm_timer *);
-};
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ce878788a39f..18d149d80209 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -34,6 +34,7 @@
#include <asm/current.h>
#include <asm/apicdef.h>
#include <linux/atomic.h>
+#include <linux/jump_label.h>
#include "kvm_cache_regs.h"
#include "irq.h"
#include "trace.h"
@@ -72,11 +73,6 @@
static unsigned int min_timer_period_us = 500;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
-static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
-{
- return *((u32 *) (apic->regs + reg_off));
-}
-
static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
{
*((u32 *) (apic->regs + reg_off)) = val;
@@ -117,19 +113,23 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
}
-static inline int apic_hw_enabled(struct kvm_lapic *apic)
-{
- return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
-}
+struct static_key_deferred apic_hw_disabled __read_mostly;
+struct static_key_deferred apic_sw_disabled __read_mostly;
-static inline int apic_sw_enabled(struct kvm_lapic *apic)
+static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
{
- return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+ if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) {
+ if (val & APIC_SPIV_APIC_ENABLED)
+ static_key_slow_dec_deferred(&apic_sw_disabled);
+ else
+ static_key_slow_inc(&apic_sw_disabled.key);
+ }
+ apic_set_reg(apic, APIC_SPIV, val);
}
static inline int apic_enabled(struct kvm_lapic *apic)
{
- return apic_sw_enabled(apic) && apic_hw_enabled(apic);
+ return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic);
}
#define LVT_MASK \
@@ -141,34 +141,34 @@ static inline int apic_enabled(struct kvm_lapic *apic)
static inline int kvm_apic_id(struct kvm_lapic *apic)
{
- return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+ return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
}
static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
{
- return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+ return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
}
static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
{
- return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
+ return kvm_apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
}
static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
{
- return ((apic_get_reg(apic, APIC_LVTT) &
+ return ((kvm_apic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
}
static inline int apic_lvtt_period(struct kvm_lapic *apic)
{
- return ((apic_get_reg(apic, APIC_LVTT) &
+ return ((kvm_apic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
}
static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
{
- return ((apic_get_reg(apic, APIC_LVTT) &
+ return ((kvm_apic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask) ==
APIC_LVT_TIMER_TSCDEADLINE);
}
@@ -184,7 +184,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
struct kvm_cpuid_entry2 *feat;
u32 v = APIC_VERSION;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!kvm_vcpu_has_lapic(vcpu))
return;
feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@ -285,7 +285,6 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
/* This may race with setting of irr in __apic_accept_irq() and
@@ -293,9 +292,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
* will cause vmexit immediately and the value will be recalculated
* on the next vmentry.
*/
- if (!apic)
+ if (!kvm_vcpu_has_lapic(vcpu))
return 0;
- highest_irr = apic_find_highest_irr(apic);
+ highest_irr = apic_find_highest_irr(vcpu->arch.apic);
return highest_irr;
}
@@ -378,8 +377,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
u32 tpr, isrv, ppr, old_ppr;
int isr;
- old_ppr = apic_get_reg(apic, APIC_PROCPRI);
- tpr = apic_get_reg(apic, APIC_TASKPRI);
+ old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI);
+ tpr = kvm_apic_get_reg(apic, APIC_TASKPRI);
isr = apic_find_highest_isr(apic);
isrv = (isr != -1) ? isr : 0;
@@ -415,13 +414,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
u32 logical_id;
if (apic_x2apic_mode(apic)) {
- logical_id = apic_get_reg(apic, APIC_LDR);
+ logical_id = kvm_apic_get_reg(apic, APIC_LDR);
return logical_id & mda;
}
- logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
+ logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR));
- switch (apic_get_reg(apic, APIC_DFR)) {
+ switch (kvm_apic_get_reg(apic, APIC_DFR)) {
case APIC_DFR_FLAT:
if (logical_id & mda)
result = 1;
@@ -433,7 +432,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
break;
default:
apic_debug("Bad DFR vcpu %d: %08x\n",
- apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
+ apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
break;
}
@@ -591,7 +590,7 @@ static int apic_set_eoi(struct kvm_lapic *apic)
apic_clear_isr(vector, apic);
apic_update_ppr(apic);
- if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
+ if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
int trigger_mode;
if (apic_test_vector(vector, apic->regs + APIC_TMR))
@@ -606,8 +605,8 @@ static int apic_set_eoi(struct kvm_lapic *apic)
static void apic_send_ipi(struct kvm_lapic *apic)
{
- u32 icr_low = apic_get_reg(apic, APIC_ICR);
- u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+ u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
+ u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2);
struct kvm_lapic_irq irq;
irq.vector = icr_low & APIC_VECTOR_MASK;
@@ -642,7 +641,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
ASSERT(apic != NULL);
/* if initial count is 0, current count should also be 0 */
- if (apic_get_reg(apic, APIC_TMICT) == 0)
+ if (kvm_apic_get_reg(apic, APIC_TMICT) == 0)
return 0;
remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
@@ -696,13 +695,15 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
val = apic_get_tmcct(apic);
break;
-
+ case APIC_PROCPRI:
+ apic_update_ppr(apic);
+ val = kvm_apic_get_reg(apic, offset);
+ break;
case APIC_TASKPRI:
report_tpr_access(apic, false);
/* fall thru */
default:
- apic_update_ppr(apic);
- val = apic_get_reg(apic, offset);
+ val = kvm_apic_get_reg(apic, offset);
break;
}
@@ -719,7 +720,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
{
unsigned char alignment = offset & 0xf;
u32 result;
- /* this bitmask has a bit cleared for each reserver register */
+ /* this bitmask has a bit cleared for each reserved register */
static const u64 rmask = 0x43ff01ffffffe70cULL;
if ((alignment + len) > 4) {
@@ -754,7 +755,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
{
- return apic_hw_enabled(apic) &&
+ return kvm_apic_hw_enabled(apic) &&
addr >= apic->base_address &&
addr < apic->base_address + LAPIC_MMIO_LENGTH;
}
@@ -777,7 +778,7 @@ static void update_divide_count(struct kvm_lapic *apic)
{
u32 tmp1, tmp2, tdcr;
- tdcr = apic_get_reg(apic, APIC_TDCR);
+ tdcr = kvm_apic_get_reg(apic, APIC_TDCR);
tmp1 = tdcr & 0xf;
tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
apic->divide_count = 0x1 << (tmp2 & 0x7);
@@ -792,9 +793,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
atomic_set(&apic->lapic_timer.pending, 0);
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
- /* lapic timer in oneshot or peroidic mode */
+ /* lapic timer in oneshot or periodic mode */
now = apic->lapic_timer.timer.base->get_time();
- apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
+ apic->lapic_timer.period = (u64)kvm_apic_get_reg(apic, APIC_TMICT)
* APIC_BUS_CYCLE_NS * apic->divide_count;
if (!apic->lapic_timer.period)
@@ -826,7 +827,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
"timer initial count 0x%x, period %lldns, "
"expire @ 0x%016" PRIx64 ".\n", __func__,
APIC_BUS_CYCLE_NS, ktime_to_ns(now),
- apic_get_reg(apic, APIC_TMICT),
+ kvm_apic_get_reg(apic, APIC_TMICT),
apic->lapic_timer.period,
ktime_to_ns(ktime_add_ns(now,
apic->lapic_timer.period)));
@@ -858,7 +859,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
{
- int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+ int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0));
if (apic_lvt_nmi_mode(lvt0_val)) {
if (!nmi_wd_enabled) {
@@ -909,15 +910,15 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_SPIV: {
u32 mask = 0x3ff;
- if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+ if (kvm_apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
mask |= APIC_SPIV_DIRECTED_EOI;
- apic_set_reg(apic, APIC_SPIV, val & mask);
+ apic_set_spiv(apic, val & mask);
if (!(val & APIC_SPIV_APIC_ENABLED)) {
int i;
u32 lvt_val;
for (i = 0; i < APIC_LVT_NUM; i++) {
- lvt_val = apic_get_reg(apic,
+ lvt_val = kvm_apic_get_reg(apic,
APIC_LVTT + 0x10 * i);
apic_set_reg(apic, APIC_LVTT + 0x10 * i,
lvt_val | APIC_LVT_MASKED);
@@ -946,7 +947,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_LVT1:
case APIC_LVTERR:
/* TODO: Check vector */
- if (!apic_sw_enabled(apic))
+ if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
@@ -955,12 +956,12 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
case APIC_LVTT:
- if ((apic_get_reg(apic, APIC_LVTT) &
+ if ((kvm_apic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask) !=
(val & apic->lapic_timer.timer_mode_mask))
hrtimer_cancel(&apic->lapic_timer.timer);
- if (!apic_sw_enabled(apic))
+ if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
apic_set_reg(apic, APIC_LVTT, val);
@@ -1039,24 +1040,30 @@ static int apic_mmio_write(struct kvm_io_device *this,
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (apic)
+ if (kvm_vcpu_has_lapic(vcpu))
apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
}
EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
void kvm_free_lapic(struct kvm_vcpu *vcpu)
{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
if (!vcpu->arch.apic)
return;
- hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
+ hrtimer_cancel(&apic->lapic_timer.timer);
+
+ if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
+ static_key_slow_dec_deferred(&apic_hw_disabled);
+
+ if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED))
+ static_key_slow_dec_deferred(&apic_sw_disabled);
- if (vcpu->arch.apic->regs)
- free_page((unsigned long)vcpu->arch.apic->regs);
+ if (apic->regs)
+ free_page((unsigned long)apic->regs);
- kfree(vcpu->arch.apic);
+ kfree(apic);
}
/*
@@ -1068,10 +1075,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
- return 0;
- if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+ if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+ apic_lvtt_period(apic))
return 0;
return apic->lapic_timer.tscdeadline;
@@ -1080,10 +1086,9 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
- return;
- if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+ if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+ apic_lvtt_period(apic))
return;
hrtimer_cancel(&apic->lapic_timer.timer);
@@ -1095,20 +1100,21 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
+ if (!kvm_vcpu_has_lapic(vcpu))
return;
+
apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
- | (apic_get_reg(apic, APIC_TASKPRI) & 4));
+ | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
}
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
u64 tpr;
- if (!apic)
+ if (!kvm_vcpu_has_lapic(vcpu))
return 0;
- tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
+
+ tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
return (tpr & 0xf0) >> 4;
}
@@ -1123,6 +1129,14 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
return;
}
+ /* update jump label if enable bit changes */
+ if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) {
+ if (value & MSR_IA32_APICBASE_ENABLE)
+ static_key_slow_dec_deferred(&apic_hw_disabled);
+ else
+ static_key_slow_inc(&apic_hw_disabled.key);
+ }
+
if (!kvm_vcpu_is_bsp(apic->vcpu))
value &= ~MSR_IA32_APICBASE_BSP;
@@ -1164,7 +1178,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_set_reg(apic, APIC_DFR, 0xffffffffU);
- apic_set_reg(apic, APIC_SPIV, 0xff);
+ apic_set_spiv(apic, 0xff);
apic_set_reg(apic, APIC_TASKPRI, 0);
apic_set_reg(apic, APIC_LDR, 0);
apic_set_reg(apic, APIC_ESR, 0);
@@ -1183,7 +1197,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
if (kvm_vcpu_is_bsp(vcpu))
- vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ kvm_lapic_set_base(vcpu,
+ vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
@@ -1196,45 +1211,34 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
vcpu->arch.apic_base, apic->base_address);
}
-bool kvm_apic_present(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
-}
-
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
-{
- return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
-}
-
/*
*----------------------------------------------------------------------
* timer interface
*----------------------------------------------------------------------
*/
-static bool lapic_is_periodic(struct kvm_timer *ktimer)
+static bool lapic_is_periodic(struct kvm_lapic *apic)
{
- struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
- lapic_timer);
return apic_lvtt_period(apic);
}
int apic_has_pending_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *lapic = vcpu->arch.apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
- return atomic_read(&lapic->lapic_timer.pending);
+ if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
+ apic_lvt_enabled(apic, APIC_LVTT))
+ return atomic_read(&apic->lapic_timer.pending);
return 0;
}
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
{
- u32 reg = apic_get_reg(apic, lvt_type);
+ u32 reg = kvm_apic_get_reg(apic, lvt_type);
int vector, mode, trig_mode;
- if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+ if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
vector = reg & APIC_VECTOR_MASK;
mode = reg & APIC_MODE_MASK;
trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
@@ -1251,15 +1255,40 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
kvm_apic_local_deliver(apic, APIC_LVT0);
}
-static struct kvm_timer_ops lapic_timer_ops = {
- .is_periodic = lapic_is_periodic,
-};
-
static const struct kvm_io_device_ops apic_mmio_ops = {
.read = apic_mmio_read,
.write = apic_mmio_write,
};
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ wait_queue_head_t *q = &vcpu->wq;
+
+ /*
+ * There is a race window between reading and incrementing, but we do
+ * not care about potentially losing timer events in the !reinject
+ * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
+ * in vcpu_enter_guest.
+ */
+ if (!atomic_read(&ktimer->pending)) {
+ atomic_inc(&ktimer->pending);
+ /* FIXME: this code should not know anything about vcpus */
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ }
+
+ if (waitqueue_active(q))
+ wake_up_interruptible(q);
+
+ if (lapic_is_periodic(apic)) {
+ hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
int kvm_create_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
@@ -1283,14 +1312,17 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS);
- apic->lapic_timer.timer.function = kvm_timer_fn;
- apic->lapic_timer.t_ops = &lapic_timer_ops;
- apic->lapic_timer.kvm = vcpu->kvm;
- apic->lapic_timer.vcpu = vcpu;
+ apic->lapic_timer.timer.function = apic_timer_fn;
- apic->base_address = APIC_DEFAULT_PHYS_BASE;
- vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
+ /*
+ * APIC is created enabled. This will prevent kvm_lapic_set_base from
+ * thinking that APIC satet has changed.
+ */
+ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
+ kvm_lapic_set_base(vcpu,
+ APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
+ static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_lapic_reset(vcpu);
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
@@ -1306,23 +1338,23 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
- if (!apic || !apic_enabled(apic))
+ if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
return -1;
apic_update_ppr(apic);
highest_irr = apic_find_highest_irr(apic);
if ((highest_irr == -1) ||
- ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+ ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI)))
return -1;
return highest_irr;
}
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
{
- u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
+ u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
int r = 0;
- if (!apic_hw_enabled(vcpu->arch.apic))
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
r = 1;
if ((lvt0 & APIC_LVT_MASKED) == 0 &&
GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1334,7 +1366,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
+ if (!kvm_vcpu_has_lapic(vcpu))
+ return;
+
+ if (atomic_read(&apic->lapic_timer.pending) > 0) {
if (kvm_apic_local_deliver(apic, APIC_LVTT))
atomic_dec(&apic->lapic_timer.pending);
}
@@ -1354,12 +1389,15 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
return vector;
}
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- apic->base_address = vcpu->arch.apic_base &
- MSR_IA32_APICBASE_BASE;
+ kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
+ /* set SPIV separately to get count of SW disabled APICs right */
+ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+ memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
kvm_apic_set_version(vcpu);
apic_update_ppr(apic);
@@ -1374,13 +1412,12 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
struct hrtimer *timer;
- if (!apic)
+ if (!kvm_vcpu_has_lapic(vcpu))
return;
- timer = &apic->lapic_timer.timer;
+ timer = &vcpu->arch.apic->lapic_timer.timer;
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
@@ -1478,7 +1515,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
+ tpr = kvm_apic_get_reg(apic, APIC_TASKPRI) & 0xff;
max_irr = apic_find_highest_irr(apic);
if (max_irr < 0)
max_irr = 0;
@@ -1537,7 +1574,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!kvm_vcpu_has_lapic(vcpu))
return 1;
/* if this is ICR write vector before command */
@@ -1551,7 +1588,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 low, high = 0;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!kvm_vcpu_has_lapic(vcpu))
return 1;
if (apic_reg_read(apic, reg, 4, &low))
@@ -1576,3 +1613,10 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
addr);
}
+
+void kvm_lapic_init(void)
+{
+ /* do not patch jump label more than once per second */
+ jump_label_rate_limit(&apic_hw_disabled, HZ);
+ jump_label_rate_limit(&apic_sw_disabled, HZ);
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4af5405ae1e2..615a8b030168 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,10 +2,17 @@
#define __KVM_X86_LAPIC_H
#include "iodev.h"
-#include "kvm_timer.h"
#include <linux/kvm_host.h>
+struct kvm_timer {
+ struct hrtimer timer;
+ s64 period; /* unit: ns */
+ u32 timer_mode_mask;
+ u64 tscdeadline;
+ atomic_t pending; /* accumulated triggered timers */
+};
+
struct kvm_lapic {
unsigned long base_address;
struct kvm_io_device dev;
@@ -47,9 +54,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-bool kvm_apic_present(struct kvm_vcpu *vcpu);
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
@@ -71,4 +77,48 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
}
int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
+void kvm_lapic_init(void);
+
+static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+ return *((u32 *) (apic->regs + reg_off));
+}
+
+extern struct static_key kvm_no_apic_vcpu;
+
+static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
+{
+ if (static_key_false(&kvm_no_apic_vcpu))
+ return vcpu->arch.apic;
+ return true;
+}
+
+extern struct static_key_deferred apic_hw_disabled;
+
+static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
+{
+ if (static_key_false(&apic_hw_disabled.key))
+ return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+ return MSR_IA32_APICBASE_ENABLE;
+}
+
+extern struct static_key_deferred apic_sw_disabled;
+
+static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic)
+{
+ if (static_key_false(&apic_sw_disabled.key))
+ return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+ return APIC_SPIV_APIC_ENABLED;
+}
+
+static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+}
+
+static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
+}
+
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7fbd0d273ea8..399c177212b5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -556,6 +556,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
return 0;
pfn = spte_to_pfn(old_spte);
+
+ /*
+ * KVM does not hold the refcount of the page used by
+ * kvm mmu, before reclaiming the page, we should
+ * unmap it from mmu first.
+ */
+ WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+
if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
@@ -960,13 +968,10 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
{
- struct kvm_lpage_info *linfo;
-
- if (likely(level == PT_PAGE_TABLE_LEVEL))
- return &slot->rmap[gfn - slot->base_gfn];
+ unsigned long idx;
- linfo = lpage_info_slot(gfn, slot, level);
- return &linfo->rmap_pde;
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
}
/*
@@ -1173,7 +1178,8 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
unsigned long *rmapp;
while (mask) {
- rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
+ rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
/* clear the first set bit */
@@ -1200,7 +1206,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1218,7 +1224,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
}
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1259,43 +1265,67 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
return 0;
}
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- unsigned long data,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data))
+static int kvm_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ unsigned long *rmapp,
+ struct kvm_memory_slot *slot,
+ unsigned long data))
{
int j;
- int ret;
- int retval = 0;
+ int ret = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
- unsigned long start = memslot->userspace_addr;
- unsigned long end;
+ unsigned long hva_start, hva_end;
+ gfn_t gfn_start, gfn_end;
+
+ hva_start = max(start, memslot->userspace_addr);
+ hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
+ /*
+ * {gfn(page) | page intersects with [hva_start, hva_end)} =
+ * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+ */
+ gfn_start = hva_to_gfn_memslot(hva_start, memslot);
+ gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
- end = start + (memslot->npages << PAGE_SHIFT);
- if (hva >= start && hva < end) {
- gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
- gfn_t gfn = memslot->base_gfn + gfn_offset;
+ for (j = PT_PAGE_TABLE_LEVEL;
+ j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
+ unsigned long idx, idx_end;
+ unsigned long *rmapp;
- ret = handler(kvm, &memslot->rmap[gfn_offset], data);
+ /*
+ * {idx(page_j) | page_j intersects with
+ * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
+ */
+ idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
+ idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
- for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- struct kvm_lpage_info *linfo;
+ rmapp = __gfn_to_rmap(gfn_start, j, memslot);
- linfo = lpage_info_slot(gfn, memslot,
- PT_DIRECTORY_LEVEL + j);
- ret |= handler(kvm, &linfo->rmap_pde, data);
- }
- trace_kvm_age_page(hva, memslot, ret);
- retval |= ret;
+ for (; idx <= idx_end; ++idx)
+ ret |= handler(kvm, rmapp++, memslot, data);
}
}
- return retval;
+ return ret;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ struct kvm_memory_slot *slot,
+ unsigned long data))
+{
+ return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
}
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
@@ -1303,13 +1333,18 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
}
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+}
+
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
}
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator uninitialized_var(iter);
@@ -1323,8 +1358,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
* This has some overhead, but not as much as the cost of swapping
* out actively used pages or breaking up actively used hugepages.
*/
- if (!shadow_accessed_mask)
- return kvm_unmap_rmapp(kvm, rmapp, data);
+ if (!shadow_accessed_mask) {
+ young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
+ goto out;
+ }
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) {
@@ -1336,12 +1373,14 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
(unsigned long *)sptep);
}
}
-
+out:
+ /* @data has hva passed to kvm_age_hva(). */
+ trace_kvm_age_page(data, slot, young);
return young;
}
static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1379,13 +1418,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
- kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
+ kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
}
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
+ return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
}
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
@@ -2457,7 +2496,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
rmap_recycle(vcpu, sptep, gfn);
}
}
- kvm_release_pfn_clean(pfn);
+
+ if (!is_error_pfn(pfn))
+ kvm_release_pfn_clean(pfn);
}
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2469,17 +2510,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
{
struct kvm_memory_slot *slot;
- unsigned long hva;
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
- if (!slot) {
- get_page(fault_page);
- return page_to_pfn(fault_page);
- }
-
- hva = gfn_to_hva_memslot(slot, gfn);
+ if (!slot)
+ return KVM_PFN_ERR_FAULT;
- return hva_to_pfn_atomic(vcpu->kvm, hva);
+ return gfn_to_pfn_memslot_atomic(slot, gfn);
}
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
@@ -2611,8 +2647,16 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
{
- kvm_release_pfn_clean(pfn);
- if (is_hwpoison_pfn(pfn)) {
+ /*
+ * Do not cache the mmio info caused by writing the readonly gfn
+ * into the spte otherwise read access on readonly gfn also can
+ * caused mmio page fault and treat it as mmio access.
+ * Return 1 to tell kvm to emulate it.
+ */
+ if (pfn == KVM_PFN_ERR_RO_FAULT)
+ return 1;
+
+ if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
return 0;
}
@@ -3236,8 +3280,6 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
if (!async)
return false; /* *pfn has correct page already */
- put_page(pfn_to_page(*pfn));
-
if (!prefault && can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(gva, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 7d7d0b9e23eb..daff69e21150 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -116,10 +116,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
+ if (is_error_pfn(pfn))
return;
- }
hpa = pfn << PAGE_SHIFT;
if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
@@ -190,7 +188,6 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- struct kvm_memory_slot *slot;
unsigned long *rmapp;
u64 *sptep;
struct rmap_iterator iter;
@@ -198,8 +195,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
if (sp->role.direct || sp->unsync || sp->role.invalid)
return;
- slot = gfn_to_memslot(kvm, sp->gfn);
- rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+ rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bb7cf01cae76..bf8c42bf50fe 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -370,10 +370,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
- if (mmu_invalid_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
+ if (mmu_invalid_pfn(pfn))
return;
- }
/*
* we call mmu_set_spte() with host_writable = true because that
@@ -448,10 +446,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
gfn = gpte_to_gfn(gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
pte_access & ACC_WRITE_MASK);
- if (mmu_invalid_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
+ if (mmu_invalid_pfn(pfn))
break;
- }
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
NULL, PT_PAGE_TABLE_LEVEL, gfn,
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 9b7ec1150ab0..cfc258a6bf97 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -1,5 +1,5 @@
/*
- * Kernel-based Virtual Machine -- Performane Monitoring Unit support
+ * Kernel-based Virtual Machine -- Performance Monitoring Unit support
*
* Copyright 2011 Red Hat, Inc. and/or its affiliates.
*
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index baead950d6c8..31be4a557447 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2063,7 +2063,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
if (svm->nested.intercept & 1ULL) {
/*
* The #vmexit can't be emulated here directly because this
- * code path runs with irqs and preemtion disabled. A
+ * code path runs with irqs and preemption disabled. A
* #vmexit emulation might sleep. Only signal request for
* the #vmexit here.
*/
@@ -2105,7 +2105,6 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
return kmap(page);
error:
- kvm_release_page_clean(page);
kvm_inject_gp(&svm->vcpu, 0);
return NULL;
@@ -2409,7 +2408,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
/*
* This function merges the msr permission bitmaps of kvm and the
- * nested vmcb. It is omptimized in that it only merges the parts where
+ * nested vmcb. It is optimized in that it only merges the parts where
* the kvm msr permission bitmap may contain zero bits
*/
int i;
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
deleted file mode 100644
index 6b85cc647f34..000000000000
--- a/arch/x86/kvm/timer.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * timer support
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/hrtimer.h>
-#include <linux/atomic.h>
-#include "kvm_timer.h"
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
-{
- struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- struct kvm_vcpu *vcpu = ktimer->vcpu;
- wait_queue_head_t *q = &vcpu->wq;
-
- /*
- * There is a race window between reading and incrementing, but we do
- * not care about potentially losing timer events in the !reinject
- * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
- * in vcpu_enter_guest.
- */
- if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
- atomic_inc(&ktimer->pending);
- /* FIXME: this code should not know anything about vcpus */
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
- }
-
- if (waitqueue_active(q))
- wake_up_interruptible(q);
-
- if (ktimer->t_ops->is_periodic(ktimer)) {
- hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
- return HRTIMER_RESTART;
- } else
- return HRTIMER_NORESTART;
-}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c00f03de1b79..13e0296cea46 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -596,10 +596,9 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
{
struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
- if (is_error_page(page)) {
- kvm_release_page_clean(page);
+ if (is_error_page(page))
return NULL;
- }
+
return page;
}
@@ -1343,7 +1342,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
guest_efer = vmx->vcpu.arch.efer;
/*
- * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+ * NX is emulated; LMA and LME handled by hardware; SCE meaningless
* outside long mode
*/
ignore_bits = EFER_NX | EFER_SCE;
@@ -1991,7 +1990,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
#endif
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
- CPU_BASED_RDPMC_EXITING |
+ CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
/*
* We can allow some features even when not supported by the
@@ -3254,7 +3253,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
* qemu binaries.
* IA32 arch specifies that at the time of processor reset the
* "Accessed" bit in the AR field of segment registers is 1. And qemu
- * is setting it to 0 in the usedland code. This causes invalid guest
+ * is setting it to 0 in the userland code. This causes invalid guest
* state vmexit when "unrestricted guest" mode is turned on.
* Fix for this setup issue in cpu_reset is being pushed in the qemu
* tree. Newer qemu binaries with that qemu fix would not need this
@@ -4439,7 +4438,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
-/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
{
if (to_vmx(vcpu)->nested.vmxon &&
@@ -6223,6 +6222,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long debugctlmsr;
if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -6262,6 +6262,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
atomic_switch_perf_msrs(vmx);
+ debugctlmsr = get_debugctlmsr();
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
@@ -6363,6 +6364,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
+ /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+ if (debugctlmsr)
+ update_debugctlmsr(debugctlmsr);
+
#ifndef CONFIG_X86_64
/*
* The sysexit path does not restore ds/es, so we must set them to
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dce75b760312..42bbf4187d20 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -246,20 +246,14 @@ static void drop_user_return_notifiers(void *ignore)
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
- if (irqchip_in_kernel(vcpu->kvm))
- return vcpu->arch.apic_base;
- else
- return vcpu->arch.apic_base;
+ return vcpu->arch.apic_base;
}
EXPORT_SYMBOL_GPL(kvm_get_apic_base);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
{
/* TODO: reserve bits check */
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_lapic_set_base(vcpu, data);
- else
- vcpu->arch.apic_base = data;
+ kvm_lapic_set_base(vcpu, data);
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
@@ -1097,7 +1091,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
* For each generation, we track the original measured
* nanosecond time, offset, and write, so if TSCs are in
* sync, we can match exact offset, and if not, we can match
- * exact software computaion in compute_guest_tsc()
+ * exact software computation in compute_guest_tsc()
*
* These values are tracked in kvm->arch.cur_xxx variables.
*/
@@ -1140,6 +1134,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
unsigned long this_tsc_khz;
s64 kernel_ns, max_kernel_ns;
u64 tsc_timestamp;
+ u8 pvclock_flags;
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -1221,7 +1216,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_kernel_ns = kernel_ns;
vcpu->last_guest_tsc = tsc_timestamp;
- vcpu->hv_clock.flags = 0;
+
+ pvclock_flags = 0;
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ vcpu->hv_clock.flags = pvclock_flags;
/*
* The interface expects us to write an even number signaling that the
@@ -1504,7 +1506,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
- /* Bits 2:5 are resrved, Should be zero */
+ /* Bits 2:5 are reserved, Should be zero */
if (data & 0x3c)
return 1;
@@ -1639,10 +1641,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
- if (is_error_page(vcpu->arch.time_page)) {
- kvm_release_page_clean(vcpu->arch.time_page);
+ if (is_error_page(vcpu->arch.time_page))
vcpu->arch.time_page = NULL;
- }
+
break;
}
case MSR_KVM_ASYNC_PF_EN:
@@ -1727,7 +1728,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
* Ignore all writes to this no longer documented MSR.
* Writes are only relevant for old K7 processors,
* all pre-dating SVM, but a recommended workaround from
- * AMD for these chips. It is possible to speicify the
+ * AMD for these chips. It is possible to specify the
* affected processor models on the command line, hence
* the need to ignore the workaround.
*/
@@ -2174,6 +2175,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_PCI_2_3:
case KVM_CAP_KVMCLOCK_CTRL:
+ case KVM_CAP_READONLY_MEM:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2355,8 +2357,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
- kvm_apic_post_state_restore(vcpu);
+ kvm_apic_post_state_restore(vcpu, s);
update_cr8_intercept(vcpu);
return 0;
@@ -2632,11 +2633,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
*/
static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
{
- struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
if (!vcpu->arch.time_page)
return -EINVAL;
- src->flags |= PVCLOCK_GUEST_STOPPED;
- mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
+ vcpu->arch.pvclock_set_guest_stopped_request = true;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
return 0;
}
@@ -3087,7 +3086,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
if (!kvm->arch.vpit)
return -ENXIO;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
- kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+ kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
@@ -3170,6 +3169,16 @@ out:
return r;
}
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level);
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -3276,29 +3285,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
create_pit_unlock:
mutex_unlock(&kvm->slots_lock);
break;
- case KVM_IRQ_LINE_STATUS:
- case KVM_IRQ_LINE: {
- struct kvm_irq_level irq_event;
-
- r = -EFAULT;
- if (copy_from_user(&irq_event, argp, sizeof irq_event))
- goto out;
- r = -ENXIO;
- if (irqchip_in_kernel(kvm)) {
- __s32 status;
- status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event.irq, irq_event.level);
- if (ioctl == KVM_IRQ_LINE_STATUS) {
- r = -EFAULT;
- irq_event.status = status;
- if (copy_to_user(argp, &irq_event,
- sizeof irq_event))
- goto out;
- }
- r = 0;
- }
- break;
- }
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip *chip;
@@ -3959,10 +3945,8 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
goto emul_write;
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (is_error_page(page)) {
- kvm_release_page_clean(page);
+ if (is_error_page(page))
goto emul_write;
- }
kaddr = kmap_atomic(page);
kaddr += offset_in_page(gpa);
@@ -4490,13 +4474,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa;
+ pfn_t pfn;
if (tdp_enabled)
return false;
/*
* if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-entetr the
+ * and it failed try to unshadow page and re-enter the
* guest to let CPU execute the instruction.
*/
if (kvm_mmu_unprotect_page_virt(vcpu, gva))
@@ -4507,8 +4492,17 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
if (gpa == UNMAPPED_GVA)
return true; /* let cpu generate fault */
- if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+ /*
+ * Do not retry the unhandleable instruction if it faults on the
+ * readonly host memory, otherwise it will goto a infinite loop:
+ * retry instruction -> write #PF -> emulation fail -> retry
+ * instruction -> ...
+ */
+ pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
+ if (!is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
return true;
+ }
return false;
}
@@ -4926,6 +4920,7 @@ int kvm_arch_init(void *opaque)
if (cpu_has_xsave)
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ kvm_lapic_init();
return 0;
out:
@@ -5592,7 +5587,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
/*
* We are here if userspace calls get_regs() in the middle of
* instruction emulation. Registers state needs to be copied
- * back from emulation context to vcpu. Usrapace shouldn't do
+ * back from emulation context to vcpu. Userspace shouldn't do
* that usually, but some bad designed PV devices (vmware
* backdoor interface) need this to work
*/
@@ -6121,7 +6116,7 @@ int kvm_arch_hardware_enable(void *garbage)
* as we reset last_host_tsc on all VCPUs to stop this from being
* called multiple times (one for each physical CPU bringup).
*
- * Platforms with unnreliable TSCs don't have to deal with this, they
+ * Platforms with unreliable TSCs don't have to deal with this, they
* will be compensated by the logic in vcpu_load, which sets the TSC to
* catchup mode. This will catchup all VCPUs to real time, but cannot
* guarantee that they stay in perfect synchronization.
@@ -6174,6 +6169,8 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
}
+struct static_key kvm_no_apic_vcpu __read_mostly;
+
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
struct page *page;
@@ -6206,7 +6203,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
r = kvm_create_lapic(vcpu);
if (r < 0)
goto fail_mmu_destroy;
- }
+ } else
+ static_key_slow_inc(&kvm_no_apic_vcpu);
vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
GFP_KERNEL);
@@ -6246,6 +6244,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
kvm_mmu_destroy(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
+ if (!irqchip_in_kernel(vcpu->kvm))
+ static_key_slow_dec(&kvm_no_apic_vcpu);
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
@@ -6317,10 +6317,18 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
{
int i;
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
- kvm_kvfree(free->arch.lpage_info[i]);
- free->arch.lpage_info[i] = NULL;
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
+ kvm_kvfree(free->arch.rmap[i]);
+ free->arch.rmap[i] = NULL;
+ }
+ if (i == 0)
+ continue;
+
+ if (!dont || free->arch.lpage_info[i - 1] !=
+ dont->arch.lpage_info[i - 1]) {
+ kvm_kvfree(free->arch.lpage_info[i - 1]);
+ free->arch.lpage_info[i - 1] = NULL;
}
}
}
@@ -6329,23 +6337,30 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
{
int i;
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
unsigned long ugfn;
int lpages;
- int level = i + 2;
+ int level = i + 1;
lpages = gfn_to_index(slot->base_gfn + npages - 1,
slot->base_gfn, level) + 1;
- slot->arch.lpage_info[i] =
- kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
- if (!slot->arch.lpage_info[i])
+ slot->arch.rmap[i] =
+ kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
+ if (!slot->arch.rmap[i])
+ goto out_free;
+ if (i == 0)
+ continue;
+
+ slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
+ sizeof(*slot->arch.lpage_info[i - 1]));
+ if (!slot->arch.lpage_info[i - 1])
goto out_free;
if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i][0].write_count = 1;
+ slot->arch.lpage_info[i - 1][0].write_count = 1;
if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+ slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
ugfn = slot->userspace_addr >> PAGE_SHIFT;
/*
* If the gfn and userspace address are not aligned wrt each
@@ -6357,16 +6372,21 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
unsigned long j;
for (j = 0; j < lpages; ++j)
- slot->arch.lpage_info[i][j].write_count = 1;
+ slot->arch.lpage_info[i - 1][j].write_count = 1;
}
}
return 0;
out_free:
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- kvm_kvfree(slot->arch.lpage_info[i]);
- slot->arch.lpage_info[i] = NULL;
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ kvm_kvfree(slot->arch.rmap[i]);
+ slot->arch.rmap[i] = NULL;
+ if (i == 0)
+ continue;
+
+ kvm_kvfree(slot->arch.lpage_info[i - 1]);
+ slot->arch.lpage_info[i - 1] = NULL;
}
return -ENOMEM;
}
@@ -6385,10 +6405,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
map_flags = MAP_SHARED | MAP_ANONYMOUS;
/*To keep backward compatibility with older userspace,
- *x86 needs to hanlde !user_alloc case.
+ *x86 needs to handle !user_alloc case.
*/
if (!user_alloc) {
- if (npages && !old.rmap) {
+ if (npages && !old.npages) {
unsigned long userspace_addr;
userspace_addr = vm_mmap(NULL, 0,
@@ -6416,7 +6436,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
- if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
+ if (!user_alloc && !old.user_alloc && old.npages && !npages) {
int ret;
ret = vm_munmap(old.userspace_addr,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 3d1134ddb885..2b5219c12ac8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -124,4 +124,5 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
extern u64 host_xcr0;
+extern struct static_key kvm_no_apic_vcpu;
#endif