summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt3
-rw-r--r--arch/arm/kvm/arm.c4
-rw-r--r--arch/mips/kvm/mips.c6
-rw-r--r--arch/powerpc/kvm/booke.c6
-rw-r--r--arch/powerpc/kvm/powerpc.c2
-rw-r--r--arch/s390/include/asm/kvm_host.h4
-rw-r--r--arch/s390/kernel/entry.S2
-rw-r--r--arch/s390/kvm/intercept.c16
-rw-r--r--arch/s390/kvm/interrupt.c87
-rw-r--r--arch/s390/kvm/kvm-s390.c61
-rw-r--r--arch/s390/kvm/kvm-s390.h25
-rw-r--r--arch/s390/kvm/priv.c8
-rw-r--r--arch/x86/include/asm/kvm_host.h17
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/uapi/asm/kvm.h3
-rw-r--r--arch/x86/kernel/kvm.c4
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/emulate.c41
-rw-r--r--arch/x86/kvm/ioapic.c9
-rw-r--r--arch/x86/kvm/irq_comm.c14
-rw-r--r--arch/x86/kvm/lapic.c26
-rw-r--r--arch/x86/kvm/lapic.h8
-rw-r--r--arch/x86/kvm/mmu.c418
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmu_audit.c4
-rw-r--r--arch/x86/kvm/svm.c32
-rw-r--r--arch/x86/kvm/vmx.c224
-rw-r--r--arch/x86/kvm/x86.c131
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--include/linux/kvm_host.h32
-rw-r--r--include/uapi/linux/kvm.h1
-rw-r--r--virt/kvm/kvm_main.c38
32 files changed, 796 insertions, 437 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 9fa2bf8c3f6f..695544420ff2 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -959,7 +959,8 @@ documentation when it pops into existence).
4.37 KVM_ENABLE_CAP
Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM
-Architectures: ppc, s390
+Architectures: x86 (only KVM_CAP_ENABLE_CAP_VM),
+ mips (only KVM_CAP_ENABLE_CAP), ppc, s390
Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM)
Parameters: struct kvm_enable_cap (in)
Returns: 0 on success; -1 on error
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index d9631ecddd56..e41cb11f71b2 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -553,13 +553,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
* Enter the guest
*/
trace_kvm_entry(*vcpu_pc(vcpu));
- kvm_guest_enter();
+ __kvm_guest_enter();
vcpu->mode = IN_GUEST_MODE;
ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
vcpu->mode = OUTSIDE_GUEST_MODE;
- kvm_guest_exit();
+ __kvm_guest_exit();
trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
/*
* We may have taken a host interrupt in HYP mode (ie
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index bb68e8d520e8..a8e660a44474 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -393,7 +393,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
kvm_mips_deliver_interrupts(vcpu,
kvm_read_c0_guest_cause(vcpu->arch.cop0));
- kvm_guest_enter();
+ __kvm_guest_enter();
/* Disable hardware page table walking while in guest */
htw_stop();
@@ -403,7 +403,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
/* Re-enable HTW before enabling interrupts */
htw_start();
- kvm_guest_exit();
+ __kvm_guest_exit();
local_irq_enable();
if (vcpu->sigset_active)
@@ -982,7 +982,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
- memslot = &kvm->memslots->memslots[log->slot];
+ memslot = id_to_memslot(kvm->memslots, log->slot);
ga = memslot->base_gfn << PAGE_SHIFT;
ga_end = ga + (memslot->npages << PAGE_SHIFT);
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 6c1316a15a27..3872ab31c80a 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1004,10 +1004,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
}
- local_irq_enable();
-
trace_kvm_exit(exit_nr, vcpu);
- kvm_guest_exit();
+ __kvm_guest_exit();
+
+ local_irq_enable();
run->exit_reason = KVM_EXIT_UNKNOWN;
run->ready_for_interrupt_injection = 1;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index ac3ddf115f3d..8cd1f80fdc70 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -115,7 +115,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
continue;
}
- kvm_guest_enter();
+ __kvm_guest_enter();
return 1;
}
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index d01fc588b5c3..444c412307cb 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -80,6 +80,7 @@ struct sca_block {
#define CPUSTAT_MCDS 0x00000100
#define CPUSTAT_SM 0x00000080
#define CPUSTAT_IBS 0x00000040
+#define CPUSTAT_GED2 0x00000010
#define CPUSTAT_G 0x00000008
#define CPUSTAT_GED 0x00000004
#define CPUSTAT_J 0x00000002
@@ -95,7 +96,8 @@ struct kvm_s390_sie_block {
#define PROG_IN_SIE (1<<0)
__u32 prog0c; /* 0x000c */
__u8 reserved10[16]; /* 0x0010 */
-#define PROG_BLOCK_SIE 0x00000001
+#define PROG_BLOCK_SIE (1<<0)
+#define PROG_REQUEST (1<<1)
atomic_t prog20; /* 0x0020 */
__u8 reserved24[4]; /* 0x0024 */
__u64 cputm; /* 0x0028 */
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 99b44acbfcc7..3238893c9d4f 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -1005,7 +1005,7 @@ ENTRY(sie64a)
.Lsie_gmap:
lg %r14,__SF_EMPTY(%r15) # get control block pointer
oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now
- tm __SIE_PROG20+3(%r14),1 # last exit...
+ tm __SIE_PROG20+3(%r14),3 # last exit...
jnz .Lsie_done
LPP __SF_EMPTY(%r15) # set guest id
sie 0(%r14)
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 9e3779e3e496..7365e8a46032 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -241,21 +241,6 @@ static int handle_prog(struct kvm_vcpu *vcpu)
return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
}
-static int handle_instruction_and_prog(struct kvm_vcpu *vcpu)
-{
- int rc, rc2;
-
- vcpu->stat.exit_instr_and_program++;
- rc = handle_instruction(vcpu);
- rc2 = handle_prog(vcpu);
-
- if (rc == -EOPNOTSUPP)
- vcpu->arch.sie_block->icptcode = 0x04;
- if (rc)
- return rc;
- return rc2;
-}
-
/**
* handle_external_interrupt - used for external interruption interceptions
*
@@ -355,7 +340,6 @@ static const intercept_handler_t intercept_funcs[] = {
[0x00 >> 2] = handle_noop,
[0x04 >> 2] = handle_instruction,
[0x08 >> 2] = handle_prog,
- [0x0C >> 2] = handle_instruction_and_prog,
[0x10 >> 2] = handle_noop,
[0x14 >> 2] = handle_external_interrupt,
[0x18 >> 2] = handle_noop,
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 9de47265ef73..322ef9cfdc80 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -134,6 +134,8 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
active_mask = pending_local_irqs(vcpu);
active_mask |= pending_floating_irqs(vcpu);
+ if (!active_mask)
+ return 0;
if (psw_extint_disabled(vcpu))
active_mask &= ~IRQ_PEND_EXT_MASK;
@@ -941,12 +943,9 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
if (cpu_timer_irq_pending(vcpu))
set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
- do {
- irqs = deliverable_irqs(vcpu);
+ while ((irqs = deliverable_irqs(vcpu)) && !rc) {
/* bits are in the order of interrupt priority */
irq_type = find_first_bit(&irqs, IRQ_PEND_COUNT);
- if (irq_type == IRQ_PEND_COUNT)
- break;
if (is_ioirq(irq_type)) {
rc = __deliver_io(vcpu, irq_type);
} else {
@@ -958,9 +957,7 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
}
rc = func(vcpu);
}
- if (rc)
- break;
- } while (!rc);
+ }
set_intercept_indicators(vcpu);
@@ -1061,7 +1058,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
if (sclp_has_sigpif())
return __inject_extcall_sigpif(vcpu, src_id);
- if (!test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
+ if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
return -EBUSY;
*extcall = irq->u.extcall;
atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
@@ -1340,12 +1337,54 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
return 0;
}
-static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
+/*
+ * Find a destination VCPU for a floating irq and kick it.
+ */
+static void __floating_irq_kick(struct kvm *kvm, u64 type)
{
+ struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
struct kvm_s390_local_interrupt *li;
+ struct kvm_vcpu *dst_vcpu;
+ int sigcpu, online_vcpus, nr_tries = 0;
+
+ online_vcpus = atomic_read(&kvm->online_vcpus);
+ if (!online_vcpus)
+ return;
+
+ /* find idle VCPUs first, then round robin */
+ sigcpu = find_first_bit(fi->idle_mask, online_vcpus);
+ if (sigcpu == online_vcpus) {
+ do {
+ sigcpu = fi->next_rr_cpu;
+ fi->next_rr_cpu = (fi->next_rr_cpu + 1) % online_vcpus;
+ /* avoid endless loops if all vcpus are stopped */
+ if (nr_tries++ >= online_vcpus)
+ return;
+ } while (is_vcpu_stopped(kvm_get_vcpu(kvm, sigcpu)));
+ }
+ dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
+
+ /* make the VCPU drop out of the SIE, or wake it up if sleeping */
+ li = &dst_vcpu->arch.local_int;
+ spin_lock(&li->lock);
+ switch (type) {
+ case KVM_S390_MCHK:
+ atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
+ break;
+ case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+ atomic_set_mask(CPUSTAT_IO_INT, li->cpuflags);
+ break;
+ default:
+ atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
+ break;
+ }
+ spin_unlock(&li->lock);
+ kvm_s390_vcpu_wakeup(dst_vcpu);
+}
+
+static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
+{
struct kvm_s390_float_interrupt *fi;
- struct kvm_vcpu *dst_vcpu = NULL;
- int sigcpu;
u64 type = READ_ONCE(inti->type);
int rc;
@@ -1373,32 +1412,8 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
if (rc)
return rc;
- sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
- if (sigcpu == KVM_MAX_VCPUS) {
- do {
- sigcpu = fi->next_rr_cpu++;
- if (sigcpu == KVM_MAX_VCPUS)
- sigcpu = fi->next_rr_cpu = 0;
- } while (kvm_get_vcpu(kvm, sigcpu) == NULL);
- }
- dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
- li = &dst_vcpu->arch.local_int;
- spin_lock(&li->lock);
- switch (type) {
- case KVM_S390_MCHK:
- atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
- break;
- case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
- atomic_set_mask(CPUSTAT_IO_INT, li->cpuflags);
- break;
- default:
- atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
- break;
- }
- spin_unlock(&li->lock);
- kvm_s390_vcpu_wakeup(kvm_get_vcpu(kvm, sigcpu));
+ __floating_irq_kick(kvm, type);
return 0;
-
}
int kvm_s390_inject_vm(struct kvm *kvm,
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 8cd8e7b288c5..d461f8a15c07 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -110,7 +110,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
/* upper facilities limit for kvm */
unsigned long kvm_s390_fac_list_mask[] = {
0xffe6fffbfcfdfc40UL,
- 0x005c800000000000UL,
+ 0x005e800000000000UL,
};
unsigned long kvm_s390_fac_list_mask_size(void)
@@ -454,10 +454,10 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
mutex_lock(&kvm->lock);
kvm->arch.epoch = gtod - host_tod;
- kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) {
+ kvm_s390_vcpu_block_all(kvm);
+ kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm)
cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch;
- exit_sie(cur_vcpu);
- }
+ kvm_s390_vcpu_unblock_all(kvm);
mutex_unlock(&kvm->lock);
return 0;
}
@@ -1311,8 +1311,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
CPUSTAT_SM |
- CPUSTAT_STOPPED |
- CPUSTAT_GED);
+ CPUSTAT_STOPPED);
+
+ if (test_kvm_facility(vcpu->kvm, 78))
+ atomic_set_mask(CPUSTAT_GED2, &vcpu->arch.sie_block->cpuflags);
+ else if (test_kvm_facility(vcpu->kvm, 8))
+ atomic_set_mask(CPUSTAT_GED, &vcpu->arch.sie_block->cpuflags);
+
kvm_s390_vcpu_setup_model(vcpu);
vcpu->arch.sie_block->ecb = 6;
@@ -1409,16 +1414,26 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
return kvm_s390_vcpu_has_irq(vcpu, 0);
}
-void s390_vcpu_block(struct kvm_vcpu *vcpu)
+void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu)
{
atomic_set_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
}
-void s390_vcpu_unblock(struct kvm_vcpu *vcpu)
+void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu)
{
atomic_clear_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
}
+static void kvm_s390_vcpu_request(struct kvm_vcpu *vcpu)
+{
+ atomic_set_mask(PROG_REQUEST, &vcpu->arch.sie_block->prog20);
+}
+
+static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu)
+{
+ atomic_clear_mask(PROG_REQUEST, &vcpu->arch.sie_block->prog20);
+}
+
/*
* Kick a guest cpu out of SIE and wait until SIE is not running.
* If the CPU is not running (e.g. waiting as idle) the function will
@@ -1430,10 +1445,11 @@ void exit_sie(struct kvm_vcpu *vcpu)
cpu_relax();
}
-/* Kick a guest cpu out of SIE and prevent SIE-reentry */
-void exit_sie_sync(struct kvm_vcpu *vcpu)
+/* Kick a guest cpu out of SIE to process a request synchronously */
+void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
{
- s390_vcpu_block(vcpu);
+ kvm_make_request(req, vcpu);
+ kvm_s390_vcpu_request(vcpu);
exit_sie(vcpu);
}
@@ -1447,8 +1463,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
/* match against both prefix pages */
if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
- kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
- exit_sie_sync(vcpu);
+ kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
}
}
}
@@ -1720,8 +1735,10 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu)
static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
{
+ if (!vcpu->requests)
+ return 0;
retry:
- s390_vcpu_unblock(vcpu);
+ kvm_s390_vcpu_request_handled(vcpu);
/*
* We use MMU_RELOAD just to re-arm the ipte notifier for the
* guest prefix page. gmap_ipte_notify will wait on the ptl lock.
@@ -1993,12 +2010,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
* As PF_VCPU will be used in fault handler, between
* guest_enter and guest_exit should be no uaccess.
*/
- preempt_disable();
- kvm_guest_enter();
- preempt_enable();
+ local_irq_disable();
+ __kvm_guest_enter();
+ local_irq_enable();
exit_reason = sie64a(vcpu->arch.sie_block,
vcpu->run->s.regs.gprs);
- kvm_guest_exit();
+ local_irq_disable();
+ __kvm_guest_exit();
+ local_irq_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
rc = vcpu_post_run(vcpu, exit_reason);
@@ -2206,8 +2225,7 @@ int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr)
static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
{
kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
- kvm_make_request(KVM_REQ_DISABLE_IBS, vcpu);
- exit_sie_sync(vcpu);
+ kvm_s390_sync_request(KVM_REQ_DISABLE_IBS, vcpu);
}
static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
@@ -2223,8 +2241,7 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
{
kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
- kvm_make_request(KVM_REQ_ENABLE_IBS, vcpu);
- exit_sie_sync(vcpu);
+ kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
}
void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index ca108b90ae56..c5704786e473 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -211,10 +211,10 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr);
void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
-void s390_vcpu_block(struct kvm_vcpu *vcpu);
-void s390_vcpu_unblock(struct kvm_vcpu *vcpu);
+void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu);
void exit_sie(struct kvm_vcpu *vcpu);
-void exit_sie_sync(struct kvm_vcpu *vcpu);
+void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu);
int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
/* is cmma enabled */
@@ -228,6 +228,25 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
struct kvm_s390_pgm_info *pgm_info);
+static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+
+ WARN_ON(!mutex_is_locked(&kvm->lock));
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_s390_vcpu_block(vcpu);
+}
+
+static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_s390_vcpu_unblock(vcpu);
+}
+
/**
* kvm_s390_inject_prog_cond - conditionally inject a program check
* @vcpu: virtual cpu
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index d22d8ee1ff9d..ad4242245771 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -698,10 +698,14 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
case 0x00001000:
end = (start + (1UL << 20)) & ~((1UL << 20) - 1);
break;
- /* We dont support EDAT2
case 0x00002000:
+ /* only support 2G frame size if EDAT2 is available and we are
+ not in 24-bit addressing mode */
+ if (!test_kvm_facility(vcpu->kvm, 78) ||
+ psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_24BIT)
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
end = (start + (1UL << 31)) & ~((1UL << 31) - 1);
- break;*/
+ break;
default:
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
}
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f4a555beef19..1a4d6a054749 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -637,6 +637,8 @@ struct kvm_arch {
#endif
bool boot_vcpu_runs_old_kvmclock;
+
+ u64 disabled_quirks;
};
struct kvm_vm_stat {
@@ -689,12 +691,13 @@ struct msr_data {
struct kvm_lapic_irq {
u32 vector;
- u32 delivery_mode;
- u32 dest_mode;
- u32 level;
- u32 trig_mode;
+ u16 delivery_mode;
+ u16 dest_mode;
+ bool level;
+ u16 trig_mode;
u32 shorthand;
u32 dest_id;
+ bool msi_redir_hint;
};
struct kvm_x86_ops {
@@ -711,7 +714,7 @@ struct kvm_x86_ops {
/* Create, but do not attach this VCPU */
struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
void (*vcpu_free)(struct kvm_vcpu *vcpu);
- void (*vcpu_reset)(struct kvm_vcpu *vcpu);
+ void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
@@ -1002,7 +1005,7 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
-int fx_init(struct kvm_vcpu *vcpu);
+int fx_init(struct kvm_vcpu *vcpu, bool init_event);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes);
@@ -1146,7 +1149,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
unsigned long address);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d6b078e9fa28..628954ceede1 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -86,7 +86,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
offset = pvclock_get_nsec_offset(src);
ret = src->system_time + offset;
ret_flags = src->flags;
- rdtsc_barrier();
*cycles = ret;
*flags = ret_flags;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index d7dcef58aefa..2fec75e4b1e1 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -345,4 +345,7 @@ struct kvm_xcrs {
struct kvm_sync_regs {
};
+#define KVM_QUIRK_LINT0_REENABLED (1 << 0)
+#define KVM_QUIRK_CD_NW_CLEARED (1 << 1)
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 9435620062df..cc34cece853e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -331,7 +331,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
apic_write(APIC_EOI, APIC_EOI_ACK);
}
-void kvm_guest_cpu_init(void)
+static void kvm_guest_cpu_init(void)
{
if (!kvm_para_available())
return;
@@ -655,7 +655,7 @@ static inline void spin_time_accum_blocked(u64 start)
static struct dentry *d_spin_debug;
static struct dentry *d_kvm_debug;
-struct dentry *kvm_init_debugfs(void)
+static struct dentry *kvm_init_debugfs(void)
{
d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
if (!d_kvm_debug)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 1d08ad3582d0..92a74a0428aa 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -97,7 +97,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
- vcpu->arch.eager_fpu = guest_cpuid_has_mpx(vcpu);
+ vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu);
/*
* The existing code assumes virtual address is 48-bit in the canonical
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 630bcb0d7a04..9b655d113fc6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -25,6 +25,7 @@
#include <linux/module.h>
#include <asm/kvm_emulate.h>
#include <linux/stringify.h>
+#include <asm/debugreg.h>
#include "x86.h"
#include "tss.h"
@@ -523,13 +524,9 @@ static void masked_increment(ulong *reg, ulong mask, int inc)
static inline void
register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
{
- ulong mask;
+ ulong *preg = reg_rmw(ctxt, reg);
- if (ctxt->ad_bytes == sizeof(unsigned long))
- mask = ~0UL;
- else
- mask = ad_mask(ctxt);
- masked_increment(reg_rmw(ctxt, reg), mask, inc);
+ assign_register(preg, *preg + inc, ctxt->ad_bytes);
}
static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
@@ -2573,6 +2570,30 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
return true;
}
+static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
+{
+ /*
+ * Intel CPUs mask the counter and pointers in quite strange
+ * manner when ECX is zero due to REP-string optimizations.
+ */
+#ifdef CONFIG_X86_64
+ if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
+ return;
+
+ *reg_write(ctxt, VCPU_REGS_RCX) = 0;
+
+ switch (ctxt->b) {
+ case 0xa4: /* movsb */
+ case 0xa5: /* movsd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1;
+ /* fall through */
+ case 0xaa: /* stosb */
+ case 0xab: /* stosd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1;
+ }
+#endif
+}
+
static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
struct tss_segment_16 *tss)
{
@@ -2849,7 +2870,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ulong old_tss_base =
ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
u32 desc_limit;
- ulong desc_addr;
+ ulong desc_addr, dr7;
/* FIXME: old_tss_base == ~0 ? */
@@ -2934,6 +2955,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ret = em_push(ctxt);
}
+ ops->get_dr(ctxt, 7, &dr7);
+ ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
+
return ret;
}
@@ -3840,7 +3864,7 @@ static const struct opcode group5[] = {
F(DstMem | SrcNone | Lock, em_inc),
F(DstMem | SrcNone | Lock, em_dec),
I(SrcMem | NearBranch, em_call_near_abs),
- I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
+ I(SrcMemFAddr | ImplicitOps, em_call_far),
I(SrcMem | NearBranch, em_jmp_abs),
I(SrcMemFAddr | ImplicitOps, em_jmp_far),
I(SrcMem | Stack, em_push), D(Undefined),
@@ -4910,6 +4934,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
if (ctxt->rep_prefix && (ctxt->d & String)) {
/* All REP prefixes have the same first termination condition */
if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+ string_registers_quirk(ctxt);
ctxt->eip = ctxt->_eip;
ctxt->eflags &= ~X86_EFLAGS_RF;
goto done;
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 28146f03c514..856f79105bb5 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -349,6 +349,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
irqe.delivery_mode = entry->fields.delivery_mode << 8;
irqe.level = 1;
irqe.shorthand = 0;
+ irqe.msi_redir_hint = false;
if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
ioapic->irr_delivered |= 1 << irq;
@@ -637,11 +638,9 @@ void kvm_ioapic_destroy(struct kvm *kvm)
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
cancel_delayed_work_sync(&ioapic->eoi_inject);
- if (ioapic) {
- kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
- kvm->arch.vioapic = NULL;
- kfree(ioapic);
- }
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
}
int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 72298b3ac025..9efff9e5b58c 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -31,6 +31,8 @@
#include "ioapic.h"
+#include "lapic.h"
+
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
@@ -48,11 +50,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
line_status);
}
-inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
-{
- return irq->delivery_mode == APIC_DM_LOWEST;
-}
-
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, unsigned long *dest_map)
{
@@ -60,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_vcpu *vcpu, *lowest = NULL;
if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
- kvm_is_dm_lowest_prio(irq)) {
+ kvm_lowest_prio_delivery(irq)) {
printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
irq->delivery_mode = APIC_DM_FIXED;
}
@@ -76,7 +73,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
irq->dest_id, irq->dest_mode))
continue;
- if (!kvm_is_dm_lowest_prio(irq)) {
+ if (!kvm_lowest_prio_delivery(irq)) {
if (r < 0)
r = 0;
r += kvm_apic_set_irq(vcpu, irq, dest_map);
@@ -106,9 +103,10 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
irq->delivery_mode = e->msi.data & 0x700;
+ irq->msi_redir_hint = ((e->msi.address_lo
+ & MSI_ADDR_REDIRECTION_LOWPRI) > 0);
irq->level = 1;
irq->shorthand = 0;
- /* TODO Deal with RH bit of MSI message address */
}
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 629af0f1c5c4..dc5b57bb1f76 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -728,7 +728,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
dst = map->logical_map[cid];
- if (irq->delivery_mode == APIC_DM_LOWEST) {
+ if (kvm_lowest_prio_delivery(irq)) {
int l = -1;
for_each_set_bit(i, &bitmap, 16) {
if (!dst[i])
@@ -914,9 +914,10 @@ static void apic_send_ipi(struct kvm_lapic *apic)
irq.vector = icr_low & APIC_VECTOR_MASK;
irq.delivery_mode = icr_low & APIC_MODE_MASK;
irq.dest_mode = icr_low & APIC_DEST_MASK;
- irq.level = icr_low & APIC_INT_ASSERT;
+ irq.level = (icr_low & APIC_INT_ASSERT) != 0;
irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
irq.shorthand = icr_low & APIC_SHORT_MASK;
+ irq.msi_redir_hint = false;
if (apic_x2apic_mode(apic))
irq.dest_id = icr_high;
else
@@ -926,10 +927,11 @@ static void apic_send_ipi(struct kvm_lapic *apic)
apic_debug("icr_high 0x%x, icr_low 0x%x, "
"short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
- "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
+ "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, "
+ "msi_redir_hint 0x%x\n",
icr_high, icr_low, irq.shorthand, irq.dest_id,
irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
- irq.vector);
+ irq.vector, irq.msi_redir_hint);
kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
}
@@ -1557,7 +1559,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
}
-void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct kvm_lapic *apic;
int i;
@@ -1571,14 +1573,16 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
- kvm_apic_set_id(apic, vcpu->vcpu_id);
+ if (!init_event)
+ kvm_apic_set_id(apic, vcpu->vcpu_id);
kvm_apic_set_version(apic->vcpu);
for (i = 0; i < APIC_LVT_NUM; i++)
apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
apic->lapic_timer.timer_mode = 0;
- apic_set_reg(apic, APIC_LVT0,
- SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+ if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_LINT0_REENABLED))
+ apic_set_reg(apic, APIC_LVT0,
+ SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_set_reg(apic, APIC_DFR, 0xffffffffU);
apic_set_spiv(apic, 0xff);
@@ -1712,7 +1716,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
- kvm_lapic_reset(vcpu);
+ kvm_lapic_reset(vcpu, false);
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
return 0;
@@ -2046,8 +2050,8 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
pe = xchg(&apic->pending_events, 0);
if (test_bit(KVM_APIC_INIT, &pe)) {
- kvm_lapic_reset(vcpu);
- kvm_vcpu_reset(vcpu);
+ kvm_lapic_reset(vcpu, true);
+ kvm_vcpu_reset(vcpu, true);
if (kvm_vcpu_is_bsp(apic->vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 9d28383fc1e7..71b150cae5f9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,7 +48,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
@@ -153,6 +153,12 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
return vcpu->arch.apic->pending_events;
}
+static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
+{
+ return (irq->delivery_mode == APIC_DM_LOWEST ||
+ irq->msi_redir_hint);
+}
+
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
void wait_lapic_expire(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 44a7d2515497..49c34e632b91 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -811,8 +811,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
int i;
slot = gfn_to_memslot(kvm, gfn);
- for (i = PT_DIRECTORY_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
linfo->write_count += 1;
}
@@ -826,8 +825,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
int i;
slot = gfn_to_memslot(kvm, gfn);
- for (i = PT_DIRECTORY_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
linfo->write_count -= 1;
WARN_ON(linfo->write_count < 0);
@@ -858,8 +856,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
page_size = kvm_host_page_size(kvm, gfn);
- for (i = PT_PAGE_TABLE_LEVEL;
- i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
+ for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
if (page_size >= KVM_HPAGE_SIZE(i))
ret = i;
else
@@ -1142,6 +1139,11 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
return NULL;
}
+#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \
+ for (_spte_ = rmap_get_first(*_rmap_, _iter_); \
+ _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \
+ _spte_ = rmap_get_next(_iter_))
+
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
if (mmu_spte_clear_track_bits(sptep))
@@ -1205,12 +1207,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
struct rmap_iterator iter;
bool flush = false;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
flush |= spte_write_protect(kvm, sptep, pt_protect);
- sptep = rmap_get_next(&iter);
- }
return flush;
}
@@ -1232,12 +1230,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
struct rmap_iterator iter;
bool flush = false;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
flush |= spte_clear_dirty(kvm, sptep);
- sptep = rmap_get_next(&iter);
- }
return flush;
}
@@ -1259,12 +1253,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
struct rmap_iterator iter;
bool flush = false;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
flush |= spte_set_dirty(kvm, sptep);
- sptep = rmap_get_next(&iter);
- }
return flush;
}
@@ -1351,8 +1341,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
slot = gfn_to_memslot(kvm, gfn);
- for (i = PT_PAGE_TABLE_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
rmapp = __gfn_to_rmap(gfn, i, slot);
write_protected |= __rmap_write_protect(kvm, rmapp, true);
}
@@ -1360,24 +1349,28 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
return write_protected;
}
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp)
{
u64 *sptep;
struct rmap_iterator iter;
- int need_tlb_flush = 0;
+ bool flush = false;
while ((sptep = rmap_get_first(*rmapp, &iter))) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n",
- sptep, *sptep, gfn, level);
+ rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
drop_spte(kvm, sptep);
- need_tlb_flush = 1;
+ flush = true;
}
- return need_tlb_flush;
+ return flush;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
+{
+ return kvm_zap_rmapp(kvm, rmapp);
}
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
@@ -1394,8 +1387,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
WARN_ON(pte_huge(*ptep));
new_pfn = pte_pfn(*ptep);
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!is_shadow_present_pte(*sptep));
+restart:
+ for_each_rmap_spte(rmapp, &iter, sptep) {
rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
sptep, *sptep, gfn, level);
@@ -1403,7 +1396,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (pte_write(*ptep)) {
drop_spte(kvm, sptep);
- sptep = rmap_get_first(*rmapp, &iter);
+ goto restart;
} else {
new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1414,7 +1407,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
mmu_spte_clear_track_bits(sptep);
mmu_spte_set(sptep, new_spte);
- sptep = rmap_get_next(&iter);
}
}
@@ -1424,6 +1416,74 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
return 0;
}
+struct slot_rmap_walk_iterator {
+ /* input fields. */
+ struct kvm_memory_slot *slot;
+ gfn_t start_gfn;
+ gfn_t end_gfn;
+ int start_level;
+ int end_level;
+
+ /* output fields. */
+ gfn_t gfn;
+ unsigned long *rmap;
+ int level;
+
+ /* private field. */
+ unsigned long *end_rmap;
+};
+
+static void
+rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
+{
+ iterator->level = level;
+ iterator->gfn = iterator->start_gfn;
+ iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
+ iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
+ iterator->slot);
+}
+
+static void
+slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+ struct kvm_memory_slot *slot, int start_level,
+ int end_level, gfn_t start_gfn, gfn_t end_gfn)
+{
+ iterator->slot = slot;
+ iterator->start_level = start_level;
+ iterator->end_level = end_level;
+ iterator->start_gfn = start_gfn;
+ iterator->end_gfn = end_gfn;
+
+ rmap_walk_init_level(iterator, iterator->start_level);
+}
+
+static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
+{
+ return !!iterator->rmap;
+}
+
+static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
+{
+ if (++iterator->rmap <= iterator->end_rmap) {
+ iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+ return;
+ }
+
+ if (++iterator->level > iterator->end_level) {
+ iterator->rmap = NULL;
+ return;
+ }
+
+ rmap_walk_init_level(iterator, iterator->level);
+}
+
+#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
+ _start_gfn, _end_gfn, _iter_) \
+ for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
+ _end_level_, _start_gfn, _end_gfn); \
+ slot_rmap_walk_okay(_iter_); \
+ slot_rmap_walk_next(_iter_))
+
static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long start,
unsigned long end,
@@ -1435,10 +1495,10 @@ static int kvm_handle_hva_range(struct kvm *kvm,
int level,
unsigned long data))
{
- int j;
- int ret = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
+ struct slot_rmap_walk_iterator iterator;
+ int ret = 0;
slots = kvm_memslots(kvm);
@@ -1458,26 +1518,11 @@ static int kvm_handle_hva_range(struct kvm *kvm,
gfn_start = hva_to_gfn_memslot(hva_start, memslot);
gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
- for (j = PT_PAGE_TABLE_LEVEL;
- j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
- unsigned long idx, idx_end;
- unsigned long *rmapp;
- gfn_t gfn = gfn_start;
-
- /*
- * {idx(page_j) | page_j intersects with
- * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
- */
- idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
- idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
-
- rmapp = __gfn_to_rmap(gfn_start, j, memslot);
-
- for (; idx <= idx_end;
- ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j)))
- ret |= handler(kvm, rmapp++, memslot,
- gfn, j, data);
- }
+ for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
+ PT_MAX_HUGEPAGE_LEVEL, gfn_start, gfn_end - 1,
+ &iterator)
+ ret |= handler(kvm, iterator.rmap, memslot,
+ iterator.gfn, iterator.level, data);
}
return ret;
@@ -1518,16 +1563,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
BUG_ON(!shadow_accessed_mask);
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;
- sptep = rmap_get_next(&iter)) {
- BUG_ON(!is_shadow_present_pte(*sptep));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
if (*sptep & shadow_accessed_mask) {
young = 1;
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsigned long *)sptep);
}
- }
+
trace_kvm_age_page(gfn, level, slot, young);
return young;
}
@@ -1548,15 +1590,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (!shadow_accessed_mask)
goto out;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;
- sptep = rmap_get_next(&iter)) {
- BUG_ON(!is_shadow_present_pte(*sptep));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
if (*sptep & shadow_accessed_mask) {
young = 1;
break;
}
- }
out:
return young;
}
@@ -2393,19 +2431,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
u64 start, u64 end)
{
- int i;
u64 base, mask;
u8 prev_match, curr_match;
- int num_var_ranges = KVM_NR_VAR_MTRR;
+ int i, num_var_ranges = KVM_NR_VAR_MTRR;
- if (!mtrr_state->enabled)
- return 0xFF;
+ /* MTRR is completely disabled, use UC for all of physical memory. */
+ if (!(mtrr_state->enabled & 0x2))
+ return MTRR_TYPE_UNCACHABLE;
/* Make end inclusive end, instead of exclusive */
end--;
/* Look in fixed ranges. Just return the type as per start */
- if (mtrr_state->have_fixed && (start < 0x100000)) {
+ if (mtrr_state->have_fixed && (mtrr_state->enabled & 0x1) &&
+ (start < 0x100000)) {
int idx;
if (start < 0x80000) {
@@ -2428,9 +2467,6 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
* Look of multiple ranges matching this address and pick type
* as per MTRR precedence
*/
- if (!(mtrr_state->enabled & 2))
- return mtrr_state->def_type;
-
prev_match = 0xFF;
for (i = 0; i < num_var_ranges; ++i) {
unsigned short start_state, end_state;
@@ -3475,10 +3511,12 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable)
{
+ struct kvm_memory_slot *slot;
bool async;
- *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
-
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ async = false;
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
if (!async)
return false; /* *pfn has correct page already */
@@ -3492,8 +3530,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
return true;
}
- *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
-
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
return false;
}
@@ -4420,36 +4457,113 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
init_kvm_mmu(vcpu);
}
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap);
+
+/* The caller should hold mmu-lock before calling this function. */
+static bool
+slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
{
- gfn_t last_gfn;
- int i;
+ struct slot_rmap_walk_iterator iterator;
bool flush = false;
- last_gfn = memslot->base_gfn + memslot->npages - 1;
+ for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap);
- spin_lock(&kvm->mmu_lock);
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
- for (i = PT_PAGE_TABLE_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- unsigned long *rmapp;
- unsigned long last_index, index;
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
- rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
- last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+ return flush;
+}
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= __rmap_write_protect(kvm, rmapp,
- false);
+static bool
+slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ bool lock_flush_tlb)
+{
+ return slot_handle_level_range(kvm, memslot, fn, start_level,
+ end_level, memslot->base_gfn,
+ memslot->base_gfn + memslot->npages - 1,
+ lock_flush_tlb);
+}
- if (need_resched() || spin_needbreak(&kvm->mmu_lock))
- cond_resched_lock(&kvm->mmu_lock);
- }
+static bool
+slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static bool
+slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static bool
+slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+}
+
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+
+ slots = kvm_memslots(kvm);
+
+ spin_lock(&kvm->mmu_lock);
+ kvm_for_each_memslot(memslot, slots) {
+ gfn_t start, end;
+
+ start = max(gfn_start, memslot->base_gfn);
+ end = min(gfn_end, memslot->base_gfn + memslot->npages);
+ if (start >= end)
+ continue;
+
+ slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+ PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true);
}
spin_unlock(&kvm->mmu_lock);
+}
+
+static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp)
+{
+ return __rmap_write_protect(kvm, rmapp, false);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ bool flush;
+
+ spin_lock(&kvm->mmu_lock);
+ flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
+ false);
+ spin_unlock(&kvm->mmu_lock);
/*
* kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
@@ -4482,9 +4596,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
pfn_t pfn;
struct kvm_mmu_page *sp;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+restart:
+ for_each_rmap_spte(rmapp, &iter, sptep) {
sp = page_header(__pa(sptep));
pfn = spte_to_pfn(*sptep);
@@ -4499,10 +4612,9 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
!kvm_is_reserved_pfn(pfn) &&
PageTransCompound(pfn_to_page(pfn))) {
drop_spte(kvm, sptep);
- sptep = rmap_get_first(*rmapp, &iter);
need_tlb_flush = 1;
- } else
- sptep = rmap_get_next(&iter);
+ goto restart;
+ }
}
return need_tlb_flush;
@@ -4511,59 +4623,18 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- bool flush = false;
- unsigned long *rmapp;
- unsigned long last_index, index;
-
spin_lock(&kvm->mmu_lock);
-
- rmapp = memslot->arch.rmap[0];
- last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1,
- memslot->base_gfn, PT_PAGE_TABLE_LEVEL);
-
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp);
-
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- if (flush) {
- kvm_flush_remote_tlbs(kvm);
- flush = false;
- }
- cond_resched_lock(&kvm->mmu_lock);
- }
- }
-
- if (flush)
- kvm_flush_remote_tlbs(kvm);
-
+ slot_handle_leaf(kvm, memslot, kvm_mmu_zap_collapsible_spte, true);
spin_unlock(&kvm->mmu_lock);
}
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- gfn_t last_gfn;
- unsigned long *rmapp;
- unsigned long last_index, index;
- bool flush = false;
-
- last_gfn = memslot->base_gfn + memslot->npages - 1;
+ bool flush;
spin_lock(&kvm->mmu_lock);
-
- rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
- last_index = gfn_to_index(last_gfn, memslot->base_gfn,
- PT_PAGE_TABLE_LEVEL);
-
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= __rmap_clear_dirty(kvm, rmapp);
-
- if (need_resched() || spin_needbreak(&kvm->mmu_lock))
- cond_resched_lock(&kvm->mmu_lock);
- }
-
+ flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
spin_unlock(&kvm->mmu_lock);
lockdep_assert_held(&kvm->slots_lock);
@@ -4582,31 +4653,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- gfn_t last_gfn;
- int i;
- bool flush = false;
-
- last_gfn = memslot->base_gfn + memslot->npages - 1;
+ bool flush;
spin_lock(&kvm->mmu_lock);
-
- for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- unsigned long *rmapp;
- unsigned long last_index, index;
-
- rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
- last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
-
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= __rmap_write_protect(kvm, rmapp,
- false);
-
- if (need_resched() || spin_needbreak(&kvm->mmu_lock))
- cond_resched_lock(&kvm->mmu_lock);
- }
- }
+ flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
+ false);
spin_unlock(&kvm->mmu_lock);
/* see kvm_mmu_slot_remove_write_access */
@@ -4620,31 +4671,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
void kvm_mmu_slot_set_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- gfn_t last_gfn;
- int i;
- bool flush = false;
-
- last_gfn = memslot->base_gfn + memslot->npages - 1;
+ bool flush;
spin_lock(&kvm->mmu_lock);
-
- for (i = PT_PAGE_TABLE_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- unsigned long *rmapp;
- unsigned long last_index, index;
-
- rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
- last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
-
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= __rmap_set_dirty(kvm, rmapp);
-
- if (need_resched() || spin_needbreak(&kvm->mmu_lock))
- cond_resched_lock(&kvm->mmu_lock);
- }
- }
-
+ flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
spin_unlock(&kvm->mmu_lock);
lockdep_assert_held(&kvm->slots_lock);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 0ada65ecddcf..398d21c0f6dd 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -43,6 +43,7 @@
#define PT_PDPE_LEVEL 3
#define PT_DIRECTORY_LEVEL 2
#define PT_PAGE_TABLE_LEVEL 1
+#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
static inline u64 rsvd_bits(int s, int e)
{
@@ -170,4 +171,5 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 9ade5cfb5a4c..368d53497314 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -197,13 +197,11 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;
- sptep = rmap_get_next(&iter)) {
+ for_each_rmap_spte(rmapp, &iter, sptep)
if (is_writable_pte(*sptep))
audit_printk(kvm, "shadow page has writable "
"mappings: gfn %llx role %x\n",
sp->gfn, sp->role.word);
- }
}
static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9afa233b5482..b9f9e1073e50 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1082,7 +1082,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
return target_tsc - tsc;
}
-static void init_vmcb(struct vcpu_svm *svm)
+static void init_vmcb(struct vcpu_svm *svm, bool init_event)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
@@ -1153,17 +1153,17 @@ static void init_vmcb(struct vcpu_svm *svm)
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
- svm_set_efer(&svm->vcpu, 0);
+ if (!init_event)
+ svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
kvm_set_rflags(&svm->vcpu, 2);
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
/*
- * This is the guest-visible cr0 value.
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
+ * It also updates the guest-visible cr0 value.
*/
- svm->vcpu.arch.cr0 = 0;
(void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
save->cr4 = X86_CR4_PAE;
@@ -1176,7 +1176,7 @@ static void init_vmcb(struct vcpu_svm *svm)
clr_exception_intercept(svm, PF_VECTOR);
clr_cr_intercept(svm, INTERCEPT_CR3_READ);
clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
- save->g_pat = 0x0007040600070406ULL;
+ save->g_pat = svm->vcpu.arch.pat;
save->cr3 = 0;
save->cr4 = 0;
}
@@ -1195,13 +1195,19 @@ static void init_vmcb(struct vcpu_svm *svm)
enable_gif(svm);
}
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 dummy;
u32 eax = 1;
- init_vmcb(svm);
+ if (!init_event) {
+ svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
+ svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ }
+ init_vmcb(svm, init_event);
kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@@ -1257,12 +1263,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
clear_page(svm->vmcb);
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
- init_vmcb(svm);
-
- svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
- svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ init_vmcb(svm, false);
svm_init_osvw(&svm->vcpu);
@@ -1575,7 +1576,8 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
* does not do it - this results in some delay at
* reboot
*/
- cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+ if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_CD_NW_CLEARED))
+ cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
mark_dirty(svm->vmcb, VMCB_CR);
update_cr0_intercept(svm);
@@ -1883,7 +1885,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
* so reinitialize it.
*/
clear_page(svm->vmcb);
- init_vmcb(svm);
+ init_vmcb(svm, false);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2d73807f0d31..9cf5030927d5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2170,8 +2170,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_nested;
- else if (irqchip_in_kernel(vcpu->kvm) &&
- apic_x2apic_mode(vcpu->arch.apic)) {
+ else if (vcpu->arch.apic_base & X2APIC_ENABLE) {
if (is_long_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
else
@@ -4667,16 +4666,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
- if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
- u32 msr_low, msr_high;
- u64 host_pat;
- rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
- host_pat = msr_low | ((u64) msr_high << 32);
- /* Write the default value follow host pat */
- vmcs_write64(GUEST_IA32_PAT, host_pat);
- /* Keep arch.pat sync with GUEST_IA32_PAT */
- vmx->vcpu.arch.pat = host_pat;
- }
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
u32 index = vmx_msr_index[i];
@@ -4708,22 +4699,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
return 0;
}
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct msr_data apic_base_msr;
+ u64 cr0;
vmx->rmode.vm86_active = 0;
vmx->soft_vnmi_blocked = 0;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
- kvm_set_cr8(&vmx->vcpu, 0);
- apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
- apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
- apic_base_msr.host_initiated = true;
- kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
+ kvm_set_cr8(vcpu, 0);
+
+ if (!init_event) {
+ apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
+ apic_base_msr.host_initiated = true;
+ kvm_set_apic_base(vcpu, &apic_base_msr);
+ }
vmx_segment_cache_clear(vmx);
@@ -4747,9 +4743,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
- vmcs_write32(GUEST_SYSENTER_CS, 0);
- vmcs_writel(GUEST_SYSENTER_ESP, 0);
- vmcs_writel(GUEST_SYSENTER_EIP, 0);
+ if (!init_event) {
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+ }
vmcs_writel(GUEST_RFLAGS, 0x02);
kvm_rip_write(vcpu, 0xfff0);
@@ -4764,18 +4763,15 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
- /* Special registers */
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
setup_msrs(vmx);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
- if (cpu_has_vmx_tpr_shadow()) {
+ if (cpu_has_vmx_tpr_shadow() && !init_event) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+ if (vm_need_tpr_shadow(vcpu->kvm))
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- __pa(vmx->vcpu.arch.apic->regs));
+ __pa(vcpu->arch.apic->regs));
vmcs_write32(TPR_THRESHOLD, 0);
}
@@ -4787,12 +4783,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
if (vmx->vpid != 0)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
- vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
- vmx_set_cr4(&vmx->vcpu, 0);
- vmx_set_efer(&vmx->vcpu, 0);
- vmx_fpu_activate(&vmx->vcpu);
- update_exception_bitmap(&vmx->vcpu);
+ cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+ vmx_set_cr0(vcpu, cr0); /* enter rmode */
+ vmx->vcpu.arch.cr0 = cr0;
+ vmx_set_cr4(vcpu, 0);
+ if (!init_event)
+ vmx_set_efer(vcpu, 0);
+ vmx_fpu_activate(vcpu);
+ update_exception_bitmap(vcpu);
vpid_sync_context(vmx);
}
@@ -5710,9 +5708,6 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
return 0;
}
- /* clear all local breakpoint enable flags */
- vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155);
-
/*
* TODO: What about debug traps on tss switch?
* Are we supposed to inject them and update dr6?
@@ -7691,6 +7686,158 @@ static void kvm_flush_pml_buffers(struct kvm *kvm)
kvm_vcpu_kick(vcpu);
}
+static void vmx_dump_sel(char *name, uint32_t sel)
+{
+ pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read32(sel),
+ vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
+ vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
+ vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
+}
+
+static void vmx_dump_dtsel(char *name, uint32_t limit)
+{
+ pr_err("%s limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read32(limit),
+ vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+}
+
+static void dump_vmcs(void)
+{
+ u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
+ u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
+ u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+ u32 secondary_exec_control = 0;
+ unsigned long cr4 = vmcs_readl(GUEST_CR4);
+ u64 efer = vmcs_readl(GUEST_IA32_EFER);
+ int i, n;
+
+ if (cpu_has_secondary_exec_ctrls())
+ secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+ pr_err("*** Guest State ***\n");
+ pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
+ vmcs_readl(CR0_GUEST_HOST_MASK));
+ pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
+ pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
+ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+ (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
+ {
+ pr_err("PDPTR0 = 0x%016lx PDPTR1 = 0x%016lx\n",
+ vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1));
+ pr_err("PDPTR2 = 0x%016lx PDPTR3 = 0x%016lx\n",
+ vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3));
+ }
+ pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
+ vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
+ pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
+ vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(GUEST_SYSENTER_ESP),
+ vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
+ vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
+ vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
+ vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
+ vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
+ vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
+ vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
+ vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
+ vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
+ vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
+ vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
+ if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
+ (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
+ pr_err("EFER = 0x%016llx PAT = 0x%016lx\n",
+ efer, vmcs_readl(GUEST_IA32_PAT));
+ pr_err("DebugCtl = 0x%016lx DebugExceptions = 0x%016lx\n",
+ vmcs_readl(GUEST_IA32_DEBUGCTL),
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016lx\n",
+ vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL));
+ if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
+ pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS));
+ pr_err("Interruptibility = %08x ActivityState = %08x\n",
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
+ vmcs_read32(GUEST_ACTIVITY_STATE));
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+ pr_err("InterruptStatus = %04x\n",
+ vmcs_read16(GUEST_INTR_STATUS));
+
+ pr_err("*** Host State ***\n");
+ pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
+ vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
+ pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
+ vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
+ vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
+ vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
+ vmcs_read16(HOST_TR_SELECTOR));
+ pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
+ vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
+ vmcs_readl(HOST_TR_BASE));
+ pr_err("GDTBase=%016lx IDTBase=%016lx\n",
+ vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
+ pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
+ vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
+ vmcs_readl(HOST_CR4));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(HOST_IA32_SYSENTER_ESP),
+ vmcs_read32(HOST_IA32_SYSENTER_CS),
+ vmcs_readl(HOST_IA32_SYSENTER_EIP));
+ if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
+ pr_err("EFER = 0x%016lx PAT = 0x%016lx\n",
+ vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016lx\n",
+ vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL));
+
+ pr_err("*** Control State ***\n");
+ pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
+ pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
+ pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+ pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
+ vmcs_read32(EXCEPTION_BITMAP),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
+ pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
+ vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
+ pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_EXIT_INTR_INFO),
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+ pr_err(" reason=%08x qualification=%016lx\n",
+ vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
+ pr_err("IDTVectoring: info=%08x errcode=%08x\n",
+ vmcs_read32(IDT_VECTORING_INFO_FIELD),
+ vmcs_read32(IDT_VECTORING_ERROR_CODE));
+ pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
+ if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
+ pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+ if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
+ pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
+ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
+ pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER));
+ n = vmcs_read32(CR3_TARGET_COUNT);
+ for (i = 0; i + 1 < n; i += 4)
+ pr_err("CR3 target%u=%016lx target%u=%016lx\n",
+ i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
+ i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
+ if (i < n)
+ pr_err("CR3 target%u=%016lx\n",
+ i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
+ if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+ pr_err("PLE Gap=%08x Window=%08x\n",
+ vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
+ if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
+ pr_err("Virtual processor ID = 0x%04x\n",
+ vmcs_read16(VIRTUAL_PROCESSOR_ID));
+}
+
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
@@ -7723,6 +7870,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ dump_vmcs();
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= exit_reason;
@@ -8924,7 +9072,7 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
struct vmx_msr_entry *e)
{
/* x2APIC MSR accesses are not allowed */
- if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+ if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
return -EINVAL;
if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
e->index == MSR_IA32_UCODE_REV)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ea306adbbc13..d42d8ace90f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -572,8 +572,7 @@ out:
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
- unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
- X86_CR0_CD | X86_CR0_NW;
+ unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
cr0 |= X86_CR0_ET;
@@ -1852,6 +1851,63 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
+static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct mtrr_state_type *mtrr_state = &vcpu->arch.mtrr_state;
+ unsigned char mtrr_enabled = mtrr_state->enabled;
+ gfn_t start, end, mask;
+ int index;
+ bool is_fixed = true;
+
+ if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
+ !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ return;
+
+ if (!(mtrr_enabled & 0x2) && msr != MSR_MTRRdefType)
+ return;
+
+ switch (msr) {
+ case MSR_MTRRfix64K_00000:
+ start = 0x0;
+ end = 0x80000;
+ break;
+ case MSR_MTRRfix16K_80000:
+ start = 0x80000;
+ end = 0xa0000;
+ break;
+ case MSR_MTRRfix16K_A0000:
+ start = 0xa0000;
+ end = 0xc0000;
+ break;
+ case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+ index = msr - MSR_MTRRfix4K_C0000;
+ start = 0xc0000 + index * (32 << 10);
+ end = start + (32 << 10);
+ break;
+ case MSR_MTRRdefType:
+ is_fixed = false;
+ start = 0x0;
+ end = ~0ULL;
+ break;
+ default:
+ /* variable range MTRRs. */
+ is_fixed = false;
+ index = (msr - 0x200) / 2;
+ start = (((u64)mtrr_state->var_ranges[index].base_hi) << 32) +
+ (mtrr_state->var_ranges[index].base_lo & PAGE_MASK);
+ mask = (((u64)mtrr_state->var_ranges[index].mask_hi) << 32) +
+ (mtrr_state->var_ranges[index].mask_lo & PAGE_MASK);
+ mask |= ~0ULL << cpuid_maxphyaddr(vcpu);
+
+ end = ((start & mask) | ~mask) + 1;
+ }
+
+ if (is_fixed && !(mtrr_enabled & 0x1))
+ return;
+
+ kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
+}
+
static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
@@ -1885,7 +1941,7 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
*pt = data;
}
- kvm_mmu_reset_context(vcpu);
+ update_mtrr(vcpu, msr);
return 0;
}
@@ -2798,6 +2854,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_TIME:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
case KVM_CAP_TSC_DEADLINE_TIMER:
+ case KVM_CAP_ENABLE_CAP_VM:
+ case KVM_CAP_DISABLE_QUIRKS:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -3845,6 +3903,26 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
return 0;
}
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
+{
+ int r;
+
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_DISABLE_QUIRKS:
+ kvm->arch.disabled_quirks = cap->args[0];
+ r = 0;
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4097,7 +4175,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+ break;
+ }
default:
r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
}
@@ -5952,6 +6038,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
lapic_irq.shorthand = 0;
lapic_irq.dest_mode = 0;
lapic_irq.dest_id = apicid;
+ lapic_irq.msi_redir_hint = false;
lapic_irq.delivery_mode = APIC_DM_REMRD;
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
@@ -6347,7 +6434,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
- kvm_guest_enter();
+ __kvm_guest_enter();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
@@ -7003,7 +7090,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
return 0;
}
-int fx_init(struct kvm_vcpu *vcpu)
+int fx_init(struct kvm_vcpu *vcpu, bool init_event)
{
int err;
@@ -7011,7 +7098,9 @@ int fx_init(struct kvm_vcpu *vcpu)
if (err)
return err;
- fpu_finit(&vcpu->arch.guest_fpu);
+ if (!init_event)
+ fpu_finit(&vcpu->arch.guest_fpu);
+
if (cpu_has_xsaves)
vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
host_xcr0 | XSTATE_COMPACTION_ENABLED;
@@ -7053,16 +7142,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
kvm_put_guest_xcr0(vcpu);
- if (!vcpu->guest_fpu_loaded)
+ if (!vcpu->guest_fpu_loaded) {
+ vcpu->fpu_counter = 0;
return;
+ }
vcpu->guest_fpu_loaded = 0;
fpu_save_init(&vcpu->arch.guest_fpu);
__kernel_fpu_end();
++vcpu->stat.fpu_reload;
- if (!vcpu->arch.eager_fpu)
- kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
-
+ /*
+ * If using eager FPU mode, or if the guest is a frequent user
+ * of the FPU, just leave the FPU active for next time.
+ * Every 255 times fpu_counter rolls over to 0; a guest that uses
+ * the FPU in bursts will revert to loading it on demand.
+ */
+ if (!vcpu->arch.eager_fpu) {
+ if (++vcpu->fpu_counter < 5)
+ kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+ }
trace_kvm_fpu(0);
}
@@ -7103,7 +7201,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
r = vcpu_load(vcpu);
if (r)
return r;
- kvm_vcpu_reset(vcpu);
+ kvm_vcpu_reset(vcpu, false);
kvm_mmu_setup(vcpu);
vcpu_put(vcpu);
@@ -7141,7 +7239,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_x86_ops->vcpu_free(vcpu);
}
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
@@ -7168,13 +7266,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
- kvm_pmu_reset(vcpu);
+ if (!init_event)
+ kvm_pmu_reset(vcpu);
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
- kvm_x86_ops->vcpu_reset(vcpu);
+ kvm_x86_ops->vcpu_reset(vcpu, init_event);
}
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -7363,7 +7462,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
goto fail_free_mce_banks;
}
- r = fx_init(vcpu);
+ r = fx_init(vcpu, false);
if (r)
goto fail_free_wbinvd_dirty_mask;
@@ -7375,6 +7474,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f5fef1868096..01a1d011e073 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -4,6 +4,8 @@
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
+#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.exception.pending = false;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ad45054309a0..87fd74a04005 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -230,6 +230,7 @@ struct kvm_vcpu {
int fpu_active;
int guest_fpu_loaded, guest_xcr0_loaded;
+ unsigned char fpu_counter;
wait_queue_head_t wq;
struct pid *pid;
int sigset_active;
@@ -538,13 +539,13 @@ void kvm_release_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
- bool write_fault, bool *writable);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable);
pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
+pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+ bool *async, bool write_fault, bool *writable);
void kvm_release_pfn_clean(pfn_t pfn);
void kvm_set_pfn_dirty(pfn_t pfn);
@@ -762,16 +763,10 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
}
#endif
-static inline void kvm_guest_enter(void)
+/* must be called with irqs disabled */
+static inline void __kvm_guest_enter(void)
{
- unsigned long flags;
-
- BUG_ON(preemptible());
-
- local_irq_save(flags);
guest_enter();
- local_irq_restore(flags);
-
/* KVM does not hold any references to rcu protected data when it
* switches CPU into a guest mode. In fact switching to a guest mode
* is very similar to exiting to userspace from rcu point of view. In
@@ -783,12 +778,27 @@ static inline void kvm_guest_enter(void)
rcu_virt_note_context_switch(smp_processor_id());
}
+/* must be called with irqs disabled */
+static inline void __kvm_guest_exit(void)
+{
+ guest_exit();
+}
+
+static inline void kvm_guest_enter(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __kvm_guest_enter();
+ local_irq_restore(flags);
+}
+
static inline void kvm_guest_exit(void)
{
unsigned long flags;
local_irq_save(flags);
- guest_exit();
+ __kvm_guest_exit();
local_irq_restore(flags);
}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4b60056776d1..75bd9f7fd846 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -814,6 +814,7 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_S390_INJECT_IRQ 113
#define KVM_CAP_S390_IRQ_STATE 114
#define KVM_CAP_PPC_HWRNG 115
+#define KVM_CAP_DISABLE_QUIRKS 116
#ifdef KVM_CAP_IRQ_ROUTING
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 90977418aeb6..bd3c08a7c6c2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1355,9 +1355,8 @@ exit:
return pfn;
}
-static pfn_t
-__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
- bool *async, bool write_fault, bool *writable)
+pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+ bool *async, bool write_fault, bool *writable)
{
unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
@@ -1376,44 +1375,35 @@ __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
return hva_to_pfn(addr, atomic, async, write_fault,
writable);
}
+EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
-static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
+static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic,
bool write_fault, bool *writable)
{
struct kvm_memory_slot *slot;
- if (async)
- *async = false;
-
slot = gfn_to_memslot(kvm, gfn);
- return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
+ return __gfn_to_pfn_memslot(slot, gfn, atomic, NULL, write_fault,
writable);
}
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
{
- return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
+ return __gfn_to_pfn(kvm, gfn, true, true, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
- bool write_fault, bool *writable)
-{
- return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
-
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
{
- return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
+ return __gfn_to_pfn(kvm, gfn, false, true, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn);
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable)
{
- return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
+ return __gfn_to_pfn(kvm, gfn, false, write_fault, writable);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
@@ -1590,15 +1580,17 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
int offset, int len)
{
int r;
+ struct kvm_memory_slot *memslot;
unsigned long addr;
- addr = gfn_to_hva(kvm, gfn);
+ memslot = gfn_to_memslot(kvm, gfn);
+ addr = gfn_to_hva_memslot(memslot, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
r = __copy_to_user((void __user *)addr + offset, data, len);
if (r)
return -EFAULT;
- mark_page_dirty(kvm, gfn);
+ mark_page_dirty_in_slot(kvm, memslot, gfn);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_write_guest_page);
@@ -2882,18 +2874,12 @@ static int hardware_enable_all(void)
static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
void *v)
{
- int cpu = (long)v;
-
val &= ~CPU_TASKS_FROZEN;
switch (val) {
case CPU_DYING:
- pr_info("kvm: disabling virtualization on CPU%d\n",
- cpu);
hardware_disable();
break;
case CPU_STARTING:
- pr_info("kvm: enabling virtualization on CPU%d\n",
- cpu);
hardware_enable();
break;
}