summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt4
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h3
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h140
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h17
-rw-r--r--arch/powerpc/include/asm/kvm_host.h6
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h3
-rw-r--r--arch/powerpc/kernel/asm-offsets.c3
-rw-r--r--arch/powerpc/kernel/idle_book3s.S35
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c124
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c51
-rw-r--r--arch/powerpc/kvm/book3s_64_slb.S2
-rw-r--r--arch/powerpc/kvm/book3s_hv.c337
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c117
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c65
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S205
-rw-r--r--arch/powerpc/kvm/book3s_pr.c16
-rw-r--r--arch/powerpc/kvm/book3s_pr_papr.c2
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c2
-rw-r--r--arch/powerpc/kvm/powerpc.c7
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h8
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/kvm/emulate.c9
-rw-r--r--arch/x86/kvm/lapic.c91
-rw-r--r--arch/x86/kvm/mmu.c130
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/paging_tmpl.h21
-rw-r--r--arch/x86/kvm/svm.c241
-rw-r--r--arch/x86/kvm/vmx.c210
-rw-r--r--arch/x86/kvm/x86.c94
-rwxr-xr-xtools/kvm/kvm_stat/kvm_stat30
-rw-r--r--virt/kvm/kvm_main.c2
33 files changed, 1355 insertions, 631 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e63a35fafef0..dd2dd96927b8 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1124,10 +1124,14 @@ guest physical address space and must not conflict with any memory slot
or any mmio address. The guest may malfunction if it accesses this memory
region.
+Setting the address to 0 will result in resetting the address to its default
+(0xfffbc000).
+
This ioctl is required on Intel-based hosts. This is needed on Intel hardware
because of a quirk in the virtualization implementation (see the internals
documentation when it pops into existence).
+Fails if any VCPU has already been created.
4.41 KVM_SET_BOOT_CPU_ID
diff --git a/MAINTAINERS b/MAINTAINERS
index 2d3d750b19c0..6457e5163f93 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7571,7 +7571,7 @@ F: arch/mips/include/asm/kvm*
F: arch/mips/kvm/
KERNEL VIRTUAL MACHINE FOR POWERPC (KVM/powerpc)
-M: Alexander Graf <agraf@suse.com>
+M: Paul Mackerras <paulus@ozlabs.org>
L: kvm-ppc@vger.kernel.org
W: http://www.linux-kvm.org/
T: git git://github.com/agraf/linux-2.6.git
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index b8d5b8e35244..9a667007bff8 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -216,7 +216,8 @@ extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa,
bool writing, bool *writable);
extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
unsigned long *rmap, long pte_index, int realmode);
-extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize);
+extern void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot,
+ unsigned long gfn, unsigned long psize);
extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
unsigned long pte_index);
void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index d55c7f881ce7..735cfa35298a 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -20,6 +20,8 @@
#ifndef __ASM_KVM_BOOK3S_64_H__
#define __ASM_KVM_BOOK3S_64_H__
+#include <linux/string.h>
+#include <asm/bitops.h>
#include <asm/book3s/64/mmu-hash.h>
/* Power architecture requires HPT is at least 256kiB, at most 64TiB */
@@ -107,18 +109,96 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
hpte[0] = cpu_to_be64(hpte_v);
}
+/*
+ * These functions encode knowledge of the POWER7/8/9 hardware
+ * interpretations of the HPTE LP (large page size) field.
+ */
+static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l)
+{
+ unsigned int lphi;
+
+ if (!(h & HPTE_V_LARGE))
+ return 12; /* 4kB */
+ lphi = (l >> 16) & 0xf;
+ switch ((l >> 12) & 0xf) {
+ case 0:
+ return !lphi ? 24 : -1; /* 16MB */
+ break;
+ case 1:
+ return 16; /* 64kB */
+ break;
+ case 3:
+ return !lphi ? 34 : -1; /* 16GB */
+ break;
+ case 7:
+ return (16 << 8) + 12; /* 64kB in 4kB */
+ break;
+ case 8:
+ if (!lphi)
+ return (24 << 8) + 16; /* 16MB in 64kkB */
+ if (lphi == 3)
+ return (24 << 8) + 12; /* 16MB in 4kB */
+ break;
+ }
+ return -1;
+}
+
+static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l)
+{
+ return kvmppc_hpte_page_shifts(h, l) & 0xff;
+}
+
+static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l)
+{
+ int tmp = kvmppc_hpte_page_shifts(h, l);
+
+ if (tmp >= 0x100)
+ tmp >>= 8;
+ return tmp;
+}
+
+static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r)
+{
+ return 1ul << kvmppc_hpte_actual_page_shift(v, r);
+}
+
+static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift)
+{
+ switch (base_shift) {
+ case 12:
+ switch (actual_shift) {
+ case 12:
+ return 0;
+ case 16:
+ return 7;
+ case 24:
+ return 0x38;
+ }
+ break;
+ case 16:
+ switch (actual_shift) {
+ case 16:
+ return 1;
+ case 24:
+ return 8;
+ }
+ break;
+ case 24:
+ return 0;
+ }
+ return -1;
+}
+
static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
unsigned long pte_index)
{
- int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
- unsigned int penc;
+ int a_pgshift, b_pgshift;
unsigned long rb = 0, va_low, sllp;
- unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
- if (v & HPTE_V_LARGE) {
- i = hpte_page_sizes[lp];
- b_psize = i & 0xf;
- a_psize = i >> 4;
+ b_pgshift = a_pgshift = kvmppc_hpte_page_shifts(v, r);
+ if (a_pgshift >= 0x100) {
+ b_pgshift &= 0xff;
+ a_pgshift >>= 8;
}
/*
@@ -152,37 +232,33 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
va_low ^= v >> (SID_SHIFT_1T - 16);
va_low &= 0x7ff;
- switch (b_psize) {
- case MMU_PAGE_4K:
- sllp = get_sllp_encoding(a_psize);
- rb |= sllp << 5; /* AP field */
+ if (b_pgshift == 12) {
+ if (a_pgshift > 12) {
+ sllp = (a_pgshift == 16) ? 5 : 4;
+ rb |= sllp << 5; /* AP field */
+ }
rb |= (va_low & 0x7ff) << 12; /* remaining 11 bits of AVA */
- break;
- default:
- {
+ } else {
int aval_shift;
/*
* remaining bits of AVA/LP fields
* Also contain the rr bits of LP
*/
- rb |= (va_low << mmu_psize_defs[b_psize].shift) & 0x7ff000;
+ rb |= (va_low << b_pgshift) & 0x7ff000;
/*
* Now clear not needed LP bits based on actual psize
*/
- rb &= ~((1ul << mmu_psize_defs[a_psize].shift) - 1);
+ rb &= ~((1ul << a_pgshift) - 1);
/*
* AVAL field 58..77 - base_page_shift bits of va
* we have space for 58..64 bits, Missing bits should
* be zero filled. +1 is to take care of L bit shift
*/
- aval_shift = 64 - (77 - mmu_psize_defs[b_psize].shift) + 1;
+ aval_shift = 64 - (77 - b_pgshift) + 1;
rb |= ((va_low << aval_shift) & 0xfe);
rb |= 1; /* L field */
- penc = mmu_psize_defs[b_psize].penc[a_psize];
- rb |= penc << 12; /* LP field */
- break;
- }
+ rb |= r & 0xff000 & ((1ul << a_pgshift) - 1); /* LP field */
}
rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */
return rb;
@@ -370,6 +446,28 @@ static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt)
return (1UL << (hpt->order - 7)) - 1;
}
+/* Set bits in a dirty bitmap, which is in LE format */
+static inline void set_dirty_bits(unsigned long *map, unsigned long i,
+ unsigned long npages)
+{
+
+ if (npages >= 8)
+ memset((char *)map + i / 8, 0xff, npages / 8);
+ else
+ for (; npages; ++i, --npages)
+ __set_bit_le(i, map);
+}
+
+static inline void set_dirty_bits_atomic(unsigned long *map, unsigned long i,
+ unsigned long npages)
+{
+ if (npages >= 8)
+ memset((char *)map + i / 8, 0xff, npages / 8);
+ else
+ for (; npages; ++i, --npages)
+ set_bit_le(i, map);
+}
+
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 83596f32f50b..ab386af2904f 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -82,6 +82,16 @@ struct kvm_split_mode {
u8 do_nap;
u8 napped[MAX_SMT_THREADS];
struct kvmppc_vcore *vc[MAX_SUBCORES];
+ /* Bits for changing lpcr on P9 */
+ unsigned long lpcr_req;
+ unsigned long lpidr_req;
+ unsigned long host_lpcr;
+ u32 do_set;
+ u32 do_restore;
+ union {
+ u32 allphases;
+ u8 phase[4];
+ } lpcr_sync;
};
/*
@@ -104,14 +114,11 @@ struct kvmppc_host_state {
u8 napping;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
- /*
- * hwthread_req/hwthread_state pair is used to pull sibling threads
- * out of guest on pre-ISAv3.0B CPUs where threads share MMU.
- */
u8 hwthread_req;
u8 hwthread_state;
u8 host_ipi;
- u8 ptid;
+ u8 ptid; /* thread number within subcore when split */
+ u8 tid; /* thread number within whole core */
struct kvm_vcpu *kvm_vcpu;
struct kvmppc_vcore *kvm_vcore;
void __iomem *xics_phys;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e372ed871c51..3aa5b577cd60 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -235,10 +235,7 @@ struct revmap_entry {
*/
#define KVMPPC_RMAP_LOCK_BIT 63
#define KVMPPC_RMAP_RC_SHIFT 32
-#define KVMPPC_RMAP_CHG_SHIFT 48
#define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
-#define KVMPPC_RMAP_CHANGED (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT)
-#define KVMPPC_RMAP_CHG_ORDER (0x3ful << KVMPPC_RMAP_CHG_SHIFT)
#define KVMPPC_RMAP_PRESENT 0x100000000ul
#define KVMPPC_RMAP_INDEX 0xfffffffful
@@ -276,7 +273,7 @@ struct kvm_arch {
int tlbie_lock;
unsigned long lpcr;
unsigned long vrma_slb_v;
- int hpte_setup_done;
+ int mmu_ready;
atomic_t vcpus_running;
u32 online_vcores;
atomic_t hpte_mod_interest;
@@ -284,6 +281,7 @@ struct kvm_arch {
cpumask_t cpu_in_guest;
u8 radix;
u8 fwnmi_enabled;
+ bool threads_indep;
pgd_t *pgtable;
u64 process_table;
struct dentry *debugfs_dir;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index ba5fadd6f3c9..96753f3aac6d 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -168,6 +168,7 @@ extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order);
extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info);
extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order);
extern void kvmppc_free_hpt(struct kvm_hpt_info *info);
+extern void kvmppc_rmap_reset(struct kvm *kvm);
extern long kvmppc_prepare_vrma(struct kvm *kvm,
struct kvm_userspace_memory_region *mem);
extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
@@ -177,6 +178,8 @@ extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
struct iommu_group *grp);
extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
struct iommu_group *grp);
+extern int kvmppc_switch_mmu_to_hpt(struct kvm *kvm);
+extern int kvmppc_switch_mmu_to_radix(struct kvm *kvm);
extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce_64 *args);
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8cfb20e38cfe..519fad556113 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -642,6 +642,7 @@ int main(void)
HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
HSTATE_FIELD(HSTATE_PTID, ptid);
+ HSTATE_FIELD(HSTATE_TID, tid);
HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]);
@@ -667,6 +668,8 @@ int main(void)
OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
+ OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
+ OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 1125c9be9e06..175d49f468af 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -319,20 +319,13 @@ enter_winkle:
/*
* r3 - PSSCR value corresponding to the requested stop state.
*/
+power_enter_stop:
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-power_enter_stop_kvm_rm:
- /*
- * This is currently unused because POWER9 KVM does not have to
- * gather secondary threads into sibling mode, but the code is
- * here in case that function is required.
- *
- * Tell KVM we're entering idle.
- */
+ /* Tell KVM we're entering idle */
li r4,KVM_HWTHREAD_IN_IDLE
/* DO THIS IN REAL MODE! See comment above. */
stb r4,HSTATE_HWTHREAD_STATE(r13)
#endif
-power_enter_stop:
/*
* Check if we are executing the lite variant with ESL=EC=0
*/
@@ -496,18 +489,6 @@ pnv_powersave_wakeup_mce:
b pnv_powersave_wakeup
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-kvm_start_guest_check:
- li r0,KVM_HWTHREAD_IN_KERNEL
- stb r0,HSTATE_HWTHREAD_STATE(r13)
- /* Order setting hwthread_state vs. testing hwthread_req */
- sync
- lbz r0,HSTATE_HWTHREAD_REQ(r13)
- cmpwi r0,0
- beqlr
- b kvm_start_guest
-#endif
-
/*
* Called from reset vector for powersave wakeups.
* cr3 - set to gt if waking up with partial/complete hypervisor state loss
@@ -532,9 +513,15 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
mr r3,r12
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-BEGIN_FTR_SECTION
- bl kvm_start_guest_check
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+ li r0,KVM_HWTHREAD_IN_KERNEL
+ stb r0,HSTATE_HWTHREAD_STATE(r13)
+ /* Order setting hwthread_state vs. testing hwthread_req */
+ sync
+ lbz r0,HSTATE_HWTHREAD_REQ(r13)
+ cmpwi r0,0
+ beq 1f
+ b kvm_start_guest
+1:
#endif
/* Return SRR1 from power7_nap() */
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 7c62967d672c..6aec8a22aeff 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -73,8 +73,6 @@ struct kvm_resize_hpt {
struct kvm_hpt_info hpt;
};
-static void kvmppc_rmap_reset(struct kvm *kvm);
-
int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
{
unsigned long hpt = 0;
@@ -106,7 +104,6 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
/* Allocate reverse map array */
rev = vmalloc(sizeof(struct revmap_entry) * npte);
if (!rev) {
- pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n");
if (cma)
kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
else
@@ -137,19 +134,22 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
long err = -EBUSY;
struct kvm_hpt_info info;
- if (kvm_is_radix(kvm))
- return -EINVAL;
-
mutex_lock(&kvm->lock);
- if (kvm->arch.hpte_setup_done) {
- kvm->arch.hpte_setup_done = 0;
- /* order hpte_setup_done vs. vcpus_running */
+ if (kvm->arch.mmu_ready) {
+ kvm->arch.mmu_ready = 0;
+ /* order mmu_ready vs. vcpus_running */
smp_mb();
if (atomic_read(&kvm->arch.vcpus_running)) {
- kvm->arch.hpte_setup_done = 1;
+ kvm->arch.mmu_ready = 1;
goto out;
}
}
+ if (kvm_is_radix(kvm)) {
+ err = kvmppc_switch_mmu_to_hpt(kvm);
+ if (err)
+ goto out;
+ }
+
if (kvm->arch.hpt.order == order) {
/* We already have a suitable HPT */
@@ -183,6 +183,7 @@ out:
void kvmppc_free_hpt(struct kvm_hpt_info *info)
{
vfree(info->rev);
+ info->rev = NULL;
if (info->cma)
kvm_free_hpt_cma(virt_to_page(info->virt),
1 << (info->order - PAGE_SHIFT));
@@ -334,7 +335,7 @@ static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
{
unsigned long ra_mask;
- ra_mask = hpte_page_size(v, r) - 1;
+ ra_mask = kvmppc_actual_pgsz(v, r) - 1;
return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
}
@@ -350,6 +351,9 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
int index;
int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
+ if (kvm_is_radix(vcpu->kvm))
+ return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite);
+
/* Get SLB entry */
if (virtmode) {
slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
@@ -505,7 +509,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
mmio_update = atomic64_read(&kvm->arch.mmio_update);
if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
r = vcpu->arch.pgfault_cache->rpte;
- psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r);
+ psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0],
+ r);
gpa_base = r & HPTE_R_RPN & ~(psize - 1);
gfn_base = gpa_base >> PAGE_SHIFT;
gpa = gpa_base | (ea & (psize - 1));
@@ -534,7 +539,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
return RESUME_GUEST;
/* Translate the logical address and get the page */
- psize = hpte_page_size(hpte[0], r);
+ psize = kvmppc_actual_pgsz(hpte[0], r);
gpa_base = r & HPTE_R_RPN & ~(psize - 1);
gfn_base = gpa_base >> PAGE_SHIFT;
gpa = gpa_base | (ea & (psize - 1));
@@ -710,7 +715,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
goto out_put;
}
-static void kvmppc_rmap_reset(struct kvm *kvm)
+void kvmppc_rmap_reset(struct kvm *kvm)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
@@ -776,6 +781,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
/* Must be called with both HPTE and rmap locked */
static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
+ struct kvm_memory_slot *memslot,
unsigned long *rmapp, unsigned long gfn)
{
__be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
@@ -798,7 +804,7 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
/* Now check and modify the HPTE */
ptel = rev[i].guest_rpte;
- psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
+ psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel);
if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
hpte_rpn(ptel, psize) == gfn) {
hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
@@ -807,8 +813,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
/* Harvest R and C */
rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
- if (rcbits & HPTE_R_C)
- kvmppc_update_rmap_change(rmapp, psize);
+ if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap)
+ kvmppc_update_dirty_map(memslot, gfn, psize);
if (rcbits & ~rev[i].guest_rpte) {
rev[i].guest_rpte = ptel | rcbits;
note_hpte_modification(kvm, &rev[i]);
@@ -846,7 +852,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
continue;
}
- kvmppc_unmap_hpte(kvm, i, rmapp, gfn);
+ kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn);
unlock_rmap(rmapp);
__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
}
@@ -1029,14 +1035,6 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
retry:
lock_rmap(rmapp);
- if (*rmapp & KVMPPC_RMAP_CHANGED) {
- long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER)
- >> KVMPPC_RMAP_CHG_SHIFT;
- *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
- npages_dirty = 1;
- if (change_order > PAGE_SHIFT)
- npages_dirty = 1ul << (change_order - PAGE_SHIFT);
- }
if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
unlock_rmap(rmapp);
return npages_dirty;
@@ -1092,7 +1090,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
rev[i].guest_rpte |= HPTE_R_C;
note_hpte_modification(kvm, &rev[i]);
}
- n = hpte_page_size(v, r);
+ n = kvmppc_actual_pgsz(v, r);
n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (n > npages_dirty)
npages_dirty = n;
@@ -1128,7 +1126,7 @@ void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map)
{
- unsigned long i, j;
+ unsigned long i;
unsigned long *rmapp;
preempt_disable();
@@ -1140,9 +1138,8 @@ long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
* since we always put huge-page HPTEs in the rmap chain
* corresponding to their page base address.
*/
- if (npages && map)
- for (j = i; npages; ++j, --npages)
- __set_bit_le(j, map);
+ if (npages)
+ set_dirty_bits(map, i, npages);
++rmapp;
}
preempt_enable();
@@ -1186,7 +1183,6 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
struct page *page = virt_to_page(va);
struct kvm_memory_slot *memslot;
unsigned long gfn;
- unsigned long *rmap;
int srcu_idx;
put_page(page);
@@ -1194,20 +1190,12 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
if (!dirty)
return;
- /* We need to mark this page dirty in the rmap chain */
+ /* We need to mark this page dirty in the memslot dirty_bitmap, if any */
gfn = gpa >> PAGE_SHIFT;
srcu_idx = srcu_read_lock(&kvm->srcu);
memslot = gfn_to_memslot(kvm, gfn);
- if (memslot) {
- if (!kvm_is_radix(kvm)) {
- rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
- lock_rmap(rmap);
- *rmap |= KVMPPC_RMAP_CHANGED;
- unlock_rmap(rmap);
- } else if (memslot->dirty_bitmap) {
- mark_page_dirty(kvm, gfn);
- }
- }
+ if (memslot && memslot->dirty_bitmap)
+ set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap);
srcu_read_unlock(&kvm->srcu, srcu_idx);
}
@@ -1267,7 +1255,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
guest_rpte = rev->guest_rpte;
ret = -EIO;
- apsize = hpte_page_size(vpte, guest_rpte);
+ apsize = kvmppc_actual_pgsz(vpte, guest_rpte);
if (!apsize)
goto out;
@@ -1282,7 +1270,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
lock_rmap(rmapp);
- kvmppc_unmap_hpte(kvm, idx, rmapp, gfn);
+ kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn);
unlock_rmap(rmapp);
}
@@ -1455,7 +1443,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
struct kvm_resize_hpt *resize;
int ret;
- if (flags != 0)
+ if (flags != 0 || kvm_is_radix(kvm))
return -EINVAL;
if (shift && ((shift < 18) || (shift > 46)))
@@ -1521,7 +1509,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
struct kvm_resize_hpt *resize;
long ret;
- if (flags != 0)
+ if (flags != 0 || kvm_is_radix(kvm))
return -EINVAL;
if (shift && ((shift < 18) || (shift > 46)))
@@ -1533,15 +1521,15 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
/* This shouldn't be possible */
ret = -EIO;
- if (WARN_ON(!kvm->arch.hpte_setup_done))
+ if (WARN_ON(!kvm->arch.mmu_ready))
goto out_no_hpt;
/* Stop VCPUs from running while we mess with the HPT */
- kvm->arch.hpte_setup_done = 0;
+ kvm->arch.mmu_ready = 0;
smp_mb();
/* Boot all CPUs out of the guest so they re-read
- * hpte_setup_done */
+ * mmu_ready */
on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
ret = -ENXIO;
@@ -1564,7 +1552,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
out:
/* Let VCPUs run again */
- kvm->arch.hpte_setup_done = 1;
+ kvm->arch.mmu_ready = 1;
smp_mb();
out_no_hpt:
resize_hpt_release(kvm, resize);
@@ -1707,6 +1695,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
if (!access_ok(VERIFY_WRITE, buf, count))
return -EFAULT;
+ if (kvm_is_radix(kvm))
+ return 0;
first_pass = ctx->first_pass;
flags = ctx->flags;
@@ -1800,20 +1790,22 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
unsigned long tmp[2];
ssize_t nb;
long int err, ret;
- int hpte_setup;
+ int mmu_ready;
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
+ if (kvm_is_radix(kvm))
+ return -EINVAL;
/* lock out vcpus from running while we're doing this */
mutex_lock(&kvm->lock);
- hpte_setup = kvm->arch.hpte_setup_done;
- if (hpte_setup) {
- kvm->arch.hpte_setup_done = 0; /* temporarily */
- /* order hpte_setup_done vs. vcpus_running */
+ mmu_ready = kvm->arch.mmu_ready;
+ if (mmu_ready) {
+ kvm->arch.mmu_ready = 0; /* temporarily */
+ /* order mmu_ready vs. vcpus_running */
smp_mb();
if (atomic_read(&kvm->arch.vcpus_running)) {
- kvm->arch.hpte_setup_done = 1;
+ kvm->arch.mmu_ready = 1;
mutex_unlock(&kvm->lock);
return -EBUSY;
}
@@ -1866,7 +1858,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
"r=%lx\n", ret, i, v, r);
goto out;
}
- if (!hpte_setup && is_vrma_hpte(v)) {
+ if (!mmu_ready && is_vrma_hpte(v)) {
unsigned long psize = hpte_base_page_size(v, r);
unsigned long senc = slb_pgsize_encoding(psize);
unsigned long lpcr;
@@ -1875,7 +1867,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
(VRMA_VSID << SLB_VSID_SHIFT_1T);
lpcr = senc << (LPCR_VRMASD_SH - 4);
kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
- hpte_setup = 1;
+ mmu_ready = 1;
}
++i;
hptp += 2;
@@ -1891,9 +1883,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
}
out:
- /* Order HPTE updates vs. hpte_setup_done */
+ /* Order HPTE updates vs. mmu_ready */
smp_wmb();
- kvm->arch.hpte_setup_done = hpte_setup;
+ kvm->arch.mmu_ready = mmu_ready;
mutex_unlock(&kvm->lock);
if (err)
@@ -2002,6 +1994,10 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
struct kvm *kvm;
__be64 *hptp;
+ kvm = p->kvm;
+ if (kvm_is_radix(kvm))
+ return 0;
+
ret = mutex_lock_interruptible(&p->mutex);
if (ret)
return ret;
@@ -2024,7 +2020,6 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
}
}
- kvm = p->kvm;
i = p->hpt_index;
hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
@@ -2099,10 +2094,7 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */
- if (kvm_is_radix(vcpu->kvm))
- mmu->xlate = kvmppc_mmu_radix_xlate;
- else
- mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+ mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index c5d7435455f1..58618f644c56 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -474,26 +474,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
return ret;
}
-static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn, unsigned int order)
-{
- unsigned long i, limit;
- unsigned long *dp;
-
- if (!memslot->dirty_bitmap)
- return;
- limit = 1ul << order;
- if (limit < BITS_PER_LONG) {
- for (i = 0; i < limit; ++i)
- mark_page_dirty(kvm, gfn + i);
- return;
- }
- dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn);
- limit /= BITS_PER_LONG;
- for (i = 0; i < limit; ++i)
- *dp++ = ~0ul;
-}
-
/* Called with kvm->lock held */
int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn)
@@ -508,12 +488,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
gpa, shift);
kvmppc_radix_tlbie_page(kvm, gpa, shift);
- if (old & _PAGE_DIRTY) {
- if (!shift)
- mark_page_dirty(kvm, gfn);
- else
- mark_pages_dirty(kvm, memslot,
- gfn, shift - PAGE_SHIFT);
+ if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
+ unsigned long npages = 1;
+ if (shift)
+ npages = 1ul << (shift - PAGE_SHIFT);
+ kvmppc_update_dirty_map(memslot, gfn, npages);
}
}
return 0;
@@ -579,20 +558,8 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map)
{
unsigned long i, j;
- unsigned long n, *p;
int npages;
- /*
- * Radix accumulates dirty bits in the first half of the
- * memslot's dirty_bitmap area, for when pages are paged
- * out or modified by the host directly. Pick up these
- * bits and add them to the map.
- */
- n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long);
- p = memslot->dirty_bitmap;
- for (i = 0; i < n; ++i)
- map[i] |= xchg(&p[i], 0);
-
for (i = 0; i < memslot->npages; i = j) {
npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
@@ -604,9 +571,10 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
* real address, if npages > 1 we can skip to i + npages.
*/
j = i + 1;
- if (npages)
- for (j = i; npages; ++j, --npages)
- __set_bit_le(j, map);
+ if (npages) {
+ set_dirty_bits(map, i, npages);
+ i = j + npages;
+ }
}
return 0;
}
@@ -694,6 +662,7 @@ void kvmppc_free_radix(struct kvm *kvm)
pgd_clear(pgd);
}
pgd_free(kvm->mm, kvm->arch.pgtable);
+ kvm->arch.pgtable = NULL;
}
static void pte_ctor(void *addr)
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index 3589c4e3d49b..688722acd692 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -113,7 +113,7 @@ slb_do_enter:
/* Remove all SLB entries that are in use. */
- li r0, r0
+ li r0, 0
slbmte r0, r0
slbia
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 73bf1ebfa78f..fff62fdf1464 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -19,6 +19,7 @@
*/
#include <linux/kvm_host.h>
+#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/preempt.h>
@@ -97,6 +98,10 @@ static int target_smt_mode;
module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
+static bool indep_threads_mode = true;
+module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
+
#ifdef CONFIG_KVM_XICS
static struct kernel_param_ops module_param_ops = {
.set = param_set_int,
@@ -114,6 +119,7 @@ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
+static void kvmppc_setup_partition_table(struct kvm *kvm);
static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
int *ip)
@@ -1732,9 +1738,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
* MMU mode (radix or HPT), unfortunately, but since we only support
* HPT guests on a HPT host so far, that isn't an impediment yet.
*/
-static int threads_per_vcore(void)
+static int threads_per_vcore(struct kvm *kvm)
{
- if (cpu_has_feature(CPU_FTR_ARCH_300))
+ if (kvm->arch.threads_indep)
return 1;
return threads_per_subcore;
}
@@ -1772,7 +1778,7 @@ static struct debugfs_timings_element {
{"cede", offsetof(struct kvm_vcpu, arch.cede_time)},
};
-#define N_TIMINGS (sizeof(timings) / sizeof(timings[0]))
+#define N_TIMINGS (ARRAY_SIZE(timings))
struct debugfs_timings_state {
struct kvm_vcpu *vcpu;
@@ -2117,15 +2123,6 @@ static int kvmppc_grab_hwthread(int cpu)
struct paca_struct *tpaca;
long timeout = 10000;
- /*
- * ISA v3.0 idle routines do not set hwthread_state or test
- * hwthread_req, so they can not grab idle threads.
- */
- if (cpu_has_feature(CPU_FTR_ARCH_300)) {
- WARN(1, "KVM: can not control sibling threads\n");
- return -EBUSY;
- }
-
tpaca = &paca[cpu];
/* Ensure the thread won't go into the kernel if it wakes */
@@ -2160,12 +2157,10 @@ static void kvmppc_release_hwthread(int cpu)
struct paca_struct *tpaca;
tpaca = &paca[cpu];
+ tpaca->kvm_hstate.hwthread_req = 0;
tpaca->kvm_hstate.kvm_vcpu = NULL;
tpaca->kvm_hstate.kvm_vcore = NULL;
tpaca->kvm_hstate.kvm_split_mode = NULL;
- if (!cpu_has_feature(CPU_FTR_ARCH_300))
- tpaca->kvm_hstate.hwthread_req = 0;
-
}
static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
@@ -2237,11 +2232,10 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
kvmppc_ipi_thread(cpu);
}
-static void kvmppc_wait_for_nap(void)
+static void kvmppc_wait_for_nap(int n_threads)
{
int cpu = smp_processor_id();
int i, loops;
- int n_threads = threads_per_vcore();
if (n_threads <= 1)
return;
@@ -2328,7 +2322,7 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
vc->vcore_state = VCORE_PREEMPT;
vc->pcpu = smp_processor_id();
- if (vc->num_threads < threads_per_vcore()) {
+ if (vc->num_threads < threads_per_vcore(vc->kvm)) {
spin_lock(&lp->lock);
list_add_tail(&vc->preempt_list, &lp->list);
spin_unlock(&lp->lock);
@@ -2366,7 +2360,7 @@ struct core_info {
/*
* This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
- * respectively in 2-way micro-threading (split-core) mode.
+ * respectively in 2-way micro-threading (split-core) mode on POWER8.
*/
static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
@@ -2382,7 +2376,14 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
static bool subcore_config_ok(int n_subcores, int n_threads)
{
- /* Can only dynamically split if unsplit to begin with */
+ /*
+ * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core
+ * mode, with one thread per subcore.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return n_subcores <= 4 && n_threads == 1;
+
+ /* On POWER8, can only dynamically split if unsplit to begin with */
if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
return false;
if (n_subcores > MAX_SUBCORES)
@@ -2413,6 +2414,11 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
if (!cpu_has_feature(CPU_FTR_ARCH_207S))
return false;
+ /* POWER9 currently requires all threads to be in the same MMU mode */
+ if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+ kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
+ return false;
+
if (n_threads < cip->max_subcore_threads)
n_threads = cip->max_subcore_threads;
if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
@@ -2638,6 +2644,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
int target_threads;
int controlled_threads;
int trap;
+ bool is_power8;
+ bool hpt_on_radix;
/*
* Remove from the list any threads that have a signal pending
@@ -2660,15 +2668,19 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
* the number of threads per subcore, except on POWER9,
* where it's 1 because the threads are (mostly) independent.
*/
- controlled_threads = threads_per_vcore();
+ controlled_threads = threads_per_vcore(vc->kvm);
/*
* Make sure we are running on primary threads, and that secondary
* threads are offline. Also check if the number of threads in this
* guest are greater than the current system threads per guest.
+ * On POWER9, we need to be not in independent-threads mode if
+ * this is a HPT guest on a radix host.
*/
- if ((controlled_threads > 1) &&
- ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
+ hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm);
+ if (((controlled_threads > 1) &&
+ ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
+ (hpt_on_radix && vc->kvm->arch.threads_indep)) {
for_each_runnable_thread(i, vcpu, vc) {
vcpu->arch.ret = -EBUSY;
kvmppc_remove_runnable(vc, vcpu);
@@ -2731,32 +2743,51 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
cmd_bit = stat_bit = 0;
split = core_info.n_subcores;
sip = NULL;
- if (split > 1) {
- /* threads_per_subcore must be MAX_SMT_THREADS (8) here */
- if (split == 2 && (dynamic_mt_modes & 2)) {
- cmd_bit = HID0_POWER8_1TO2LPAR;
- stat_bit = HID0_POWER8_2LPARMODE;
- } else {
- split = 4;
- cmd_bit = HID0_POWER8_1TO4LPAR;
- stat_bit = HID0_POWER8_4LPARMODE;
- }
- subcore_size = MAX_SMT_THREADS / split;
+ is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
+ && !cpu_has_feature(CPU_FTR_ARCH_300);
+
+ if (split > 1 || hpt_on_radix) {
sip = &split_info;
memset(&split_info, 0, sizeof(split_info));
- split_info.rpr = mfspr(SPRN_RPR);
- split_info.pmmar = mfspr(SPRN_PMMAR);
- split_info.ldbar = mfspr(SPRN_LDBAR);
- split_info.subcore_size = subcore_size;
for (sub = 0; sub < core_info.n_subcores; ++sub)
split_info.vc[sub] = core_info.vc[sub];
+
+ if (is_power8) {
+ if (split == 2 && (dynamic_mt_modes & 2)) {
+ cmd_bit = HID0_POWER8_1TO2LPAR;
+ stat_bit = HID0_POWER8_2LPARMODE;
+ } else {
+ split = 4;
+ cmd_bit = HID0_POWER8_1TO4LPAR;
+ stat_bit = HID0_POWER8_4LPARMODE;
+ }
+ subcore_size = MAX_SMT_THREADS / split;
+ split_info.rpr = mfspr(SPRN_RPR);
+ split_info.pmmar = mfspr(SPRN_PMMAR);
+ split_info.ldbar = mfspr(SPRN_LDBAR);
+ split_info.subcore_size = subcore_size;
+ } else {
+ split_info.subcore_size = 1;
+ if (hpt_on_radix) {
+ /* Use the split_info for LPCR/LPIDR changes */
+ split_info.lpcr_req = vc->lpcr;
+ split_info.lpidr_req = vc->kvm->arch.lpid;
+ split_info.host_lpcr = vc->kvm->arch.host_lpcr;
+ split_info.do_set = 1;
+ }
+ }
+
/* order writes to split_info before kvm_split_mode pointer */
smp_wmb();
}
- for (thr = 0; thr < controlled_threads; ++thr)
+
+ for (thr = 0; thr < controlled_threads; ++thr) {
+ paca[pcpu + thr].kvm_hstate.tid = thr;
+ paca[pcpu + thr].kvm_hstate.napping = 0;
paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
+ }
- /* Initiate micro-threading (split-core) if required */
+ /* Initiate micro-threading (split-core) on POWER8 if required */
if (cmd_bit) {
unsigned long hid0 = mfspr(SPRN_HID0);
@@ -2775,7 +2806,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
/* Start all the threads */
active = 0;
for (sub = 0; sub < core_info.n_subcores; ++sub) {
- thr = subcore_thread_map[sub];
+ thr = is_power8 ? subcore_thread_map[sub] : sub;
thr0_done = false;
active |= 1 << thr;
pvc = core_info.vc[sub];
@@ -2802,18 +2833,20 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
* the vcore pointer in the PACA of the secondaries.
*/
smp_mb();
- if (cmd_bit)
- split_info.do_nap = 1; /* ask secondaries to nap when done */
/*
* When doing micro-threading, poke the inactive threads as well.
* This gets them to the nap instruction after kvm_do_nap,
* which reduces the time taken to unsplit later.
+ * For POWER9 HPT guest on radix host, we need all the secondary
+ * threads woken up so they can do the LPCR/LPIDR change.
*/
- if (split > 1)
+ if (cmd_bit || hpt_on_radix) {
+ split_info.do_nap = 1; /* ask secondaries to nap when done */
for (thr = 1; thr < threads_per_subcore; ++thr)
if (!(active & (1 << thr)))
kvmppc_ipi_thread(pcpu + thr);
+ }
vc->vcore_state = VCORE_RUNNING;
preempt_disable();
@@ -2847,10 +2880,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
vc->vcore_state = VCORE_EXITING;
/* wait for secondary threads to finish writing their state to memory */
- kvmppc_wait_for_nap();
+ kvmppc_wait_for_nap(controlled_threads);
/* Return to whole-core mode if we split the core earlier */
- if (split > 1) {
+ if (cmd_bit) {
unsigned long hid0 = mfspr(SPRN_HID0);
unsigned long loops = 0;
@@ -2866,8 +2899,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
cpu_relax();
++loops;
}
- split_info.do_nap = 0;
+ } else if (hpt_on_radix) {
+ /* Wait for all threads to have seen final sync */
+ for (thr = 1; thr < controlled_threads; ++thr) {
+ while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) {
+ HMT_low();
+ barrier();
+ }
+ HMT_medium();
+ }
}
+ split_info.do_nap = 0;
kvmppc_set_host_core(pcpu);
@@ -3208,6 +3250,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
unsigned long ebb_regs[3] = {}; /* shut up GCC */
unsigned long user_tar = 0;
unsigned int user_vrsave;
+ struct kvm *kvm;
if (!vcpu->arch.sane) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -3245,13 +3288,25 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
return -EINTR;
}
- atomic_inc(&vcpu->kvm->arch.vcpus_running);
- /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
+ kvm = vcpu->kvm;
+ atomic_inc(&kvm->arch.vcpus_running);
+ /* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
smp_mb();
- /* On the first time here, set up HTAB and VRMA */
- if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) {
- r = kvmppc_hv_setup_htab_rma(vcpu);
+ /* On the first time here, set up MMU if necessary */
+ if (!vcpu->kvm->arch.mmu_ready) {
+ mutex_lock(&kvm->lock);
+ r = 0;
+ if (!kvm->arch.mmu_ready) {
+ if (!kvm_is_radix(vcpu->kvm))
+ r = kvmppc_hv_setup_htab_rma(vcpu);
+ if (!r) {
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ kvmppc_setup_partition_table(kvm);
+ kvm->arch.mmu_ready = 1;
+ }
+ }
+ mutex_unlock(&kvm->lock);
if (r)
goto out;
}
@@ -3310,22 +3365,21 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
}
static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
- int linux_psize)
+ int shift, int sllp)
{
- struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
-
- if (!def->shift)
- return;
- (*sps)->page_shift = def->shift;
- (*sps)->slb_enc = def->sllp;
- (*sps)->enc[0].page_shift = def->shift;
- (*sps)->enc[0].pte_enc = def->penc[linux_psize];
+ (*sps)->page_shift = shift;
+ (*sps)->slb_enc = sllp;
+ (*sps)->enc[0].page_shift = shift;
+ (*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
/*
- * Add 16MB MPSS support if host supports it
+ * Add 16MB MPSS support (may get filtered out by userspace)
*/
- if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) {
- (*sps)->enc[1].page_shift = 24;
- (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
+ if (shift != 24) {
+ int penc = kvmppc_pgsize_lp_encoding(shift, 24);
+ if (penc != -1) {
+ (*sps)->enc[1].page_shift = 24;
+ (*sps)->enc[1].pte_enc = penc;
+ }
}
(*sps)++;
}
@@ -3336,13 +3390,6 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
struct kvm_ppc_one_seg_page_size *sps;
/*
- * Since we don't yet support HPT guests on a radix host,
- * return an error if the host uses radix.
- */
- if (radix_enabled())
- return -EINVAL;
-
- /*
* POWER7, POWER8 and POWER9 all support 32 storage keys for data.
* POWER7 doesn't support keys for instruction accesses,
* POWER8 and POWER9 do.
@@ -3350,16 +3397,15 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
info->data_keys = 32;
info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
- info->flags = KVM_PPC_PAGE_SIZES_REAL;
- if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
- info->flags |= KVM_PPC_1T_SEGMENTS;
- info->slb_size = mmu_slb_size;
+ /* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
+ info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
+ info->slb_size = 32;
/* We only support these sizes for now, and no muti-size segments */
sps = &info->sps[0];
- kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
- kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
- kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
+ kvmppc_add_seg_page_size(&sps, 12, 0);
+ kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
+ kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
return 0;
}
@@ -3374,7 +3420,7 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
struct kvm_memory_slot *memslot;
int i, r;
unsigned long n;
- unsigned long *buf;
+ unsigned long *buf, *p;
struct kvm_vcpu *vcpu;
mutex_lock(&kvm->slots_lock);
@@ -3390,8 +3436,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
goto out;
/*
- * Use second half of bitmap area because radix accumulates
- * bits in the first half.
+ * Use second half of bitmap area because both HPT and radix
+ * accumulate bits in the first half.
*/
n = kvm_dirty_bitmap_bytes(memslot);
buf = memslot->dirty_bitmap + n / sizeof(long);
@@ -3404,6 +3450,16 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
if (r)
goto out;
+ /*
+ * We accumulate dirty bits in the first half of the
+ * memslot's dirty_bitmap area, for when pages are paged
+ * out or modified by the host directly. Pick up these
+ * bits and add them to the map.
+ */
+ p = memslot->dirty_bitmap;
+ for (i = 0; i < n / sizeof(long); ++i)
+ buf[i] |= xchg(&p[i], 0);
+
/* Harvest dirty bits from VPA and DTL updates */
/* Note: we never modify the SLB shadow buffer areas */
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -3435,15 +3491,6 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
unsigned long npages)
{
- /*
- * For now, if radix_enabled() then we only support radix guests,
- * and in that case we don't need the rmap array.
- */
- if (radix_enabled()) {
- slot->arch.rmap = NULL;
- return 0;
- }
-
slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
if (!slot->arch.rmap)
return -ENOMEM;
@@ -3464,8 +3511,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
const struct kvm_memory_slot *new)
{
unsigned long npages = mem->memory_size >> PAGE_SHIFT;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
/*
* If we are making a new memslot, it might make
@@ -3475,18 +3520,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
*/
if (npages)
atomic64_inc(&kvm->arch.mmio_update);
-
- if (npages && old->npages && !kvm_is_radix(kvm)) {
- /*
- * If modifying a memslot, reset all the rmap dirty bits.
- * If this is a new memslot, we don't need to do anything
- * since the rmap array starts out as all zeroes,
- * i.e. no pages are dirty.
- */
- slots = kvm_memslots(kvm);
- memslot = id_to_memslot(slots, mem->slot);
- kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
- }
}
/*
@@ -3542,6 +3575,10 @@ static void kvmppc_setup_partition_table(struct kvm *kvm)
mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
}
+/*
+ * Set up HPT (hashed page table) and RMA (real-mode area).
+ * Must be called with kvm->lock held.
+ */
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
{
int err = 0;
@@ -3553,10 +3590,6 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
unsigned long psize, porder;
int srcu_idx;
- mutex_lock(&kvm->lock);
- if (kvm->arch.hpte_setup_done)
- goto out; /* another vcpu beat us to it */
-
/* Allocate hashed page table (if not done already) and reset it */
if (!kvm->arch.hpt.virt) {
int order = KVM_DEFAULT_HPT_ORDER;
@@ -3615,18 +3648,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
/* the -4 is to account for senc values starting at 0x10 */
lpcr = senc << (LPCR_VRMASD_SH - 4);
kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
- } else {
- kvmppc_setup_partition_table(kvm);
}
- /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
+ /* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
smp_wmb();
- kvm->arch.hpte_setup_done = 1;
err = 0;
out_srcu:
srcu_read_unlock(&kvm->srcu, srcu_idx);
out:
- mutex_unlock(&kvm->lock);
return err;
up_out:
@@ -3634,6 +3663,34 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
goto out_srcu;
}
+/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
+{
+ kvmppc_free_radix(kvm);
+ kvmppc_update_lpcr(kvm, LPCR_VPM1,
+ LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+ kvmppc_rmap_reset(kvm);
+ kvm->arch.radix = 0;
+ kvm->arch.process_table = 0;
+ return 0;
+}
+
+/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
+{
+ int err;
+
+ err = kvmppc_init_vm_radix(kvm);
+ if (err)
+ return err;
+
+ kvmppc_free_hpt(&kvm->arch.hpt);
+ kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
+ LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+ kvm->arch.radix = 1;
+ return 0;
+}
+
#ifdef CONFIG_KVM_XICS
/*
* Allocate a per-core structure for managing state about which cores are
@@ -3777,10 +3834,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
}
/*
- * For now, if the host uses radix, the guest must be radix.
+ * If the host uses radix, the guest starts out as radix.
*/
if (radix_enabled()) {
kvm->arch.radix = 1;
+ kvm->arch.mmu_ready = 1;
lpcr &= ~LPCR_VPM1;
lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
ret = kvmppc_init_vm_radix(kvm);
@@ -3800,7 +3858,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
* Work out how many sets the TLB has, for the use of
* the TLB invalidation loop in book3s_hv_rmhandlers.S.
*/
- if (kvm_is_radix(kvm))
+ if (radix_enabled())
kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */
else if (cpu_has_feature(CPU_FTR_ARCH_300))
kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */
@@ -3812,10 +3870,12 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
/*
* Track that we now have a HV mode VM active. This blocks secondary
* CPU threads from coming online.
- * On POWER9, we only need to do this for HPT guests on a radix
- * host, which is not yet supported.
+ * On POWER9, we only need to do this if the "indep_threads_mode"
+ * module parameter has been set to N.
*/
- if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ kvm->arch.threads_indep = indep_threads_mode;
+ if (!kvm->arch.threads_indep)
kvm_hv_vm_activated();
/*
@@ -3855,7 +3915,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
{
debugfs_remove_recursive(kvm->arch.debugfs_dir);
- if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ if (!kvm->arch.threads_indep)
kvm_hv_vm_deactivated();
kvmppc_free_vcores(kvm);
@@ -4190,6 +4250,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
{
unsigned long lpcr;
int radix;
+ int err;
/* If not on a POWER9, reject it */
if (!cpu_has_feature(CPU_FTR_ARCH_300))
@@ -4199,12 +4260,8 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
return -EINVAL;
- /* We can't change a guest to/from radix yet */
- radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
- if (radix != kvm_is_radix(kvm))
- return -EINVAL;
-
/* GR (guest radix) bit in process_table field must match */
+ radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
if (!!(cfg->process_table & PATB_GR) != radix)
return -EINVAL;
@@ -4212,15 +4269,40 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
if ((cfg->process_table & PRTS_MASK) > 24)
return -EINVAL;
+ /* We can change a guest to/from radix now, if the host is radix */
+ if (radix && !radix_enabled())
+ return -EINVAL;
+
mutex_lock(&kvm->lock);
+ if (radix != kvm_is_radix(kvm)) {
+ if (kvm->arch.mmu_ready) {
+ kvm->arch.mmu_ready = 0;
+ /* order mmu_ready vs. vcpus_running */
+ smp_mb();
+ if (atomic_read(&kvm->arch.vcpus_running)) {
+ kvm->arch.mmu_ready = 1;
+ err = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ if (radix)
+ err = kvmppc_switch_mmu_to_radix(kvm);
+ else
+ err = kvmppc_switch_mmu_to_hpt(kvm);
+ if (err)
+ goto out_unlock;
+ }
+
kvm->arch.process_table = cfg->process_table;
kvmppc_setup_partition_table(kvm);
lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
- mutex_unlock(&kvm->lock);
+ err = 0;
- return 0;
+ out_unlock:
+ mutex_unlock(&kvm->lock);
+ return err;
}
static struct kvmppc_ops kvm_ops_hv = {
@@ -4362,4 +4444,3 @@ module_exit(kvmppc_book3s_exit_hv);
MODULE_LICENSE("GPL");
MODULE_ALIAS_MISCDEV(KVM_MINOR);
MODULE_ALIAS("devname:kvm");
-
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 90644db9d38e..49a2c7825e04 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -278,7 +278,8 @@ void kvmhv_commence_exit(int trap)
struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
int ptid = local_paca->kvm_hstate.ptid;
struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
- int me, ee, i;
+ int me, ee, i, t;
+ int cpu0;
/* Set our bit in the threads-exiting-guest map in the 0xff00
bits of vcore->entry_exit_map */
@@ -320,6 +321,22 @@ void kvmhv_commence_exit(int trap)
if ((ee >> 8) == 0)
kvmhv_interrupt_vcore(vc, ee);
}
+
+ /*
+ * On POWER9 when running a HPT guest on a radix host (sip != NULL),
+ * we have to interrupt inactive CPU threads to get them to
+ * restore the host LPCR value.
+ */
+ if (sip->lpcr_req) {
+ if (cmpxchg(&sip->do_restore, 0, 1) == 0) {
+ vc = local_paca->kvm_hstate.kvm_vcore;
+ cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid;
+ for (t = 1; t < threads_per_core; ++t) {
+ if (sip->napped[t])
+ kvmhv_rm_send_ipi(cpu0 + t);
+ }
+ }
+ }
}
struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
@@ -529,6 +546,8 @@ static inline bool is_rm(void)
unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
{
+ if (!kvmppc_xics_enabled(vcpu))
+ return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_xirr(vcpu);
@@ -541,6 +560,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
{
+ if (!kvmppc_xics_enabled(vcpu))
+ return H_TOO_HARD;
vcpu->arch.gpr[5] = get_tb();
if (xive_enabled()) {
if (is_rm())
@@ -554,6 +575,8 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
{
+ if (!kvmppc_xics_enabled(vcpu))
+ return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_ipoll(vcpu, server);
@@ -567,6 +590,8 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
unsigned long mfrr)
{
+ if (!kvmppc_xics_enabled(vcpu))
+ return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_ipi(vcpu, server, mfrr);
@@ -579,6 +604,8 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
{
+ if (!kvmppc_xics_enabled(vcpu))
+ return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_cppr(vcpu, cppr);
@@ -591,6 +618,8 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
{
+ if (!kvmppc_xics_enabled(vcpu))
+ return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_eoi(vcpu, xirr);
@@ -601,3 +630,89 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
return xics_rm_h_eoi(vcpu, xirr);
}
#endif /* CONFIG_KVM_XICS */
+
+void kvmppc_bad_interrupt(struct pt_regs *regs)
+{
+ die("Bad interrupt in KVM entry/exit code", regs, SIGABRT);
+ panic("Bad KVM trap");
+}
+
+/*
+ * Functions used to switch LPCR HR and UPRT bits on all threads
+ * when entering and exiting HPT guests on a radix host.
+ */
+
+#define PHASE_REALMODE 1 /* in real mode */
+#define PHASE_SET_LPCR 2 /* have set LPCR */
+#define PHASE_OUT_OF_GUEST 4 /* have finished executing in guest */
+#define PHASE_RESET_LPCR 8 /* have reset LPCR to host value */
+
+#define ALL(p) (((p) << 24) | ((p) << 16) | ((p) << 8) | (p))
+
+static void wait_for_sync(struct kvm_split_mode *sip, int phase)
+{
+ int thr = local_paca->kvm_hstate.tid;
+
+ sip->lpcr_sync.phase[thr] |= phase;
+ phase = ALL(phase);
+ while ((sip->lpcr_sync.allphases & phase) != phase) {
+ HMT_low();
+ barrier();
+ }
+ HMT_medium();
+}
+
+void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip)
+{
+ unsigned long rb, set;
+
+ /* wait for every other thread to get to real mode */
+ wait_for_sync(sip, PHASE_REALMODE);
+
+ /* Set LPCR and LPIDR */
+ mtspr(SPRN_LPCR, sip->lpcr_req);
+ mtspr(SPRN_LPID, sip->lpidr_req);
+ isync();
+
+ /* Invalidate the TLB on thread 0 */
+ if (local_paca->kvm_hstate.tid == 0) {
+ sip->do_set = 0;
+ asm volatile("ptesync" : : : "memory");
+ for (set = 0; set < POWER9_TLB_SETS_RADIX; ++set) {
+ rb = TLBIEL_INVAL_SET_LPID +
+ (set << TLBIEL_INVAL_SET_SHIFT);
+ asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : :
+ "r" (rb), "r" (0));
+ }
+ asm volatile("ptesync" : : : "memory");
+ }
+
+ /* indicate that we have done so and wait for others */
+ wait_for_sync(sip, PHASE_SET_LPCR);
+ /* order read of sip->lpcr_sync.allphases vs. sip->do_set */
+ smp_rmb();
+}
+
+/*
+ * Called when a thread that has been in the guest needs
+ * to reload the host LPCR value - but only on POWER9 when
+ * running a HPT guest on a radix host.
+ */
+void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
+{
+ /* we're out of the guest... */
+ wait_for_sync(sip, PHASE_OUT_OF_GUEST);
+
+ mtspr(SPRN_LPID, 0);
+ mtspr(SPRN_LPCR, sip->host_lpcr);
+ isync();
+
+ if (local_paca->kvm_hstate.tid == 0) {
+ sip->do_restore = 0;
+ smp_wmb(); /* order store of do_restore vs. phase */
+ }
+
+ wait_for_sync(sip, PHASE_RESET_LPCR);
+ smp_mb();
+ local_paca->kvm_hstate.kvm_split_mode = NULL;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 4efe364f1188..26c11f678fbf 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -107,30 +107,50 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
}
EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
-/* Update the changed page order field of an rmap entry */
-void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize)
+/* Update the dirty bitmap of a memslot */
+void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot,
+ unsigned long gfn, unsigned long psize)
{
- unsigned long order;
+ unsigned long npages;
- if (!psize)
+ if (!psize || !memslot->dirty_bitmap)
return;
- order = ilog2(psize);
- order <<= KVMPPC_RMAP_CHG_SHIFT;
- if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER))
- *rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order;
+ npages = (psize + PAGE_SIZE - 1) / PAGE_SIZE;
+ gfn -= memslot->base_gfn;
+ set_dirty_bits_atomic(memslot->dirty_bitmap, gfn, npages);
+}
+EXPORT_SYMBOL_GPL(kvmppc_update_dirty_map);
+
+static void kvmppc_set_dirty_from_hpte(struct kvm *kvm,
+ unsigned long hpte_v, unsigned long hpte_gr)
+{
+ struct kvm_memory_slot *memslot;
+ unsigned long gfn;
+ unsigned long psize;
+
+ psize = kvmppc_actual_pgsz(hpte_v, hpte_gr);
+ gfn = hpte_rpn(hpte_gr, psize);
+ memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+ if (memslot && memslot->dirty_bitmap)
+ kvmppc_update_dirty_map(memslot, gfn, psize);
}
-EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change);
/* Returns a pointer to the revmap entry for the page mapped by a HPTE */
static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
- unsigned long hpte_gr)
+ unsigned long hpte_gr,
+ struct kvm_memory_slot **memslotp,
+ unsigned long *gfnp)
{
struct kvm_memory_slot *memslot;
unsigned long *rmap;
unsigned long gfn;
- gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr));
+ gfn = hpte_rpn(hpte_gr, kvmppc_actual_pgsz(hpte_v, hpte_gr));
memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+ if (memslotp)
+ *memslotp = memslot;
+ if (gfnp)
+ *gfnp = gfn;
if (!memslot)
return NULL;
@@ -147,10 +167,12 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
unsigned long ptel, head;
unsigned long *rmap;
unsigned long rcbits;
+ struct kvm_memory_slot *memslot;
+ unsigned long gfn;
rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
ptel = rev->guest_rpte |= rcbits;
- rmap = revmap_for_hpte(kvm, hpte_v, ptel);
+ rmap = revmap_for_hpte(kvm, hpte_v, ptel, &memslot, &gfn);
if (!rmap)
return;
lock_rmap(rmap);
@@ -169,7 +191,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
}
*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
if (rcbits & HPTE_R_C)
- kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r));
+ kvmppc_update_dirty_map(memslot, gfn,
+ kvmppc_actual_pgsz(hpte_v, hpte_r));
unlock_rmap(rmap);
}
@@ -193,7 +216,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
if (kvm_is_radix(kvm))
return H_FUNCTION;
- psize = hpte_page_size(pteh, ptel);
+ psize = kvmppc_actual_pgsz(pteh, ptel);
if (!psize)
return H_PARAMETER;
writing = hpte_is_writable(ptel);
@@ -797,7 +820,7 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
gr |= r & (HPTE_R_R | HPTE_R_C);
if (r & HPTE_R_R) {
kvmppc_clear_ref_hpte(kvm, hpte, pte_index);
- rmap = revmap_for_hpte(kvm, v, gr);
+ rmap = revmap_for_hpte(kvm, v, gr, NULL, NULL);
if (rmap) {
lock_rmap(rmap);
*rmap |= KVMPPC_RMAP_REFERENCED;
@@ -819,7 +842,6 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
__be64 *hpte;
unsigned long v, r, gr;
struct revmap_entry *rev;
- unsigned long *rmap;
long ret = H_NOT_FOUND;
if (kvm_is_radix(kvm))
@@ -848,16 +870,9 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
r = be64_to_cpu(hpte[1]);
gr |= r & (HPTE_R_R | HPTE_R_C);
if (r & HPTE_R_C) {
- unsigned long psize = hpte_page_size(v, r);
hpte[1] = cpu_to_be64(r & ~HPTE_R_C);
eieio();
- rmap = revmap_for_hpte(kvm, v, gr);
- if (rmap) {
- lock_rmap(rmap);
- *rmap |= KVMPPC_RMAP_CHANGED;
- kvmppc_update_rmap_change(rmap, psize);
- unlock_rmap(rmap);
- }
+ kvmppc_set_dirty_from_hpte(kvm, v, gr);
}
}
vcpu->arch.gpr[4] = gr;
@@ -1014,7 +1029,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
* Check the HPTE again, including base page size
*/
if ((v & valid) && (v & mask) == val &&
- hpte_base_page_size(v, r) == (1ul << pshift))
+ kvmppc_hpte_base_page_shift(v, r) == pshift)
/* Return with the HPTE still locked */
return (hash << 3) + (i >> 1);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index ec69fa45d5a2..7add18930e6d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -31,6 +31,7 @@
#include <asm/tm.h>
#include <asm/opal.h>
#include <asm/xive-regs.h>
+#include <asm/thread_info.h>
/* Sign-extend HDEC if not on POWER9 */
#define EXTEND_HDEC(reg) \
@@ -81,6 +82,19 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
RFI
kvmppc_call_hv_entry:
+BEGIN_FTR_SECTION
+ /* On P9, do LPCR setting, if necessary */
+ ld r3, HSTATE_SPLIT_MODE(r13)
+ cmpdi r3, 0
+ beq 46f
+ lwz r4, KVM_SPLIT_DO_SET(r3)
+ cmpwi r4, 0
+ beq 46f
+ bl kvmhv_p9_set_lpcr
+ nop
+46:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
ld r4, HSTATE_KVM_VCPU(r13)
bl kvmppc_hv_entry
@@ -149,11 +163,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
subf r4, r4, r3
mtspr SPRN_DEC, r4
-BEGIN_FTR_SECTION
/* hwthread_req may have got set by cede or no vcpu, so clear it */
li r0, 0
stb r0, HSTATE_HWTHREAD_REQ(r13)
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
/*
* For external interrupts we need to call the Linux
@@ -316,7 +328,6 @@ kvm_novcpu_exit:
* Relocation is off and most register values are lost.
* r13 points to the PACA.
* r3 contains the SRR1 wakeup value, SRR1 is trashed.
- * This is not used by ISAv3.0B processors.
*/
.globl kvm_start_guest
kvm_start_guest:
@@ -390,6 +401,7 @@ kvm_secondary_got_guest:
ld r6, HSTATE_SPLIT_MODE(r13)
cmpdi r6, 0
beq 63f
+BEGIN_FTR_SECTION
ld r0, KVM_SPLIT_RPR(r6)
mtspr SPRN_RPR, r0
ld r0, KVM_SPLIT_PMMAR(r6)
@@ -397,6 +409,15 @@ kvm_secondary_got_guest:
ld r0, KVM_SPLIT_LDBAR(r6)
mtspr SPRN_LDBAR, r0
isync
+FTR_SECTION_ELSE
+ /* On P9 we use the split_info for coordinating LPCR changes */
+ lwz r4, KVM_SPLIT_DO_SET(r6)
+ cmpwi r4, 0
+ beq 63f
+ mr r3, r6
+ bl kvmhv_p9_set_lpcr
+ nop
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
63:
/* Order load of vcpu after load of vcore */
lwsync
@@ -435,9 +456,6 @@ kvm_secondary_got_guest:
* While waiting we also need to check if we get given a vcpu to run.
*/
kvm_no_guest:
-BEGIN_FTR_SECTION
- twi 31,0,0
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
lbz r3, HSTATE_HWTHREAD_REQ(r13)
cmpwi r3, 0
bne 53f
@@ -470,6 +488,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
ld r3, HSTATE_SPLIT_MODE(r13)
cmpdi r3, 0
beq kvm_no_guest
+ lwz r0, KVM_SPLIT_DO_SET(r3)
+ cmpwi r0, 0
+ bne kvmhv_do_set
+ lwz r0, KVM_SPLIT_DO_RESTORE(r3)
+ cmpwi r0, 0
+ bne kvmhv_do_restore
lbz r0, KVM_SPLIT_DO_NAP(r3)
cmpwi r0, 0
beq kvm_no_guest
@@ -482,6 +506,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
stb r0, HSTATE_HWTHREAD_STATE(r13)
b kvm_no_guest
+kvmhv_do_set:
+ /* Set LPCR, LPIDR etc. on P9 */
+ HMT_MEDIUM
+ bl kvmhv_p9_set_lpcr
+ nop
+ b kvm_no_guest
+
+kvmhv_do_restore:
+ HMT_MEDIUM
+ bl kvmhv_p9_restore_lpcr
+ nop
+ b kvm_no_guest
+
/*
* Here the primary thread is trying to return the core to
* whole-core mode, so we need to nap.
@@ -519,8 +556,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* Set kvm_split_mode.napped[tid] = 1 */
ld r3, HSTATE_SPLIT_MODE(r13)
li r0, 1
- lhz r4, PACAPACAINDEX(r13)
- clrldi r4, r4, 61 /* micro-threading => P8 => 8 threads/core */
+ lbz r4, HSTATE_TID(r13)
addi r4, r4, KVM_SPLIT_NAPPED
stbx r0, r3, r4
/* Check the do_nap flag again after setting napped[] */
@@ -1914,10 +1950,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
19: lis r8,0x7fff /* MAX_INT@h */
mtspr SPRN_HDEC,r8
-16: ld r8,KVM_HOST_LPCR(r4)
+16:
+BEGIN_FTR_SECTION
+ /* On POWER9 with HPT-on-radix we need to wait for all other threads */
+ ld r3, HSTATE_SPLIT_MODE(r13)
+ cmpdi r3, 0
+ beq 47f
+ lwz r8, KVM_SPLIT_DO_RESTORE(r3)
+ cmpwi r8, 0
+ beq 47f
+ stw r12, STACK_SLOT_TRAP(r1)
+ bl kvmhv_p9_restore_lpcr
+ nop
+ lwz r12, STACK_SLOT_TRAP(r1)
+ b 48f
+47:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+ ld r8,KVM_HOST_LPCR(r4)
mtspr SPRN_LPCR,r8
isync
-
+48:
/* load host SLB entries */
BEGIN_MMU_FTR_SECTION
b 0f
@@ -2543,10 +2595,8 @@ kvm_do_nap:
clrrdi r0, r0, 1
mtspr SPRN_CTRLT, r0
-BEGIN_FTR_SECTION
li r0,1
stb r0,HSTATE_HWTHREAD_REQ(r13)
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
mfspr r5,SPRN_LPCR
ori r5,r5,LPCR_PECE0 | LPCR_PECE1
BEGIN_FTR_SECTION
@@ -3134,10 +3184,139 @@ kvmppc_restore_tm:
/*
* We come here if we get any exception or interrupt while we are
* executing host real mode code while in guest MMU context.
- * For now just spin, but we should do something better.
+ * r12 is (CR << 32) | vector
+ * r13 points to our PACA
+ * r12 is saved in HSTATE_SCRATCH0(r13)
+ * ctr is saved in HSTATE_SCRATCH1(r13) if RELOCATABLE
+ * r9 is saved in HSTATE_SCRATCH2(r13)
+ * r13 is saved in HSPRG1
+ * cfar is saved in HSTATE_CFAR(r13)
+ * ppr is saved in HSTATE_PPR(r13)
*/
kvmppc_bad_host_intr:
+ /*
+ * Switch to the emergency stack, but start half-way down in
+ * case we were already on it.
+ */
+ mr r9, r1
+ std r1, PACAR1(r13)
+ ld r1, PACAEMERGSP(r13)
+ subi r1, r1, THREAD_SIZE/2 + INT_FRAME_SIZE
+ std r9, 0(r1)
+ std r0, GPR0(r1)
+ std r9, GPR1(r1)
+ std r2, GPR2(r1)
+ SAVE_4GPRS(3, r1)
+ SAVE_2GPRS(7, r1)
+ srdi r0, r12, 32
+ clrldi r12, r12, 32
+ std r0, _CCR(r1)
+ std r12, _TRAP(r1)
+ andi. r0, r12, 2
+ beq 1f
+ mfspr r3, SPRN_HSRR0
+ mfspr r4, SPRN_HSRR1
+ mfspr r5, SPRN_HDAR
+ mfspr r6, SPRN_HDSISR
+ b 2f
+1: mfspr r3, SPRN_SRR0
+ mfspr r4, SPRN_SRR1
+ mfspr r5, SPRN_DAR
+ mfspr r6, SPRN_DSISR
+2: std r3, _NIP(r1)
+ std r4, _MSR(r1)
+ std r5, _DAR(r1)
+ std r6, _DSISR(r1)
+ ld r9, HSTATE_SCRATCH2(r13)
+ ld r12, HSTATE_SCRATCH0(r13)
+ GET_SCRATCH0(r0)
+ SAVE_4GPRS(9, r1)
+ std r0, GPR13(r1)
+ SAVE_NVGPRS(r1)
+ ld r5, HSTATE_CFAR(r13)
+ std r5, ORIG_GPR3(r1)
+ mflr r3
+#ifdef CONFIG_RELOCATABLE
+ ld r4, HSTATE_SCRATCH1(r13)
+#else
+ mfctr r4
+#endif
+ mfxer r5
+ lbz r6, PACASOFTIRQEN(r13)
+ std r3, _LINK(r1)
+ std r4, _CTR(r1)
+ std r5, _XER(r1)
+ std r6, SOFTE(r1)
+ ld r2, PACATOC(r13)
+ LOAD_REG_IMMEDIATE(3, 0x7265677368657265)
+ std r3, STACK_FRAME_OVERHEAD-16(r1)
+
+ /*
+ * On POWER9 do a minimal restore of the MMU and call C code,
+ * which will print a message and panic.
+ * XXX On POWER7 and POWER8, we just spin here since we don't
+ * know what the other threads are doing (and we don't want to
+ * coordinate with them) - but at least we now have register state
+ * in memory that we might be able to look at from another CPU.
+ */
+BEGIN_FTR_SECTION
b .
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+ ld r9, HSTATE_KVM_VCPU(r13)
+ ld r10, VCPU_KVM(r9)
+
+ li r0, 0
+ mtspr SPRN_AMR, r0
+ mtspr SPRN_IAMR, r0
+ mtspr SPRN_CIABR, r0
+ mtspr SPRN_DAWRX, r0
+
+ /* Flush the ERAT on radix P9 DD1 guest exit */
+BEGIN_FTR_SECTION
+ PPC_INVALIDATE_ERAT
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
+
+BEGIN_MMU_FTR_SECTION
+ b 4f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
+
+ slbmte r0, r0
+ slbia
+ ptesync
+ ld r8, PACA_SLBSHADOWPTR(r13)
+ .rept SLB_NUM_BOLTED
+ li r3, SLBSHADOW_SAVEAREA
+ LDX_BE r5, r8, r3
+ addi r3, r3, 8
+ LDX_BE r6, r8, r3
+ andis. r7, r5, SLB_ESID_V@h
+ beq 3f
+ slbmte r6, r5
+3: addi r8, r8, 16
+ .endr
+
+4: lwz r7, KVM_HOST_LPID(r10)
+ mtspr SPRN_LPID, r7
+ mtspr SPRN_PID, r0
+ ld r8, KVM_HOST_LPCR(r10)
+ mtspr SPRN_LPCR, r8
+ isync
+ li r0, KVM_GUEST_MODE_NONE
+ stb r0, HSTATE_IN_GUEST(r13)
+
+ /*
+ * Turn on the MMU and jump to C code
+ */
+ bcl 20, 31, .+4
+5: mflr r3
+ addi r3, r3, 9f - 5b
+ ld r4, PACAKMSR(r13)
+ mtspr SPRN_SRR0, r3
+ mtspr SPRN_SRR1, r4
+ rfid
+9: addi r3, r1, STACK_FRAME_OVERHEAD
+ bl kvmppc_bad_interrupt
+ b 9b
/*
* This mimics the MSR transition on IRQ delivery. The new guest MSR is taken
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 69a09444d46e..d0dc8624198f 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1326,12 +1326,22 @@ static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu,
kvmppc_set_pvr_pr(vcpu, sregs->pvr);
vcpu3s->sdr1 = sregs->u.s.sdr1;
+#ifdef CONFIG_PPC_BOOK3S_64
if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+ /* Flush all SLB entries */
+ vcpu->arch.mmu.slbmte(vcpu, 0, 0);
+ vcpu->arch.mmu.slbia(vcpu);
+
for (i = 0; i < 64; i++) {
- vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
- sregs->u.s.ppc64.slb[i].slbe);
+ u64 rb = sregs->u.s.ppc64.slb[i].slbe;
+ u64 rs = sregs->u.s.ppc64.slb[i].slbv;
+
+ if (rb & SLB_ESID_V)
+ vcpu->arch.mmu.slbmte(vcpu, rs, rb);
}
- } else {
+ } else
+#endif
+ {
for (i = 0; i < 16; i++) {
vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
}
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index 8a4205fa774f..dae3be5ff42b 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -419,6 +419,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd)
case H_PROTECT:
case H_BULK_REMOVE:
case H_PUT_TCE:
+ case H_PUT_TCE_INDIRECT:
+ case H_STUFF_TCE:
case H_CEDE:
case H_LOGICAL_CI_LOAD:
case H_LOGICAL_CI_STORE:
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index c6c734424c70..423b21393bc9 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -377,7 +377,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
start = vma->vm_pgoff;
end = start +
- ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
+ vma_pages(vma);
pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 3480faaf1ef8..a0b7f094de78 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -590,8 +590,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = !!(hv_enabled && radix_enabled());
break;
case KVM_CAP_PPC_MMU_HASH_V3:
- r = !!(hv_enabled && !radix_enabled() &&
- cpu_has_feature(CPU_FTR_ARCH_300));
+ r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300));
break;
#endif
case KVM_CAP_SYNC_MMU:
@@ -644,8 +643,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
#endif
case KVM_CAP_PPC_HTM:
- r = cpu_has_feature(CPU_FTR_TM_COMP) &&
- is_kvmppc_hv_enabled(kvm);
+ r = is_kvmppc_hv_enabled(kvm) &&
+ (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_HTM_COMP);
break;
default:
r = 0;
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index fa2558e12024..ad38c5e918ec 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -225,6 +225,8 @@ struct x86_emulate_ops {
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
+ int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase);
+
};
typedef u32 __attribute__((vector_size(16))) sse128_t;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c73e493adf07..7233445a20bd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1061,6 +1061,11 @@ struct kvm_x86_ops {
void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
void (*setup_mce)(struct kvm_vcpu *vcpu);
+
+ int (*smi_allowed)(struct kvm_vcpu *vcpu);
+ int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
+ int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
+ int (*enable_smi_window)(struct kvm_vcpu *vcpu);
};
struct kvm_arch_async_pf {
@@ -1426,4 +1431,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
#endif
}
+#define put_smstate(type, buf, offset, val) \
+ *(type *)((buf) + (offset) - 0x7e00) = val
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index caec8417539f..8b6780751132 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -70,11 +70,11 @@
#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
-#define SECONDARY_EXEC_RDRAND 0x00000800
+#define SECONDARY_EXEC_RDRAND_EXITING 0x00000800
#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
-#define SECONDARY_EXEC_RDSEED 0x00010000
+#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000
#define SECONDARY_EXEC_ENABLE_PML 0x00020000
#define SECONDARY_EXEC_XSAVES 0x00100000
#define SECONDARY_EXEC_TSC_SCALING 0x02000000
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d90cdc77e077..8079d141792a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2591,6 +2591,15 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
smbase = ctxt->ops->get_smbase(ctxt);
+
+ /*
+ * Give pre_leave_smm() a chance to make ISA-specific changes to the
+ * vCPU state (e.g. enter guest mode) before loading state from the SMM
+ * state-save area.
+ */
+ if (ctxt->ops->pre_leave_smm(ctxt, smbase))
+ return X86EMUL_UNHANDLEABLE;
+
if (emulator_has_longmode(ctxt))
ret = rsm_load_state_64(ctxt, smbase + 0x8000);
else
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 69c5612be786..a778f1ae2927 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1301,14 +1301,42 @@ static void update_divide_count(struct kvm_lapic *apic)
apic->divide_count);
}
+static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
+{
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (apic->lapic_timer.period < min_period) {
+ pr_info_ratelimited(
+ "kvm: vcpu %i: requested %lld ns "
+ "lapic timer period limited to %lld ns\n",
+ apic->vcpu->vcpu_id,
+ apic->lapic_timer.period, min_period);
+ apic->lapic_timer.period = min_period;
+ }
+ }
+}
+
static void apic_update_lvtt(struct kvm_lapic *apic)
{
u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask;
if (apic->lapic_timer.timer_mode != timer_mode) {
+ if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
+ APIC_LVT_TIMER_TSCDEADLINE)) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+ apic->lapic_timer.period = 0;
+ apic->lapic_timer.tscdeadline = 0;
+ }
apic->lapic_timer.timer_mode = timer_mode;
- hrtimer_cancel(&apic->lapic_timer.timer);
+ limit_periodic_timer_frequency(apic);
}
}
@@ -1430,6 +1458,30 @@ static void start_sw_period(struct kvm_lapic *apic)
HRTIMER_MODE_ABS_PINNED);
}
+static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
+{
+ ktime_t now, remaining;
+ u64 ns_remaining_old, ns_remaining_new;
+
+ apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
+ * APIC_BUS_CYCLE_NS * apic->divide_count;
+ limit_periodic_timer_frequency(apic);
+
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = 0;
+
+ ns_remaining_old = ktime_to_ns(remaining);
+ ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
+ apic->divide_count, old_divisor);
+
+ apic->lapic_timer.tscdeadline +=
+ nsec_to_cycles(apic->vcpu, ns_remaining_new) -
+ nsec_to_cycles(apic->vcpu, ns_remaining_old);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
+}
+
static bool set_target_expiration(struct kvm_lapic *apic)
{
ktime_t now;
@@ -1439,27 +1491,13 @@ static bool set_target_expiration(struct kvm_lapic *apic)
apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
* APIC_BUS_CYCLE_NS * apic->divide_count;
- if (!apic->lapic_timer.period)
+ if (!apic->lapic_timer.period) {
+ apic->lapic_timer.tscdeadline = 0;
return false;
-
- /*
- * Do not allow the guest to program periodic timers with small
- * interval, since the hrtimers are not throttled by the host
- * scheduler.
- */
- if (apic_lvtt_period(apic)) {
- s64 min_period = min_timer_period_us * 1000LL;
-
- if (apic->lapic_timer.period < min_period) {
- pr_info_ratelimited(
- "kvm: vcpu %i: requested %lld ns "
- "lapic timer period limited to %lld ns\n",
- apic->vcpu->vcpu_id,
- apic->lapic_timer.period, min_period);
- apic->lapic_timer.period = min_period;
- }
}
+ limit_periodic_timer_frequency(apic);
+
apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
PRIx64 ", "
"timer initial count 0x%x, period %lldns, "
@@ -1515,6 +1553,9 @@ static bool start_hv_timer(struct kvm_lapic *apic)
if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
return false;
+ if (!ktimer->tscdeadline)
+ return false;
+
r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
if (r < 0)
return false;
@@ -1738,13 +1779,21 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
start_apic_timer(apic);
break;
- case APIC_TDCR:
+ case APIC_TDCR: {
+ uint32_t old_divisor = apic->divide_count;
+
if (val & 4)
apic_debug("KVM_WRITE:TDCR %x\n", val);
kvm_lapic_set_reg(apic, APIC_TDCR, val);
update_divide_count(apic);
+ if (apic->divide_count != old_divisor &&
+ apic->lapic_timer.period) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ update_target_expiration(apic, old_divisor);
+ restart_apic_timer(apic);
+ }
break;
-
+ }
case APIC_ESR:
if (apic_x2apic_mode(apic) && val != 0) {
apic_debug("KVM_WRITE:ESR not zero %x\n", val);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 106d4a029a8a..0b481cc9c725 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -150,6 +150,20 @@ module_param(dbg, bool, 0644);
/* make pte_list_desc fit well in cache line */
#define PTE_LIST_EXT 3
+/*
+ * Return values of handle_mmio_page_fault and mmu.page_fault:
+ * RET_PF_RETRY: let CPU fault again on the address.
+ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ *
+ * For handle_mmio_page_fault only:
+ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ */
+enum {
+ RET_PF_RETRY = 0,
+ RET_PF_EMULATE = 1,
+ RET_PF_INVALID = 2,
+};
+
struct pte_list_desc {
u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
@@ -2424,7 +2438,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
{
- return __shadow_walk_next(iterator, *iterator->sptep);
+ __shadow_walk_next(iterator, *iterator->sptep);
}
static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
@@ -2794,13 +2808,13 @@ done:
return ret;
}
-static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
- int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
- bool speculative, bool host_writable)
+static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+ bool speculative, bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
- bool emulate = false;
+ int ret = RET_PF_RETRY;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
@@ -2830,12 +2844,12 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
true, host_writable)) {
if (write_fault)
- emulate = true;
+ ret = RET_PF_EMULATE;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
if (unlikely(is_mmio_spte(*sptep)))
- emulate = true;
+ ret = RET_PF_EMULATE;
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
@@ -2855,7 +2869,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
kvm_release_pfn_clean(pfn);
- return emulate;
+ return ret;
}
static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2994,14 +3008,13 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
* Do not cache the mmio info caused by writing the readonly gfn
* into the spte otherwise read access on readonly gfn also can
* caused mmio page fault and treat it as mmio access.
- * Return 1 to tell kvm to emulate it.
*/
if (pfn == KVM_PFN_ERR_RO_FAULT)
- return 1;
+ return RET_PF_EMULATE;
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
- return 0;
+ return RET_PF_RETRY;
}
return -EFAULT;
@@ -3286,13 +3299,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
}
if (fast_page_fault(vcpu, v, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
return r;
@@ -3312,7 +3325,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
@@ -3659,54 +3672,38 @@ exit:
return reserved;
}
-/*
- * Return values of handle_mmio_page_fault:
- * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
- * directly.
- * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
- * fault path update the mmio spte.
- * RET_MMIO_PF_RETRY: let CPU fault again on the address.
- * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
- */
-enum {
- RET_MMIO_PF_EMULATE = 1,
- RET_MMIO_PF_INVALID = 2,
- RET_MMIO_PF_RETRY = 0,
- RET_MMIO_PF_BUG = -1
-};
-
static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
bool reserved;
if (mmio_info_in_cache(vcpu, addr, direct))
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
if (WARN_ON(reserved))
- return RET_MMIO_PF_BUG;
+ return -EINVAL;
if (is_mmio_spte(spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned access = get_mmio_spte_access(spte);
if (!check_mmio_spte(vcpu, spte))
- return RET_MMIO_PF_INVALID;
+ return RET_PF_INVALID;
if (direct)
addr = 0;
trace_handle_mmio_page_fault(addr, gfn, access);
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
}
/*
* If the page table is zapped by other cpus, let CPU fault again on
* the address.
*/
- return RET_MMIO_PF_RETRY;
+ return RET_PF_RETRY;
}
EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
@@ -3756,7 +3753,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3820,8 +3817,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
}
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
- u64 fault_address, char *insn, int insn_len,
- bool need_unprotect)
+ u64 fault_address, char *insn, int insn_len)
{
int r = 1;
@@ -3829,7 +3825,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
default:
trace_kvm_page_fault(fault_address, error_code);
- if (need_unprotect && kvm_event_needs_reinjection(vcpu))
+ if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, fault_address);
r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
insn_len);
@@ -3876,7 +3872,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3893,13 +3889,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
}
if (fast_page_fault(vcpu, gpa, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
return r;
@@ -3919,7 +3915,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -3974,19 +3970,19 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
unsigned level, unsigned gpte)
{
/*
- * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
- * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
- * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
- */
- gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
-
- /*
* The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
* If it is clear, there are no large pages at this level, so clear
* PT_PAGE_SIZE_MASK in gpte if that is the case.
*/
gpte &= level - mmu->last_nonleaf_level;
+ /*
+ * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
+ * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+ * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+
return gpte & PT_PAGE_SIZE_MASK;
}
@@ -4555,6 +4551,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
update_permission_bitmask(vcpu, context, true);
update_pkru_bitmask(vcpu, context, true);
+ update_last_nonleaf_level(vcpu, context);
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
@@ -4917,25 +4914,25 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
vcpu->arch.gpa_val = cr2;
}
+ r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, cr2, direct);
- if (r == RET_MMIO_PF_EMULATE) {
+ if (r == RET_PF_EMULATE) {
emulation_type = 0;
goto emulate;
}
- if (r == RET_MMIO_PF_RETRY)
- return 1;
- if (r < 0)
- return r;
- /* Must be RET_MMIO_PF_INVALID. */
}
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
- false);
+ if (r == RET_PF_INVALID) {
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
+ false);
+ WARN_ON(r == RET_PF_INVALID);
+ }
+
+ if (r == RET_PF_RETRY)
+ return 1;
if (r < 0)
return r;
- if (!r)
- return 1;
/*
* Before emulating the instruction, check if the error code
@@ -4992,8 +4989,7 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
free_page((unsigned long)vcpu->arch.mmu.pae_root);
- if (vcpu->arch.mmu.lm_root != NULL)
- free_page((unsigned long)vcpu->arch.mmu.lm_root);
+ free_page((unsigned long)vcpu->arch.mmu.lm_root);
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -5463,10 +5459,8 @@ static struct shrinker mmu_shrinker = {
static void mmu_destroy_caches(void)
{
- if (pte_list_desc_cache)
- kmem_cache_destroy(pte_list_desc_cache);
- if (mmu_page_header_cache)
- kmem_cache_destroy(mmu_page_header_cache);
+ kmem_cache_destroy(pte_list_desc_cache);
+ kmem_cache_destroy(mmu_page_header_cache);
}
int kvm_mmu_module_init(void)
@@ -5475,13 +5469,13 @@ int kvm_mmu_module_init(void)
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
- 0, 0, NULL);
+ 0, SLAB_ACCOUNT, NULL);
if (!pte_list_desc_cache)
goto nomem;
mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
sizeof(struct kvm_mmu_page),
- 0, 0, NULL);
+ 0, SLAB_ACCOUNT, NULL);
if (!mmu_page_header_cache)
goto nomem;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 64a2dbd2b1af..1092302aa16a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -65,8 +65,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
- u64 fault_address, char *insn, int insn_len,
- bool need_unprotect);
+ u64 fault_address, char *insn, int insn_len);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 86b68dc5a649..5abae72266b7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -334,10 +334,11 @@ retry_walk:
--walker->level;
index = PT_INDEX(addr, walker->level);
-
table_gfn = gpte_to_gfn(pte);
offset = index * sizeof(pt_element_t);
pte_gpa = gfn_to_gpa(table_gfn) + offset;
+
+ BUG_ON(walker->level < 1);
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
@@ -592,7 +593,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
- int top_level, emulate;
+ int top_level, ret;
direct_access = gw->pte_access;
@@ -658,15 +659,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}
clear_sp_write_flooding_count(it.sptep);
- emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
- it.level, gw->gfn, pfn, prefault, map_writable);
+ ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
+ it.level, gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
- return emulate;
+ return ret;
out_gpte_changed:
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
/*
@@ -761,12 +762,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (!prefault)
inject_page_fault(vcpu, &walker.fault);
- return 0;
+ return RET_PF_RETRY;
}
if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
shadow_page_table_clear_flood(vcpu, addr);
- return 1;
+ return RET_PF_EMULATE;
}
vcpu->arch.write_fault_to_shadow_pgtable = false;
@@ -788,7 +789,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
&map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
return r;
@@ -833,7 +834,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0e68f0b3cbf7..b71daed3cca2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1034,15 +1034,12 @@ static int avic_ga_log_notifier(u32 ga_tag)
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
- if (!vcpu)
- return 0;
-
/* Note:
* At this point, the IOMMU should have already set the pending
* bit in the vAPIC backing page. So, we just need to schedule
* in the vcpu.
*/
- if (vcpu->mode == OUTSIDE_GUEST_MODE)
+ if (vcpu)
kvm_vcpu_wake_up(vcpu);
return 0;
@@ -2144,7 +2141,18 @@ static int pf_interception(struct vcpu_svm *svm)
return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
svm->vmcb->control.insn_bytes,
- svm->vmcb->control.insn_len, !npt_enabled);
+ svm->vmcb->control.insn_len);
+}
+
+static int npf_interception(struct vcpu_svm *svm)
+{
+ u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ trace_kvm_page_fault(fault_address, error_code);
+ return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ svm->vmcb->control.insn_bytes,
+ svm->vmcb->control.insn_len);
}
static int db_interception(struct vcpu_svm *svm)
@@ -2916,70 +2924,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
return true;
}
-static bool nested_svm_vmrun(struct vcpu_svm *svm)
+static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
+ struct vmcb *nested_vmcb, struct page *page)
{
- struct vmcb *nested_vmcb;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
- struct page *page;
- u64 vmcb_gpa;
-
- vmcb_gpa = svm->vmcb->save.rax;
-
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
- return false;
-
- if (!nested_vmcb_checks(nested_vmcb)) {
- nested_vmcb->control.exit_code = SVM_EXIT_ERR;
- nested_vmcb->control.exit_code_hi = 0;
- nested_vmcb->control.exit_info_1 = 0;
- nested_vmcb->control.exit_info_2 = 0;
-
- nested_svm_unmap(page);
-
- return false;
- }
-
- trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
- nested_vmcb->save.rip,
- nested_vmcb->control.int_ctl,
- nested_vmcb->control.event_inj,
- nested_vmcb->control.nested_ctl);
-
- trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
- nested_vmcb->control.intercept_cr >> 16,
- nested_vmcb->control.intercept_exceptions,
- nested_vmcb->control.intercept);
-
- /* Clear internal status */
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- /*
- * Save the old vmcb, so we don't need to pick what we save, but can
- * restore everything when a VMEXIT occurs
- */
- hsave->save.es = vmcb->save.es;
- hsave->save.cs = vmcb->save.cs;
- hsave->save.ss = vmcb->save.ss;
- hsave->save.ds = vmcb->save.ds;
- hsave->save.gdtr = vmcb->save.gdtr;
- hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.efer;
- hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
- hsave->save.rip = kvm_rip_read(&svm->vcpu);
- hsave->save.rsp = vmcb->save.rsp;
- hsave->save.rax = vmcb->save.rax;
- if (npt_enabled)
- hsave->save.cr3 = vmcb->save.cr3;
- else
- hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
-
- copy_vmcb_control_area(hsave, vmcb);
-
if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
else
@@ -3072,6 +3019,73 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
enable_gif(svm);
mark_all_dirty(svm->vmcb);
+}
+
+static bool nested_svm_vmrun(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
+ u64 vmcb_gpa;
+
+ vmcb_gpa = svm->vmcb->save.rax;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ if (!nested_vmcb)
+ return false;
+
+ if (!nested_vmcb_checks(nested_vmcb)) {
+ nested_vmcb->control.exit_code = SVM_EXIT_ERR;
+ nested_vmcb->control.exit_code_hi = 0;
+ nested_vmcb->control.exit_info_1 = 0;
+ nested_vmcb->control.exit_info_2 = 0;
+
+ nested_svm_unmap(page);
+
+ return false;
+ }
+
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
+ nested_vmcb->save.rip,
+ nested_vmcb->control.int_ctl,
+ nested_vmcb->control.event_inj,
+ nested_vmcb->control.nested_ctl);
+
+ trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+ nested_vmcb->control.intercept_cr >> 16,
+ nested_vmcb->control.intercept_exceptions,
+ nested_vmcb->control.intercept);
+
+ /* Clear internal status */
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ /*
+ * Save the old vmcb, so we don't need to pick what we save, but can
+ * restore everything when a VMEXIT occurs
+ */
+ hsave->save.es = vmcb->save.es;
+ hsave->save.cs = vmcb->save.cs;
+ hsave->save.ss = vmcb->save.ss;
+ hsave->save.ds = vmcb->save.ds;
+ hsave->save.gdtr = vmcb->save.gdtr;
+ hsave->save.idtr = vmcb->save.idtr;
+ hsave->save.efer = svm->vcpu.arch.efer;
+ hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ hsave->save.cr4 = svm->vcpu.arch.cr4;
+ hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
+ hsave->save.rip = kvm_rip_read(&svm->vcpu);
+ hsave->save.rsp = vmcb->save.rsp;
+ hsave->save.rax = vmcb->save.rax;
+ if (npt_enabled)
+ hsave->save.cr3 = vmcb->save.cr3;
+ else
+ hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
+
+ copy_vmcb_control_area(hsave, vmcb);
+
+ enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
return true;
}
@@ -3173,7 +3187,7 @@ static int stgi_interception(struct vcpu_svm *svm)
/*
* If VGIF is enabled, the STGI intercept is only added to
- * detect the opening of the NMI window; remove it now.
+ * detect the opening of the SMI/NMI window; remove it now.
*/
if (vgif_enabled(svm))
clr_intercept(svm, INTERCEPT_STGI);
@@ -4131,7 +4145,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_MONITOR] = monitor_interception,
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
- [SVM_EXIT_NPF] = pf_interception,
+ [SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_RSM] = emulate_on_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
@@ -5393,6 +5407,88 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
vcpu->arch.mcg_cap &= 0x1ff;
}
+static int svm_smi_allowed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* Per APM Vol.2 15.22.2 "Response to SMI" */
+ if (!gif_set(svm))
+ return 0;
+
+ if (is_guest_mode(&svm->vcpu) &&
+ svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
+ /* TODO: Might need to set exit_info_1 and exit_info_2 here */
+ svm->vmcb->control.exit_code = SVM_EXIT_SMI;
+ svm->nested.exit_required = true;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (is_guest_mode(vcpu)) {
+ /* FED8h - SVM Guest */
+ put_smstate(u64, smstate, 0x7ed8, 1);
+ /* FEE0h - SVM Guest VMCB Physical Address */
+ put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ ret = nested_svm_vmexit(svm);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *nested_vmcb;
+ struct page *page;
+ struct {
+ u64 guest;
+ u64 vmcb;
+ } svm_state_save;
+ int ret;
+
+ ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save,
+ sizeof(svm_state_save));
+ if (ret)
+ return ret;
+
+ if (svm_state_save.guest) {
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page);
+ if (nested_vmcb)
+ enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page);
+ else
+ ret = 1;
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ }
+ return ret;
+}
+
+static int enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!gif_set(svm)) {
+ if (vgif_enabled(svm))
+ set_intercept(svm, INTERCEPT_STGI);
+ /* STGI will cause a vm exit */
+ return 1;
+ }
+ return 0;
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -5503,6 +5599,11 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.deliver_posted_interrupt = svm_deliver_avic_intr,
.update_pi_irte = svm_update_pi_irte,
.setup_mce = svm_setup_mce,
+
+ .smi_allowed = svm_smi_allowed,
+ .pre_enter_smm = svm_pre_enter_smm,
+ .pre_leave_smm = svm_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a2b804e10c95..e6c8ffa84968 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -486,6 +486,14 @@ struct nested_vmx {
u64 nested_vmx_cr4_fixed1;
u64 nested_vmx_vmcs_enum;
u64 nested_vmx_vmfunc_controls;
+
+ /* SMM related state */
+ struct {
+ /* in VMX operation on SMM entry? */
+ bool vmxon;
+ /* in guest mode on SMM entry? */
+ bool guest_mode;
+ } smm;
};
#define POSTED_INTR_ON 0
@@ -900,16 +908,13 @@ static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
static bool vmx_xsaves_supported(void);
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
-static int alloc_identity_pagetable(struct kvm *kvm);
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@ -1598,18 +1603,15 @@ static inline void vpid_sync_context(int vpid)
static inline void ept_sync_global(void)
{
- if (cpu_has_vmx_invept_global())
- __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+ __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
}
static inline void ept_sync_context(u64 eptp)
{
- if (enable_ept) {
- if (cpu_has_vmx_invept_context())
- __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
- else
- ept_sync_global();
- }
+ if (cpu_has_vmx_invept_context())
+ __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+ else
+ ept_sync_global();
}
static __always_inline void vmcs_check16(unsigned long field)
@@ -2831,8 +2833,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
SECONDARY_EXEC_ENABLE_PML;
vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
}
- } else
- vmx->nested.nested_vmx_ept_caps = 0;
+ }
if (cpu_has_vmx_vmfunc()) {
vmx->nested.nested_vmx_secondary_ctls_high |=
@@ -2841,8 +2842,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
* Advertise EPTP switching unconditionally
* since we emulate it
*/
- vmx->nested.nested_vmx_vmfunc_controls =
- VMX_VMFUNC_EPTP_SWITCHING;
+ if (enable_ept)
+ vmx->nested.nested_vmx_vmfunc_controls =
+ VMX_VMFUNC_EPTP_SWITCHING;
}
/*
@@ -2856,8 +2858,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
SECONDARY_EXEC_ENABLE_VPID;
vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
VMX_VPID_EXTENT_SUPPORTED_MASK;
- } else
- vmx->nested.nested_vmx_vpid_caps = 0;
+ }
if (enable_unrestricted_guest)
vmx->nested.nested_vmx_secondary_ctls_high |=
@@ -3544,7 +3545,8 @@ static int hardware_enable(void)
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
kvm_cpu_vmxon(phys_addr);
- ept_sync_global();
+ if (enable_ept)
+ ept_sync_global();
return 0;
}
@@ -3657,8 +3659,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_RDSEED |
- SECONDARY_EXEC_RDRAND |
+ SECONDARY_EXEC_RDSEED_EXITING |
+ SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_PML |
SECONDARY_EXEC_TSC_SCALING |
SECONDARY_EXEC_ENABLE_VMFUNC;
@@ -3679,14 +3681,25 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
+ &vmx_capability.ept, &vmx_capability.vpid);
+
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
enabled */
_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_INVLPG_EXITING);
- rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
- vmx_capability.ept, vmx_capability.vpid);
+ } else if (vmx_capability.ept) {
+ vmx_capability.ept = 0;
+ pr_warn_once("EPT CAP should not exist if not support "
+ "1-setting enable EPT VM-execution control\n");
+ }
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
+ vmx_capability.vpid) {
+ vmx_capability.vpid = 0;
+ pr_warn_once("VPID CAP should not exist if not support "
+ "1-setting enable VPID VM-execution control\n");
}
min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
@@ -4781,18 +4794,18 @@ static int init_rmode_identity_map(struct kvm *kvm)
kvm_pfn_t identity_map_pfn;
u32 tmp;
- if (!enable_ept)
- return 0;
-
/* Protect kvm->arch.ept_identity_pagetable_done. */
mutex_lock(&kvm->slots_lock);
if (likely(kvm->arch.ept_identity_pagetable_done))
goto out2;
+ if (!kvm->arch.ept_identity_map_addr)
+ kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
- r = alloc_identity_pagetable(kvm);
+ r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ kvm->arch.ept_identity_map_addr, PAGE_SIZE);
if (r < 0)
goto out2;
@@ -4864,20 +4877,6 @@ out:
return r;
}
-static int alloc_identity_pagetable(struct kvm *kvm)
-{
- /* Called with kvm->slots_lock held. */
-
- int r = 0;
-
- BUG_ON(kvm->arch.ept_identity_pagetable_done);
-
- r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
- kvm->arch.ept_identity_map_addr, PAGE_SIZE);
-
- return r;
-}
-
static int allocate_vpid(void)
{
int vpid;
@@ -5282,13 +5281,13 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
static bool vmx_rdrand_supported(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDRAND;
+ SECONDARY_EXEC_RDRAND_EXITING;
}
static bool vmx_rdseed_supported(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDSEED;
+ SECONDARY_EXEC_RDSEED_EXITING;
}
static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
@@ -5382,30 +5381,30 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (vmx_rdrand_supported()) {
bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
if (rdrand_enabled)
- exec_control &= ~SECONDARY_EXEC_RDRAND;
+ exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
if (nested) {
if (rdrand_enabled)
vmx->nested.nested_vmx_secondary_ctls_high |=
- SECONDARY_EXEC_RDRAND;
+ SECONDARY_EXEC_RDRAND_EXITING;
else
vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDRAND;
+ ~SECONDARY_EXEC_RDRAND_EXITING;
}
}
if (vmx_rdseed_supported()) {
bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
if (rdseed_enabled)
- exec_control &= ~SECONDARY_EXEC_RDSEED;
+ exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
if (nested) {
if (rdseed_enabled)
vmx->nested.nested_vmx_secondary_ctls_high |=
- SECONDARY_EXEC_RDSEED;
+ SECONDARY_EXEC_RDSEED_EXITING;
else
vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDSEED;
+ ~SECONDARY_EXEC_RDSEED_EXITING;
}
}
@@ -5426,7 +5425,7 @@ static void ept_set_mmio_spte_mask(void)
/*
* Sets up the vmcs for emulated real mode.
*/
-static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
{
#ifdef CONFIG_X86_64
unsigned long a;
@@ -5539,8 +5538,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
-
- return 0;
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -5604,6 +5601,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ if (kvm_mpx_supported())
+ vmcs_write64(GUEST_BNDCFGS, 0);
setup_msrs(vmx);
@@ -5915,8 +5914,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
cr2 = vmcs_readl(EXIT_QUALIFICATION);
/* EPT won't cause page fault directly */
WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
- return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0,
- true);
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -6750,16 +6748,14 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_ept() ||
!cpu_has_vmx_ept_4levels() ||
- !cpu_has_vmx_ept_mt_wb()) {
+ !cpu_has_vmx_ept_mt_wb() ||
+ !cpu_has_vmx_invept_global())
enable_ept = 0;
- enable_unrestricted_guest = 0;
- enable_ept_ad_bits = 0;
- }
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
- if (!cpu_has_vmx_unrestricted_guest())
+ if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
enable_unrestricted_guest = 0;
if (!cpu_has_vmx_flexpriority())
@@ -6779,8 +6775,13 @@ static __init int hardware_setup(void)
if (enable_ept && !cpu_has_vmx_ept_2m_page())
kvm_disable_largepages();
- if (!cpu_has_vmx_ple())
+ if (!cpu_has_vmx_ple()) {
ple_gap = 0;
+ ple_window = 0;
+ ple_window_grow = 0;
+ ple_window_max = 0;
+ ple_window_shrink = 0;
+ }
if (!cpu_has_vmx_apicv()) {
enable_apicv = 0;
@@ -8418,9 +8419,9 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
case EXIT_REASON_RDPMC:
return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
case EXIT_REASON_RDRAND:
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
case EXIT_REASON_RDSEED:
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
@@ -9478,7 +9479,6 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
vmx->loaded_vmcs = vmcs;
vmx_vcpu_put(vcpu);
vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
put_cpu();
}
@@ -9559,11 +9559,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
vmx->vcpu.cpu = cpu;
- err = vmx_vcpu_setup(vmx);
+ vmx_vcpu_setup(vmx);
vmx_vcpu_put(&vmx->vcpu);
put_cpu();
- if (err)
- goto free_vmcs;
if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
err = alloc_apic_access_page(kvm);
if (err)
@@ -9571,9 +9569,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
}
if (enable_ept) {
- if (!kvm->arch.ept_identity_map_addr)
- kvm->arch.ept_identity_map_addr =
- VMX_EPT_IDENTITY_PAGETABLE_ADDR;
err = init_rmode_identity_map(kvm);
if (err)
goto free_vmcs;
@@ -11297,7 +11292,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
/* Same as above - no reason to call set_cr4_guest_host_mask(). */
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
- kvm_set_cr4(vcpu, vmcs12->host_cr4);
+ vmx_set_cr4(vcpu, vmcs12->host_cr4);
nested_ept_uninit_mmu_context(vcpu);
@@ -11328,6 +11323,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
@@ -11424,8 +11421,11 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
leave_guest_mode(vcpu);
if (likely(!vmx->fail)) {
- prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
- exit_qualification);
+ if (exit_reason == -1)
+ sync_vmcs12(vcpu, vmcs12);
+ else
+ prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
+ exit_qualification);
if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
vmcs12->vm_exit_msr_store_count))
@@ -11489,7 +11489,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
*/
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (enable_shadow_vmcs)
+ if (enable_shadow_vmcs && exit_reason != -1)
vmx->nested.sync_shadow_vmcs = true;
/* in case we halted in L2 */
@@ -11513,12 +11513,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
}
- trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
- vmcs12->exit_qualification,
- vmcs12->idt_vectoring_info_field,
- vmcs12->vm_exit_intr_info,
- vmcs12->vm_exit_intr_error_code,
- KVM_ISA_VMX);
+ if (exit_reason != -1)
+ trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+ vmcs12->exit_qualification,
+ vmcs12->idt_vectoring_info_field,
+ vmcs12->vm_exit_intr_info,
+ vmcs12->vm_exit_intr_error_code,
+ KVM_ISA_VMX);
load_vmcs12_host_state(vcpu, vmcs12);
@@ -11941,6 +11942,54 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
~FEATURE_CONTROL_LMCE;
}
+static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
+{
+ /* we need a nested vmexit to enter SMM, postpone if run is pending */
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return 0;
+ return 1;
+}
+
+static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
+ if (vmx->nested.smm.guest_mode)
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
+
+ vmx->nested.smm.vmxon = vmx->nested.vmxon;
+ vmx->nested.vmxon = false;
+ return 0;
+}
+
+static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ret;
+
+ if (vmx->nested.smm.vmxon) {
+ vmx->nested.vmxon = true;
+ vmx->nested.smm.vmxon = false;
+ }
+
+ if (vmx->nested.smm.guest_mode) {
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ ret = enter_vmx_non_root_mode(vcpu, false);
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ if (ret)
+ return ret;
+
+ vmx->nested.smm.guest_mode = false;
+ }
+ return 0;
+}
+
+static int enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -12066,6 +12115,11 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
#endif
.setup_mce = vmx_setup_mce,
+
+ .smi_allowed = vmx_smi_allowed,
+ .pre_enter_smm = vmx_pre_enter_smm,
+ .pre_leave_smm = vmx_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03869eb7fcd6..34c85aa2e2d1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2006,10 +2006,12 @@ static void kvmclock_sync_fn(struct work_struct *work)
KVMCLOCK_SYNC_PERIOD);
}
-static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 mcg_cap = vcpu->arch.mcg_cap;
unsigned bank_num = mcg_cap & 0xff;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
switch (msr) {
case MSR_IA32_MCG_STATUS:
@@ -2034,6 +2036,9 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if ((offset & 0x3) == 0 &&
data != 0 && (data | (1 << 10)) != ~(u64)0)
return -1;
+ if (!msr_info->host_initiated &&
+ (offset & 0x3) == 1 && data != 0)
+ return -1;
vcpu->arch.mce_banks[offset] = data;
break;
}
@@ -2283,7 +2288,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
- return set_msr_mce(vcpu, msr, data);
+ return set_msr_mce(vcpu, msr_info);
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -4034,10 +4039,16 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_SET_IDENTITY_MAP_ADDR: {
u64 ident_addr;
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (kvm->created_vcpus)
+ goto set_identity_unlock;
r = -EFAULT;
if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
- goto out;
+ goto set_identity_unlock;
r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+set_identity_unlock:
+ mutex_unlock(&kvm->lock);
break;
}
case KVM_SET_NR_MMU_PAGES:
@@ -5275,6 +5286,11 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla
kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
}
+static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
+}
+
static const struct x86_emulate_ops emulate_ops = {
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
@@ -5316,6 +5332,7 @@ static const struct x86_emulate_ops emulate_ops = {
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
.set_hflags = emulator_set_hflags,
+ .pre_leave_smm = emulator_pre_leave_smm,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6426,7 +6443,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
}
kvm_x86_ops->queue_exception(vcpu);
- } else if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+ } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) {
vcpu->arch.smi_pending = false;
enter_smm(vcpu);
} else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
@@ -6473,9 +6490,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
-#define put_smstate(type, buf, offset, val) \
- *(type *)((buf) + (offset) - 0x7e00) = val
-
static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
{
u32 flags = 0;
@@ -6641,13 +6655,20 @@ static void enter_smm(struct kvm_vcpu *vcpu)
u32 cr0;
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
- vcpu->arch.hflags |= HF_SMM_MASK;
memset(buf, 0, 512);
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, buf);
else
enter_smm_save_state_32(vcpu, buf);
+ /*
+ * Give pre_enter_smm() a chance to make ISA-specific changes to the
+ * vCPU state (e.g. leave guest mode) after we've saved the state into
+ * the SMM state-save area.
+ */
+ kvm_x86_ops->pre_enter_smm(vcpu, buf);
+
+ vcpu->arch.hflags |= HF_SMM_MASK;
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
if (kvm_x86_ops->get_nmi_mask(vcpu))
@@ -6876,17 +6897,23 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (inject_pending_event(vcpu, req_int_win) != 0)
req_immediate_exit = true;
else {
- /* Enable NMI/IRQ window open exits if needed.
+ /* Enable SMI/NMI/IRQ window open exits if needed.
*
- * SMIs have two cases: 1) they can be nested, and
- * then there is nothing to do here because RSM will
- * cause a vmexit anyway; 2) or the SMI can be pending
- * because inject_pending_event has completed the
- * injection of an IRQ or NMI from the previous vmexit,
- * and then we request an immediate exit to inject the SMI.
+ * SMIs have three cases:
+ * 1) They can be nested, and then there is nothing to
+ * do here because RSM will cause a vmexit anyway.
+ * 2) There is an ISA-specific reason why SMI cannot be
+ * injected, and the moment when this changes can be
+ * intercepted.
+ * 3) Or the SMI can be pending because
+ * inject_pending_event has completed the injection
+ * of an IRQ or NMI from the previous vmexit, and
+ * then we request an immediate exit to inject the
+ * SMI.
*/
if (vcpu->arch.smi_pending && !is_smm(vcpu))
- req_immediate_exit = true;
+ if (!kvm_x86_ops->enable_smi_window(vcpu))
+ req_immediate_exit = true;
if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
@@ -7798,18 +7825,40 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
+ if (kvm_mpx_supported()) {
+ void *mpx_state_buffer;
+
+ /*
+ * To avoid have the INIT path from kvm_apic_has_events() that be
+ * called with loaded FPU and does not let userspace fix the state.
+ */
+ kvm_put_guest_fpu(vcpu);
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
+ XFEATURE_MASK_BNDREGS);
+ if (mpx_state_buffer)
+ memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
+ XFEATURE_MASK_BNDCSR);
+ if (mpx_state_buffer)
+ memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+ }
+
if (!init_event) {
kvm_pmu_reset(vcpu);
vcpu->arch.smbase = 0x30000;
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
vcpu->arch.msr_misc_features_enables = 0;
+
+ vcpu->arch.xcr0 = XFEATURE_MASK_FP;
}
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
+ vcpu->arch.ia32_xss = 0;
+
kvm_x86_ops->vcpu_reset(vcpu, init_event);
}
@@ -7974,16 +8023,11 @@ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
struct page *page;
- struct kvm *kvm;
int r;
- BUG_ON(vcpu->kvm == NULL);
- kvm = vcpu->kvm;
-
vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
- vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
- if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -8001,7 +8045,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (r < 0)
goto fail_free_pio_data;
- if (irqchip_in_kernel(kvm)) {
+ if (irqchip_in_kernel(vcpu->kvm)) {
r = kvm_create_lapic(vcpu);
if (r < 0)
goto fail_mmu_destroy;
@@ -8023,10 +8067,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
fx_init(vcpu);
- vcpu->arch.ia32_tsc_adjust_msr = 0x0;
- vcpu->arch.pv_time_enabled = false;
-
- vcpu->arch.guest_supported_xcr0 = 0;
vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 32283d88701a..217cf6f95c36 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -19,9 +19,11 @@ Three different ways of output formatting are available:
The data is sampled from the KVM's debugfs entries and its perf events.
"""
+from __future__ import print_function
import curses
import sys
+import locale
import os
import time
import optparse
@@ -225,6 +227,8 @@ IOCTL_NUMBERS = {
'RESET': 0x00002403,
}
+ENCODING = locale.getpreferredencoding(False)
+
class Arch(object):
"""Encapsulates global architecture specific data.
@@ -666,7 +670,7 @@ class TracepointProvider(Provider):
"""Returns 'event name: current value' for all enabled events."""
ret = defaultdict(int)
for group in self.group_leaders:
- for name, val in group.read().iteritems():
+ for name, val in group.read().items():
if name in self._fields:
ret[name] += val
return ret
@@ -955,7 +959,7 @@ class Tui(object):
except:
raise Exception
for line in child.stdout:
- line = line.lstrip().split(' ', 1)
+ line = line.decode(ENCODING).lstrip().split(' ', 1)
# perform a sanity check before calling the more expensive
# function to possibly extract the guest name
if ' -name ' in line[1]:
@@ -1005,7 +1009,7 @@ class Tui(object):
name = ''
try:
line = open('/proc/{}/cmdline'
- .format(pid), 'rb').read().split('\0')
+ .format(pid), 'r').read().split('\0')
parms = line[line.index('-name') + 1].split(',')
while '' in parms:
# commas are escaped (i.e. ',,'), hence e.g. 'foo,bar' results
@@ -1170,7 +1174,7 @@ class Tui(object):
.format(self.stats.fields_filter))
self.screen.addstr(3, 0, "New regex: ")
curses.echo()
- regex = self.screen.getstr()
+ regex = self.screen.getstr().decode(ENCODING)
curses.noecho()
if len(regex) == 0:
self.stats.fields_filter = DEFAULT_REGEX
@@ -1204,7 +1208,7 @@ class Tui(object):
curses.echo()
self.screen.addstr(3, 0, "Pid [0 or pid]: ")
- pid = self.screen.getstr()
+ pid = self.screen.getstr().decode(ENCODING)
curses.noecho()
try:
@@ -1233,7 +1237,7 @@ class Tui(object):
self.screen.addstr(2, 0, 'Change delay from %.1fs to ' %
self._delay_regular)
curses.echo()
- val = self.screen.getstr()
+ val = self.screen.getstr().decode(ENCODING)
curses.noecho()
try:
@@ -1273,7 +1277,7 @@ class Tui(object):
self.print_all_gnames(7)
curses.echo()
self.screen.addstr(3, 0, "Guest [ENTER or guest]: ")
- gname = self.screen.getstr()
+ gname = self.screen.getstr().decode(ENCODING)
curses.noecho()
if not gname:
@@ -1369,25 +1373,25 @@ def batch(stats):
s = stats.get()
for key in sorted(s.keys()):
values = s[key]
- print '%-42s%10d%10d' % (key, values[0], values[1])
+ print('%-42s%10d%10d' % (key, values[0], values[1]))
except KeyboardInterrupt:
pass
def log(stats):
"""Prints statistics as reiterating key block, multiple value blocks."""
- keys = sorted(stats.get().iterkeys())
+ keys = sorted(stats.get().keys())
def banner():
for k in keys:
- print '%s' % k,
- print
+ print(k, end=' ')
+ print()
def statline():
s = stats.get()
for k in keys:
- print ' %9d' % s[k][1],
- print
+ print(' %9d' % s[k][1], end=' ')
+ print()
line = 0
banner_repeat = 20
while True:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9deb5a245b83..3d73299e05f2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4010,7 +4010,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
if (!vcpu_align)
vcpu_align = __alignof__(struct kvm_vcpu);
kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
- 0, NULL);
+ SLAB_ACCOUNT, NULL);
if (!kvm_vcpu_cache) {
r = -ENOMEM;
goto out_free_3;