summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/compat.h1
-rw-r--r--arch/x86/include/asm/dma-mapping.h5
-rw-r--r--arch/x86/include/asm/iommu.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h50
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/uaccess.h1
-rw-r--r--arch/x86/include/asm/xen/hypercall.h13
-rw-r--r--arch/x86/kernel/amd_gart_64.c1
-rw-r--r--arch/x86/kernel/pci-calgary_64.c25
-rw-r--r--arch/x86/kernel/pci-dma.c8
-rw-r--r--arch/x86/kernel/pci-nommu.c11
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c84
-rw-r--r--arch/x86/kvm/lapic.c116
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.c155
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmutrace.h6
-rw-r--r--arch/x86/kvm/svm.c95
-rw-r--r--arch/x86/kvm/vmx.c83
-rw-r--r--arch/x86/kvm/x86.c14
-rw-r--r--arch/x86/pci/sta2x11-fixup.c3
-rw-r--r--arch/x86/xen/enlighten.c154
-rw-r--r--arch/x86/xen/enlighten_hvm.c64
-rw-r--r--arch/x86/xen/enlighten_pv.c89
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c14
-rw-r--r--arch/x86/xen/setup.c7
-rw-r--r--arch/x86/xen/smp.c31
-rw-r--r--arch/x86/xen/smp.h2
-rw-r--r--arch/x86/xen/smp_hvm.c14
-rw-r--r--arch/x86/xen/smp_pv.c6
-rw-r--r--arch/x86/xen/suspend_hvm.c11
-rw-r--r--arch/x86/xen/xen-ops.h3
33 files changed, 672 insertions, 410 deletions
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 24118c0b4640..5343c19814b3 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -116,7 +116,6 @@ struct compat_statfs {
int f_spare[4];
};
-#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff
#define COMPAT_RLIM_INFINITY 0xffffffff
typedef u32 compat_old_sigset_t; /* at least 32 bits */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 08a0838b83fb..398c79889f5c 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -19,8 +19,6 @@
# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
#endif
-#define DMA_ERROR_CODE 0
-
extern int iommu_merge;
extern struct device x86_dma_fallback_dev;
extern int panic_on_overflow;
@@ -35,9 +33,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
#define arch_dma_alloc_attrs arch_dma_alloc_attrs
-#define HAVE_ARCH_DMA_SUPPORTED 1
-extern int dma_supported(struct device *hwdev, u64 mask);
-
extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag,
unsigned long attrs);
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 793869879464..fca144a104e4 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -6,6 +6,8 @@ extern int force_iommu, no_iommu;
extern int iommu_detected;
extern int iommu_pass_through;
+int x86_dma_supported(struct device *dev, u64 mask);
+
/* 10 seconds */
#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 695605eb1dfb..1588e9e3dc01 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,28 +48,31 @@
#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
/* x86-specific vcpu->requests bit members */
-#define KVM_REQ_MIGRATE_TIMER 8
-#define KVM_REQ_REPORT_TPR_ACCESS 9
-#define KVM_REQ_TRIPLE_FAULT 10
-#define KVM_REQ_MMU_SYNC 11
-#define KVM_REQ_CLOCK_UPDATE 12
-#define KVM_REQ_EVENT 14
-#define KVM_REQ_APF_HALT 15
-#define KVM_REQ_STEAL_UPDATE 16
-#define KVM_REQ_NMI 17
-#define KVM_REQ_PMU 18
-#define KVM_REQ_PMI 19
-#define KVM_REQ_SMI 20
-#define KVM_REQ_MASTERCLOCK_UPDATE 21
-#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_SCAN_IOAPIC (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24
-#define KVM_REQ_APIC_PAGE_RELOAD (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_HV_CRASH 26
-#define KVM_REQ_IOAPIC_EOI_EXIT 27
-#define KVM_REQ_HV_RESET 28
-#define KVM_REQ_HV_EXIT 29
-#define KVM_REQ_HV_STIMER 30
+#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0)
+#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1)
+#define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2)
+#define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3)
+#define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4)
+#define KVM_REQ_EVENT KVM_ARCH_REQ(6)
+#define KVM_REQ_APF_HALT KVM_ARCH_REQ(7)
+#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8)
+#define KVM_REQ_NMI KVM_ARCH_REQ(9)
+#define KVM_REQ_PMU KVM_ARCH_REQ(10)
+#define KVM_REQ_PMI KVM_ARCH_REQ(11)
+#define KVM_REQ_SMI KVM_ARCH_REQ(12)
+#define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13)
+#define KVM_REQ_MCLOCK_INPROGRESS \
+ KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_SCAN_IOAPIC \
+ KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_GLOBAL_CLOCK_UPDATE KVM_ARCH_REQ(16)
+#define KVM_REQ_APIC_PAGE_RELOAD \
+ KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_HV_CRASH KVM_ARCH_REQ(18)
+#define KVM_REQ_IOAPIC_EOI_EXIT KVM_ARCH_REQ(19)
+#define KVM_REQ_HV_RESET KVM_ARCH_REQ(20)
+#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21)
+#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -254,7 +257,8 @@ union kvm_mmu_page_role {
unsigned cr0_wp:1;
unsigned smep_andnot_wp:1;
unsigned smap_andnot_wp:1;
- unsigned :8;
+ unsigned ad_disabled:1;
+ unsigned :7;
/*
* This is left at the top of the word so that
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d406894cd9a2..5573c75f8e4c 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -426,6 +426,8 @@
#define MSR_IA32_TSC_ADJUST 0x0000003b
#define MSR_IA32_BNDCFGS 0x00000d90
+#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc
+
#define MSR_IA32_XSS 0x00000da0
#define FEATURE_CONTROL_LOCKED (1<<0)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a059aac9e937..476ea27f490b 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -565,7 +565,6 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
extern __must_check long
strncpy_from_user(char *dst, const char __user *src, long count);
-extern __must_check long strlen_user(const char __user *str);
extern __must_check long strnlen_user(const char __user *str, long n);
unsigned long __must_check clear_user(void __user *mem, unsigned long len);
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index f6d20f6cca12..11071fcd630e 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -43,6 +43,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <asm/smap.h>
#include <xen/interface/xen.h>
#include <xen/interface/sched.h>
@@ -50,6 +51,8 @@
#include <xen/interface/platform.h>
#include <xen/interface/xen-mca.h>
+struct xen_dm_op_buf;
+
/*
* The hypercall asms have to meet several constraints:
* - Work on 32- and 64-bit.
@@ -214,10 +217,12 @@ privcmd_call(unsigned call,
__HYPERCALL_DECLS;
__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
+ stac();
asm volatile("call *%[call]"
: __HYPERCALL_5PARAM
: [call] "a" (&hypercall_page[call])
: __HYPERCALL_CLOBBER5);
+ clac();
return (long)__res;
}
@@ -474,9 +479,13 @@ HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
static inline int
HYPERVISOR_dm_op(
- domid_t dom, unsigned int nr_bufs, void *bufs)
+ domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs)
{
- return _hypercall3(int, dm_op, dom, nr_bufs, bufs);
+ int ret;
+ stac();
+ ret = _hypercall3(int, dm_op, dom, nr_bufs, bufs);
+ clac();
+ return ret;
}
static inline void
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 815dd63f49d0..cc0e8bc0ea3f 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -704,6 +704,7 @@ static const struct dma_map_ops gart_dma_ops = {
.alloc = gart_alloc_coherent,
.free = gart_free_coherent,
.mapping_error = gart_mapping_error,
+ .dma_supported = x86_dma_supported,
};
static void gart_iommu_shutdown(void)
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index fda7867046d0..5286a4a92cf7 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -50,6 +50,8 @@
#include <asm/x86_init.h>
#include <asm/iommu_table.h>
+#define CALGARY_MAPPING_ERROR 0
+
#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
int use_calgary __read_mostly = 1;
#else
@@ -252,7 +254,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
if (panic_on_overflow)
panic("Calgary: fix the allocator.\n");
else
- return DMA_ERROR_CODE;
+ return CALGARY_MAPPING_ERROR;
}
}
@@ -272,10 +274,10 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
entry = iommu_range_alloc(dev, tbl, npages);
- if (unlikely(entry == DMA_ERROR_CODE)) {
+ if (unlikely(entry == CALGARY_MAPPING_ERROR)) {
pr_warn("failed to allocate %u pages in iommu %p\n",
npages, tbl);
- return DMA_ERROR_CODE;
+ return CALGARY_MAPPING_ERROR;
}
/* set the return dma address */
@@ -295,7 +297,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned long flags;
/* were we called with bad_dma_address? */
- badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
+ badend = CALGARY_MAPPING_ERROR + (EMERGENCY_PAGES * PAGE_SIZE);
if (unlikely(dma_addr < badend)) {
WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
"address 0x%Lx\n", dma_addr);
@@ -380,7 +382,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
entry = iommu_range_alloc(dev, tbl, npages);
- if (entry == DMA_ERROR_CODE) {
+ if (entry == CALGARY_MAPPING_ERROR) {
/* makes sure unmap knows to stop */
s->dma_length = 0;
goto error;
@@ -398,7 +400,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
error:
calgary_unmap_sg(dev, sg, nelems, dir, 0);
for_each_sg(sg, s, nelems, i) {
- sg->dma_address = DMA_ERROR_CODE;
+ sg->dma_address = CALGARY_MAPPING_ERROR;
sg->dma_length = 0;
}
return 0;
@@ -453,7 +455,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
/* set up tces to cover the allocated range */
mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
- if (mapping == DMA_ERROR_CODE)
+ if (mapping == CALGARY_MAPPING_ERROR)
goto free;
*dma_handle = mapping;
return ret;
@@ -478,6 +480,11 @@ static void calgary_free_coherent(struct device *dev, size_t size,
free_pages((unsigned long)vaddr, get_order(size));
}
+static int calgary_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return dma_addr == CALGARY_MAPPING_ERROR;
+}
+
static const struct dma_map_ops calgary_dma_ops = {
.alloc = calgary_alloc_coherent,
.free = calgary_free_coherent,
@@ -485,6 +492,8 @@ static const struct dma_map_ops calgary_dma_ops = {
.unmap_sg = calgary_unmap_sg,
.map_page = calgary_map_page,
.unmap_page = calgary_unmap_page,
+ .mapping_error = calgary_mapping_error,
+ .dma_supported = x86_dma_supported,
};
static inline void __iomem * busno_to_bbar(unsigned char num)
@@ -732,7 +741,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
struct iommu_table *tbl = pci_iommu(dev->bus);
/* reserve EMERGENCY_PAGES from bad_dma_address and up */
- iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
+ iommu_range_reserve(tbl, CALGARY_MAPPING_ERROR, EMERGENCY_PAGES);
/* avoid the BIOS/VGA first 640KB-1MB region */
/* for CalIOC2 - avoid the entire first MB */
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 3a216ec869cd..5e16d3f29594 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -213,10 +213,8 @@ static __init int iommu_setup(char *p)
}
early_param("iommu", iommu_setup);
-int dma_supported(struct device *dev, u64 mask)
+int x86_dma_supported(struct device *dev, u64 mask)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
#ifdef CONFIG_PCI
if (mask > 0xffffffff && forbid_dac > 0) {
dev_info(dev, "PCI: Disallowing DAC for device\n");
@@ -224,9 +222,6 @@ int dma_supported(struct device *dev, u64 mask)
}
#endif
- if (ops->dma_supported)
- return ops->dma_supported(dev, mask);
-
/* Copied from i386. Doesn't make much sense, because it will
only work for pci_alloc_coherent.
The caller just has to use GFP_DMA in this case. */
@@ -252,7 +247,6 @@ int dma_supported(struct device *dev, u64 mask)
return 1;
}
-EXPORT_SYMBOL(dma_supported);
static int __init pci_iommu_init(void)
{
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a88952ef371c..a6d404087fe3 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -11,6 +11,8 @@
#include <asm/iommu.h>
#include <asm/dma.h>
+#define NOMMU_MAPPING_ERROR 0
+
static int
check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
{
@@ -33,7 +35,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
dma_addr_t bus = page_to_phys(page) + offset;
WARN_ON(size == 0);
if (!check_addr("map_single", dev, bus, size))
- return DMA_ERROR_CODE;
+ return NOMMU_MAPPING_ERROR;
flush_write_buffers();
return bus;
}
@@ -88,6 +90,11 @@ static void nommu_sync_sg_for_device(struct device *dev,
flush_write_buffers();
}
+static int nommu_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return dma_addr == NOMMU_MAPPING_ERROR;
+}
+
const struct dma_map_ops nommu_dma_ops = {
.alloc = dma_generic_alloc_coherent,
.free = dma_generic_free_coherent,
@@ -96,4 +103,6 @@ const struct dma_map_ops nommu_dma_ops = {
.sync_single_for_device = nommu_sync_single_for_device,
.sync_sg_for_device = nommu_sync_sg_for_device,
.is_phys = 1,
+ .mapping_error = nommu_mapping_error,
+ .dma_supported = x86_dma_supported,
};
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index a6fd40aade7c..da6728383052 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
return best && (best->ebx & bit(X86_FEATURE_RTM));
}
+static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_MPX));
+}
+
static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 80890dee66ce..fb0055953fbc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -900,7 +900,7 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
if (rc != X86EMUL_CONTINUE) \
goto done; \
ctxt->_eip += sizeof(_type); \
- _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \
+ memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \
ctxt->fetch.ptr += sizeof(_type); \
_x; \
})
@@ -3942,6 +3942,25 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
}
/*
+ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save
+ * and restore MXCSR.
+ */
+static size_t __fxstate_size(int nregs)
+{
+ return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16;
+}
+
+static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt)
+{
+ bool cr4_osfxsr;
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return __fxstate_size(16);
+
+ cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR;
+ return __fxstate_size(cr4_osfxsr ? 8 : 0);
+}
+
+/*
* FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
* 1) 16 bit mode
* 2) 32 bit mode
@@ -3962,7 +3981,6 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
static int em_fxsave(struct x86_emulate_ctxt *ctxt)
{
struct fxregs_state fx_state;
- size_t size;
int rc;
rc = check_fxsr(ctxt);
@@ -3978,68 +3996,42 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)
- size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]);
- else
- size = offsetof(struct fxregs_state, xmm_space[0]);
-
- return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
-}
-
-static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
- struct fxregs_state *new)
-{
- int rc = X86EMUL_CONTINUE;
- struct fxregs_state old;
-
- rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old));
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- /*
- * 64 bit host will restore XMM 8-15, which is not correct on non-64
- * bit guests. Load the current values in order to preserve 64 bit
- * XMMs after fxrstor.
- */
-#ifdef CONFIG_X86_64
- /* XXX: accessing XMM 8-15 very awkwardly */
- memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16);
-#endif
-
- /*
- * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but
- * does save and restore MXCSR.
- */
- if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))
- memcpy(new->xmm_space, old.xmm_space, 8 * 16);
-
- return rc;
+ return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state,
+ fxstate_size(ctxt));
}
static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
{
struct fxregs_state fx_state;
int rc;
+ size_t size;
rc = check_fxsr(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
- if (rc != X86EMUL_CONTINUE)
- return rc;
+ ctxt->ops->get_fpu(ctxt);
- if (fx_state.mxcsr >> 16)
- return emulate_gp(ctxt, 0);
+ size = fxstate_size(ctxt);
+ if (size < __fxstate_size(16)) {
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+ if (rc != X86EMUL_CONTINUE)
+ goto out;
+ }
- ctxt->ops->get_fpu(ctxt);
+ rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+ goto out;
- if (ctxt->mode < X86EMUL_MODE_PROT64)
- rc = fxrstor_fixup(ctxt, &fx_state);
+ if (fx_state.mxcsr >> 16) {
+ rc = emulate_gp(ctxt, 0);
+ goto out;
+ }
if (rc == X86EMUL_CONTINUE)
rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
+out:
ctxt->ops->put_fpu(ctxt);
return rc;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d24c8742d9b0..2819d4c123eb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1495,6 +1495,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
static void cancel_hv_timer(struct kvm_lapic *apic)
{
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
preempt_disable();
kvm_x86_ops->cancel_hv_timer(apic->vcpu);
apic->lapic_timer.hv_timer_in_use = false;
@@ -1503,25 +1504,56 @@ static void cancel_hv_timer(struct kvm_lapic *apic)
static bool start_hv_timer(struct kvm_lapic *apic)
{
- u64 tscdeadline = apic->lapic_timer.tscdeadline;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ int r;
- if ((atomic_read(&apic->lapic_timer.pending) &&
- !apic_lvtt_period(apic)) ||
- kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
- if (apic->lapic_timer.hv_timer_in_use)
- cancel_hv_timer(apic);
- } else {
- apic->lapic_timer.hv_timer_in_use = true;
- hrtimer_cancel(&apic->lapic_timer.timer);
+ if (!kvm_x86_ops->set_hv_timer)
+ return false;
+
+ if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+ return false;
- /* In case the sw timer triggered in the window */
- if (atomic_read(&apic->lapic_timer.pending) &&
- !apic_lvtt_period(apic))
- cancel_hv_timer(apic);
+ r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
+ if (r < 0)
+ return false;
+
+ ktimer->hv_timer_in_use = true;
+ hrtimer_cancel(&ktimer->timer);
+
+ /*
+ * Also recheck ktimer->pending, in case the sw timer triggered in
+ * the window. For periodic timer, leave the hv timer running for
+ * simplicity, and the deadline will be recomputed on the next vmexit.
+ */
+ if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) {
+ if (r)
+ apic_timer_expired(apic);
+ return false;
}
- trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
- apic->lapic_timer.hv_timer_in_use);
- return apic->lapic_timer.hv_timer_in_use;
+
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true);
+ return true;
+}
+
+static void start_sw_timer(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_timer(apic);
+ if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+ return;
+
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ start_sw_period(apic);
+ else if (apic_lvtt_tscdeadline(apic))
+ start_sw_tscdeadline(apic);
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false);
+}
+
+static void restart_apic_timer(struct kvm_lapic *apic)
+{
+ if (!start_hv_timer(apic))
+ start_sw_timer(apic);
}
void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
@@ -1535,19 +1567,14 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
advance_periodic_target_expiration(apic);
- if (!start_hv_timer(apic))
- start_sw_period(apic);
+ restart_apic_timer(apic);
}
}
EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- WARN_ON(apic->lapic_timer.hv_timer_in_use);
-
- start_hv_timer(apic);
+ restart_apic_timer(vcpu->arch.apic);
}
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
@@ -1556,33 +1583,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
/* Possibly the TSC deadline timer is not enabled yet */
- if (!apic->lapic_timer.hv_timer_in_use)
- return;
-
- cancel_hv_timer(apic);
+ if (apic->lapic_timer.hv_timer_in_use)
+ start_sw_timer(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
- if (atomic_read(&apic->lapic_timer.pending))
- return;
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
- start_sw_period(apic);
- else if (apic_lvtt_tscdeadline(apic))
- start_sw_tscdeadline(apic);
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ restart_apic_timer(apic);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
static void start_apic_timer(struct kvm_lapic *apic)
{
atomic_set(&apic->lapic_timer.pending, 0);
- if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
- if (set_target_expiration(apic) &&
- !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
- start_sw_period(apic);
- } else if (apic_lvtt_tscdeadline(apic)) {
- if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
- start_sw_tscdeadline(apic);
- }
+ if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ && !set_target_expiration(apic))
+ return;
+
+ restart_apic_timer(apic);
}
static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -1813,16 +1835,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
* LAPIC interface
*----------------------------------------------------------------------
*/
-u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (!lapic_in_kernel(vcpu))
- return 0;
-
- return apic->lapic_timer.tscdeadline;
-}
-
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index bcbe811f3b97..29caa2c3dff9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -87,7 +87,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
-u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
@@ -216,4 +215,5 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb8225969255..aafd399cf8c6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -183,13 +183,13 @@ static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
static u64 __read_mostly shadow_mmio_mask;
+static u64 __read_mostly shadow_mmio_value;
static u64 __read_mostly shadow_present_mask;
/*
- * The mask/value to distinguish a PTE that has been marked not-present for
- * access tracking purposes.
- * The mask would be either 0 if access tracking is disabled, or
- * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
+ * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
+ * Non-present SPTEs with shadow_acc_track_value set are in place for access
+ * tracking.
*/
static u64 __read_mostly shadow_acc_track_mask;
static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
@@ -207,16 +207,40 @@ static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIF
static void mmu_spte_set(u64 *sptep, u64 spte);
static void mmu_free_roots(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
{
+ BUG_ON((mmio_mask & mmio_value) != mmio_value);
+ shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
+{
+ return sp->role.ad_disabled;
+}
+
+static inline bool spte_ad_enabled(u64 spte)
+{
+ MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ return !(spte & shadow_acc_track_value);
+}
+
+static inline u64 spte_shadow_accessed_mask(u64 spte)
+{
+ MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
+}
+
+static inline u64 spte_shadow_dirty_mask(u64 spte)
+{
+ MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
+ return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
+}
+
static inline bool is_access_track_spte(u64 spte)
{
- /* Always false if shadow_acc_track_mask is zero. */
- return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
+ return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
}
/*
@@ -270,7 +294,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
u64 mask = generation_mmio_spte_mask(gen);
access &= ACC_WRITE_MASK | ACC_USER_MASK;
- mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
+ mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT;
trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
@@ -278,7 +302,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
static bool is_mmio_spte(u64 spte)
{
- return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+ return (spte & shadow_mmio_mask) == shadow_mmio_value;
}
static gfn_t get_mmio_spte_gfn(u64 spte)
@@ -315,12 +339,20 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
return likely(kvm_gen == spte_gen);
}
+/*
+ * Sets the shadow PTE masks used by the MMU.
+ *
+ * Assumptions:
+ * - Setting either @accessed_mask or @dirty_mask requires setting both
+ * - At least one of @accessed_mask or @acc_track_mask must be set
+ */
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
u64 acc_track_mask)
{
- if (acc_track_mask != 0)
- acc_track_mask |= SPTE_SPECIAL_MASK;
+ BUG_ON(!dirty_mask != !accessed_mask);
+ BUG_ON(!accessed_mask && !acc_track_mask);
+ BUG_ON(acc_track_mask & shadow_acc_track_value);
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
@@ -329,7 +361,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
shadow_x_mask = x_mask;
shadow_present_mask = p_mask;
shadow_acc_track_mask = acc_track_mask;
- WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@@ -549,7 +580,7 @@ static bool spte_has_volatile_bits(u64 spte)
is_access_track_spte(spte))
return true;
- if (shadow_accessed_mask) {
+ if (spte_ad_enabled(spte)) {
if ((spte & shadow_accessed_mask) == 0 ||
(is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
return true;
@@ -560,14 +591,17 @@ static bool spte_has_volatile_bits(u64 spte)
static bool is_accessed_spte(u64 spte)
{
- return shadow_accessed_mask ? spte & shadow_accessed_mask
- : !is_access_track_spte(spte);
+ u64 accessed_mask = spte_shadow_accessed_mask(spte);
+
+ return accessed_mask ? spte & accessed_mask
+ : !is_access_track_spte(spte);
}
static bool is_dirty_spte(u64 spte)
{
- return shadow_dirty_mask ? spte & shadow_dirty_mask
- : spte & PT_WRITABLE_MASK;
+ u64 dirty_mask = spte_shadow_dirty_mask(spte);
+
+ return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
}
/* Rules for using mmu_spte_set:
@@ -707,10 +741,10 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
static u64 mark_spte_for_access_track(u64 spte)
{
- if (shadow_accessed_mask != 0)
+ if (spte_ad_enabled(spte))
return spte & ~shadow_accessed_mask;
- if (shadow_acc_track_mask == 0 || is_access_track_spte(spte))
+ if (is_access_track_spte(spte))
return spte;
/*
@@ -729,7 +763,6 @@ static u64 mark_spte_for_access_track(u64 spte)
spte |= (spte & shadow_acc_track_saved_bits_mask) <<
shadow_acc_track_saved_bits_shift;
spte &= ~shadow_acc_track_mask;
- spte |= shadow_acc_track_value;
return spte;
}
@@ -741,6 +774,7 @@ static u64 restore_acc_track_spte(u64 spte)
u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
& shadow_acc_track_saved_bits_mask;
+ WARN_ON_ONCE(spte_ad_enabled(spte));
WARN_ON_ONCE(!is_access_track_spte(spte));
new_spte &= ~shadow_acc_track_mask;
@@ -759,7 +793,7 @@ static bool mmu_spte_age(u64 *sptep)
if (!is_accessed_spte(spte))
return false;
- if (shadow_accessed_mask) {
+ if (spte_ad_enabled(spte)) {
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsigned long *)sptep);
} else {
@@ -1390,6 +1424,22 @@ static bool spte_clear_dirty(u64 *sptep)
return mmu_spte_update(sptep, spte);
}
+static bool wrprot_ad_disabled_spte(u64 *sptep)
+{
+ bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
+ (unsigned long *)sptep);
+ if (was_writable)
+ kvm_set_pfn_dirty(spte_to_pfn(*sptep));
+
+ return was_writable;
+}
+
+/*
+ * Gets the GFN ready for another round of dirty logging by clearing the
+ * - D bit on ad-enabled SPTEs, and
+ * - W bit on ad-disabled SPTEs.
+ * Returns true iff any D or W bits were cleared.
+ */
static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
@@ -1397,7 +1447,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- flush |= spte_clear_dirty(sptep);
+ if (spte_ad_enabled(*sptep))
+ flush |= spte_clear_dirty(sptep);
+ else
+ flush |= wrprot_ad_disabled_spte(sptep);
return flush;
}
@@ -1420,7 +1473,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- flush |= spte_set_dirty(sptep);
+ if (spte_ad_enabled(*sptep))
+ flush |= spte_set_dirty(sptep);
return flush;
}
@@ -1452,7 +1506,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
}
/**
- * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages
+ * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
+ * protect the page if the D-bit isn't supported.
* @kvm: kvm instance
* @slot: slot to clear D-bit
* @gfn_offset: start of the BITS_PER_LONG pages we care about
@@ -1766,18 +1821,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
u64 *sptep;
struct rmap_iterator iter;
- /*
- * If there's no access bit in the secondary pte set by the hardware and
- * fast access tracking is also not enabled, it's up to gup-fast/gup to
- * set the access bit in the primary pte or in the page structure.
- */
- if (!shadow_accessed_mask && !shadow_acc_track_mask)
- goto out;
-
for_each_rmap_spte(rmap_head, &iter, sptep)
if (is_accessed_spte(*sptep))
return 1;
-out:
return 0;
}
@@ -1798,18 +1844,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
{
- /*
- * In case of absence of EPT Access and Dirty Bits supports,
- * emulate the accessed bit for EPT, by checking if this page has
- * an EPT mapping, and clearing it if it does. On the next access,
- * a new EPT mapping will be established.
- * This has some overhead, but not as much as the cost of swapping
- * out actively used pages or breaking up actively used hugepages.
- */
- if (!shadow_accessed_mask && !shadow_acc_track_mask)
- return kvm_handle_hva_range(kvm, start, end, 0,
- kvm_unmap_rmapp);
-
return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
}
@@ -2398,7 +2432,12 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+ shadow_user_mask | shadow_x_mask;
+
+ if (sp_ad_disabled(sp))
+ spte |= shadow_acc_track_value;
+ else
+ spte |= shadow_accessed_mask;
mmu_spte_set(sptep, spte);
@@ -2666,10 +2705,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
{
u64 spte = 0;
int ret = 0;
+ struct kvm_mmu_page *sp;
if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0;
+ sp = page_header(__pa(sptep));
+ if (sp_ad_disabled(sp))
+ spte |= shadow_acc_track_value;
+
/*
* For the EPT case, shadow_present_mask is 0 if hardware
* supports exec-only page table entries. In that case,
@@ -2678,7 +2722,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
*/
spte |= shadow_present_mask;
if (!speculative)
- spte |= shadow_accessed_mask;
+ spte |= spte_shadow_accessed_mask(spte);
if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask;
@@ -2735,7 +2779,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (pte_access & ACC_WRITE_MASK) {
kvm_vcpu_mark_page_dirty(vcpu, gfn);
- spte |= shadow_dirty_mask;
+ spte |= spte_shadow_dirty_mask(spte);
}
if (speculative)
@@ -2877,16 +2921,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
{
struct kvm_mmu_page *sp;
+ sp = page_header(__pa(sptep));
+
/*
- * Since it's no accessed bit on EPT, it's no way to
- * distinguish between actually accessed translations
- * and prefetched, so disable pte prefetch if EPT is
- * enabled.
+ * Without accessed bits, there's no way to distinguish between
+ * actually accessed translations and prefetched, so disable pte
+ * prefetch if accessed bits aren't available.
*/
- if (!shadow_accessed_mask)
+ if (sp_ad_disabled(sp))
return;
- sp = page_header(__pa(sptep));
if (sp->role.level > PT_PAGE_TABLE_LEVEL)
return;
@@ -4290,6 +4334,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->base_role.word = 0;
context->base_role.smm = is_smm(vcpu);
+ context->base_role.ad_disabled = (shadow_accessed_mask == 0);
context->page_fault = tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
@@ -4377,6 +4422,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->root_level = context->shadow_root_level;
context->root_hpa = INVALID_PAGE;
context->direct_map = false;
+ context->base_role.ad_disabled = !accessed_dirty;
update_permission_bitmask(vcpu, context, true);
update_pkru_bitmask(vcpu, context, true);
@@ -4636,6 +4682,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mask.smep_andnot_wp = 1;
mask.smap_andnot_wp = 1;
mask.smm = 1;
+ mask.ad_disabled = 1;
/*
* If we don't have indirect shadow pages, it means no page is
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 330bf3a811fb..a276834950c1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e)
return ((1ULL << (e - s + 1)) - 1) << s;
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 5a24b846a1cb..8b97a6cba8d1 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -30,8 +30,9 @@
\
role.word = __entry->role; \
\
- trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \
- " %snxe root %u %s%c", __entry->mmu_valid_gen, \
+ trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \
+ " %snxe %sad root %u %s%c", \
+ __entry->mmu_valid_gen, \
__entry->gfn, role.level, \
role.cr4_pae ? " pae" : "", \
role.quadrant, \
@@ -39,6 +40,7 @@
access_str[role.access], \
role.invalid ? " invalid" : "", \
role.nxe ? "" : "!", \
+ role.ad_disabled ? "!" : "", \
__entry->root_count, \
__entry->unsync ? "unsync" : "sync", 0); \
saved_ptr; \
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 33460fcdeef9..905ea6052517 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -190,6 +190,7 @@ struct vcpu_svm {
struct nested_state nested;
bool nmi_singlestep;
+ u64 nmi_singlestep_guest_rflags;
unsigned int3_injected;
unsigned long int3_rip;
@@ -964,6 +965,18 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}
+static void disable_nmi_singlestep(struct vcpu_svm *svm)
+{
+ svm->nmi_singlestep = false;
+ if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
+ /* Clear our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
+ }
+}
+
/* Note:
* This hash table is used to map VM_ID to a struct kvm_arch,
* when handling AMD IOMMU GALOG notification to schedule in
@@ -1713,11 +1726,24 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
- return to_svm(vcpu)->vmcb->save.rflags;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long rflags = svm->vmcb->save.rflags;
+
+ if (svm->nmi_singlestep) {
+ /* Hide our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ rflags &= ~X86_EFLAGS_RF;
+ }
+ return rflags;
}
static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
+ if (to_svm(vcpu)->nmi_singlestep)
+ rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+
/*
* Any change of EFLAGS.VM is accompanied by a reload of SS
* (caused by either a task switch or an inter-privilege IRET),
@@ -2112,10 +2138,7 @@ static int db_interception(struct vcpu_svm *svm)
}
if (svm->nmi_singlestep) {
- svm->nmi_singlestep = false;
- if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
- svm->vmcb->save.rflags &=
- ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+ disable_nmi_singlestep(svm);
}
if (svm->vcpu.guest_debug &
@@ -2370,8 +2393,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
static int nested_svm_check_permissions(struct vcpu_svm *svm)
{
- if (!(svm->vcpu.arch.efer & EFER_SVME)
- || !is_paging(&svm->vcpu)) {
+ if (!(svm->vcpu.arch.efer & EFER_SVME) ||
+ !is_paging(&svm->vcpu)) {
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
@@ -2381,7 +2404,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
return 1;
}
- return 0;
+ return 0;
}
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
@@ -2534,6 +2557,31 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
+/* DB exceptions for our internal use must not cause vmexit */
+static int nested_svm_intercept_db(struct vcpu_svm *svm)
+{
+ unsigned long dr6;
+
+ /* if we're not singlestepping, it's not ours */
+ if (!svm->nmi_singlestep)
+ return NESTED_EXIT_DONE;
+
+ /* if it's not a singlestep exception, it's not ours */
+ if (kvm_get_dr(&svm->vcpu, 6, &dr6))
+ return NESTED_EXIT_DONE;
+ if (!(dr6 & DR6_BS))
+ return NESTED_EXIT_DONE;
+
+ /* if the guest is singlestepping, it should get the vmexit */
+ if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
+ disable_nmi_singlestep(svm);
+ return NESTED_EXIT_DONE;
+ }
+
+ /* it's ours, the nested hypervisor must not see this one */
+ return NESTED_EXIT_HOST;
+}
+
static int nested_svm_exit_special(struct vcpu_svm *svm)
{
u32 exit_code = svm->vmcb->control.exit_code;
@@ -2589,8 +2637,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
}
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (svm->nested.intercept_exceptions & excp_bits)
- vmexit = NESTED_EXIT_DONE;
+ if (svm->nested.intercept_exceptions & excp_bits) {
+ if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
+ vmexit = nested_svm_intercept_db(svm);
+ else
+ vmexit = NESTED_EXIT_DONE;
+ }
/* async page fault always cause vmexit */
else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
svm->apf_reason != 0)
@@ -4627,10 +4679,17 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
== HF_NMI_MASK)
return; /* IRET will cause a vm exit */
+ if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0)
+ return; /* STGI will cause a vm exit */
+
+ if (svm->nested.exit_required)
+ return; /* we're not going to run the guest yet */
+
/*
* Something prevents NMI from been injected. Single step over possible
* problem (IRET or exception injection or interrupt shadow)
*/
+ svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
svm->nmi_singlestep = true;
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
}
@@ -4771,6 +4830,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->nested.exit_required))
return;
+ /*
+ * Disable singlestep if we're injecting an interrupt/exception.
+ * We don't want our modified rflags to be pushed on the stack where
+ * we might not be able to easily reset them if we disabled NMI
+ * singlestep later.
+ */
+ if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
+ /*
+ * Event injection happens before external interrupts cause a
+ * vmexit and interrupts are disabled here, so smp_send_reschedule
+ * is enough to force an immediate vmexit.
+ */
+ disable_nmi_singlestep(svm);
+ smp_send_reschedule(vcpu->cpu);
+ }
+
pre_svm_run(svm);
sync_lapic_to_cr8(vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6dcc4873e435..f76efad248ab 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -913,8 +913,9 @@ static void nested_release_page_clean(struct page *page)
kvm_release_page_clean(page);
}
+static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
-static u64 construct_eptp(unsigned long root_hpa);
+static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
static bool vmx_xsaves_supported(void);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -2772,7 +2773,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
if (enable_ept_ad_bits) {
vmx->nested.nested_vmx_secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_PML;
- vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
+ vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
}
} else
vmx->nested.nested_vmx_ept_caps = 0;
@@ -3198,7 +3199,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
case MSR_IA32_BNDCFGS:
- if (!kvm_mpx_supported())
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
@@ -3280,7 +3282,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_BNDCFGS:
- if (!kvm_mpx_supported())
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
+ return 1;
+ if (is_noncanonical_address(data & PAGE_MASK) ||
+ (data & MSR_IA32_BNDCFGS_RSVD))
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
break;
@@ -4013,7 +4019,7 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
if (enable_ept) {
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+ ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
} else {
vpid_sync_context(vpid);
}
@@ -4188,14 +4194,15 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmx->emulation_required = emulation_required(vcpu);
}
-static u64 construct_eptp(unsigned long root_hpa)
+static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
{
u64 eptp;
/* TODO write the value reading from MSR */
eptp = VMX_EPT_DEFAULT_MT |
VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
- if (enable_ept_ad_bits)
+ if (enable_ept_ad_bits &&
+ (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
eptp |= VMX_EPT_AD_ENABLE_BIT;
eptp |= (root_hpa & PAGE_MASK);
@@ -4209,7 +4216,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
guest_cr3 = cr3;
if (enable_ept) {
- eptp = construct_eptp(cr3);
+ eptp = construct_eptp(vcpu, cr3);
vmcs_write64(EPT_POINTER, eptp);
if (is_paging(vcpu) || is_guest_mode(vcpu))
guest_cr3 = kvm_read_cr3(vcpu);
@@ -5170,7 +5177,8 @@ static void ept_set_mmio_spte_mask(void)
* EPT Misconfigurations can be generated if the value of bits 2:0
* of an EPT paging-structure entry is 110b (write/execute).
*/
- kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE);
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
+ VMX_EPT_MISCONFIG_WX_VALUE);
}
#define VMX_XSS_EXIT_BITMAP 0
@@ -6220,17 +6228,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- if (is_guest_mode(vcpu)
- && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) {
- /*
- * Fix up exit_qualification according to whether guest
- * page table accesses are reads or writes.
- */
- u64 eptp = nested_ept_get_cr3(vcpu);
- if (!(eptp & VMX_EPT_AD_ENABLE_BIT))
- exit_qualification &= ~EPT_VIOLATION_ACC_WRITE;
- }
-
/*
* EPT violation happened while executing iret from NMI,
* "blocked by NMI" bit has to be set before next VM entry.
@@ -6453,7 +6450,7 @@ void vmx_enable_tdp(void)
enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
0ull, VMX_EPT_EXECUTABLE_MASK,
cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
- enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK);
+ VMX_EPT_RWX_MASK);
ept_set_mmio_spte_mask();
kvm_enable_tdp();
@@ -6557,7 +6554,6 @@ static __init int hardware_setup(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
- vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
vmx_msr_bitmap_legacy, PAGE_SIZE);
@@ -7661,7 +7657,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
unsigned long type, types;
gva_t gva;
struct x86_exception e;
- int vpid;
+ struct {
+ u64 vpid;
+ u64 gla;
+ } operand;
if (!(vmx->nested.nested_vmx_secondary_ctls_high &
SECONDARY_EXEC_ENABLE_VPID) ||
@@ -7691,17 +7690,28 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
vmx_instruction_info, false, &gva))
return 1;
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid,
- sizeof(u32), &e)) {
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+ sizeof(operand), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
+ if (operand.vpid >> 16) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
+ if (is_noncanonical_address(operand.gla)) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ /* fall through */
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
- if (!vpid) {
+ if (!operand.vpid) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
return kvm_skip_emulated_instruction(vcpu);
@@ -9394,6 +9404,11 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
vmcs12->guest_physical_address = fault->address;
}
+static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
+{
+ return nested_ept_get_cr3(vcpu) & VMX_EPT_AD_ENABLE_BIT;
+}
+
/* Callbacks for nested_ept_init_mmu_context: */
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
@@ -9404,18 +9419,18 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
{
- u64 eptp;
+ bool wants_ad;
WARN_ON(mmu_is_nested(vcpu));
- eptp = nested_ept_get_cr3(vcpu);
- if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits)
+ wants_ad = nested_ept_ad_enabled(vcpu);
+ if (wants_ad && !enable_ept_ad_bits)
return 1;
kvm_mmu_unload(vcpu);
kvm_init_shadow_ept_mmu(vcpu,
to_vmx(vcpu)->nested.nested_vmx_ept_caps &
VMX_EPT_EXECUTE_ONLY_BIT,
- eptp & VMX_EPT_AD_ENABLE_BIT);
+ wants_ad);
vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -10728,8 +10743,7 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
}
- if (nested_cpu_has_ept(vmcs12))
- vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+ vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
if (nested_cpu_has_vid(vmcs12))
vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
@@ -10754,8 +10768,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
if (kvm_mpx_supported())
vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
- if (nested_cpu_has_xsaves(vmcs12))
- vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
}
/*
@@ -11152,7 +11164,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
vmx->hv_deadline_tsc = tscl + delta_tsc;
vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
PIN_BASED_VMX_PREEMPTION_TIMER);
- return 0;
+
+ return delta_tsc == 0;
}
static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0e846f0cb83b..6c7266f7766d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2841,10 +2841,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_vcpu_write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
}
- if (kvm_lapic_hv_timer_in_use(vcpu) &&
- kvm_x86_ops->set_hv_timer(vcpu,
- kvm_get_lapic_target_expiration_tsc(vcpu)))
- kvm_lapic_switch_to_sw_timer(vcpu);
+
+ if (kvm_lapic_hv_timer_in_use(vcpu))
+ kvm_lapic_restart_hv_timer(vcpu);
+
/*
* On a host with synchronized TSC, there is no need to update
* kvmclock on vcpu->cpu migration
@@ -6011,7 +6011,7 @@ static void kvm_set_mmio_spte_mask(void)
mask &= ~1ull;
#endif
- kvm_mmu_set_mmio_spte_mask(mask);
+ kvm_mmu_set_mmio_spte_mask(mask, mask);
}
#ifdef CONFIG_X86_64
@@ -6733,7 +6733,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_immediate_exit = false;
- if (vcpu->requests) {
+ if (kvm_request_pending(vcpu)) {
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
kvm_mmu_unload(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
@@ -6897,7 +6897,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops->sync_pir_to_irr(vcpu);
}
- if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
+ if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
|| need_resched() || signal_pending(current)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index ec008e800b45..53d600217973 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -26,6 +26,7 @@
#include <linux/pci_ids.h>
#include <linux/export.h>
#include <linux/list.h>
+#include <asm/iommu.h>
#define STA2X11_SWIOTLB_SIZE (4*1024*1024)
extern int swiotlb_late_init_with_default_size(size_t default_size);
@@ -191,7 +192,7 @@ static const struct dma_map_ops sta2x11_dma_ops = {
.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
.sync_sg_for_device = swiotlb_sync_sg_for_device,
.mapping_error = swiotlb_dma_mapping_error,
- .dma_supported = NULL, /* FIXME: we should use this instead! */
+ .dma_supported = x86_dma_supported,
};
/* At setup time, we use our own ops if the device is a ConneXt one */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a5ffcbb20cc0..0e7ef69e8531 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -106,15 +106,83 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
return rc >= 0 ? 0 : rc;
}
-static void clamp_max_cpus(void)
+static int xen_vcpu_setup_restore(int cpu)
{
-#ifdef CONFIG_SMP
- if (setup_max_cpus > MAX_VIRT_CPUS)
- setup_max_cpus = MAX_VIRT_CPUS;
-#endif
+ int rc = 0;
+
+ /* Any per_cpu(xen_vcpu) is stale, so reset it */
+ xen_vcpu_info_reset(cpu);
+
+ /*
+ * For PVH and PVHVM, setup online VCPUs only. The rest will
+ * be handled by hotplug.
+ */
+ if (xen_pv_domain() ||
+ (xen_hvm_domain() && cpu_online(cpu))) {
+ rc = xen_vcpu_setup(cpu);
+ }
+
+ return rc;
+}
+
+/*
+ * On restore, set the vcpu placement up again.
+ * If it fails, then we're in a bad state, since
+ * we can't back out from using it...
+ */
+void xen_vcpu_restore(void)
+{
+ int cpu, rc;
+
+ for_each_possible_cpu(cpu) {
+ bool other_cpu = (cpu != smp_processor_id());
+ bool is_up;
+
+ if (xen_vcpu_nr(cpu) == XEN_VCPU_ID_INVALID)
+ continue;
+
+ /* Only Xen 4.5 and higher support this. */
+ is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up,
+ xen_vcpu_nr(cpu), NULL) > 0;
+
+ if (other_cpu && is_up &&
+ HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL))
+ BUG();
+
+ if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
+ xen_setup_runstate_info(cpu);
+
+ rc = xen_vcpu_setup_restore(cpu);
+ if (rc)
+ pr_emerg_once("vcpu restore failed for cpu=%d err=%d. "
+ "System will hang.\n", cpu, rc);
+ /*
+ * In case xen_vcpu_setup_restore() fails, do not bring up the
+ * VCPU. This helps us avoid the resulting OOPS when the VCPU
+ * accesses pvclock_vcpu_time via xen_vcpu (which is NULL.)
+ * Note that this does not improve the situation much -- now the
+ * VM hangs instead of OOPSing -- with the VCPUs that did not
+ * fail, spinning in stop_machine(), waiting for the failed
+ * VCPUs to come up.
+ */
+ if (other_cpu && is_up && (rc == 0) &&
+ HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
+ BUG();
+ }
}
-void xen_vcpu_setup(int cpu)
+void xen_vcpu_info_reset(int cpu)
+{
+ if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) {
+ per_cpu(xen_vcpu, cpu) =
+ &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)];
+ } else {
+ /* Set to NULL so that if somebody accesses it we get an OOPS */
+ per_cpu(xen_vcpu, cpu) = NULL;
+ }
+}
+
+int xen_vcpu_setup(int cpu)
{
struct vcpu_register_vcpu_info info;
int err;
@@ -123,11 +191,11 @@ void xen_vcpu_setup(int cpu)
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
/*
- * This path is called twice on PVHVM - first during bootup via
- * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being
- * hotplugged: cpu_up -> xen_hvm_cpu_notify.
- * As we can only do the VCPUOP_register_vcpu_info once lets
- * not over-write its result.
+ * This path is called on PVHVM at bootup (xen_hvm_smp_prepare_boot_cpu)
+ * and at restore (xen_vcpu_restore). Also called for hotplugged
+ * VCPUs (cpu_init -> xen_hvm_cpu_prepare_hvm).
+ * However, the hypercall can only be done once (see below) so if a VCPU
+ * is offlined and comes back online then let's not redo the hypercall.
*
* For PV it is called during restore (xen_vcpu_restore) and bootup
* (xen_setup_vcpu_info_placement). The hotplug mechanism does not
@@ -135,42 +203,44 @@ void xen_vcpu_setup(int cpu)
*/
if (xen_hvm_domain()) {
if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
- return;
+ return 0;
}
- if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS)
- per_cpu(xen_vcpu, cpu) =
- &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)];
- if (!xen_have_vcpu_info_placement) {
- if (cpu >= MAX_VIRT_CPUS)
- clamp_max_cpus();
- return;
+ if (xen_have_vcpu_info_placement) {
+ vcpup = &per_cpu(xen_vcpu_info, cpu);
+ info.mfn = arbitrary_virt_to_mfn(vcpup);
+ info.offset = offset_in_page(vcpup);
+
+ /*
+ * Check to see if the hypervisor will put the vcpu_info
+ * structure where we want it, which allows direct access via
+ * a percpu-variable.
+ * N.B. This hypercall can _only_ be called once per CPU.
+ * Subsequent calls will error out with -EINVAL. This is due to
+ * the fact that hypervisor has no unregister variant and this
+ * hypercall does not allow to over-write info.mfn and
+ * info.offset.
+ */
+ err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info,
+ xen_vcpu_nr(cpu), &info);
+
+ if (err) {
+ pr_warn_once("register_vcpu_info failed: cpu=%d err=%d\n",
+ cpu, err);
+ xen_have_vcpu_info_placement = 0;
+ } else {
+ /*
+ * This cpu is using the registered vcpu info, even if
+ * later ones fail to.
+ */
+ per_cpu(xen_vcpu, cpu) = vcpup;
+ }
}
- vcpup = &per_cpu(xen_vcpu_info, cpu);
- info.mfn = arbitrary_virt_to_mfn(vcpup);
- info.offset = offset_in_page(vcpup);
-
- /* Check to see if the hypervisor will put the vcpu_info
- structure where we want it, which allows direct access via
- a percpu-variable.
- N.B. This hypercall can _only_ be called once per CPU. Subsequent
- calls will error out with -EINVAL. This is due to the fact that
- hypervisor has no unregister variant and this hypercall does not
- allow to over-write info.mfn and info.offset.
- */
- err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, xen_vcpu_nr(cpu),
- &info);
+ if (!xen_have_vcpu_info_placement)
+ xen_vcpu_info_reset(cpu);
- if (err) {
- printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
- xen_have_vcpu_info_placement = 0;
- clamp_max_cpus();
- } else {
- /* This cpu is using the registered vcpu info, even if
- later ones fail to. */
- per_cpu(xen_vcpu, cpu) = vcpup;
- }
+ return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0);
}
void xen_reboot(int reason)
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index a6d014f47e52..87d791356ea9 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -1,5 +1,6 @@
#include <linux/cpu.h>
#include <linux/kexec.h>
+#include <linux/memblock.h>
#include <xen/features.h>
#include <xen/events.h>
@@ -10,9 +11,11 @@
#include <asm/reboot.h>
#include <asm/setup.h>
#include <asm/hypervisor.h>
+#include <asm/e820/api.h>
#include <asm/xen/cpuid.h>
#include <asm/xen/hypervisor.h>
+#include <asm/xen/page.h>
#include "xen-ops.h"
#include "mmu.h"
@@ -20,37 +23,34 @@
void __ref xen_hvm_init_shared_info(void)
{
- int cpu;
struct xen_add_to_physmap xatp;
- static struct shared_info *shared_info_page;
+ u64 pa;
+
+ if (HYPERVISOR_shared_info == &xen_dummy_shared_info) {
+ /*
+ * Search for a free page starting at 4kB physical address.
+ * Low memory is preferred to avoid an EPT large page split up
+ * by the mapping.
+ * Starting below X86_RESERVE_LOW (usually 64kB) is fine as
+ * the BIOS used for HVM guests is well behaved and won't
+ * clobber memory other than the first 4kB.
+ */
+ for (pa = PAGE_SIZE;
+ !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) ||
+ memblock_is_reserved(pa);
+ pa += PAGE_SIZE)
+ ;
+
+ memblock_reserve(pa, PAGE_SIZE);
+ HYPERVISOR_shared_info = __va(pa);
+ }
- if (!shared_info_page)
- shared_info_page = (struct shared_info *)
- extend_brk(PAGE_SIZE, PAGE_SIZE);
xatp.domid = DOMID_SELF;
xatp.idx = 0;
xatp.space = XENMAPSPACE_shared_info;
- xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+ xatp.gpfn = virt_to_pfn(HYPERVISOR_shared_info);
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
BUG();
-
- HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
-
- /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
- * page, we use it in the event channel upcall and in some pvclock
- * related functions. We don't need the vcpu_info placement
- * optimizations because we don't use any pv_mmu or pv_irq op on
- * HVM.
- * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
- * online but xen_hvm_init_shared_info is run at resume time too and
- * in that case multiple vcpus might be online. */
- for_each_online_cpu(cpu) {
- /* Leave it to be NULL. */
- if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS)
- continue;
- per_cpu(xen_vcpu, cpu) =
- &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)];
- }
}
static void __init init_hvm_pv_info(void)
@@ -106,7 +106,7 @@ static void xen_hvm_crash_shutdown(struct pt_regs *regs)
static int xen_cpu_up_prepare_hvm(unsigned int cpu)
{
- int rc;
+ int rc = 0;
/*
* This can happen if CPU was offlined earlier and
@@ -121,7 +121,9 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu)
per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu);
else
per_cpu(xen_vcpu_id, cpu) = cpu;
- xen_vcpu_setup(cpu);
+ rc = xen_vcpu_setup(cpu);
+ if (rc)
+ return rc;
if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock))
xen_setup_timer(cpu);
@@ -130,9 +132,8 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu)
if (rc) {
WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
cpu, rc);
- return rc;
}
- return 0;
+ return rc;
}
static int xen_cpu_dead_hvm(unsigned int cpu)
@@ -154,6 +155,13 @@ static void __init xen_hvm_guest_init(void)
xen_hvm_init_shared_info();
+ /*
+ * xen_vcpu is a pointer to the vcpu_info struct in the shared_info
+ * page, we use it in the event channel upcall and in some pvclock
+ * related functions.
+ */
+ xen_vcpu_info_reset(0);
+
xen_panic_handler_init();
if (xen_feature(XENFEAT_hvm_callback_vector))
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index f33eef4ebd12..811e4ddb3f37 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -89,8 +89,6 @@
void *xen_initial_gdt;
-RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
-
static int xen_cpu_up_prepare_pv(unsigned int cpu);
static int xen_cpu_dead_pv(unsigned int cpu);
@@ -107,35 +105,6 @@ struct tls_descs {
*/
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
-/*
- * On restore, set the vcpu placement up again.
- * If it fails, then we're in a bad state, since
- * we can't back out from using it...
- */
-void xen_vcpu_restore(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- bool other_cpu = (cpu != smp_processor_id());
- bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu),
- NULL);
-
- if (other_cpu && is_up &&
- HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL))
- BUG();
-
- xen_setup_runstate_info(cpu);
-
- if (xen_have_vcpu_info_placement)
- xen_vcpu_setup(cpu);
-
- if (other_cpu && is_up &&
- HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
- BUG();
- }
-}
-
static void __init xen_banner(void)
{
unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
@@ -960,30 +929,43 @@ void xen_setup_shared_info(void)
HYPERVISOR_shared_info =
(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
-#ifndef CONFIG_SMP
- /* In UP this is as good a place as any to set up shared info */
- xen_setup_vcpu_info_placement();
-#endif
-
xen_setup_mfn_list_list();
- /*
- * Now that shared info is set up we can start using routines that
- * point to pvclock area.
- */
- if (system_state == SYSTEM_BOOTING)
+ if (system_state == SYSTEM_BOOTING) {
+#ifndef CONFIG_SMP
+ /*
+ * In UP this is as good a place as any to set up shared info.
+ * Limit this to boot only, at restore vcpu setup is done via
+ * xen_vcpu_restore().
+ */
+ xen_setup_vcpu_info_placement();
+#endif
+ /*
+ * Now that shared info is set up we can start using routines
+ * that point to pvclock area.
+ */
xen_init_time_ops();
+ }
}
/* This is called once we have the cpu_possible_mask */
-void xen_setup_vcpu_info_placement(void)
+void __ref xen_setup_vcpu_info_placement(void)
{
int cpu;
for_each_possible_cpu(cpu) {
/* Set up direct vCPU id mapping for PV guests. */
per_cpu(xen_vcpu_id, cpu) = cpu;
- xen_vcpu_setup(cpu);
+
+ /*
+ * xen_vcpu_setup(cpu) can fail -- in which case it
+ * falls back to the shared_info version for cpus
+ * where xen_vcpu_nr(cpu) < MAX_VIRT_CPUS.
+ *
+ * xen_cpu_up_prepare_pv() handles the rest by failing
+ * them in hotplug.
+ */
+ (void) xen_vcpu_setup(cpu);
}
/*
@@ -1332,9 +1314,17 @@ asmlinkage __visible void __init xen_start_kernel(void)
*/
acpi_numa = -1;
#endif
- /* Don't do the full vcpu_info placement stuff until we have a
- possible map and a non-dummy shared_info. */
- per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+ /* Let's presume PV guests always boot on vCPU with id 0. */
+ per_cpu(xen_vcpu_id, 0) = 0;
+
+ /*
+ * Setup xen_vcpu early because start_kernel needs it for
+ * local_irq_disable(), irqs_disabled().
+ *
+ * Don't do the full vcpu_info placement stuff until we have
+ * the cpu_possible_mask and a non-dummy shared_info.
+ */
+ xen_vcpu_info_reset(0);
WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
@@ -1431,9 +1421,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
#endif
xen_raw_console_write("about to get started...\n");
- /* Let's presume PV guests always boot on vCPU with id 0. */
- per_cpu(xen_vcpu_id, 0) = 0;
-
+ /* We need this for printk timestamps */
xen_setup_runstate_info(0);
xen_efi_init();
@@ -1451,6 +1439,9 @@ static int xen_cpu_up_prepare_pv(unsigned int cpu)
{
int rc;
+ if (per_cpu(xen_vcpu, cpu) == NULL)
+ return -ENODEV;
+
xen_setup_timer(cpu);
rc = xen_smp_intr_init(cpu);
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index 42b08f8fc2ca..37c6056a7bba 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -18,20 +18,6 @@
int xen_swiotlb __read_mostly;
-static const struct dma_map_ops xen_swiotlb_dma_ops = {
- .alloc = xen_swiotlb_alloc_coherent,
- .free = xen_swiotlb_free_coherent,
- .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
- .sync_single_for_device = xen_swiotlb_sync_single_for_device,
- .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
- .sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
- .map_sg = xen_swiotlb_map_sg_attrs,
- .unmap_sg = xen_swiotlb_unmap_sg_attrs,
- .map_page = xen_swiotlb_map_page,
- .unmap_page = xen_swiotlb_unmap_page,
- .dma_supported = xen_swiotlb_dma_supported,
-};
-
/*
* pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary
*
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a5bf7c451435..c81046323ebc 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -499,7 +499,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
void __init xen_remap_memory(void)
{
unsigned long buf = (unsigned long)&xen_remap_buf;
- unsigned long mfn_save, mfn, pfn;
+ unsigned long mfn_save, pfn;
unsigned long remapped = 0;
unsigned int i;
unsigned long pfn_s = ~0UL;
@@ -515,8 +515,7 @@ void __init xen_remap_memory(void)
pfn = xen_remap_buf.target_pfn;
for (i = 0; i < xen_remap_buf.size; i++) {
- mfn = xen_remap_buf.mfns[i];
- xen_update_mem_tables(pfn, mfn);
+ xen_update_mem_tables(pfn, xen_remap_buf.mfns[i]);
remapped++;
pfn++;
}
@@ -530,8 +529,6 @@ void __init xen_remap_memory(void)
pfn_s = xen_remap_buf.target_pfn;
len = xen_remap_buf.size;
}
-
- mfn = xen_remap_mfn;
xen_remap_mfn = xen_remap_buf.next_area_mfn;
}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 82ac611f2fc1..e7f02eb73727 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -1,4 +1,5 @@
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/cpumask.h>
#include <linux/percpu.h>
@@ -114,6 +115,36 @@ int xen_smp_intr_init(unsigned int cpu)
return rc;
}
+void __init xen_smp_cpus_done(unsigned int max_cpus)
+{
+ int cpu, rc, count = 0;
+
+ if (xen_hvm_domain())
+ native_smp_cpus_done(max_cpus);
+
+ if (xen_have_vcpu_info_placement)
+ return;
+
+ for_each_online_cpu(cpu) {
+ if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS)
+ continue;
+
+ rc = cpu_down(cpu);
+
+ if (rc == 0) {
+ /*
+ * Reset vcpu_info so this cpu cannot be onlined again.
+ */
+ xen_vcpu_info_reset(cpu);
+ count++;
+ } else {
+ pr_warn("%s: failed to bring CPU %d down, error %d\n",
+ __func__, cpu, rc);
+ }
+ }
+ WARN(count, "%s: brought %d CPUs offline\n", __func__, count);
+}
+
void xen_smp_send_reschedule(int cpu)
{
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index 8ebb6acca64a..87d3c76cba37 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -14,6 +14,8 @@ extern void xen_smp_intr_free(unsigned int cpu);
int xen_smp_intr_init_pv(unsigned int cpu);
void xen_smp_intr_free_pv(unsigned int cpu);
+void xen_smp_cpus_done(unsigned int max_cpus);
+
void xen_smp_send_reschedule(int cpu);
void xen_smp_send_call_function_ipi(const struct cpumask *mask);
void xen_smp_send_call_function_single_ipi(int cpu);
diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
index f18561bbf5c9..fd60abedf658 100644
--- a/arch/x86/xen/smp_hvm.c
+++ b/arch/x86/xen/smp_hvm.c
@@ -12,7 +12,8 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void)
native_smp_prepare_boot_cpu();
/*
- * Setup vcpu_info for boot CPU.
+ * Setup vcpu_info for boot CPU. Secondary CPUs get their vcpu_info
+ * in xen_cpu_up_prepare_hvm().
*/
xen_vcpu_setup(0);
@@ -27,10 +28,20 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void)
static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
{
+ int cpu;
+
native_smp_prepare_cpus(max_cpus);
WARN_ON(xen_smp_intr_init(0));
xen_init_lock_cpu(0);
+
+ for_each_possible_cpu(cpu) {
+ if (cpu == 0)
+ continue;
+
+ /* Set default vcpu_id to make sure that we don't use cpu-0's */
+ per_cpu(xen_vcpu_id, cpu) = XEN_VCPU_ID_INVALID;
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -60,4 +71,5 @@ void __init xen_hvm_smp_init(void)
smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu;
+ smp_ops.smp_cpus_done = xen_smp_cpus_done;
}
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index aae32535f4ec..1ea598e5f030 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -371,10 +371,6 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle)
return 0;
}
-static void xen_pv_smp_cpus_done(unsigned int max_cpus)
-{
-}
-
#ifdef CONFIG_HOTPLUG_CPU
static int xen_pv_cpu_disable(void)
{
@@ -469,7 +465,7 @@ static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
static const struct smp_ops xen_smp_ops __initconst = {
.smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_pv_smp_prepare_cpus,
- .smp_cpus_done = xen_pv_smp_cpus_done,
+ .smp_cpus_done = xen_smp_cpus_done,
.cpu_up = xen_pv_cpu_up,
.cpu_die = xen_pv_cpu_die,
diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c
index 01afcadde50a..484999416d8b 100644
--- a/arch/x86/xen/suspend_hvm.c
+++ b/arch/x86/xen/suspend_hvm.c
@@ -8,15 +8,10 @@
void xen_hvm_post_suspend(int suspend_cancelled)
{
- int cpu;
-
- if (!suspend_cancelled)
+ if (!suspend_cancelled) {
xen_hvm_init_shared_info();
+ xen_vcpu_restore();
+ }
xen_callback_vector();
xen_unplug_emulated_devices();
- if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
- for_each_online_cpu(cpu) {
- xen_setup_runstate_info(cpu);
- }
- }
}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9a440a42c618..0d5004477db6 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -78,7 +78,8 @@ bool xen_vcpu_stolen(int vcpu);
extern int xen_have_vcpu_info_placement;
-void xen_vcpu_setup(int cpu);
+int xen_vcpu_setup(int cpu);
+void xen_vcpu_info_reset(int cpu);
void xen_setup_vcpu_info_placement(void);
#ifdef CONFIG_SMP